告别识别率焦虑:视频 AI 工程化实战 —— 检测→判定→聚合→治理全链路拆解

张开发
2026/6/26 18:54:42 15 分钟阅读
告别识别率焦虑:视频 AI 工程化实战 —— 检测→判定→聚合→治理全链路拆解
背景很多视频 AI 项目上线失败不是识别率不够而是工程能力缺失无法批量跑、无法复盘、无法控成本。vl_video(本人实现的一套方案)的价值是把识别问题做成了工程流水线。本文不列接口清单直接拆架构与关键代码给你一套可迁移的方法。架构决策主链路按“检测 - 判定 - 聚合 - 治理”四段设计检测颜色 ROI 候选会话。判定多轮模型调用 投票 二次复核。聚合片段事件映射回整视频时间轴。治理评测指标、成本统计、调试资产。trade-off单体大模型直看整段 vs 候选切片后识别前者开发快后者成本与稳定性更优适合持续运营。强依赖检测模型 vs 轻量前景差分前者准确后者便宜工程上应保留双模并支持切换和回退。追求吞吐 vs 追求可解释吞吐导向会牺牲复核链路上线场景更应优先可解释和可审计。失败复盘候选过密导致模型预算失控。修复按候选分数分配调用次数并设置单视频硬预算。模型返回格式漂移导致解析失败。修复Prompt 强约束 三层 JSON 提取兜底。聚合后计数正确但事件顺序异常。修复事件统一映射绝对时间再排序禁止窗口内局部顺序直接拼接。案例代码# 节选自 vl_video/src/core/analyzer.py class VideoAnalyzer: CONFIDENCE_WEIGHT {high: 3.0, medium: 2.0, low: 1.0} def __init__(self, config, loggerNone): self.config config self.logger logger self._multimodal_cls None detector_config CandidateDetectorConfig( sample_fpsconfig.candidate_sample_fps, sample_widthconfig.candidate_sample_width, absence_duration_secconfig.candidate_min_gap_sec, gap_fill_secconfig.candidate_gap_fill_sec, min_presence_secconfig.candidate_min_presence_sec, min_segment_secconfig.candidate_min_segment_sec, max_windowsconfig.max_candidate_windows_per_video, enable_light_normalizationconfig.enable_light_normalization, diff_thresholdconfig.candidate_peak_threshold, long_segment_threshold_secconfig.long_segment_threshold_sec, long_segment_search_window_secconfig.long_segment_search_window_sec, long_segment_min_side_secconfig.long_segment_min_side_sec, long_segment_quiet_secconfig.long_segment_quiet_sec, detection_methodconfig.detection_method, enable_yolo_person_detectionconfig.enable_yolo_person_detection, yolo_model_pathconfig.yolo_model_path, yolo_conf_thresholdconfig.yolo_conf_threshold, yolo_imgszconfig.yolo_imgsz, yolo_deviceconfig.yolo_device, person_near_cabinet_paddingconfig.person_near_cabinet_padding, allow_person_detection_fallbackconfig.allow_person_detection_fallback, ) self.candidate_detector CandidateDetector(detector_config) self.cabinet_locator CabinetLocator( probe_framesconfig.cabinet_probe_frames, max_regionsconfig.cabinet_max_regions, ) self.window_saver WindowSaver( debug_rootconfig.debug_output_dir, save_windowsconfig.save_candidate_windows, ) self.event_aggregator EventAggregator()设计意图配置前移构建统一“可调参数面”。风险点配置膨胀会增加误配置概率需配套校验与默认值策略。# 节选自 vl_video/src/detection/candidate_detector.py def _detect_presence(self, sampled_rgb, sampled_gray, roi_payload): if self.config.detection_method foreground_diff: return self._detect_presence_with_heuristic(sampled_gray, roi_payload) if self.config.enable_yolo_person_detection: try: return self._detect_presence_with_yolo(sampled_rgb, sampled_gray, roi_payload) except Exception as exc: if not self.config.allow_person_detection_fallback: raise RuntimeError( YOLO 人体检测依赖不可用请先安装 ultralytics、torch 或将 detection_method 设为 foreground_diff 再运行。 ) from exc return self._detect_presence_with_heuristic(sampled_gray, roi_payload) return self._detect_presence_with_heuristic(sampled_gray, roi_payload) def detect(self, video_path: str, roi_payloadNone) - dict: metadata probe_video(video_path) sampled_rgb sample_rgb_frames( video_pathvideo_path, sample_fpsself.config.sample_fps, sample_widthself.config.sample_width, metadatametadata, ) if sampled_rgb.shape[0] 2: return { metadata: metadata, score_series: [], windows: [], presence_flags: [], near_cabinet_flags: [], motion_scores: [], cabinet_activity_scores: [], }设计意图检测策略模块化让精度模式和资源模式可热切换。风险点回退路径如果无日志线上会出现“悄悄降级”难排障问题。# 节选自 vl_video/src/core/event_aggregator.py def aggregate(self, analyzed_segments): final_events [] uncertain_events [] take_count 0 put_count 0 sorted_segments sorted(analyzed_segments, keylambda item: item.get(start_sec, 0)) for segment in sorted_segments: take_count int(segment.get(take_count, 0)) put_count int(segment.get(put_count, 0)) duration max(segment.get(end_sec, 0) - segment.get(start_sec, 0), 0.1) fps segment.get(fps, 0) for event in segment.get(events, []): center_sec segment[start_sec] duration * float(event.get(relative_position, 0.5)) event_record { label: event[label], confidence: event[confidence], evidence: event[evidence], description: event[description], start_sec: center_sec, end_sec: center_sec, start_frame: int(round(center_sec * fps)), end_frame: int(round(center_sec * fps)), source_window_ids: [segment[window_id]], debug_paths: [segment.get(clip_path, )], } event_record[event_id] fevent_{len(final_events) 1:03d} final_events.append(event_record) final_events.sort(keylambda e: e[start_sec]) logic_trace .join([f{e[label]}{e[start_sec]:.2f}s for e in final_events]) return { logic_trace: logic_trace, take_battery_num: take_count, put_battery_num: put_count, events: final_events, uncertain_events: uncertain_events, }设计意图把模型输出转换成审计友好的时间线证据。风险点若 fps/时间基准不一致会导致跨窗口时间错位。总结这套主链路可复用到多数视频 AI 场景关键不是框架名而是设计原则先降输入熵再做语义判定。把模型不确定性留在系统内部消化。输出必须可复核才能长期运营。

更多文章