Skip to content

ByteTrack causing bottleneck during object segmentation + tracking #458

@RM503

Description

@RM503

Hi all,

I am working on a project for tracking excavators in construction site using RFDETRSegPreview and ByteTrack on some custom data. The detection and segmentation works fine. However, when I first started running inference on a 34 s sample video, the total time as around 50 s, even when the video was downsampled to 15 fps. I identified the tracking was creating the bottleneck. Can anyone suggest any improvements? Here are important methods in my inference class-

def _track_with_bytetrack(self, detections: sv.Detections) -> sv.Detections:
        if len(detections) == 0:
            self.tracker.update_with_detections(detections)
            return detections

        detections = self._nms(detections)
        tracked = self.tracker.update_with_detections(detections)

        if detections.mask is None:
            return tracked
        if tracked.mask is not None:
            return tracked
        if len(tracked) == 0:
            return tracked

        det_boxes = detections.xyxy.astype(np.float32, copy=False)
        trk_boxes = tracked.xyxy.astype(np.float32, copy=False)

        if detections.class_id is not None and tracked.class_id is not None:
            det_cls = detections.class_id
            trk_cls = tracked.class_id
            tracked_masks = [None] * len(tracked)

            for c in np.intersect1d(np.unique(det_cls), np.unique(trk_cls)):
                det_idx = np.where(det_cls == c)[0]
                trk_idx = np.where(trk_cls == c)[0]
                if det_idx.size == 0 or trk_idx.size == 0:
                    continue

                ious = _pairwise_iou(det_boxes[det_idx], trk_boxes[trk_idx])  # (len(trk_idx), len(det_idx))
                best_det_local = np.argmax(ious, axis=1)
                best_iou = ious[np.arange(ious.shape[0]), best_det_local]
                best_det = det_idx[best_det_local]

                for j, (ti, di, iou) in enumerate(zip(trk_idx, best_det, best_iou)):
                    if iou >= self.mask_match_iou:
                        tracked_masks[int(ti)] = detections.mask[int(di)]
        else:
            ious = _pairwise_iou(det_boxes, trk_boxes)  # (T,N)
            best_det = np.argmax(ious, axis=1)               # (T,)
            best_iou = ious[np.arange(ious.shape[0]), best_det]

            tracked_masks = [
                detections.mask[int(di)] if float(iou) >= self.mask_match_iou else None
                for di, iou in zip(best_det, best_iou)
            ]

        tracked.mask = np.asarray(tracked_masks, dtype=object) if all(m is not None for m in tracked_masks) else None
        return tracked
def _process_video(self, model: Any, write_video: bool=True, stream: bool=False) -> Optional[Generator[np.ndarray, None, None]]:
        """
        This function processes videos for inference based on the desired frame rate
        initialized with the class.
        """
        def _runner() -> Generator[np.ndarray, None, None]:
            # Initialize as non so that they can be accessed for garbage cleaning
            # in case try fails
            cap = None
            out = None

            frame_rgb = None
            raw_preds = None
            detections = None
            tracked = None
            centroids = None

            box_annotator = None
            mask_annotator = None
            label_annotator = None

            try:
                cap = cv2.VideoCapture(self.input_path)
                if not cap.isOpened():
                    raise RuntimeError(f"Error opening video file: {self.input_path}")

                # Downsampling
                target_fps = 15.0
                fps_in = cap.get(cv2.CAP_PROP_FPS)
                fps_in = float(fps_in) if fps_in and fps_in > 0 else target_fps

                # choose a frame step to approximate target_fps
                # target_fps and fps_out must agree
                step = max(1, int(round(fps_in / target_fps)))
                fps_out = fps_in / step

                # if ByteTrack's initialized fps is different from fps_out
                if hasattr(self.tracker, "frame_rate"):
                    self.tracker.frame_rate = int(round(fps_out))
                if hasattr(self.tracker, "fps"):
                    self.tracker.fps = int(round(fps_out))

                output_name = Path(self.input_path).stem + "_seg" + Path(self.input_path).suffix
                out_path = str(Path(self.output_dir) / output_name)

                if write_video:
                    out = cv2.VideoWriter(
                        out_path,
                        cv2.VideoWriter_fourcc(*"mp4v"),
                        fps_out,
                        self.resized_dims,
                    )

                # Initialize annotators
                bbox_annotator = sv.BoxAnnotator()
                mask_annotator = sv.MaskAnnotator()
                label_annotator = sv.LabelAnnotator()

                if hasattr(model, "optimize_for_inference"):
                    model.optimize_for_inference()

                logging.info(
                    f"Running inference on video: {Path(self.input_path).name} | "
                    f"fps_in={fps_in:.2f}, target_fps={target_fps:.2f}, step={step}, fps_out={fps_out:.2f}"
                )

                total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                frame_idx = 0

                with (
                    torch.inference_mode(),
                    torch.autocast("cuda", dtype=torch.bfloat16),
                    tqdm.tqdm(total=total_frames, desc="Tracking frames", colour="green") as pbar
                ):
                    timings = {} # store read, pre and post processing times for benchmarking
                    n = 0
                    while True:
                        with timer("read", timings):
                            ret, frame = cap.read()
                            
                            if not ret:
                                break

                        pbar.update(1)

                        # Skip frames to downsample (these frames "do not exist" in output timeline)
                        if frame_idx % step != 0:
                            frame_idx += 1
                            continue
                        
                        with timer("pre", timings):
                            frame_rgb = self._process_frame(frame, resized_dims=self.resized_dims)

                        with timer("predict", timings):
                            raw_preds = model.predict(frame_rgb, threshold=self.threshold)

                        with timer("detections", timings):
                            detections = self._to_sv_detections(raw_preds)
                        with timer("track_with_bytetrack", timings):
                            tracked = self._track_with_bytetrack(detections)
                        with timer("track_centroid", timings):
                            centroids = self.centroid_tracker.update(tracked, frame_idx)

                        #logging.info(f"Centroids: {centroids}")
                        with timer("annotations", timings):
                            if len(tracked) > 0:
                                labels = self._labels_for(tracked)
                                annotated = bbox_annotator.annotate(scene=frame_rgb, detections=tracked)

                                # masks only exist on inference frames (fine, because we downsampled)
                                if tracked.mask is not None:
                                    annotated = mask_annotator.annotate(scene=annotated, detections=tracked)

                                if labels:
                                    annotated = label_annotator.annotate(
                                        scene=annotated, detections=tracked, labels=labels
                                    )
                            else:
                                annotated = frame_rgb

                        with timer("write", timings):
                            if out is not None:
                                out.write(cv2.cvtColor(annotated, cv2.COLOR_RGB2BGR))

                        if stream:
                            yield frame_idx, centroids, annotated
  
                        n += 1
                        frame_idx += 1

                    print("frames inferred:", n)
                    for name, total_time in timings.items():
                        print(f"avg {name:12s}: {total_time/max(n,1):.6f}")

                if out is not None:
                    logging.info(f"Saved output video to: {out_path}")

            finally:
                try:
                    if cap is not None:
                        cap.release()
                except Exception:
                    pass

                try:
                    if out is not None:
                        out.release()
                except Exception:
                    pass

                try:
                    if hasattr(self, "centroid_tracker") and self.centroid_tracker is not None:
                        self.centroid_tracker.close()
                except Exception:
                    pass
                # Release memory after inference is
                try:
                    del frame_rgb, raw_preds, detections, tracked, centroids
                except Exception:
                    pass

                try:
                    del bbox_annotator, mask_annotator, label_annotator
                except Exception:
                    pass

                gc.collect()

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    torch.cuda.ipc_collect()

        if stream:
            return _runner()

        for _ in _runner():
            pass

        return None

For reference, these are some execution timings that I have found for various parts of the inference, tracking and annotating processes

Tracking frames: 100%|██████████| 2056/2056 [00:50<00:00, 40.71it/s]
INFO:root:Saved output video to: /content/drive/MyDrive/excavation_monitoring/sample_inference/excavator_vid_seg.mp4
frames inferred: 514
avg read : 0.010707
avg pre : 0.000793
avg predict : 0.030293
avg detections : 0.000008
avg track_with_bytetrack: 0.049681
avg track_centroid: 0.002220
avg annotations : 0.002100
avg write : 0.001900

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions