ByteTrack causing bottleneck during object segmentation + tracking

Hi all,

I am working on a project for tracking excavators in construction site using `RFDETRSegPreview` and `ByteTrack` on some custom data. The detection and segmentation works fine. However, when I first started running inference on a 34 s sample video, the total time as around 50 s, even when the video was downsampled to 15 fps. I identified the tracking was creating the bottleneck. Can anyone suggest any improvements? Here are important methods in my inference class-

```
def _track_with_bytetrack(self, detections: sv.Detections) -> sv.Detections:
        if len(detections) == 0:
            self.tracker.update_with_detections(detections)
            return detections

        detections = self._nms(detections)
        tracked = self.tracker.update_with_detections(detections)

        if detections.mask is None:
            return tracked
        if tracked.mask is not None:
            return tracked
        if len(tracked) == 0:
            return tracked

        det_boxes = detections.xyxy.astype(np.float32, copy=False)
        trk_boxes = tracked.xyxy.astype(np.float32, copy=False)

        if detections.class_id is not None and tracked.class_id is not None:
            det_cls = detections.class_id
            trk_cls = tracked.class_id
            tracked_masks = [None] * len(tracked)

            for c in np.intersect1d(np.unique(det_cls), np.unique(trk_cls)):
                det_idx = np.where(det_cls == c)[0]
                trk_idx = np.where(trk_cls == c)[0]
                if det_idx.size == 0 or trk_idx.size == 0:
                    continue

                ious = _pairwise_iou(det_boxes[det_idx], trk_boxes[trk_idx])  # (len(trk_idx), len(det_idx))
                best_det_local = np.argmax(ious, axis=1)
                best_iou = ious[np.arange(ious.shape[0]), best_det_local]
                best_det = det_idx[best_det_local]

                for j, (ti, di, iou) in enumerate(zip(trk_idx, best_det, best_iou)):
                    if iou >= self.mask_match_iou:
                        tracked_masks[int(ti)] = detections.mask[int(di)]
        else:
            ious = _pairwise_iou(det_boxes, trk_boxes)  # (T,N)
            best_det = np.argmax(ious, axis=1)               # (T,)
            best_iou = ious[np.arange(ious.shape[0]), best_det]

            tracked_masks = [
                detections.mask[int(di)] if float(iou) >= self.mask_match_iou else None
                for di, iou in zip(best_det, best_iou)
            ]

        tracked.mask = np.asarray(tracked_masks, dtype=object) if all(m is not None for m in tracked_masks) else None
        return tracked
```

```
def _process_video(self, model: Any, write_video: bool=True, stream: bool=False) -> Optional[Generator[np.ndarray, None, None]]:
        """
        This function processes videos for inference based on the desired frame rate
        initialized with the class.
        """
        def _runner() -> Generator[np.ndarray, None, None]:
            # Initialize as non so that they can be accessed for garbage cleaning
            # in case try fails
            cap = None
            out = None

            frame_rgb = None
            raw_preds = None
            detections = None
            tracked = None
            centroids = None

            box_annotator = None
            mask_annotator = None
            label_annotator = None

            try:
                cap = cv2.VideoCapture(self.input_path)
                if not cap.isOpened():
                    raise RuntimeError(f"Error opening video file: {self.input_path}")

                # Downsampling
                target_fps = 15.0
                fps_in = cap.get(cv2.CAP_PROP_FPS)
                fps_in = float(fps_in) if fps_in and fps_in > 0 else target_fps

                # choose a frame step to approximate target_fps
                # target_fps and fps_out must agree
                step = max(1, int(round(fps_in / target_fps)))
                fps_out = fps_in / step

                # if ByteTrack's initialized fps is different from fps_out
                if hasattr(self.tracker, "frame_rate"):
                    self.tracker.frame_rate = int(round(fps_out))
                if hasattr(self.tracker, "fps"):
                    self.tracker.fps = int(round(fps_out))

                output_name = Path(self.input_path).stem + "_seg" + Path(self.input_path).suffix
                out_path = str(Path(self.output_dir) / output_name)

                if write_video:
                    out = cv2.VideoWriter(
                        out_path,
                        cv2.VideoWriter_fourcc(*"mp4v"),
                        fps_out,
                        self.resized_dims,
                    )

                # Initialize annotators
                bbox_annotator = sv.BoxAnnotator()
                mask_annotator = sv.MaskAnnotator()
                label_annotator = sv.LabelAnnotator()

                if hasattr(model, "optimize_for_inference"):
                    model.optimize_for_inference()

                logging.info(
                    f"Running inference on video: {Path(self.input_path).name} | "
                    f"fps_in={fps_in:.2f}, target_fps={target_fps:.2f}, step={step}, fps_out={fps_out:.2f}"
                )

                total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                frame_idx = 0

                with (
                    torch.inference_mode(),
                    torch.autocast("cuda", dtype=torch.bfloat16),
                    tqdm.tqdm(total=total_frames, desc="Tracking frames", colour="green") as pbar
                ):
                    timings = {} # store read, pre and post processing times for benchmarking
                    n = 0
                    while True:
                        with timer("read", timings):
                            ret, frame = cap.read()
                            
                            if not ret:
                                break

                        pbar.update(1)

                        # Skip frames to downsample (these frames "do not exist" in output timeline)
                        if frame_idx % step != 0:
                            frame_idx += 1
                            continue
                        
                        with timer("pre", timings):
                            frame_rgb = self._process_frame(frame, resized_dims=self.resized_dims)

                        with timer("predict", timings):
                            raw_preds = model.predict(frame_rgb, threshold=self.threshold)

                        with timer("detections", timings):
                            detections = self._to_sv_detections(raw_preds)
                        with timer("track_with_bytetrack", timings):
                            tracked = self._track_with_bytetrack(detections)
                        with timer("track_centroid", timings):
                            centroids = self.centroid_tracker.update(tracked, frame_idx)

                        #logging.info(f"Centroids: {centroids}")
                        with timer("annotations", timings):
                            if len(tracked) > 0:
                                labels = self._labels_for(tracked)
                                annotated = bbox_annotator.annotate(scene=frame_rgb, detections=tracked)

                                # masks only exist on inference frames (fine, because we downsampled)
                                if tracked.mask is not None:
                                    annotated = mask_annotator.annotate(scene=annotated, detections=tracked)

                                if labels:
                                    annotated = label_annotator.annotate(
                                        scene=annotated, detections=tracked, labels=labels
                                    )
                            else:
                                annotated = frame_rgb

                        with timer("write", timings):
                            if out is not None:
                                out.write(cv2.cvtColor(annotated, cv2.COLOR_RGB2BGR))

                        if stream:
                            yield frame_idx, centroids, annotated
  
                        n += 1
                        frame_idx += 1

                    print("frames inferred:", n)
                    for name, total_time in timings.items():
                        print(f"avg {name:12s}: {total_time/max(n,1):.6f}")

                if out is not None:
                    logging.info(f"Saved output video to: {out_path}")

            finally:
                try:
                    if cap is not None:
                        cap.release()
                except Exception:
                    pass

                try:
                    if out is not None:
                        out.release()
                except Exception:
                    pass

                try:
                    if hasattr(self, "centroid_tracker") and self.centroid_tracker is not None:
                        self.centroid_tracker.close()
                except Exception:
                    pass
                # Release memory after inference is
                try:
                    del frame_rgb, raw_preds, detections, tracked, centroids
                except Exception:
                    pass

                try:
                    del bbox_annotator, mask_annotator, label_annotator
                except Exception:
                    pass

                gc.collect()

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    torch.cuda.ipc_collect()

        if stream:
            return _runner()

        for _ in _runner():
            pass

        return None
```

For reference, these are some execution timings that I have found for various parts of the inference, tracking and annotating processes

Tracking frames: 100%|██████████| 2056/2056 [00:50<00:00, 40.71it/s]
INFO:root:Saved output video to: /content/drive/MyDrive/excavation_monitoring/sample_inference/excavator_vid_seg.mp4
frames inferred: 514
avg read        : 0.010707
avg pre         : 0.000793
avg predict     : 0.030293
avg detections  : 0.000008
**avg track_with_bytetrack: 0.049681**
avg track_centroid: 0.002220
avg annotations : 0.002100
avg write       : 0.001900

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

ByteTrack causing bottleneck during object segmentation + tracking #458

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

ByteTrack causing bottleneck during object segmentation + tracking #458

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions