abewley · jkschin · Aug 3, 2020 · Aug 5, 2020 · Aug 6, 2020 · Aug 6, 2020
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,6 @@
 output/
 mot_benchmark
+.DS_Store
+.idea/
+env/
+*.pyc
diff --git a/README.md b/README.md
@@ -6,6 +6,12 @@ See an example [video here](https://motchallenge.net/movies/ETH-Linthescher-SORT
 
 By Alex Bewley  
 
+### jkschin comments
+1. I first used [this](https://github.com/ultralytics/yolov3) YOLOV3 repository to convert a YOLOV3 model into ONNX.
+2. Download [yolov3-10.onnx](https://drive.google.com/file/d/1m8DTvn4EByXyLr4alSSFon-UzihmhpoV/view?usp=sharing) for convenience.
+3. The key thing about these papers are that it's for research purposes. They take in the detections as a text file. `onnxyolo.py` uses the ONNX model to detect objects, and then passes those detections into SORT.
+4. I tried multiple C++ repositories and found this to be the most promising: https://github.com/yasenh/sort-cpp.
+
 ### Introduction
 
 SORT is a barebones implementation of a visual multiple object tracking framework based on rudimentary data association and state estimation techniques. It is designed for online tracking applications where only past and current frames are available and the method produces object identities on the fly. While this minimalistic tracker doesn't handle occlusion or re-entering objects its purpose is to serve as a baseline and testbed for the development of future trackers.

diff --git a/onnxyolo.py b/onnxyolo.py
@@ -0,0 +1,69 @@
+import onnxruntime as rt
+import random
+import numpy as np
+from PIL import Image
+
+names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
+colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]
+
+# this function is from yolo3.utils.letterbox_image
+def letterbox_image(image, size):
+    '''resize image with unchanged aspect ratio using padding'''
+    iw, ih = image.size
+    w, h = size
+    scale = min(w/iw, h/ih)
+    nw = int(iw*scale)
+    nh = int(ih*scale)
+
+    image = image.resize((nw,nh), Image.BICUBIC)
+    new_image = Image.new('RGB', size, (128,128,128))
+    new_image.paste(image, ((w-nw)//2, (h-nh)//2))
+    return new_image
+
+def preprocess(img):
+    model_image_size = (416, 416)
+    boxed_image = letterbox_image(img, tuple(reversed(model_image_size)))
+    image_data = np.array(boxed_image, dtype='float32')
+    image_data /= 255.
+    image_data = np.transpose(image_data, [2, 0, 1])
+    image_data = np.expand_dims(image_data, 0)
+    return image_data
+
+def get_sess():
+    sess = rt.InferenceSession("./weights/yolov3-10.onnx")
+    # sess = rt.InferenceSession("./weights/yolov5s.onnx")
+    # sess = rt.InferenceSession("weights/yolov3-spp-ultralytics.onnx")
+    input_name = sess.get_inputs()[0].name
+    print("input name", input_name)
+    input_shape = sess.get_inputs()[0].shape
+    print("input shape", input_shape)
+    input_type = sess.get_inputs()[0].type
+    print("input type", input_type)
+    output_name = sess.get_outputs()[0].name
+    print("output name", output_name)
+    output_shape = sess.get_outputs()[0].shape
+    print("output shape", output_shape)
+    output_type = sess.get_outputs()[0].type
+    print("output type", output_type)
+    return sess
+
+def get_dets(p, sess):
+    image = Image.open(p)
+    # input
+    image_data = preprocess(image)
+    image_size = np.array([image.size[1], image.size[0]], dtype=np.float32).reshape(1, 2)
+    print("Image Shape: ", image_size)
+    preds = sess.run(
+        ["yolonms_layer_1/ExpandDims_1:0",
+         "yolonms_layer_1/ExpandDims_3:0",
+         "yolonms_layer_1/concat_2:0"], {"input_1": image_data, "image_shape": image_size})
+    print("Preds Shape: ", preds[0].shape)
+    print("Preds Shape: ", preds[1].shape)
+    print("Preds Shape: ", preds[2].shape)
+    print(preds[2])
+    dets = []
+    for _, cls, idx in preds[2]:
+        a, b, c, d = preds[0][0][idx]
+        score = preds[1][0][cls][idx]
+        dets.append([b,a,d,c,score])
+    return np.array(dets)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,19 @@
+# Original Requirements
 filterpy==1.4.5
-scikit-image==0.14.0
+scikit-image==0.14.2
 lap==0.4.0
+
+# Requirements from YOLOv5 Repository
+# Not all are used but just keeping it here for now.
+Cython
+matplotlib>=3.2.2
+numpy>=1.18.5
+opencv-python>=4.1.2
+pillow
+# pycocotools>=2.0
+PyYAML>=5.3
+scipy>=1.4.1
+tensorboard>=2.2
+torch>=1.6.0
+torchvision>=0.7.0
+tqdm>=4.41.0
diff --git a/sort.py b/sort.py
@@ -30,6 +30,8 @@
 import argparse
 from filterpy.kalman import KalmanFilter
 
+import onnxyolo
+
 np.random.seed(0)
 
 
@@ -276,6 +278,7 @@ def parse_args():
   total_time = 0.0
   total_frames = 0
   colours = np.random.rand(32, 3) #used only for display
+  sess = onnxyolo.get_sess()
   if(display):
     if not os.path.exists('mot_benchmark'):
       print('\n\tERROR: mot_benchmark link not found!\n\n    Create a symbolic link to the MOT benchmark\n    (https://motchallenge.net/data/2D_MOT_2015/#download). E.g.:\n\n    $ ln -s /path/to/MOT2015_challenge/2DMOT2015 mot_benchmark\n\n')
@@ -298,8 +301,11 @@ def parse_args():
       print("Processing %s."%(seq))
       for frame in range(int(seq_dets[:,0].max())):
         frame += 1 #detection and frame numbers begin at 1
-        dets = seq_dets[seq_dets[:, 0]==frame, 2:7]
-        dets[:, 2:4] += dets[:, 0:2] #convert to [x1,y1,w,h] to [x1,y1,x2,y2]
+        p = 'mot_benchmark/%s/%s/img1/%06d.jpg' % (phase, seq, frame)
+        dets = onnxyolo.get_dets(p ,sess)
+        print(dets)
+        # dets = seq_dets[seq_dets[:, 0]==frame, 2:7]
+        # dets[:, 2:4] += dets[:, 0:2] #convert to [x1,y1,w,h] to [x1,y1,x2,y2]
         total_frames += 1
 
         if(display):