Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
output/
mot_benchmark
.DS_Store
.idea/
env/
*.pyc
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ See an example [video here](https://motchallenge.net/movies/ETH-Linthescher-SORT

By Alex Bewley

### jkschin comments
1. I first used [this](https://github.com/ultralytics/yolov3) YOLOV3 repository to convert a YOLOV3 model into ONNX.
2. Download [yolov3-10.onnx](https://drive.google.com/file/d/1m8DTvn4EByXyLr4alSSFon-UzihmhpoV/view?usp=sharing) for convenience.
3. The key thing about these papers are that it's for research purposes. They take in the detections as a text file. `onnxyolo.py` uses the ONNX model to detect objects, and then passes those detections into SORT.
4. I tried multiple C++ repositories and found this to be the most promising: https://github.com/yasenh/sort-cpp.

### Introduction

SORT is a barebones implementation of a visual multiple object tracking framework based on rudimentary data association and state estimation techniques. It is designed for online tracking applications where only past and current frames are available and the method produces object identities on the fly. While this minimalistic tracker doesn't handle occlusion or re-entering objects its purpose is to serve as a baseline and testbed for the development of future trackers.
Expand Down
69 changes: 69 additions & 0 deletions onnxyolo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import onnxruntime as rt
import random
import numpy as np
from PIL import Image

names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]

# this function is from yolo3.utils.letterbox_image
def letterbox_image(image, size):
'''resize image with unchanged aspect ratio using padding'''
iw, ih = image.size
w, h = size
scale = min(w/iw, h/ih)
nw = int(iw*scale)
nh = int(ih*scale)

image = image.resize((nw,nh), Image.BICUBIC)
new_image = Image.new('RGB', size, (128,128,128))
new_image.paste(image, ((w-nw)//2, (h-nh)//2))
return new_image

def preprocess(img):
model_image_size = (416, 416)
boxed_image = letterbox_image(img, tuple(reversed(model_image_size)))
image_data = np.array(boxed_image, dtype='float32')
image_data /= 255.
image_data = np.transpose(image_data, [2, 0, 1])
image_data = np.expand_dims(image_data, 0)
return image_data

def get_sess():
sess = rt.InferenceSession("./weights/yolov3-10.onnx")
# sess = rt.InferenceSession("./weights/yolov5s.onnx")
# sess = rt.InferenceSession("weights/yolov3-spp-ultralytics.onnx")
input_name = sess.get_inputs()[0].name
print("input name", input_name)
input_shape = sess.get_inputs()[0].shape
print("input shape", input_shape)
input_type = sess.get_inputs()[0].type
print("input type", input_type)
output_name = sess.get_outputs()[0].name
print("output name", output_name)
output_shape = sess.get_outputs()[0].shape
print("output shape", output_shape)
output_type = sess.get_outputs()[0].type
print("output type", output_type)
return sess

def get_dets(p, sess):
image = Image.open(p)
# input
image_data = preprocess(image)
image_size = np.array([image.size[1], image.size[0]], dtype=np.float32).reshape(1, 2)
print("Image Shape: ", image_size)
preds = sess.run(
["yolonms_layer_1/ExpandDims_1:0",
"yolonms_layer_1/ExpandDims_3:0",
"yolonms_layer_1/concat_2:0"], {"input_1": image_data, "image_shape": image_size})
print("Preds Shape: ", preds[0].shape)
print("Preds Shape: ", preds[1].shape)
print("Preds Shape: ", preds[2].shape)
print(preds[2])
dets = []
for _, cls, idx in preds[2]:
a, b, c, d = preds[0][0][idx]
score = preds[1][0][cls][idx]
dets.append([b,a,d,c,score])
return np.array(dets)
18 changes: 17 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# Original Requirements
filterpy==1.4.5
scikit-image==0.14.0
scikit-image==0.14.2
lap==0.4.0

# Requirements from YOLOv5 Repository
# Not all are used but just keeping it here for now.
Cython
matplotlib>=3.2.2
numpy>=1.18.5
opencv-python>=4.1.2
pillow
# pycocotools>=2.0
PyYAML>=5.3
scipy>=1.4.1
tensorboard>=2.2
torch>=1.6.0
torchvision>=0.7.0
tqdm>=4.41.0
10 changes: 8 additions & 2 deletions sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
import argparse
from filterpy.kalman import KalmanFilter

import onnxyolo

np.random.seed(0)


Expand Down Expand Up @@ -276,6 +278,7 @@ def parse_args():
total_time = 0.0
total_frames = 0
colours = np.random.rand(32, 3) #used only for display
sess = onnxyolo.get_sess()
if(display):
if not os.path.exists('mot_benchmark'):
print('\n\tERROR: mot_benchmark link not found!\n\n Create a symbolic link to the MOT benchmark\n (https://motchallenge.net/data/2D_MOT_2015/#download). E.g.:\n\n $ ln -s /path/to/MOT2015_challenge/2DMOT2015 mot_benchmark\n\n')
Expand All @@ -298,8 +301,11 @@ def parse_args():
print("Processing %s."%(seq))
for frame in range(int(seq_dets[:,0].max())):
frame += 1 #detection and frame numbers begin at 1
dets = seq_dets[seq_dets[:, 0]==frame, 2:7]
dets[:, 2:4] += dets[:, 0:2] #convert to [x1,y1,w,h] to [x1,y1,x2,y2]
p = 'mot_benchmark/%s/%s/img1/%06d.jpg' % (phase, seq, frame)
dets = onnxyolo.get_dets(p ,sess)
print(dets)
# dets = seq_dets[seq_dets[:, 0]==frame, 2:7]
# dets[:, 2:4] += dets[:, 0:2] #convert to [x1,y1,w,h] to [x1,y1,x2,y2]
total_frames += 1

if(display):
Expand Down