add StrongSORT Tacker

This commit is contained in:
Pongsatorn Kanjanasantisak 2025-08-10 01:23:09 +07:00
parent ffc2e99678
commit b7d8b3266f
93 changed files with 20230 additions and 6 deletions

View file

@ -0,0 +1,2 @@
from . import args
from . import ocsort

View file

@ -0,0 +1,110 @@
import argparse
def make_parser():
parser = argparse.ArgumentParser("OC-SORT parameters")
# distributed
parser.add_argument("-b", "--batch-size", type=int, default=1, help="batch size")
parser.add_argument("-d", "--devices", default=None, type=int, help="device for training")
parser.add_argument("--local_rank", default=0, type=int, help="local rank for dist training")
parser.add_argument("--num_machines", default=1, type=int, help="num of node for training")
parser.add_argument("--machine_rank", default=0, type=int, help="node rank for multi-node training")
parser.add_argument(
"-f",
"--exp_file",
default=None,
type=str,
help="pls input your expriment description file",
)
parser.add_argument(
"--test",
dest="test",
default=False,
action="store_true",
help="Evaluating on test-dev set.",
)
parser.add_argument(
"opts",
help="Modify config options using the command-line",
default=None,
nargs=argparse.REMAINDER,
)
# det args
parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
parser.add_argument("--conf", default=0.1, type=float, help="test conf")
parser.add_argument("--nms", default=0.7, type=float, help="test nms threshold")
parser.add_argument("--tsize", default=[800, 1440], nargs="+", type=int, help="test img size")
parser.add_argument("--seed", default=None, type=int, help="eval seed")
# tracking args
parser.add_argument("--track_thresh", type=float, default=0.6, help="detection confidence threshold")
parser.add_argument(
"--iou_thresh",
type=float,
default=0.3,
help="the iou threshold in Sort for matching",
)
parser.add_argument("--min_hits", type=int, default=3, help="min hits to create track in SORT")
parser.add_argument(
"--inertia",
type=float,
default=0.2,
help="the weight of VDC term in cost matrix",
)
parser.add_argument(
"--deltat",
type=int,
default=3,
help="time step difference to estimate direction",
)
parser.add_argument("--track_buffer", type=int, default=30, help="the frames for keep lost tracks")
parser.add_argument(
"--match_thresh",
type=float,
default=0.9,
help="matching threshold for tracking",
)
parser.add_argument(
"--gt-type",
type=str,
default="_val_half",
help="suffix to find the gt annotation",
)
parser.add_argument("--public", action="store_true", help="use public detection")
parser.add_argument("--asso", default="iou", help="similarity function: iou/giou/diou/ciou/ctdis")
# for kitti/bdd100k inference with public detections
parser.add_argument(
"--raw_results_path",
type=str,
default="exps/permatrack_kitti_test/",
help="path to the raw tracking results from other tracks",
)
parser.add_argument("--out_path", type=str, help="path to save output results")
parser.add_argument(
"--hp",
action="store_true",
help="use head padding to add the missing objects during \
initializing the tracks (offline).",
)
# for demo video
parser.add_argument("--demo_type", default="image", help="demo type, eg. image, video and webcam")
parser.add_argument("--path", default="./videos/demo.mp4", help="path to images or video")
parser.add_argument("--camid", type=int, default=0, help="webcam demo camera id")
parser.add_argument(
"--save_result",
action="store_true",
help="whether to save the inference result of image/video",
)
parser.add_argument(
"--device",
default="gpu",
type=str,
help="device to run our model, can either be cpu or gpu",
)
return parser

View file

@ -0,0 +1,445 @@
import os
import pdb
import numpy as np
from scipy.special import softmax
def iou_batch(bboxes1, bboxes2):
"""
From SORT: Computes IOU between two bboxes in the form [x1,y1,x2,y2]
"""
bboxes2 = np.expand_dims(bboxes2, 0)
bboxes1 = np.expand_dims(bboxes1, 1)
xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0])
yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1])
xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2])
yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3])
w = np.maximum(0.0, xx2 - xx1)
h = np.maximum(0.0, yy2 - yy1)
wh = w * h
o = wh / (
(bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
+ (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])
- wh
)
return o
def giou_batch(bboxes1, bboxes2):
"""
:param bbox_p: predict of bbox(N,4)(x1,y1,x2,y2)
:param bbox_g: groundtruth of bbox(N,4)(x1,y1,x2,y2)
:return:
"""
# for details should go to https://arxiv.org/pdf/1902.09630.pdf
# ensure predict's bbox form
bboxes2 = np.expand_dims(bboxes2, 0)
bboxes1 = np.expand_dims(bboxes1, 1)
xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0])
yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1])
xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2])
yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3])
w = np.maximum(0.0, xx2 - xx1)
h = np.maximum(0.0, yy2 - yy1)
wh = w * h
iou = wh / (
(bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
+ (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])
- wh
)
xxc1 = np.minimum(bboxes1[..., 0], bboxes2[..., 0])
yyc1 = np.minimum(bboxes1[..., 1], bboxes2[..., 1])
xxc2 = np.maximum(bboxes1[..., 2], bboxes2[..., 2])
yyc2 = np.maximum(bboxes1[..., 3], bboxes2[..., 3])
wc = xxc2 - xxc1
hc = yyc2 - yyc1
assert (wc > 0).all() and (hc > 0).all()
area_enclose = wc * hc
giou = iou - (area_enclose - wh) / area_enclose
giou = (giou + 1.0) / 2.0 # resize from (-1,1) to (0,1)
return giou
def diou_batch(bboxes1, bboxes2):
"""
:param bbox_p: predict of bbox(N,4)(x1,y1,x2,y2)
:param bbox_g: groundtruth of bbox(N,4)(x1,y1,x2,y2)
:return:
"""
# for details should go to https://arxiv.org/pdf/1902.09630.pdf
# ensure predict's bbox form
bboxes2 = np.expand_dims(bboxes2, 0)
bboxes1 = np.expand_dims(bboxes1, 1)
# calculate the intersection box
xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0])
yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1])
xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2])
yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3])
w = np.maximum(0.0, xx2 - xx1)
h = np.maximum(0.0, yy2 - yy1)
wh = w * h
iou = wh / (
(bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
+ (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])
- wh
)
centerx1 = (bboxes1[..., 0] + bboxes1[..., 2]) / 2.0
centery1 = (bboxes1[..., 1] + bboxes1[..., 3]) / 2.0
centerx2 = (bboxes2[..., 0] + bboxes2[..., 2]) / 2.0
centery2 = (bboxes2[..., 1] + bboxes2[..., 3]) / 2.0
inner_diag = (centerx1 - centerx2) ** 2 + (centery1 - centery2) ** 2
xxc1 = np.minimum(bboxes1[..., 0], bboxes2[..., 0])
yyc1 = np.minimum(bboxes1[..., 1], bboxes2[..., 1])
xxc2 = np.maximum(bboxes1[..., 2], bboxes2[..., 2])
yyc2 = np.maximum(bboxes1[..., 3], bboxes2[..., 3])
outer_diag = (xxc2 - xxc1) ** 2 + (yyc2 - yyc1) ** 2
diou = iou - inner_diag / outer_diag
return (diou + 1) / 2.0 # resize from (-1,1) to (0,1)
def ciou_batch(bboxes1, bboxes2):
"""
:param bbox_p: predict of bbox(N,4)(x1,y1,x2,y2)
:param bbox_g: groundtruth of bbox(N,4)(x1,y1,x2,y2)
:return:
"""
# for details should go to https://arxiv.org/pdf/1902.09630.pdf
# ensure predict's bbox form
bboxes2 = np.expand_dims(bboxes2, 0)
bboxes1 = np.expand_dims(bboxes1, 1)
# calculate the intersection box
xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0])
yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1])
xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2])
yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3])
w = np.maximum(0.0, xx2 - xx1)
h = np.maximum(0.0, yy2 - yy1)
wh = w * h
iou = wh / (
(bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
+ (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])
- wh
)
centerx1 = (bboxes1[..., 0] + bboxes1[..., 2]) / 2.0
centery1 = (bboxes1[..., 1] + bboxes1[..., 3]) / 2.0
centerx2 = (bboxes2[..., 0] + bboxes2[..., 2]) / 2.0
centery2 = (bboxes2[..., 1] + bboxes2[..., 3]) / 2.0
inner_diag = (centerx1 - centerx2) ** 2 + (centery1 - centery2) ** 2
xxc1 = np.minimum(bboxes1[..., 0], bboxes2[..., 0])
yyc1 = np.minimum(bboxes1[..., 1], bboxes2[..., 1])
xxc2 = np.maximum(bboxes1[..., 2], bboxes2[..., 2])
yyc2 = np.maximum(bboxes1[..., 3], bboxes2[..., 3])
outer_diag = (xxc2 - xxc1) ** 2 + (yyc2 - yyc1) ** 2
w1 = bboxes1[..., 2] - bboxes1[..., 0]
h1 = bboxes1[..., 3] - bboxes1[..., 1]
w2 = bboxes2[..., 2] - bboxes2[..., 0]
h2 = bboxes2[..., 3] - bboxes2[..., 1]
# prevent dividing over zero. add one pixel shift
h2 = h2 + 1.0
h1 = h1 + 1.0
arctan = np.arctan(w2 / h2) - np.arctan(w1 / h1)
v = (4 / (np.pi**2)) * (arctan**2)
S = 1 - iou
alpha = v / (S + v)
ciou = iou - inner_diag / outer_diag - alpha * v
return (ciou + 1) / 2.0 # resize from (-1,1) to (0,1)
def ct_dist(bboxes1, bboxes2):
"""
Measure the center distance between two sets of bounding boxes,
this is a coarse implementation, we don't recommend using it only
for association, which can be unstable and sensitive to frame rate
and object speed.
"""
bboxes2 = np.expand_dims(bboxes2, 0)
bboxes1 = np.expand_dims(bboxes1, 1)
centerx1 = (bboxes1[..., 0] + bboxes1[..., 2]) / 2.0
centery1 = (bboxes1[..., 1] + bboxes1[..., 3]) / 2.0
centerx2 = (bboxes2[..., 0] + bboxes2[..., 2]) / 2.0
centery2 = (bboxes2[..., 1] + bboxes2[..., 3]) / 2.0
ct_dist2 = (centerx1 - centerx2) ** 2 + (centery1 - centery2) ** 2
ct_dist = np.sqrt(ct_dist2)
# The linear rescaling is a naive version and needs more study
ct_dist = ct_dist / ct_dist.max()
return ct_dist.max() - ct_dist # resize to (0,1)
def speed_direction_batch(dets, tracks):
tracks = tracks[..., np.newaxis]
CX1, CY1 = (dets[:, 0] + dets[:, 2]) / 2.0, (dets[:, 1] + dets[:, 3]) / 2.0
CX2, CY2 = (tracks[:, 0] + tracks[:, 2]) / 2.0, (tracks[:, 1] + tracks[:, 3]) / 2.0
dx = CX1 - CX2
dy = CY1 - CY2
norm = np.sqrt(dx**2 + dy**2) + 1e-6
dx = dx / norm
dy = dy / norm
return dy, dx # size: num_track x num_det
def linear_assignment(cost_matrix):
try:
import lap
_, x, y = lap.lapjv(cost_matrix, extend_cost=True)
return np.array([[y[i], i] for i in x if i >= 0]) #
except ImportError:
from scipy.optimize import linear_sum_assignment
x, y = linear_sum_assignment(cost_matrix)
return np.array(list(zip(x, y)))
def associate_detections_to_trackers(detections, trackers, iou_threshold=0.3):
"""
Assigns detections to tracked object (both represented as bounding boxes)
Returns 3 lists of matches, unmatched_detections and unmatched_trackers
"""
if len(trackers) == 0:
return (
np.empty((0, 2), dtype=int),
np.arange(len(detections)),
np.empty((0, 5), dtype=int),
)
iou_matrix = iou_batch(detections, trackers)
if min(iou_matrix.shape) > 0:
a = (iou_matrix > iou_threshold).astype(np.int32)
if a.sum(1).max() == 1 and a.sum(0).max() == 1:
matched_indices = np.stack(np.where(a), axis=1)
else:
matched_indices = linear_assignment(-iou_matrix)
else:
matched_indices = np.empty(shape=(0, 2))
unmatched_detections = []
for d, det in enumerate(detections):
if d not in matched_indices[:, 0]:
unmatched_detections.append(d)
unmatched_trackers = []
for t, trk in enumerate(trackers):
if t not in matched_indices[:, 1]:
unmatched_trackers.append(t)
# filter out matched with low IOU
matches = []
for m in matched_indices:
if iou_matrix[m[0], m[1]] < iou_threshold:
unmatched_detections.append(m[0])
unmatched_trackers.append(m[1])
else:
matches.append(m.reshape(1, 2))
if len(matches) == 0:
matches = np.empty((0, 2), dtype=int)
else:
matches = np.concatenate(matches, axis=0)
return matches, np.array(unmatched_detections), np.array(unmatched_trackers)
def compute_aw_max_metric(emb_cost, w_association_emb, bottom=0.5):
w_emb = np.full_like(emb_cost, w_association_emb)
for idx in range(emb_cost.shape[0]):
inds = np.argsort(-emb_cost[idx])
# If there's less than two matches, just keep original weight
if len(inds) < 2:
continue
if emb_cost[idx, inds[0]] == 0:
row_weight = 0
else:
row_weight = 1 - max((emb_cost[idx, inds[1]] / emb_cost[idx, inds[0]]) - bottom, 0) / (1 - bottom)
w_emb[idx] *= row_weight
for idj in range(emb_cost.shape[1]):
inds = np.argsort(-emb_cost[:, idj])
# If there's less than two matches, just keep original weight
if len(inds) < 2:
continue
if emb_cost[inds[0], idj] == 0:
col_weight = 0
else:
col_weight = 1 - max((emb_cost[inds[1], idj] / emb_cost[inds[0], idj]) - bottom, 0) / (1 - bottom)
w_emb[:, idj] *= col_weight
return w_emb * emb_cost
def associate(
detections, trackers, iou_threshold, velocities, previous_obs, vdc_weight, emb_cost, w_assoc_emb, aw_off, aw_param
):
if len(trackers) == 0:
return (
np.empty((0, 2), dtype=int),
np.arange(len(detections)),
np.empty((0, 5), dtype=int),
)
Y, X = speed_direction_batch(detections, previous_obs)
inertia_Y, inertia_X = velocities[:, 0], velocities[:, 1]
inertia_Y = np.repeat(inertia_Y[:, np.newaxis], Y.shape[1], axis=1)
inertia_X = np.repeat(inertia_X[:, np.newaxis], X.shape[1], axis=1)
diff_angle_cos = inertia_X * X + inertia_Y * Y
diff_angle_cos = np.clip(diff_angle_cos, a_min=-1, a_max=1)
diff_angle = np.arccos(diff_angle_cos)
diff_angle = (np.pi / 2.0 - np.abs(diff_angle)) / np.pi
valid_mask = np.ones(previous_obs.shape[0])
valid_mask[np.where(previous_obs[:, 4] < 0)] = 0
iou_matrix = iou_batch(detections, trackers)
scores = np.repeat(detections[:, -1][:, np.newaxis], trackers.shape[0], axis=1)
# iou_matrix = iou_matrix * scores # a trick sometiems works, we don't encourage this
valid_mask = np.repeat(valid_mask[:, np.newaxis], X.shape[1], axis=1)
angle_diff_cost = (valid_mask * diff_angle) * vdc_weight
angle_diff_cost = angle_diff_cost.T
angle_diff_cost = angle_diff_cost * scores
if min(iou_matrix.shape) > 0:
a = (iou_matrix > iou_threshold).astype(np.int32)
if a.sum(1).max() == 1 and a.sum(0).max() == 1:
matched_indices = np.stack(np.where(a), axis=1)
else:
if emb_cost is None:
emb_cost = 0
else:
emb_cost = emb_cost.cpu().numpy()
emb_cost[iou_matrix <= 0] = 0
if not aw_off:
emb_cost = compute_aw_max_metric(emb_cost, w_assoc_emb, bottom=aw_param)
else:
emb_cost *= w_assoc_emb
final_cost = -(iou_matrix + angle_diff_cost + emb_cost)
matched_indices = linear_assignment(final_cost)
else:
matched_indices = np.empty(shape=(0, 2))
unmatched_detections = []
for d, det in enumerate(detections):
if d not in matched_indices[:, 0]:
unmatched_detections.append(d)
unmatched_trackers = []
for t, trk in enumerate(trackers):
if t not in matched_indices[:, 1]:
unmatched_trackers.append(t)
# filter out matched with low IOU
matches = []
for m in matched_indices:
if iou_matrix[m[0], m[1]] < iou_threshold:
unmatched_detections.append(m[0])
unmatched_trackers.append(m[1])
else:
matches.append(m.reshape(1, 2))
if len(matches) == 0:
matches = np.empty((0, 2), dtype=int)
else:
matches = np.concatenate(matches, axis=0)
return matches, np.array(unmatched_detections), np.array(unmatched_trackers)
def associate_kitti(detections, trackers, det_cates, iou_threshold, velocities, previous_obs, vdc_weight):
if len(trackers) == 0:
return (
np.empty((0, 2), dtype=int),
np.arange(len(detections)),
np.empty((0, 5), dtype=int),
)
"""
Cost from the velocity direction consistency
"""
Y, X = speed_direction_batch(detections, previous_obs)
inertia_Y, inertia_X = velocities[:, 0], velocities[:, 1]
inertia_Y = np.repeat(inertia_Y[:, np.newaxis], Y.shape[1], axis=1)
inertia_X = np.repeat(inertia_X[:, np.newaxis], X.shape[1], axis=1)
diff_angle_cos = inertia_X * X + inertia_Y * Y
diff_angle_cos = np.clip(diff_angle_cos, a_min=-1, a_max=1)
diff_angle = np.arccos(diff_angle_cos)
diff_angle = (np.pi / 2.0 - np.abs(diff_angle)) / np.pi
valid_mask = np.ones(previous_obs.shape[0])
valid_mask[np.where(previous_obs[:, 4] < 0)] = 0
valid_mask = np.repeat(valid_mask[:, np.newaxis], X.shape[1], axis=1)
scores = np.repeat(detections[:, -1][:, np.newaxis], trackers.shape[0], axis=1)
angle_diff_cost = (valid_mask * diff_angle) * vdc_weight
angle_diff_cost = angle_diff_cost.T
angle_diff_cost = angle_diff_cost * scores
"""
Cost from IoU
"""
iou_matrix = iou_batch(detections, trackers)
"""
With multiple categories, generate the cost for catgory mismatch
"""
num_dets = detections.shape[0]
num_trk = trackers.shape[0]
cate_matrix = np.zeros((num_dets, num_trk))
for i in range(num_dets):
for j in range(num_trk):
if det_cates[i] != trackers[j, 4]:
cate_matrix[i][j] = -1e6
cost_matrix = -iou_matrix - angle_diff_cost - cate_matrix
if min(iou_matrix.shape) > 0:
a = (iou_matrix > iou_threshold).astype(np.int32)
if a.sum(1).max() == 1 and a.sum(0).max() == 1:
matched_indices = np.stack(np.where(a), axis=1)
else:
matched_indices = linear_assignment(cost_matrix)
else:
matched_indices = np.empty(shape=(0, 2))
unmatched_detections = []
for d, det in enumerate(detections):
if d not in matched_indices[:, 0]:
unmatched_detections.append(d)
unmatched_trackers = []
for t, trk in enumerate(trackers):
if t not in matched_indices[:, 1]:
unmatched_trackers.append(t)
# filter out matched with low IOU
matches = []
for m in matched_indices:
if iou_matrix[m[0], m[1]] < iou_threshold:
unmatched_detections.append(m[0])
unmatched_trackers.append(m[1])
else:
matches.append(m.reshape(1, 2))
if len(matches) == 0:
matches = np.empty((0, 2), dtype=int)
else:
matches = np.concatenate(matches, axis=0)
return matches, np.array(unmatched_detections), np.array(unmatched_trackers)

View file

@ -0,0 +1,170 @@
import pdb
import pickle
import os
import cv2
import numpy as np
class CMCComputer:
def __init__(self, minimum_features=10, method="sparse"):
assert method in ["file", "sparse", "sift"]
os.makedirs("./cache", exist_ok=True)
self.cache_path = "./cache/affine_ocsort.pkl"
self.cache = {}
if os.path.exists(self.cache_path):
with open(self.cache_path, "rb") as fp:
self.cache = pickle.load(fp)
self.minimum_features = minimum_features
self.prev_img = None
self.prev_desc = None
self.sparse_flow_param = dict(
maxCorners=3000,
qualityLevel=0.01,
minDistance=1,
blockSize=3,
useHarrisDetector=False,
k=0.04,
)
self.file_computed = {}
self.comp_function = None
if method == "sparse":
self.comp_function = self._affine_sparse_flow
elif method == "sift":
self.comp_function = self._affine_sift
# Same BoT-SORT CMC arrays
elif method == "file":
self.comp_function = self._affine_file
self.file_affines = {}
# Maps from tag name to file name
self.file_names = {}
# All the ablation file names
for f_name in os.listdir("./cache/cmc_files/MOT17_ablation/"):
# The tag that'll be passed into compute_affine based on image name
tag = f_name.replace("GMC-", "").replace(".txt", "") + "-FRCNN"
f_name = os.path.join("./cache/cmc_files/MOT17_ablation/", f_name)
self.file_names[tag] = f_name
for f_name in os.listdir("./cache/cmc_files/MOT20_ablation/"):
tag = f_name.replace("GMC-", "").replace(".txt", "")
f_name = os.path.join("./cache/cmc_files/MOT20_ablation/", f_name)
self.file_names[tag] = f_name
# All the test file names
for f_name in os.listdir("./cache/cmc_files/MOTChallenge/"):
tag = f_name.replace("GMC-", "").replace(".txt", "")
if "MOT17" in tag:
tag = tag + "-FRCNN"
# If it's an ablation one (not test) don't overwrite it
if tag in self.file_names:
continue
f_name = os.path.join("./cache/cmc_files/MOTChallenge/", f_name)
self.file_names[tag] = f_name
def compute_affine(self, img, bbox, tag):
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if tag in self.cache:
A = self.cache[tag]
return A
mask = np.ones_like(img, dtype=np.uint8)
if bbox.shape[0] > 0:
bbox = np.round(bbox).astype(np.int32)
bbox[bbox < 0] = 0
for bb in bbox:
mask[bb[1] : bb[3], bb[0] : bb[2]] = 0
A = self.comp_function(img, mask, tag)
self.cache[tag] = A
return A
def _load_file(self, name):
affines = []
with open(self.file_names[name], "r") as fp:
for line in fp:
tokens = [float(f) for f in line.split("\t")[1:7]]
A = np.eye(2, 3)
A[0, 0] = tokens[0]
A[0, 1] = tokens[1]
A[0, 2] = tokens[2]
A[1, 0] = tokens[3]
A[1, 1] = tokens[4]
A[1, 2] = tokens[5]
affines.append(A)
self.file_affines[name] = affines
def _affine_file(self, frame, mask, tag):
name, num = tag.split(":")
if name not in self.file_affines:
self._load_file(name)
if name not in self.file_affines:
raise RuntimeError("Error loading file affines for CMC.")
return self.file_affines[name][int(num) - 1]
def _affine_sift(self, frame, mask, tag):
A = np.eye(2, 3)
detector = cv2.SIFT_create()
kp, desc = detector.detectAndCompute(frame, mask)
if self.prev_desc is None:
self.prev_desc = [kp, desc]
return A
if desc.shape[0] < self.minimum_features or self.prev_desc[1].shape[0] < self.minimum_features:
return A
bf = cv2.BFMatcher(cv2.NORM_L2)
matches = bf.knnMatch(self.prev_desc[1], desc, k=2)
good = []
for m, n in matches:
if m.distance < 0.7 * n.distance:
good.append(m)
if len(good) > self.minimum_features:
src_pts = np.float32([self.prev_desc[0][m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
dst_pts = np.float32([kp[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)
A, _ = cv2.estimateAffinePartial2D(src_pts, dst_pts, method=cv2.RANSAC)
else:
print("Warning: not enough matching points")
if A is None:
A = np.eye(2, 3)
self.prev_desc = [kp, desc]
return A
def _affine_sparse_flow(self, frame, mask, tag):
# Initialize
A = np.eye(2, 3)
# find the keypoints
keypoints = cv2.goodFeaturesToTrack(frame, mask=mask, **self.sparse_flow_param)
# Handle first frame
if self.prev_img is None:
self.prev_img = frame
self.prev_desc = keypoints
return A
matched_kp, status, err = cv2.calcOpticalFlowPyrLK(self.prev_img, frame, self.prev_desc, None)
matched_kp = matched_kp.reshape(-1, 2)
status = status.reshape(-1)
prev_points = self.prev_desc.reshape(-1, 2)
prev_points = prev_points[status]
curr_points = matched_kp[status]
# Find rigid matrix
if prev_points.shape[0] > self.minimum_features:
A, _ = cv2.estimateAffinePartial2D(prev_points, curr_points, method=cv2.RANSAC)
else:
print("Warning: not enough matching points")
if A is None:
A = np.eye(2, 3)
self.prev_img = frame
self.prev_desc = keypoints
return A
def dump_cache(self):
with open(self.cache_path, "wb") as fp:
pickle.dump(self.cache, fp)

View file

@ -0,0 +1,12 @@
# Trial number: 137
# HOTA, MOTA, IDF1: [55.567]
deepocsort:
asso_func: giou
conf_thres: 0.5122620708221085
delta_t: 1
det_thresh: 0
inertia: 0.3941737016672115
iou_thresh: 0.22136877277096445
max_age: 50
min_hits: 1
use_byte: false

View file

@ -0,0 +1,116 @@
import pdb
from collections import OrderedDict
import os
import pickle
import torch
import cv2
import torchvision
import numpy as np
class EmbeddingComputer:
def __init__(self, dataset):
self.model = None
self.dataset = dataset
self.crop_size = (128, 384)
os.makedirs("./cache/embeddings/", exist_ok=True)
self.cache_path = "./cache/embeddings/{}_embedding.pkl"
self.cache = {}
self.cache_name = ""
def load_cache(self, path):
self.cache_name = path
cache_path = self.cache_path.format(path)
if os.path.exists(cache_path):
with open(cache_path, "rb") as fp:
self.cache = pickle.load(fp)
def compute_embedding(self, img, bbox, tag, is_numpy=True):
if self.cache_name != tag.split(":")[0]:
self.load_cache(tag.split(":")[0])
if tag in self.cache:
embs = self.cache[tag]
if embs.shape[0] != bbox.shape[0]:
raise RuntimeError(
"ERROR: The number of cached embeddings don't match the "
"number of detections.\nWas the detector model changed? Delete cache if so."
)
return embs
if self.model is None:
self.initialize_model()
# Make sure bbox is within image frame
if is_numpy:
h, w = img.shape[:2]
else:
h, w = img.shape[2:]
results = np.round(bbox).astype(np.int32)
results[:, 0] = results[:, 0].clip(0, w)
results[:, 1] = results[:, 1].clip(0, h)
results[:, 2] = results[:, 2].clip(0, w)
results[:, 3] = results[:, 3].clip(0, h)
# Generate all the crops
crops = []
for p in results:
if is_numpy:
crop = img[p[1] : p[3], p[0] : p[2]]
crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
crop = cv2.resize(crop, self.crop_size, interpolation=cv2.INTER_LINEAR)
crop = torch.as_tensor(crop.astype("float32").transpose(2, 0, 1))
crop = crop.unsqueeze(0)
else:
crop = img[:, :, p[1] : p[3], p[0] : p[2]]
crop = torchvision.transforms.functional.resize(crop, self.crop_size)
crops.append(crop)
crops = torch.cat(crops, dim=0)
# Create embeddings and l2 normalize them
with torch.no_grad():
crops = crops.cuda()
crops = crops.half()
embs = self.model(crops)
embs = torch.nn.functional.normalize(embs)
embs = embs.cpu().numpy()
self.cache[tag] = embs
return embs
def initialize_model(self):
"""
model = torchreid.models.build_model(name="osnet_ain_x1_0", num_classes=2510, loss="softmax", pretrained=False)
sd = torch.load("external/weights/osnet_ain_ms_d_c.pth.tar")["state_dict"]
new_state_dict = OrderedDict()
for k, v in sd.items():
name = k[7:] # remove `module.`
new_state_dict[name] = v
# load params
model.load_state_dict(new_state_dict)
model.eval()
model.cuda()
"""
if self.dataset == "mot17":
path = "external/weights/mot17_sbs_S50.pth"
elif self.dataset == "mot20":
path = "external/weights/mot20_sbs_S50.pth"
elif self.dataset == "dance":
path = None
else:
raise RuntimeError("Need the path for a new ReID model.")
model = FastReID(path)
model.eval()
model.cuda()
model.half()
self.model = model
def dump_cache(self):
if self.cache_name:
with open(self.cache_path.format(self.cache_name), "wb") as fp:
pickle.dump(self.cache, fp)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,670 @@
"""
This script is adopted from the SORT script by Alex Bewley alex@bewley.ai
"""
from __future__ import print_function
import pdb
import pickle
import cv2
import torch
import torchvision
import numpy as np
from .association import *
from .embedding import EmbeddingComputer
from .cmc import CMCComputer
from reid_multibackend import ReIDDetectMultiBackend
def k_previous_obs(observations, cur_age, k):
if len(observations) == 0:
return [-1, -1, -1, -1, -1]
for i in range(k):
dt = k - i
if cur_age - dt in observations:
return observations[cur_age - dt]
max_age = max(observations.keys())
return observations[max_age]
def convert_bbox_to_z(bbox):
"""
Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form
[x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is
the aspect ratio
"""
w = bbox[2] - bbox[0]
h = bbox[3] - bbox[1]
x = bbox[0] + w / 2.0
y = bbox[1] + h / 2.0
s = w * h # scale is just area
r = w / float(h + 1e-6)
return np.array([x, y, s, r]).reshape((4, 1))
def convert_bbox_to_z_new(bbox):
w = bbox[2] - bbox[0]
h = bbox[3] - bbox[1]
x = bbox[0] + w / 2.0
y = bbox[1] + h / 2.0
return np.array([x, y, w, h]).reshape((4, 1))
def convert_x_to_bbox_new(x):
x, y, w, h = x.reshape(-1)[:4]
return np.array([x - w / 2, y - h / 2, x + w / 2, y + h / 2]).reshape(1, 4)
def convert_x_to_bbox(x, score=None):
"""
Takes a bounding box in the centre form [x,y,s,r] and returns it in the form
[x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right
"""
w = np.sqrt(x[2] * x[3])
h = x[2] / w
if score == None:
return np.array([x[0] - w / 2.0, x[1] - h / 2.0, x[0] + w / 2.0, x[1] + h / 2.0]).reshape((1, 4))
else:
return np.array([x[0] - w / 2.0, x[1] - h / 2.0, x[0] + w / 2.0, x[1] + h / 2.0, score]).reshape((1, 5))
def speed_direction(bbox1, bbox2):
cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0
cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0
speed = np.array([cy2 - cy1, cx2 - cx1])
norm = np.sqrt((cy2 - cy1) ** 2 + (cx2 - cx1) ** 2) + 1e-6
return speed / norm
def new_kf_process_noise(w, h, p=1 / 20, v=1 / 160):
Q = np.diag(
((p * w) ** 2, (p * h) ** 2, (p * w) ** 2, (p * h) ** 2, (v * w) ** 2, (v * h) ** 2, (v * w) ** 2, (v * h) ** 2)
)
return Q
def new_kf_measurement_noise(w, h, m=1 / 20):
w_var = (m * w) ** 2
h_var = (m * h) ** 2
R = np.diag((w_var, h_var, w_var, h_var))
return R
class KalmanBoxTracker(object):
"""
This class represents the internal state of individual tracked objects observed as bbox.
"""
count = 0
def __init__(self, bbox, cls, delta_t=3, orig=False, emb=None, alpha=0, new_kf=False):
"""
Initialises a tracker using initial bounding box.
"""
# define constant velocity model
if not orig:
from .kalmanfilter import KalmanFilterNew as KalmanFilter
else:
from filterpy.kalman import KalmanFilter
self.cls = cls
self.conf = bbox[-1]
self.new_kf = new_kf
if new_kf:
self.kf = KalmanFilter(dim_x=8, dim_z=4)
self.kf.F = np.array(
[
# x y w h x' y' w' h'
[1, 0, 0, 0, 1, 0, 0, 0],
[0, 1, 0, 0, 0, 1, 0, 0],
[0, 0, 1, 0, 0, 0, 1, 0],
[0, 0, 0, 1, 0, 0, 0, 1],
[0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 1],
]
)
self.kf.H = np.array(
[
[1, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0],
]
)
_, _, w, h = convert_bbox_to_z_new(bbox).reshape(-1)
self.kf.P = new_kf_process_noise(w, h)
self.kf.P[:4, :4] *= 4
self.kf.P[4:, 4:] *= 100
# Process and measurement uncertainty happen in functions
self.bbox_to_z_func = convert_bbox_to_z_new
self.x_to_bbox_func = convert_x_to_bbox_new
else:
self.kf = KalmanFilter(dim_x=7, dim_z=4)
self.kf.F = np.array(
[
# x y s r x' y' s'
[1, 0, 0, 0, 1, 0, 0],
[0, 1, 0, 0, 0, 1, 0],
[0, 0, 1, 0, 0, 0, 1],
[0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 1],
]
)
self.kf.H = np.array(
[
[1, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0],
]
)
self.kf.R[2:, 2:] *= 10.0
self.kf.P[4:, 4:] *= 1000.0 # give high uncertainty to the unobservable initial velocities
self.kf.P *= 10.0
self.kf.Q[-1, -1] *= 0.01
self.kf.Q[4:, 4:] *= 0.01
self.bbox_to_z_func = convert_bbox_to_z
self.x_to_bbox_func = convert_x_to_bbox
self.kf.x[:4] = self.bbox_to_z_func(bbox)
self.time_since_update = 0
self.id = KalmanBoxTracker.count
KalmanBoxTracker.count += 1
self.history = []
self.hits = 0
self.hit_streak = 0
self.age = 0
"""
NOTE: [-1,-1,-1,-1,-1] is a compromising placeholder for non-observation status, the same for the return of
function k_previous_obs. It is ugly and I do not like it. But to support generate observation array in a
fast and unified way, which you would see below k_observations = np.array([k_previous_obs(...]]), let's bear it for now.
"""
# Used for OCR
self.last_observation = np.array([-1, -1, -1, -1, -1]) # placeholder
# Used to output track after min_hits reached
self.history_observations = []
# Used for velocity
self.observations = dict()
self.velocity = None
self.delta_t = delta_t
self.emb = emb
self.frozen = False
def update(self, bbox, cls):
"""
Updates the state vector with observed bbox.
"""
if bbox is not None:
self.frozen = False
self.cls = cls
if self.last_observation.sum() >= 0: # no previous observation
previous_box = None
for dt in range(self.delta_t, 0, -1):
if self.age - dt in self.observations:
previous_box = self.observations[self.age - dt]
break
if previous_box is None:
previous_box = self.last_observation
"""
Estimate the track speed direction with observations \Delta t steps away
"""
self.velocity = speed_direction(previous_box, bbox)
"""
Insert new observations. This is a ugly way to maintain both self.observations
and self.history_observations. Bear it for the moment.
"""
self.last_observation = bbox
self.observations[self.age] = bbox
self.history_observations.append(bbox)
self.time_since_update = 0
self.history = []
self.hits += 1
self.hit_streak += 1
if self.new_kf:
R = new_kf_measurement_noise(self.kf.x[2, 0], self.kf.x[3, 0])
self.kf.update(self.bbox_to_z_func(bbox), R=R)
else:
self.kf.update(self.bbox_to_z_func(bbox))
else:
self.kf.update(bbox)
self.frozen = True
def update_emb(self, emb, alpha=0.9):
self.emb = alpha * self.emb + (1 - alpha) * emb
self.emb /= np.linalg.norm(self.emb)
def get_emb(self):
return self.emb.cpu()
def apply_affine_correction(self, affine):
m = affine[:, :2]
t = affine[:, 2].reshape(2, 1)
# For OCR
if self.last_observation.sum() > 0:
ps = self.last_observation[:4].reshape(2, 2).T
ps = m @ ps + t
self.last_observation[:4] = ps.T.reshape(-1)
# Apply to each box in the range of velocity computation
for dt in range(self.delta_t, -1, -1):
if self.age - dt in self.observations:
ps = self.observations[self.age - dt][:4].reshape(2, 2).T
ps = m @ ps + t
self.observations[self.age - dt][:4] = ps.T.reshape(-1)
# Also need to change kf state, but might be frozen
self.kf.apply_affine_correction(m, t, self.new_kf)
def predict(self):
"""
Advances the state vector and returns the predicted bounding box estimate.
"""
# Don't allow negative bounding boxes
if self.new_kf:
if self.kf.x[2] + self.kf.x[6] <= 0:
self.kf.x[6] = 0
if self.kf.x[3] + self.kf.x[7] <= 0:
self.kf.x[7] = 0
# Stop velocity, will update in kf during OOS
if self.frozen:
self.kf.x[6] = self.kf.x[7] = 0
Q = new_kf_process_noise(self.kf.x[2, 0], self.kf.x[3, 0])
else:
if (self.kf.x[6] + self.kf.x[2]) <= 0:
self.kf.x[6] *= 0.0
Q = None
self.kf.predict(Q=Q)
self.age += 1
if self.time_since_update > 0:
self.hit_streak = 0
self.time_since_update += 1
self.history.append(self.x_to_bbox_func(self.kf.x))
return self.history[-1]
def get_state(self):
"""
Returns the current bounding box estimate.
"""
return self.x_to_bbox_func(self.kf.x)
def mahalanobis(self, bbox):
"""Should be run after a predict() call for accuracy."""
return self.kf.md_for_measurement(self.bbox_to_z_func(bbox))
"""
We support multiple ways for association cost calculation, by default
we use IoU. GIoU may have better performance in some situations. We note
that we hardly normalize the cost by all methods to (0,1) which may not be
the best practice.
"""
ASSO_FUNCS = {
"iou": iou_batch,
"giou": giou_batch,
"ciou": ciou_batch,
"diou": diou_batch,
"ct_dist": ct_dist,
}
class OCSort(object):
def __init__(
self,
model_weights,
device,
fp16,
det_thresh,
max_age=30,
min_hits=3,
iou_threshold=0.3,
delta_t=3,
asso_func="iou",
inertia=0.2,
w_association_emb=0.75,
alpha_fixed_emb=0.95,
aw_param=0.5,
embedding_off=False,
cmc_off=False,
aw_off=False,
new_kf_off=False,
**kwargs
):
"""
Sets key parameters for SORT
"""
self.max_age = max_age
self.min_hits = min_hits
self.iou_threshold = iou_threshold
self.trackers = []
self.frame_count = 0
self.det_thresh = det_thresh
self.delta_t = delta_t
self.asso_func = ASSO_FUNCS[asso_func]
self.inertia = inertia
self.w_association_emb = w_association_emb
self.alpha_fixed_emb = alpha_fixed_emb
self.aw_param = aw_param
KalmanBoxTracker.count = 0
self.embedder = ReIDDetectMultiBackend(weights=model_weights, device=device, fp16=fp16)
self.cmc = CMCComputer()
self.embedding_off = embedding_off
self.cmc_off = cmc_off
self.aw_off = aw_off
self.new_kf_off = new_kf_off
def update(self, dets, img_numpy, tag='blub'):
"""
Params:
dets - a numpy array of detections in the format [[x1,y1,x2,y2,score],[x1,y1,x2,y2,score],...]
Requires: this method must be called once for each frame even with empty detections (use np.empty((0, 5)) for frames without detections).
Returns the a similar array, where the last column is the object ID.
NOTE: The number of objects returned may differ from the number of detections provided.
"""
xyxys = dets[:, 0:4]
scores = dets[:, 4]
clss = dets[:, 5]
classes = clss.numpy()
xyxys = xyxys.numpy()
scores = scores.numpy()
dets = dets[:, 0:6].numpy()
remain_inds = scores > self.det_thresh
dets = dets[remain_inds]
self.height, self.width = img_numpy.shape[:2]
# Rescale
#scale = min(img_tensor.shape[2] / img_numpy.shape[0], img_tensor.shape[3] / img_numpy.shape[1])
#dets[:, :4] /= scale
# Embedding
if self.embedding_off or dets.shape[0] == 0:
dets_embs = np.ones((dets.shape[0], 1))
else:
# (Ndets x X) [512, 1024, 2048]
#dets_embs = self.embedder.compute_embedding(img_numpy, dets[:, :4], tag)
dets_embs = self._get_features(dets[:, :4], img_numpy)
# CMC
if not self.cmc_off:
transform = self.cmc.compute_affine(img_numpy, dets[:, :4], tag)
for trk in self.trackers:
trk.apply_affine_correction(transform)
trust = (dets[:, 4] - self.det_thresh) / (1 - self.det_thresh)
af = self.alpha_fixed_emb
# From [self.alpha_fixed_emb, 1], goes to 1 as detector is less confident
dets_alpha = af + (1 - af) * (1 - trust)
# get predicted locations from existing trackers.
trks = np.zeros((len(self.trackers), 5))
trk_embs = []
to_del = []
ret = []
for t, trk in enumerate(trks):
pos = self.trackers[t].predict()[0]
trk[:] = [pos[0], pos[1], pos[2], pos[3], 0]
if np.any(np.isnan(pos)):
to_del.append(t)
else:
trk_embs.append(self.trackers[t].get_emb())
trks = np.ma.compress_rows(np.ma.masked_invalid(trks))
if len(trk_embs) > 0:
trk_embs = np.vstack(trk_embs)
else:
trk_embs = np.array(trk_embs)
for t in reversed(to_del):
self.trackers.pop(t)
velocities = np.array([trk.velocity if trk.velocity is not None else np.array((0, 0)) for trk in self.trackers])
last_boxes = np.array([trk.last_observation for trk in self.trackers])
k_observations = np.array([k_previous_obs(trk.observations, trk.age, self.delta_t) for trk in self.trackers])
"""
First round of association
"""
# (M detections X N tracks, final score)
if self.embedding_off or dets.shape[0] == 0 or trk_embs.shape[0] == 0:
stage1_emb_cost = None
else:
stage1_emb_cost = dets_embs @ trk_embs.T
matched, unmatched_dets, unmatched_trks = associate(
dets,
trks,
self.iou_threshold,
velocities,
k_observations,
self.inertia,
stage1_emb_cost,
self.w_association_emb,
self.aw_off,
self.aw_param,
)
for m in matched:
self.trackers[m[1]].update(dets[m[0], :5], dets[m[0], 5])
self.trackers[m[1]].update_emb(dets_embs[m[0]], alpha=dets_alpha[m[0]])
"""
Second round of associaton by OCR
"""
if unmatched_dets.shape[0] > 0 and unmatched_trks.shape[0] > 0:
left_dets = dets[unmatched_dets]
left_dets_embs = dets_embs[unmatched_dets]
left_trks = last_boxes[unmatched_trks]
left_trks_embs = trk_embs[unmatched_trks]
iou_left = self.asso_func(left_dets, left_trks)
# TODO: is better without this
emb_cost_left = left_dets_embs @ left_trks_embs.T
if self.embedding_off:
emb_cost_left = np.zeros_like(emb_cost_left)
iou_left = np.array(iou_left)
if iou_left.max() > self.iou_threshold:
"""
NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may
get a higher performance especially on MOT17/MOT20 datasets. But we keep it
uniform here for simplicity
"""
rematched_indices = linear_assignment(-iou_left)
to_remove_det_indices = []
to_remove_trk_indices = []
for m in rematched_indices:
det_ind, trk_ind = unmatched_dets[m[0]], unmatched_trks[m[1]]
if iou_left[m[0], m[1]] < self.iou_threshold:
continue
self.trackers[trk_ind].update(dets[det_ind, :5], dets[det_ind, 5])
self.trackers[trk_ind].update_emb(dets_embs[det_ind], alpha=dets_alpha[det_ind])
to_remove_det_indices.append(det_ind)
to_remove_trk_indices.append(trk_ind)
unmatched_dets = np.setdiff1d(unmatched_dets, np.array(to_remove_det_indices))
unmatched_trks = np.setdiff1d(unmatched_trks, np.array(to_remove_trk_indices))
for m in unmatched_trks:
self.trackers[m].update(None, None)
# create and initialise new trackers for unmatched detections
for i in unmatched_dets:
trk = KalmanBoxTracker(
dets[i, :5], dets[i, 5], delta_t=self.delta_t, emb=dets_embs[i], alpha=dets_alpha[i], new_kf=not self.new_kf_off
)
self.trackers.append(trk)
i = len(self.trackers)
for trk in reversed(self.trackers):
if trk.last_observation.sum() < 0:
d = trk.get_state()[0]
else:
"""
this is optional to use the recent observation or the kalman filter prediction,
we didn't notice significant difference here
"""
d = trk.last_observation[:4]
if (trk.time_since_update < 1) and (trk.hit_streak >= self.min_hits or self.frame_count <= self.min_hits):
# +1 as MOT benchmark requires positive
ret.append(np.concatenate((d, [trk.id + 1], [trk.cls], [trk.conf])).reshape(1, -1))
i -= 1
# remove dead tracklet
if trk.time_since_update > self.max_age:
self.trackers.pop(i)
if len(ret) > 0:
return np.concatenate(ret)
return np.empty((0, 5))
def _xywh_to_xyxy(self, bbox_xywh):
x, y, w, h = bbox_xywh
x1 = max(int(x - w / 2), 0)
x2 = min(int(x + w / 2), self.width - 1)
y1 = max(int(y - h / 2), 0)
y2 = min(int(y + h / 2), self.height - 1)
return x1, y1, x2, y2
def _get_features(self, bbox_xywh, ori_img):
im_crops = []
for box in bbox_xywh:
x1, y1, x2, y2 = self._xywh_to_xyxy(box)
im = ori_img[y1:y2, x1:x2]
im_crops.append(im)
if im_crops:
features = self.embedder(im_crops).cpu()
else:
features = np.array([])
return features
def update_public(self, dets, cates, scores):
self.frame_count += 1
det_scores = np.ones((dets.shape[0], 1))
dets = np.concatenate((dets, det_scores), axis=1)
remain_inds = scores > self.det_thresh
cates = cates[remain_inds]
dets = dets[remain_inds]
trks = np.zeros((len(self.trackers), 5))
to_del = []
ret = []
for t, trk in enumerate(trks):
pos = self.trackers[t].predict()[0]
cat = self.trackers[t].cate
trk[:] = [pos[0], pos[1], pos[2], pos[3], cat]
if np.any(np.isnan(pos)):
to_del.append(t)
trks = np.ma.compress_rows(np.ma.masked_invalid(trks))
for t in reversed(to_del):
self.trackers.pop(t)
velocities = np.array([trk.velocity if trk.velocity is not None else np.array((0, 0)) for trk in self.trackers])
last_boxes = np.array([trk.last_observation for trk in self.trackers])
k_observations = np.array([k_previous_obs(trk.observations, trk.age, self.delta_t) for trk in self.trackers])
matched, unmatched_dets, unmatched_trks = associate_kitti(
dets,
trks,
cates,
self.iou_threshold,
velocities,
k_observations,
self.inertia,
)
for m in matched:
self.trackers[m[1]].update(dets[m[0], :])
if unmatched_dets.shape[0] > 0 and unmatched_trks.shape[0] > 0:
"""
The re-association stage by OCR.
NOTE: at this stage, adding other strategy might be able to continue improve
the performance, such as BYTE association by ByteTrack.
"""
left_dets = dets[unmatched_dets]
left_trks = last_boxes[unmatched_trks]
left_dets_c = left_dets.copy()
left_trks_c = left_trks.copy()
iou_left = self.asso_func(left_dets_c, left_trks_c)
iou_left = np.array(iou_left)
det_cates_left = cates[unmatched_dets]
trk_cates_left = trks[unmatched_trks][:, 4]
num_dets = unmatched_dets.shape[0]
num_trks = unmatched_trks.shape[0]
cate_matrix = np.zeros((num_dets, num_trks))
for i in range(num_dets):
for j in range(num_trks):
if det_cates_left[i] != trk_cates_left[j]:
"""
For some datasets, such as KITTI, there are different categories,
we have to avoid associate them together.
"""
cate_matrix[i][j] = -1e6
iou_left = iou_left + cate_matrix
if iou_left.max() > self.iou_threshold - 0.1:
rematched_indices = linear_assignment(-iou_left)
to_remove_det_indices = []
to_remove_trk_indices = []
for m in rematched_indices:
det_ind, trk_ind = unmatched_dets[m[0]], unmatched_trks[m[1]]
if iou_left[m[0], m[1]] < self.iou_threshold - 0.1:
continue
self.trackers[trk_ind].update(dets[det_ind, :])
to_remove_det_indices.append(det_ind)
to_remove_trk_indices.append(trk_ind)
unmatched_dets = np.setdiff1d(unmatched_dets, np.array(to_remove_det_indices))
unmatched_trks = np.setdiff1d(unmatched_trks, np.array(to_remove_trk_indices))
for i in unmatched_dets:
trk = KalmanBoxTracker(dets[i, :])
trk.cate = cates[i]
self.trackers.append(trk)
i = len(self.trackers)
for trk in reversed(self.trackers):
if trk.last_observation.sum() > 0:
d = trk.last_observation[:4]
else:
d = trk.get_state()[0]
if trk.time_since_update < 1:
if (self.frame_count <= self.min_hits) or (trk.hit_streak >= self.min_hits):
# id+1 as MOT benchmark requires positive
ret.append(np.concatenate((d, [trk.id + 1], [trk.cls], [trk.conf])).reshape(1, -1))
if trk.hit_streak == self.min_hits:
# Head Padding (HP): recover the lost steps during initializing the track
for prev_i in range(self.min_hits - 1):
prev_observation = trk.history_observations[-(prev_i + 2)]
ret.append(
(
np.concatenate(
(
prev_observation[:4],
[trk.id + 1],
[trk.cls],
[trk.conf],
)
)
).reshape(1, -1)
)
i -= 1
if trk.time_since_update > self.max_age:
self.trackers.pop(i)
if len(ret) > 0:
return np.concatenate(ret)
return np.empty((0, 7))
def dump_cache(self):
self.cmc.dump_cache()
self.embedder.dump_cache()

View file

@ -0,0 +1,237 @@
import torch.nn as nn
import torch
from pathlib import Path
import numpy as np
from itertools import islice
import torchvision.transforms as transforms
import cv2
import sys
import torchvision.transforms as T
from collections import OrderedDict, namedtuple
import gdown
from os.path import exists as file_exists
from yolov8.ultralytics.yolo.utils.checks import check_requirements, check_version
from yolov8.ultralytics.yolo.utils import LOGGER
from trackers.strongsort.deep.reid_model_factory import (show_downloadeable_models, get_model_url, get_model_name,
download_url, load_pretrained_weights)
from trackers.strongsort.deep.models import build_model
def check_suffix(file='yolov5s.pt', suffix=('.pt',), msg=''):
# Check file(s) for acceptable suffix
if file and suffix:
if isinstance(suffix, str):
suffix = [suffix]
for f in file if isinstance(file, (list, tuple)) else [file]:
s = Path(f).suffix.lower() # file suffix
if len(s):
assert s in suffix, f"{msg}{f} acceptable suffix is {suffix}"
class ReIDDetectMultiBackend(nn.Module):
# ReID models MultiBackend class for python inference on various backends
def __init__(self, weights='osnet_x0_25_msmt17.pt', device=torch.device('cpu'), fp16=False):
super().__init__()
w = weights[0] if isinstance(weights, list) else weights
self.pt, self.jit, self.onnx, self.xml, self.engine, self.tflite = self.model_type(w) # get backend
self.fp16 = fp16
self.fp16 &= self.pt or self.jit or self.engine # FP16
# Build transform functions
self.device = device
self.image_size=(256, 128)
self.pixel_mean=[0.485, 0.456, 0.406]
self.pixel_std=[0.229, 0.224, 0.225]
self.transforms = []
self.transforms += [T.Resize(self.image_size)]
self.transforms += [T.ToTensor()]
self.transforms += [T.Normalize(mean=self.pixel_mean, std=self.pixel_std)]
self.preprocess = T.Compose(self.transforms)
self.to_pil = T.ToPILImage()
model_name = get_model_name(w)
if w.suffix == '.pt':
model_url = get_model_url(w)
if not file_exists(w) and model_url is not None:
gdown.download(model_url, str(w), quiet=False)
elif file_exists(w):
pass
else:
print(f'No URL associated to the chosen StrongSORT weights ({w}). Choose between:')
show_downloadeable_models()
exit()
# Build model
self.model = build_model(
model_name,
num_classes=1,
pretrained=not (w and w.is_file()),
use_gpu=device
)
if self.pt: # PyTorch
# populate model arch with weights
if w and w.is_file() and w.suffix == '.pt':
load_pretrained_weights(self.model, w)
self.model.to(device).eval()
self.model.half() if self.fp16 else self.model.float()
elif self.jit:
LOGGER.info(f'Loading {w} for TorchScript inference...')
self.model = torch.jit.load(w)
self.model.half() if self.fp16 else self.model.float()
elif self.onnx: # ONNX Runtime
LOGGER.info(f'Loading {w} for ONNX Runtime inference...')
cuda = torch.cuda.is_available() and device.type != 'cpu'
#check_requirements(('onnx', 'onnxruntime-gpu' if cuda else 'onnxruntime'))
import onnxruntime
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if cuda else ['CPUExecutionProvider']
self.session = onnxruntime.InferenceSession(str(w), providers=providers)
elif self.engine: # TensorRT
LOGGER.info(f'Loading {w} for TensorRT inference...')
import tensorrt as trt # https://developer.nvidia.com/nvidia-tensorrt-download
check_version(trt.__version__, '7.0.0', hard=True) # require tensorrt>=7.0.0
if device.type == 'cpu':
device = torch.device('cuda:0')
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
logger = trt.Logger(trt.Logger.INFO)
with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
self.model_ = runtime.deserialize_cuda_engine(f.read())
self.context = self.model_.create_execution_context()
self.bindings = OrderedDict()
self.fp16 = False # default updated below
dynamic = False
for index in range(self.model_.num_bindings):
name = self.model_.get_binding_name(index)
dtype = trt.nptype(self.model_.get_binding_dtype(index))
if self.model_.binding_is_input(index):
if -1 in tuple(self.model_.get_binding_shape(index)): # dynamic
dynamic = True
self.context.set_binding_shape(index, tuple(self.model_.get_profile_shape(0, index)[2]))
if dtype == np.float16:
self.fp16 = True
shape = tuple(self.context.get_binding_shape(index))
im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
self.bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr()))
self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items())
batch_size = self.bindings['images'].shape[0] # if dynamic, this is instead max batch size
elif self.xml: # OpenVINO
LOGGER.info(f'Loading {w} for OpenVINO inference...')
check_requirements(('openvino',)) # requires openvino-dev: https://pypi.org/project/openvino-dev/
from openvino.runtime import Core, Layout, get_batch
ie = Core()
if not Path(w).is_file(): # if not *.xml
w = next(Path(w).glob('*.xml')) # get *.xml file from *_openvino_model dir
network = ie.read_model(model=w, weights=Path(w).with_suffix('.bin'))
if network.get_parameters()[0].get_layout().empty:
network.get_parameters()[0].set_layout(Layout("NCWH"))
batch_dim = get_batch(network)
if batch_dim.is_static:
batch_size = batch_dim.get_length()
self.executable_network = ie.compile_model(network, device_name="CPU") # device_name="MYRIAD" for Intel NCS2
self.output_layer = next(iter(self.executable_network.outputs))
elif self.tflite:
LOGGER.info(f'Loading {w} for TensorFlow Lite inference...')
try: # https://coral.ai/docs/edgetpu/tflite-python/#update-existing-tf-lite-code-for-the-edge-tpu
from tflite_runtime.interpreter import Interpreter, load_delegate
except ImportError:
import tensorflow as tf
Interpreter, load_delegate = tf.lite.Interpreter, tf.lite.experimental.load_delegate,
self.interpreter = tf.lite.Interpreter(model_path=w)
self.interpreter.allocate_tensors()
# Get input and output tensors.
self.input_details = self.interpreter.get_input_details()
self.output_details = self.interpreter.get_output_details()
# Test model on random input data.
input_data = np.array(np.random.random_sample((1,256,128,3)), dtype=np.float32)
self.interpreter.set_tensor(self.input_details[0]['index'], input_data)
self.interpreter.invoke()
# The function `get_tensor()` returns a copy of the tensor data.
output_data = self.interpreter.get_tensor(self.output_details[0]['index'])
else:
print('This model framework is not supported yet!')
exit()
@staticmethod
def model_type(p='path/to/model.pt'):
# Return model type from model path, i.e. path='path/to/model.onnx' -> type=onnx
from trackers.reid_export import export_formats
sf = list(export_formats().Suffix) # export suffixes
check_suffix(p, sf) # checks
types = [s in Path(p).name for s in sf]
return types
def _preprocess(self, im_batch):
images = []
for element in im_batch:
image = self.to_pil(element)
image = self.preprocess(image)
images.append(image)
images = torch.stack(images, dim=0)
images = images.to(self.device)
return images
def forward(self, im_batch):
# preprocess batch
im_batch = self._preprocess(im_batch)
# batch to half
if self.fp16 and im_batch.dtype != torch.float16:
im_batch = im_batch.half()
# batch processing
features = []
if self.pt:
features = self.model(im_batch)
elif self.jit: # TorchScript
features = self.model(im_batch)
elif self.onnx: # ONNX Runtime
im_batch = im_batch.cpu().numpy() # torch to numpy
features = self.session.run([self.session.get_outputs()[0].name], {self.session.get_inputs()[0].name: im_batch})[0]
elif self.engine: # TensorRT
if True and im_batch.shape != self.bindings['images'].shape:
i_in, i_out = (self.model_.get_binding_index(x) for x in ('images', 'output'))
self.context.set_binding_shape(i_in, im_batch.shape) # reshape if dynamic
self.bindings['images'] = self.bindings['images']._replace(shape=im_batch.shape)
self.bindings['output'].data.resize_(tuple(self.context.get_binding_shape(i_out)))
s = self.bindings['images'].shape
assert im_batch.shape == s, f"input size {im_batch.shape} {'>' if self.dynamic else 'not equal to'} max model size {s}"
self.binding_addrs['images'] = int(im_batch.data_ptr())
self.context.execute_v2(list(self.binding_addrs.values()))
features = self.bindings['output'].data
elif self.xml: # OpenVINO
im_batch = im_batch.cpu().numpy() # FP32
features = self.executable_network([im_batch])[self.output_layer]
else:
print('Framework not supported at the moment, we are working on it...')
exit()
if isinstance(features, (list, tuple)):
return self.from_numpy(features[0]) if len(features) == 1 else [self.from_numpy(x) for x in features]
else:
return self.from_numpy(features)
def from_numpy(self, x):
return torch.from_numpy(x).to(self.device) if isinstance(x, np.ndarray) else x
def warmup(self, imgsz=[(256, 128, 3)]):
# Warmup model by running inference once
warmup_types = self.pt, self.jit, self.onnx, self.engine, self.tflite
if any(warmup_types) and self.device.type != 'cpu':
im = [np.empty(*imgsz).astype(np.uint8)] # input
for _ in range(2 if self.jit else 1): #
self.forward(im) # warmup