Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Find frames per track #65

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ You may use [YOLO](https://docs.ultralytics.com/) to automatically perform detec
Detect objects with Ultralytics YOLO detections, apply SORT tracking and convert tracks to CVAT format.

```
detector2cvat --video path_to_videos --save path_to_save
detector2cvat --video path_to_videos --save path_to_save [--imshow]
```


Expand Down
10 changes: 5 additions & 5 deletions src/kabr_tools/cvat2slowfast.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
from typing import Optional
import argparse
import json
from lxml import etree
Expand All @@ -9,9 +10,8 @@
import cv2


def cvat2slowfast(path_to_mini_scenes, path_to_new_dataset, label2number, old2new):
number2label = {value: key for key, value in label2number.items()}

def cvat2slowfast(path_to_mini_scenes: str, path_to_new_dataset: str,
label2number: dict, old2new: Optional[dict]) -> None:
if not os.path.exists(path_to_new_dataset):
os.makedirs(path_to_new_dataset)

Expand Down Expand Up @@ -143,7 +143,7 @@ def cvat2slowfast(path_to_mini_scenes, path_to_new_dataset, label2number, old2ne
f"{path_to_new_dataset}/annotation/data.csv", sep=" ", index=False)


def parse_args():
def parse_args() -> argparse.Namespace:
local_parser = argparse.ArgumentParser()
local_parser.add_argument(
'--miniscene',
Expand Down Expand Up @@ -172,7 +172,7 @@ def parse_args():
return local_parser.parse_args()


def main():
def main() -> None:
args = parse_args()

with open(args.classes, mode='r', encoding='utf-8') as file:
Expand Down
9 changes: 6 additions & 3 deletions src/kabr_tools/cvat2ultralytics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from typing import Optional
import argparse
import json
import cv2
Expand All @@ -10,7 +11,9 @@
from natsort import natsorted


def cvat2ultralytics(video_path, annotation_path, dataset, skip, label2index=None):
def cvat2ultralytics(video_path: str, annotation_path: str,
dataset: str, skip: int,
label2index: Optional[dict] = None) -> None:
# Create a YOLO dataset structure.
dataset_file = f"""
path: {dataset}
Expand Down Expand Up @@ -169,7 +172,7 @@ def cvat2ultralytics(video_path, annotation_path, dataset, skip, label2index=Non
shutil.move(f"{dataset}/labels/train/{file}", f"{dataset}/labels/test/{file}")


def parse_args():
def parse_args() -> argparse.Namespace:
local_parser = argparse.ArgumentParser()
local_parser.add_argument(
'--video',
Expand Down Expand Up @@ -204,7 +207,7 @@ def parse_args():
return local_parser.parse_args()


def main():
def main() -> None:
args = parse_args()

if args.label2index:
Expand Down
26 changes: 16 additions & 10 deletions src/kabr_tools/detector2cvat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
from kabr_tools.utils.draw import Draw



def detector2cvat(path_to_videos, path_to_save):
def detector2cvat(path_to_videos: str, path_to_save: str, show: bool) -> None:
videos = []

for root, dirs, files in os.walk(path_to_videos):
Expand Down Expand Up @@ -77,7 +76,9 @@ def detector2cvat(path_to_videos, path_to_save):

cv2.putText(visualization, f"Frame: {index}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX,
0.8, (255, 255, 255), 3, cv2.LINE_AA)
cv2.imshow("detector2cvat", cv2.resize(visualization, (int(width // 2.5), int(height // 2.5))))
if show:
cv2.imshow("detector2cvat", cv2.resize(
visualization, (int(width // 2.5), int(height // 2.5))))
vw.write(visualization)
key = cv2.waitKey(1)
index += 1
Expand All @@ -97,26 +98,31 @@ def detector2cvat(path_to_videos, path_to_save):
print("Something went wrong...")


def parse_args():
def parse_args() -> argparse.Namespace:
local_parser = argparse.ArgumentParser()
local_parser.add_argument(
'--video',
"--video",
type=str,
help='path to folder containing videos',
help="path to folder containing videos",
required=True
)
local_parser.add_argument(
'--save',
"--save",
type=str,
help='path to save output xml & mp4 files',
help="path to save output xml & mp4 files",
required=True
)
local_parser.add_argument(
"--imshow",
action="store_true",
help="flag to display detector's visualization"
)
return local_parser.parse_args()


def main():
def main() -> None:
args = parse_args()
detector2cvat(args.video, args.save)
detector2cvat(args.video, args.save, args.imshow)


if __name__ == "__main__":
Expand Down
101 changes: 58 additions & 43 deletions src/kabr_tools/miniscene2behavior.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,34 @@
import sys
import argparse
import torch
from lxml import etree
import pandas as pd
import cv2
import argparse
from tqdm import tqdm
import slowfast.utils.checkpoint as cu
import slowfast.models.build as build
import slowfast.utils.parser as parser
from slowfast.models import build
from slowfast.utils import parser
from slowfast.datasets.utils import get_sequence
from slowfast.visualization.utils import process_cv2_inputs
from slowfast.datasets.cv2_transform import scale
from fvcore.common.config import CfgNode
from torch import Tensor


def get_input_clip(cap, cfg, keyframe_idx):
def get_input_clip(cap: cv2.VideoCapture, cfg: CfgNode, keyframe_idx: int) -> list[Tensor]:
# https://github.com/facebookresearch/SlowFast/blob/bac7b672f40d44166a84e8c51d1a5ba367ace816/slowfast/visualization/ava_demo_precomputed_boxes.py
seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
assert keyframe_idx < total_frames, f"keyframe_idx: {keyframe_idx}" \
f">= total_frames: {total_frames}"
seq = get_sequence(
keyframe_idx,
seq_length // 2,
cfg.DATA.SAMPLING_RATE,
total_frames,
)
# TODO: remove after debugging
print(keyframe_idx, seq[0], seq[-1], total_frames)
clip = []
for frame_idx in seq:
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
Expand All @@ -32,67 +38,67 @@ def get_input_clip(cap, cfg, keyframe_idx):
frame = scale(cfg.DATA.TEST_CROP_SIZE, frame)
clip.append(frame)
else:
print('Unable to read frame. Duplicating previous frame.')
print("Unable to read frame. Duplicating previous frame.")
clip.append(clip[-1])

clip = process_cv2_inputs(clip, cfg)
return clip


def parse_args():
def parse_args() -> argparse.Namespace:
local_parser = argparse.ArgumentParser()
local_parser.add_argument(
'--config',
"--config",
type=str,
help='model config.yml filepath',
default='config.yml'
help="model config.yml filepath",
default="config.yml"
)
local_parser.add_argument(
'--checkpoint',
"--checkpoint",
type=str,
help='model checkpoint.pyth filepath',
help="model checkpoint.pyth filepath",
required=True
)
local_parser.add_argument(
'--gpu_num',
"--gpu_num",
type=int,
help='number of gpus',
help="number of gpus",
default=0
)
local_parser.add_argument(
'--miniscene',
"--miniscene",
type=str,
help='miniscene folder containing miniscene\'s tracks.xml & *.mp4',
help="miniscene folder containing miniscene\'s tracks.xml & *.mp4",
required=True
)
local_parser.add_argument(
'--video',
"--video",
type=str,
help='name of video (expect video_tracks.xml from tracks_extractor)',
help="name of video (expect video_tracks.xml from tracks_extractor)",
required=True
)
local_parser.add_argument(
'--output',
"--output",
type=str,
help='filepath for output csv',
default='annotation_data.csv'
help="filepath for output csv",
default="annotation_data.csv"
)

return local_parser.parse_args()


def create_model(config_path, checkpoint_path, gpu_num):
def create_model(config_path: str, checkpoint_path: str, gpu_num: int) -> tuple[CfgNode, torch.nn.Module]:
# load model config
try:
cfg = parser.load_config(parser.parse_args(), config_path)
except FileNotFoundError:
checkpoint = torch.load(
checkpoint_path, map_location=torch.device('cpu'))
with open(config_path, 'w') as file:
file.write(checkpoint['cfg'])
checkpoint_path, map_location=torch.device("cpu"))
with open(config_path, "w") as file:
file.write(checkpoint["cfg"])
cfg = parser.load_config(parser.parse_args(), config_path)
cfg.NUM_GPUS = gpu_num
cfg.OUTPUT_DIR = ''
cfg.OUTPUT_DIR = ""
model = build.build_model(cfg)

# load model checkpoint
Expand All @@ -103,34 +109,42 @@ def create_model(config_path, checkpoint_path, gpu_num):
return cfg, model


def annotate_miniscene(cfg, model, miniscene_path, video, output_path):
def annotate_miniscene(cfg: CfgNode, model: torch.nn.Module,
miniscene_path: str, video: str,
output_path: str) -> None:
label_data = []
track_file = f'{miniscene_path}/metadata/{video}_tracks.xml'
track_file = f"{miniscene_path}/metadata/{video}_tracks.xml"
root = etree.parse(track_file).getroot()

# find all tracks
tracks = []
frames = {}
for track in root.iterfind("track"):
track_id = track.attrib["id"]
tracks.append(track_id)
frames[track_id] = []

# find all frames
frames = []
for box in track.iterfind("box"):
frames.append(int(box.attrib['frame']))
# find all frames
for box in track.iterfind("box"):
frames[track_id].append(int(box.attrib["frame"]))

# run model on miniscene
for track in tracks:
video_file = f"{miniscene_path}/{track}.mp4"
cap = cv2.VideoCapture(video_file)
for frame in tqdm(frames, desc=f'{track} frames'):
inputs = get_input_clip(cap, cfg, frame)
print(f'{track=}')
for index, frame in tqdm(enumerate(frames[track]), desc=f'{track} frames'):
try:
inputs = get_input_clip(cap, cfg, index)
except AssertionError as e:
print(e)
break

if cfg.NUM_GPUS:
# transfer the data to the current GPU device.
if isinstance(inputs, (list,)):
for i in range(len(inputs)):
inputs[i] = inputs[i].cuda(non_blocking=True)
for i, input_clip in enumerate(inputs):
inputs[i] = input_clip.cuda(non_blocking=True)
else:
inputs = inputs.cuda(non_blocking=True)

Expand All @@ -140,17 +154,18 @@ def annotate_miniscene(cfg, model, miniscene_path, video, output_path):
if cfg.NUM_GPUS:
preds = preds.cpu()

label_data.append({'video': video,
'track': track,
'frame': frame,
'label': torch.argmax(preds).item()})
label_data.append({"video": video,
"track": track,
"frame": frame,
"label": torch.argmax(preds).item()})
if frame % 20 == 0:
pd.DataFrame(label_data).to_csv(
output_path, sep=' ', index=False)
pd.DataFrame(label_data).to_csv(output_path, sep=' ', index=False)
output_path, sep=" ", index=False)
cap.release()
pd.DataFrame(label_data).to_csv(output_path, sep=" ", index=False)


def main():
def main() -> None:
# clear arguments to avoid slowfast parsing issues
args = parse_args()
sys.argv = [sys.argv[0]]
Expand All @@ -159,5 +174,5 @@ def main():
args.video, args.output)


if __name__ == '__main__':
if __name__ == "__main__":
main()
Loading