diff --git a/scripts/run_on_single_image.py b/scripts/run_on_single_image.py index 058ccd593e0fd73f9aa677c397fa03bce8b50be3..d3197069205557b14efcaf4aec8342b7439eb8eb 100644 --- a/scripts/run_on_single_image.py +++ b/scripts/run_on_single_image.py @@ -16,6 +16,7 @@ from sparse_coding_torch.mobile_model import NetTensorFlowWrapper import time import csv from datetime import datetime +from yolov4.get_bounding_boxes import YoloModel if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -64,6 +65,8 @@ if __name__ == "__main__": checkpoint = torch.load(args.checkpoint, map_location=device) predictive_model.load_state_dict(checkpoint['model_state_dict']) + yolo_model = YoloModel() + transform = torchvision.transforms.Compose( [VideoGrayScaler(), MinMaxScaler(0, 255), @@ -85,24 +88,46 @@ if __name__ == "__main__": vc = VideoClips([os.path.join(args.input_directory, f)], clip_length_in_frames=5, frame_rate=20, - frames_between_clips=1) + frames_between_clips=5) clip_predictions = [] for i in range(vc.num_clips()): clip, _, _, _ = vc.get_clip(i) - clip = clip.swapaxes(1, 3).swapaxes(0, 1).swapaxes(2, 3).to(torch.float) - clip = transform(clip) + clip = clip.swapaxes(1, 3).swapaxes(0, 1).swapaxes(2, 3).numpy() + + bounding_boxes = yolo_model.get_bounding_boxes(clip[:, 2, :, :].swapaxes(0, 2).swapaxes(0, 1)).squeeze(0) + if bounding_boxes.size == 0: + continue + for bb in bounding_boxes: + center_x = bb[0] * 1920 + center_y = bb[1] * 1080 + + # width = region['relative_coordinates']['width'] * 1920 + # height = region['relative_coordinates']['height'] * 1080 + width=400 + height=400 + + lower_y = round(center_y - height / 2) + upper_y = round(center_y + height / 2) + lower_x = round(center_x - width / 2) + upper_x = round(center_x + width / 2) + + trimmed_clip = clip[:, :, lower_y:upper_y, lower_x:upper_x] + + trimmed_clip = torch.tensor(trimmed_clip).to(torch.float) + + trimmed_clip = transform(trimmed_clip) - with torch.no_grad(): - clip = clip.unsqueeze(0).to(device) - start_sparse_time = time.time() - activations = frozen_sparse(clip) - end_sparse_time = time.time() + with torch.no_grad(): + trimmed_clip = trimmed_clip.unsqueeze(0).to(device) + start_sparse_time = time.time() + activations = frozen_sparse(trimmed_clip) + end_sparse_time = time.time() - # Note that you can get activations here - pred, activations = predictive_model(activations) + # Note that you can get activations here + pred, activations = predictive_model(activations) - clip_predictions.append(torch.nn.Sigmoid()(pred).round().detach().cpu().flatten().to(torch.long)) + clip_predictions.append(torch.nn.Sigmoid()(pred).round().detach().cpu().flatten().to(torch.long)) final_pred = torch.mode(torch.tensor(clip_predictions))[0].item() diff --git a/yolov4 b/yolov4 new file mode 160000 index 0000000000000000000000000000000000000000..9f16748aa3f45ff240608da4bd9b1216a29127f5 --- /dev/null +++ b/yolov4 @@ -0,0 +1 @@ +Subproject commit 9f16748aa3f45ff240608da4bd9b1216a29127f5