diff --git a/scripts/run_on_single_image.py b/scripts/run_on_single_image.py
index 058ccd593e0fd73f9aa677c397fa03bce8b50be3..d3197069205557b14efcaf4aec8342b7439eb8eb 100644
--- a/scripts/run_on_single_image.py
+++ b/scripts/run_on_single_image.py
@@ -16,6 +16,7 @@ from sparse_coding_torch.mobile_model import NetTensorFlowWrapper
 import time
 import csv
 from datetime import datetime
+from yolov4.get_bounding_boxes import YoloModel
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -64,6 +65,8 @@ if __name__ == "__main__":
         checkpoint = torch.load(args.checkpoint, map_location=device)
         predictive_model.load_state_dict(checkpoint['model_state_dict'])
         
+    yolo_model = YoloModel()
+        
     transform = torchvision.transforms.Compose(
     [VideoGrayScaler(),
      MinMaxScaler(0, 255),
@@ -85,24 +88,46 @@ if __name__ == "__main__":
         vc = VideoClips([os.path.join(args.input_directory, f)],
                         clip_length_in_frames=5,
                         frame_rate=20,
-                       frames_between_clips=1)
+                       frames_between_clips=5)
     
         clip_predictions = []
         for i in range(vc.num_clips()):
             clip, _, _, _ = vc.get_clip(i)
-            clip = clip.swapaxes(1, 3).swapaxes(0, 1).swapaxes(2, 3).to(torch.float)
-            clip = transform(clip)
+            clip = clip.swapaxes(1, 3).swapaxes(0, 1).swapaxes(2, 3).numpy()
+            
+            bounding_boxes = yolo_model.get_bounding_boxes(clip[:, 2, :, :].swapaxes(0, 2).swapaxes(0, 1)).squeeze(0)
+            if bounding_boxes.size == 0:
+                continue
+            for bb in bounding_boxes:
+                center_x = bb[0] * 1920
+                center_y = bb[1] * 1080
+
+                # width = region['relative_coordinates']['width'] * 1920
+                # height = region['relative_coordinates']['height'] * 1080
+                width=400
+                height=400
+
+                lower_y = round(center_y - height / 2)
+                upper_y = round(center_y + height / 2)
+                lower_x = round(center_x - width / 2)
+                upper_x = round(center_x + width / 2)
+
+                trimmed_clip = clip[:, :, lower_y:upper_y, lower_x:upper_x]
+
+                trimmed_clip = torch.tensor(trimmed_clip).to(torch.float)
+
+                trimmed_clip = transform(trimmed_clip)
 
-            with torch.no_grad():
-                clip = clip.unsqueeze(0).to(device)
-                start_sparse_time = time.time()
-                activations = frozen_sparse(clip)
-                end_sparse_time = time.time()
+                with torch.no_grad():
+                    trimmed_clip = trimmed_clip.unsqueeze(0).to(device)
+                    start_sparse_time = time.time()
+                    activations = frozen_sparse(trimmed_clip)
+                    end_sparse_time = time.time()
 
-                # Note that you can get activations here
-                pred, activations = predictive_model(activations)
+                    # Note that you can get activations here
+                    pred, activations = predictive_model(activations)
 
-                clip_predictions.append(torch.nn.Sigmoid()(pred).round().detach().cpu().flatten().to(torch.long))
+                    clip_predictions.append(torch.nn.Sigmoid()(pred).round().detach().cpu().flatten().to(torch.long))
 
                    
         final_pred = torch.mode(torch.tensor(clip_predictions))[0].item()
diff --git a/yolov4 b/yolov4
new file mode 160000
index 0000000000000000000000000000000000000000..9f16748aa3f45ff240608da4bd9b1216a29127f5
--- /dev/null
+++ b/yolov4
@@ -0,0 +1 @@
+Subproject commit 9f16748aa3f45ff240608da4bd9b1216a29127f5