diff --git a/clips_to_test_swift/6.mp4 b/clips_to_test_swift/6.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ef9814bc63de5bddf3976c7a27119d5673012940 Binary files /dev/null and b/clips_to_test_swift/6.mp4 differ diff --git a/keras/generate_tflite.py b/keras/generate_tflite.py index 3dd871a95cc24a0d310393e502829c642600cfd1..7baffc0b6091bc3a9abb928cc1ab32a65196591b 100644 --- a/keras/generate_tflite.py +++ b/keras/generate_tflite.py @@ -8,49 +8,56 @@ import torch import torch.nn as nn from sparse_coding_torch.video_loader import VideoGrayScaler, MinMaxScaler from sparse_coding_torch.conv_sparse_model import ConvSparseLayer +from sparse_coding_torch.small_data_classifier import SmallDataClassifierConv3d from keras_model import MobileModel -inputs = keras.Input(shape=(100, 200, 5)) +inputs = keras.Input(shape=(5, 100, 200, 3)) -outputs = MobileModel(sparse_checkpoint='../sparse.pt', batch_size=1, in_channels=1, out_channels=64, kernel_size=15, stride=1, lam=0.05, activation_lr=1e-2, max_activation_iter=40, run_2d=True)(inputs) +outputs = MobileModel(sparse_checkpoint='../sparse.pt', batch_size=1, in_channels=1, out_channels=64, kernel_size=15, stride=2, lam=0.05, activation_lr=1e-1, max_activation_iter=100, run_2d=True)(inputs) +# outputs = tf.math.add(inputs, 1) model = keras.Model(inputs=inputs, outputs=outputs) -pytorch_checkpoint = torch.load('../output/final_model_75_iter/model-best_fold_0.pt', map_location='cpu')['model_state_dict'] -conv_weights = [pytorch_checkpoint['module.compress_activations_conv_1.weight'].view(8, 8, 64, 24).numpy(), pytorch_checkpoint['module.compress_activations_conv_1.bias'].numpy()] +pytorch_checkpoint = torch.load('../stride_2_100_iter.pt', map_location='cpu')['model_state_dict'] +conv_weights = [pytorch_checkpoint['module.compress_activations_conv_1.weight'].squeeze(2).swapaxes(0, 2).swapaxes(1, 3).swapaxes(2, 3).numpy(), pytorch_checkpoint['module.compress_activations_conv_1.bias'].numpy()] model.get_layer('mobile_model').classifier.conv.set_weights(conv_weights) - -ff_1_weights = [pytorch_checkpoint['module.fc1.weight'].permute(1,0).numpy(), pytorch_checkpoint['module.fc1.bias'].numpy()] -model.get_layer('mobile_model').classifier.ff_1.set_weights(ff_1_weights) -ff_2_weights = [pytorch_checkpoint['module.fc2.weight'].permute(1,0).numpy(), pytorch_checkpoint['module.fc2.bias'].numpy()] -model.get_layer('mobile_model').classifier.ff_2.set_weights(ff_2_weights) -ff_3_weights = [pytorch_checkpoint['module.fc3.weight'].permute(1,0).numpy(), pytorch_checkpoint['module.fc3.bias'].numpy()] +# # ff_1_weights = [pytorch_checkpoint['module.fc1.weight'].swapaxes(1,0).numpy(), pytorch_checkpoint['module.fc1.bias'].numpy()] +# # model.get_layer('mobile_model').classifier.ff_1.set_weights(ff_1_weights) +# # ff_2_weights = [pytorch_checkpoint['module.fc2.weight'].swapaxes(1,0).numpy(), pytorch_checkpoint['module.fc2.bias'].numpy()] +# # model.get_layer('mobile_model').classifier.ff_2.set_weights(ff_2_weights) +ff_3_weights = [pytorch_checkpoint['module.fc3.weight'].swapaxes(1,0).numpy(), pytorch_checkpoint['module.fc3.bias'].numpy()] model.get_layer('mobile_model').classifier.ff_3.set_weights(ff_3_weights) -ff_4_weights = [pytorch_checkpoint['module.fc4.weight'].permute(1,0).numpy(), pytorch_checkpoint['module.fc4.bias'].numpy()] +ff_4_weights = [pytorch_checkpoint['module.fc4.weight'].swapaxes(1,0).numpy(), pytorch_checkpoint['module.fc4.bias'].numpy()] model.get_layer('mobile_model').classifier.ff_4.set_weights(ff_4_weights) # frozen_sparse = ConvSparseLayer(in_channels=1, # out_channels=64, # kernel_size=(5, 15, 15), -# stride=1, -# padding=(0, 7, 7), +# stride=2, +# padding=0, # convo_dim=3, # rectifier=True, # lam=0.05, -# max_activation_iter=10, -# activation_lr=1e-2) - +# max_activation_iter=100, +# activation_lr=1e-1) +# # sparse_param = torch.load('../sparse.pt', map_location='cpu') # frozen_sparse.load_state_dict(sparse_param['model_state_dict']) # -# # pytorch_filter = frozen_sparse.filters[30, :, 0, :, :].squeeze(0).unsqueeze(2).detach().numpy() -# # keras_filter = model.get_layer('sparse_code').filter[0,:,:,:,30].numpy() -# # -# # cv2.imwrite('pytorch_filter.png', pytorch_filter / np.max(pytorch_filter) * 255.) -# # cv2.imwrite('keras_filter.png', keras_filter / np.max(keras_filter) * 255.) -# # raise Exception +# predictive_model = SmallDataClassifierConv3d() +# classifier_param = {k.replace('module.', ''): v for k,v in torch.load('../stride_2_100_iter.pt', map_location='cpu')['model_state_dict'].items()} +# predictive_model.load_state_dict(classifier_param) # +# predictive_model.eval() +# # +# # # pytorch_filter = frozen_sparse.filters[30, :, 0, :, :].squeeze(0).unsqueeze(2).detach().numpy() +# # # keras_filter = model.get_layer('sparse_code').filter[0,:,:,:,30].numpy() +# # # +# # # cv2.imwrite('pytorch_filter.png', pytorch_filter / np.max(pytorch_filter) * 255.) +# # # cv2.imwrite('keras_filter.png', keras_filter / np.max(keras_filter) * 255.) +# # # raise Exception +# # # img = tv.io.read_video('../clips/No_Sliding/Image_262499828648_clean1050.mp4')[0].permute(3, 0, 1, 2) # transform = tv.transforms.Compose( # [VideoGrayScaler(), @@ -59,12 +66,13 @@ model.get_layer('mobile_model').classifier.ff_4.set_weights(ff_4_weights) # tv.transforms.CenterCrop((100, 200)) # ]) # img = transform(img) - +# # with torch.no_grad(): -# activations = frozen_sparse(img.unsqueeze(0)) - +# activations, _ = predictive_model(frozen_sparse(img.unsqueeze(0)).squeeze(2)) +# activations = torch.nn.Sigmoid()(activations) +# # output = model(img.swapaxes(1, 3).swapaxes(1,2).numpy()) - +# # print(activations.size()) # print(output.shape) # print(torch.sum(activations)) @@ -72,7 +80,7 @@ model.get_layer('mobile_model').classifier.ff_4.set_weights(ff_4_weights) input_name = model.input_names[0] index = model.input_names.index(input_name) -model.inputs[index].set_shape([1, 100, 200, 5]) +model.inputs[index].set_shape([1, 5, 100, 200, 3]) converter = tf.lite.TFLiteConverter.from_keras_model(model) # converter.experimental_new_converter = True diff --git a/keras/keras_model.py b/keras/keras_model.py index 530282d40f25a3b0486fb7d685013349ae0cb296..b90848921963687c75fe949e17c6ef3dfa9d2649 100644 --- a/keras/keras_model.py +++ b/keras/keras_model.py @@ -12,7 +12,7 @@ from sparse_coding_torch.conv_sparse_model import ConvSparseLayer def load_pytorch_weights(file_path): pytorch_checkpoint = torch.load(file_path, map_location='cpu') weight_tensor = pytorch_checkpoint['model_state_dict']['filters'].swapaxes(1,3).swapaxes(2,4).swapaxes(0,4).numpy() - + return weight_tensor # @tf.function @@ -40,7 +40,7 @@ def conv_error(filters, e, stride): return g -# @tf.function +@tf.function def conv_error_3d(filters, e, stride): # e = tf.pad(e, paddings=[[0,0], [0, 0], [7, 7], [7, 7], [0, 0]]) g = tf.nn.conv3d(e, filters, strides=[1, 1, stride, stride, 1], padding='VALID') @@ -52,27 +52,27 @@ def normalize_weights(filters, out_channels): #print('filters shape', tf.shape(filters)) norms = tf.norm(tf.reshape(tf.transpose(tf.stack(filters), perm=[4, 0, 1, 2, 3]), (out_channels, -1)), axis=1) norms = tf.broadcast_to(tf.math.maximum(norms, 1e-12*tf.ones_like(norms)), filters[0].shape) - + adjusted = [f / norms for f in filters] - + #raise Exception('Beep') - + return adjusted -# @tf.function +@tf.function def normalize_weights_3d(filters, out_channels): #for f in filters: # print('filters 3d shape', f.shape) norms = tf.norm(tf.reshape(tf.transpose(filters[0], perm=[4, 0, 1, 2, 3]), (out_channels, -1)), axis=1) # tf.print("norms", norms.shape, norms) norms = tf.broadcast_to(tf.math.maximum(norms, 1e-12*tf.ones_like(norms)), filters[0].shape) - + adjusted = [f / norms for f in filters] #for i in range(out_channels): # tf.print("after normalization", tf.norm(adjusted[0][:,:,:,0,i])) #print() - + #raise Exception('Beep') return adjusted @@ -82,6 +82,7 @@ class SparseCode(keras.layers.Layer): self.out_channels = out_channels self.in_channels = in_channels + self.kernel_size = kernel_size self.stride = stride self.lam = lam self.activation_lr = activation_lr @@ -103,7 +104,7 @@ class SparseCode(keras.layers.Layer): e = images - recon g = -1 * u - + if self.run_2d: e1, e2, e3, e4, e5 = tf.split(e, 5, axis=3) g += conv_error(filters[0], e1, self.stride) @@ -113,11 +114,11 @@ class SparseCode(keras.layers.Layer): g += conv_error(filters[4], e5, self.stride) else: convd_error = conv_error_3d(filters, e, self.stride) - + g = g + convd_error g = g + activations - + m = b1 * m + (1-b1) * g v = b2 * v + (1-b2) * tf.math.pow(g, 2) @@ -125,14 +126,10 @@ class SparseCode(keras.layers.Layer): mh = m / (1 - tf.math.pow(b1, (1+i))) vh = v / (1 - tf.math.pow(b2, (1+i))) - du = self.activation_lr * mh / (tf.math.sqrt(vh) + eps) u += du -# i += 1 - -# return images, filters, u, m, v, b1, b2, eps, i return u, m, v # @tf.function @@ -146,16 +143,11 @@ class SparseCode(keras.layers.Layer): u = tf.zeros(shape=output_shape) m = tf.zeros(shape=output_shape) v = tf.zeros(shape=output_shape) - # tf.print('activations before:', tf.reduce_sum(u)) b1 = tf.constant(0.9, dtype='float32') b2 = tf.constant(0.99, dtype='float32') eps = tf.constant(1e-8, dtype='float32') - -# print(u) - - # i = tf.constant(0, dtype='float32') # c = lambda images, filters, u, m, v, b1, b2, eps, i: tf.less(i, self.max_activation_iter) # images, filters, u, m, v, b1, b2, eps, i = tf.while_loop(c, self.do_update, [images, filters, u, m, v, b1, b2, eps, i]) @@ -163,7 +155,7 @@ class SparseCode(keras.layers.Layer): u, m, v = self.do_update(images, filters, u, m, v, b1, b2, eps, i) u = tf.nn.relu(u - self.lam) - + # tf.print('activations after:', tf.reduce_sum(u)) return u @@ -182,7 +174,7 @@ class ReconSparse(keras.Model): self.image_height = image_height self.image_width = image_width self.run_2d = run_2d - + initializer = tf.keras.initializers.HeNormal() if run_2d: self.filters_1 = tf.Variable(initial_value=initializer(shape=(kernel_size, kernel_size, in_channels, out_channels)), dtype='float32', trainable=True) @@ -192,7 +184,7 @@ class ReconSparse(keras.Model): self.filters_5 = tf.Variable(initial_value=initializer(shape=(kernel_size, kernel_size, in_channels, out_channels)), dtype='float32', trainable=True) else: self.filters = tf.Variable(initial_value=initializer(shape=(5, kernel_size, kernel_size, in_channels, out_channels), dtype='float32'), trainable=True) - + if run_2d: weights = normalize_weights(self.get_weights(), out_channels) else: @@ -213,14 +205,14 @@ class Classifier(keras.layers.Layer): super(Classifier, self).__init__() self.max_pool = keras.layers.MaxPooling2D(pool_size=4, strides=4) - self.conv = keras.layers.Conv2D(24, kernel_size=8, strides=4, activation='relu', padding='SAME') + self.conv = keras.layers.Conv2D(24, kernel_size=8, strides=4, activation='relu', padding='valid') self.flatten = keras.layers.Flatten() self.dropout = keras.layers.Dropout(0.5) - self.ff_1 = keras.layers.Dense(1000, activation='relu', use_bias=True) - self.ff_2 = keras.layers.Dense(100, activation='relu', use_bias=True) + # self.ff_1 = keras.layers.Dense(1000, activation='relu', use_bias=True) + # self.ff_2 = keras.layers.Dense(100, activation='relu', use_bias=True) self.ff_3 = keras.layers.Dense(20, activation='relu', use_bias=True) self.ff_4 = keras.layers.Dense(1, activation='sigmoid') @@ -230,10 +222,10 @@ class Classifier(keras.layers.Layer): x = self.max_pool(activations) x = self.conv(x) x = self.flatten(x) - x = self.ff_1(x) - x = self.dropout(x) - x = self.ff_2(x) - x = self.dropout(x) + # # x = self.ff_1(x) + # # x = self.dropout(x) + # # x = self.ff_2(x) + # # x = self.dropout(x) x = self.ff_3(x) x = self.dropout(x) x = self.ff_4(x) @@ -245,7 +237,7 @@ class MobileModel(keras.Model): super().__init__() self.sparse_code = SparseCode(batch_size, in_channels, out_channels, kernel_size, stride, lam, activation_lr, max_activation_iter, run_2d) self.classifier = Classifier() - + self.out_channels = out_channels self.in_channels = in_channels self.stride = stride @@ -254,9 +246,9 @@ class MobileModel(keras.Model): self.max_activation_iter = max_activation_iter self.batch_size = batch_size self.run_2d = run_2d - + pytorch_weights = load_pytorch_weights(sparse_checkpoint) - + if run_2d: weight_list = np.split(pytorch_weights, 5, axis=0) self.filters_1 = tf.Variable(initial_value=weight_list[0].squeeze(0), dtype='float32', trainable=False) @@ -269,11 +261,16 @@ class MobileModel(keras.Model): @tf.function def call(self, images): + images = tf.squeeze(tf.image.rgb_to_grayscale(images), axis=-1) + images = tf.transpose(images, perm=[0, 2, 3, 1]) + images = images / 255 + images = (images - 0.2592) / 0.1251 + if self.run_2d: activations = self.sparse_code(images, [tf.stop_gradient(self.filters_1), tf.stop_gradient(self.filters_2), tf.stop_gradient(self.filters_3), tf.stop_gradient(self.filters_4), tf.stop_gradient(self.filters_5)]) else: activations = self.sparse_code(images, tf.stop_gradient(self.filters)) - + pred = self.classifier(activations) - - return pred \ No newline at end of file + + return pred diff --git a/run_tflite.py b/run_tflite.py new file mode 100644 index 0000000000000000000000000000000000000000..d4e23ecf2e82ea70eb5e2ba204d32d83480d0bcb --- /dev/null +++ b/run_tflite.py @@ -0,0 +1,158 @@ +import torch +import os +import time +import numpy as np +import torchvision +from sparse_coding_torch.video_loader import VideoGrayScaler, MinMaxScaler +from torchvision.datasets.video_utils import VideoClips +import csv +from datetime import datetime +from yolov4.get_bounding_boxes import YoloModel +import argparse +import tensorflow as tf +import scipy.stats +import cv2 + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument('--fast', action='store_true', + help='optimized for runtime') + parser.add_argument('--accurate', action='store_true', + help='optimized for accuracy') + parser.add_argument('--verbose', action='store_true', + help='output verbose') + args = parser.parse_args() + #print(args.accumulate(args.integers)) + device = 'cpu' + batch_size = 1 + + interpreter = tf.lite.Interpreter("keras/mobile_output/tf_lite_model.tflite") + interpreter.allocate_tensors() + + input_details = interpreter.get_input_details() + output_details = interpreter.get_output_details() + + yolo_model = YoloModel() + + transform = torchvision.transforms.Compose( + [VideoGrayScaler(), + # MinMaxScaler(0, 255), + # torchvision.transforms.Normalize((0.2592,), (0.1251,)), + torchvision.transforms.CenterCrop((100, 200)) + ]) + + all_predictions = [] + + all_files = list(os.listdir('input_videos')) + + for f in all_files: + print('Processing', f) + #start_time = time.time() + + clipstride = 15 + if args.fast: + clipstride = 20 + if args.accurate: + clipstride = 10 + + vc = VideoClips([os.path.join('input_videos', f)], + clip_length_in_frames=5, + frame_rate=20, + frames_between_clips=clipstride) + + ### START time after loading video ### + start_time = time.time() + clip_predictions = [] + i = 0 + cliplist = [] + countclips = 0 + for i in range(vc.num_clips()): + + clip, _, _, _ = vc.get_clip(i) + clip = clip.swapaxes(1, 3).swapaxes(0, 1).swapaxes(2, 3).numpy() + + bounding_boxes = yolo_model.get_bounding_boxes(clip[:, 2, :, :].swapaxes(0, 2).swapaxes(0, 1)).squeeze(0) + # for bb in bounding_boxes: + # print(bb[1]) + if bounding_boxes.size == 0: + continue + #widths = [] + countclips = countclips + len(bounding_boxes) + + widths = [(bounding_boxes[i][3] - bounding_boxes[i][1]) for i in range(len(bounding_boxes))] + + #for i in range(len(bounding_boxes)): + # widths.append(bounding_boxes[i][3] - bounding_boxes[i][1]) + + ind = np.argmax(np.array(widths)) + #for bb in bounding_boxes: + bb = bounding_boxes[ind] + center_x = (bb[3] + bb[1]) / 2 * 1920 + center_y = (bb[2] + bb[0]) / 2 * 1080 + + width=400 + height=400 + + lower_y = round(center_y - height / 2) + upper_y = round(center_y + height / 2) + lower_x = round(center_x - width / 2) + upper_x = round(center_x + width / 2) + + trimmed_clip = clip[:, :, lower_y:upper_y, lower_x:upper_x] + + trimmed_clip = torch.tensor(trimmed_clip).to(torch.float) + + trimmed_clip = transform(trimmed_clip) + + # tensor_to_write = trimmed_clip.swapaxes(0, 1).swapaxes(1, 2).swapaxes(2, 3) + # tensor_to_write[0][0][0][0] = 100 + # tensor_to_write[0][0][0][1] = 100 + # tensor_to_write[0][0][0][2] = 100 + # torchvision.io.write_video('clips_to_test_swift/' + str(countclips) + '.mp4', tensor_to_write, fps=20) + # countclips += 1 + # trimmed_clip.pin_memory() + cliplist.append(trimmed_clip) + + if len(cliplist) > 0: + with torch.no_grad(): + for trimmed_clip in cliplist: + interpreter.set_tensor(input_details[0]['index'], trimmed_clip) + + interpreter.invoke() + + output_array = np.array(interpreter.get_tensor(output_details[0]['index'])) + + pred = output_array[0][0] + print(pred) + + clip_predictions.append(pred.round()) + + if args.verbose: + print(clip_predictions) + print("num of clips: ", countclips) + + final_pred = scipy.stats.mode(clip_predictions)[0][0] + # if len(clip_predictions) % 2 == 0 and torch.sum(clip_predictions).item() == len(clip_predictions)//2: + # #print("I'm here") + # final_pred = (torch.nn.Sigmoid()(pred)).mean().round().detach().cpu().to(torch.long).item() + + if final_pred == 1: + str_pred = 'No Sliding' + else: + str_pred = 'Sliding' + + else: + str_pred = "No Sliding" + + end_time = time.time() + + print(str_pred) + + all_predictions.append({'FileName': f, 'Prediction': str_pred, 'TotalTimeSec': end_time - start_time}) + + with open('output_' + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', 'w+', newline='') as csv_out: + writer = csv.DictWriter(csv_out, fieldnames=all_predictions[0].keys()) + + writer.writeheader() + writer.writerows(all_predictions) diff --git a/stride_2_100_iter.pt b/stride_2_100_iter.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c398c787dd27738188c4f56b8a2c938592f60dd Binary files /dev/null and b/stride_2_100_iter.pt differ