From 69e9d6ccb355dd2ee64766f4655a414533fc123b Mon Sep 17 00:00:00 2001 From: hannandarryl <hannandarryl@gmail.com> Date: Fri, 9 Dec 2022 15:42:16 +0000 Subject: [PATCH] eval fixes for mlp models --- sparse_coding_torch/onsd/classifier_model.py | 53 ++- sparse_coding_torch/onsd/load_data.py | 10 +- .../onsd/logistic_regression.py | 137 +++++-- sparse_coding_torch/onsd/train_MLP.py | 346 +++++++++++------- .../onsd/train_sparse_model.py | 42 ++- sparse_coding_torch/onsd/video_loader.py | 8 +- 6 files changed, 411 insertions(+), 185 deletions(-) diff --git a/sparse_coding_torch/onsd/classifier_model.py b/sparse_coding_torch/onsd/classifier_model.py index a1c72fc..70dbe71 100644 --- a/sparse_coding_torch/onsd/classifier_model.py +++ b/sparse_coding_torch/onsd/classifier_model.py @@ -61,24 +61,63 @@ class ONSDClassifier(keras.layers.Layer): return class_pred, width_pred +class ONSDConv(keras.layers.Layer): + def __init__(self, do_regression): + super(ONSDConv, self).__init__() + +# self.ff_dropout = keras.layers.Dropout(0.1) + self.conv_1 = keras.layers.Conv2D(8, kernel_size=(1, 4), strides=1, activation='relu', padding='valid') +# self.max_pool = keras.layers.MaxPooling2D( + self.conv_2 = keras.layers.Conv2D(8, kernel_size=(1, 4), strides=1, activation='relu', padding='valid') + + self.flatten = keras.layers.Flatten() + +# self.ff_1 = keras.layers.Dense(1000, activation='relu', use_bias=True) +# self.ff_2 = keras.layers.Dense(500, activation='relu', use_bias=True) + self.ff_2 = keras.layers.Dense(10, activation='relu', use_bias=True) +# self.ff_3 = keras.layers.Dense(8, activation='relu', use_bias=True) + if do_regression: + self.ff_final_1 = keras.layers.Dense(1) + else: + self.ff_final_1 = keras.layers.Dense(1, activation='sigmoid') + self.do_dropout = True + +# @tf.function + def call(self, activations): + print(activations.shape) + raise Exception + x = self.conv_1(activations) + + x = self.flatten(x) + + x = self.ff_2(x) +# x = self.ff_dropout(x, self.do_dropout) +# x = self.ff_3(x) + class_pred = self.ff_final_1(x) + + return class_pred + class ONSDMLP(keras.layers.Layer): - def __init__(self): + def __init__(self, do_regression): super(ONSDMLP, self).__init__() - self.ff_dropout = keras.layers.Dropout(0.1) +# self.ff_dropout = keras.layers.Dropout(0.1) # self.ff_1 = keras.layers.Dense(1000, activation='relu', use_bias=True) # self.ff_2 = keras.layers.Dense(500, activation='relu', use_bias=True) - self.ff_2 = keras.layers.Dense(16, activation='relu', use_bias=True) - self.ff_3 = keras.layers.Dense(8, activation='relu', use_bias=True) - self.ff_final_1 = keras.layers.Dense(1) + self.ff_2 = keras.layers.Dense(8, activation='relu', use_bias=True) +# self.ff_3 = keras.layers.Dense(8, activation='relu', use_bias=True) + if do_regression: + self.ff_final_1 = keras.layers.Dense(1) + else: + self.ff_final_1 = keras.layers.Dense(1, activation='sigmoid') self.do_dropout = True # @tf.function def call(self, activations): x = self.ff_2(activations) - x = self.ff_dropout(x, self.do_dropout) - x = self.ff_3(x) +# x = self.ff_dropout(x, self.do_dropout) +# x = self.ff_3(x) class_pred = self.ff_final_1(x) return class_pred diff --git a/sparse_coding_torch/onsd/load_data.py b/sparse_coding_torch/onsd/load_data.py index b422d63..451966e 100644 --- a/sparse_coding_torch/onsd/load_data.py +++ b/sparse_coding_torch/onsd/load_data.py @@ -9,13 +9,12 @@ from typing import Sequence, Iterator import csv from sklearn.model_selection import train_test_split, GroupShuffleSplit, LeaveOneGroupOut, LeaveOneOut, StratifiedGroupKFold, StratifiedKFold, KFold, ShuffleSplit -def load_onsd_videos(batch_size, input_size, crop_size, yolo_model=None, mode=None, n_splits=None): +def load_onsd_videos(batch_size, crop_size, yolo_model=None, mode=None, n_splits=None, do_regression=False): video_path = "/shared_data/bamc_onsd_data/revised_extended_onsd_data" transforms = torchvision.transforms.Compose( [torchvision.transforms.Grayscale(1), - MinMaxScaler(0, 255), - torchvision.transforms.Resize(input_size[:2]) + MinMaxScaler(0, 255) ]) # augment_transforms = torchvision.transforms.Compose( # [torchvision.transforms.RandomRotation(45), @@ -23,7 +22,10 @@ def load_onsd_videos(batch_size, input_size, crop_size, yolo_model=None, mode=No # torchvision.transforms.RandomAdjustSharpness(0.05) # ]) - dataset = ONSDAllFramesLoader(video_path, crop_size[1], crop_size[0], transform=transforms, yolo_model=yolo_model) + if do_regression: + dataset = ONSDGoodFramesLoader(video_path, crop_size[1], crop_size[0], transform=transforms, yolo_model=yolo_model) + else: + dataset = ONSDAllFramesLoader(video_path, crop_size[1], crop_size[0], transform=transforms, yolo_model=yolo_model) targets = dataset.get_labels() diff --git a/sparse_coding_torch/onsd/logistic_regression.py b/sparse_coding_torch/onsd/logistic_regression.py index 0dcc679..e04314f 100644 --- a/sparse_coding_torch/onsd/logistic_regression.py +++ b/sparse_coding_torch/onsd/logistic_regression.py @@ -10,7 +10,7 @@ import os from sparse_coding_torch.onsd.load_data import load_onsd_videos from sparse_coding_torch.utils import SubsetWeightedRandomSampler, get_sample_weights from sparse_coding_torch.sparse_model import SparseCode, ReconSparse, normalize_weights, normalize_weights_3d -from sparse_coding_torch.onsd.classifier_model import ONSDClassifier +from sparse_coding_torch.onsd.classifier_model import ONSDMLP from sparse_coding_torch.onsd.video_loader import get_yolo_region_onsd import time import numpy as np @@ -30,6 +30,8 @@ from sklearn.neural_network import MLPClassifier from sklearn import metrics from sklearn.preprocessing import normalize +from scikeras.wrappers import KerasClassifier, KerasRegressor + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) if __name__ == "__main__": @@ -54,6 +56,8 @@ if __name__ == "__main__": parser.add_argument('--scale_factor', type=int, default=2) parser.add_argument('--clip_depth', type=int, default=1) parser.add_argument('--frames_to_skip', type=int, default=1) + parser.add_argument('--flatten', action='store_true') + parser.add_argument('--regression', action='store_true') args = parser.parse_args() @@ -89,10 +93,9 @@ if __name__ == "__main__": output = SparseCode(batch_size=args.batch_size, image_height=image_height, image_width=image_width, clip_depth=clip_depth, in_channels=1, out_channels=args.num_kernels, kernel_height=args.kernel_height, kernel_width=args.kernel_width, kernel_depth=args.kernel_depth, stride=args.stride, lam=args.lam, activation_lr=args.activation_lr, max_activation_iter=args.max_activation_iter, run_2d=False)(inputs, filter_inputs) sparse_model = keras.Model(inputs=(inputs, filter_inputs), outputs=output) - recon_model = keras.models.load_model(args.sparse_checkpoint) - + recon_model = keras.models.load_model(args.sparse_checkpoint) - splits, dataset = load_onsd_videos(args.batch_size, input_size=(image_height, image_width), crop_size=(crop_height, crop_width), yolo_model=yolo_model, mode=args.splits, n_splits=args.n_splits) + splits, dataset = load_onsd_videos(args.batch_size, input_size=(image_height, image_width), crop_size=(crop_height, crop_width), yolo_model=yolo_model, mode=args.splits, n_splits=args.n_splits, do_regression=args.regression) positive_class = 'Positives' # difficult_vids = split_difficult_vids(dataset.get_difficult_vids(), args.n_splits) @@ -134,49 +137,91 @@ if __name__ == "__main__": # clf = LogisticRegression(max_iter=1000) # clf = RidgeClassifier(alpha=3.0) - clf = MLPClassifier(hidden_layer_sizes=(16,)) +# clf = MLPClassifier(hidden_layer_sizes=(16,)) + if args.flatten: + classifier_inputs = keras.Input(shape=(args.num_kernels * ((image_height - args.kernel_height) // args.stride + 1))) + else: + classifier_inputs = keras.Input(shape=(args.num_kernels)) + classifier_outputs = ONSDMLP()(classifier_inputs) + + classifier_model = keras.Model(inputs=classifier_inputs, outputs=classifier_outputs) + if args.regression: + clf = KerasRegressor(classifier_model, loss='mean_squared_error', optimizer='adam', epochs=200, verbose=False) + else: + clf = KerasClassifier(classifier_model, loss='binary_crossentropy', optimizer='adam', epochs=200, verbose=False) - train_filter_activations = [[] for _ in range(args.num_kernels)] +# train_filter_activations = [[] for _ in range(args.num_kernels)] + train_filter_activations = [] for images, labels, width in tqdm(train_tf.shuffle(len(train_tf)).batch(batch_size)): images = tf.expand_dims(tf.transpose(images, [0, 2, 3, 1]), axis=1) activations = tf.stop_gradient(sparse_model([images, tf.stop_gradient(tf.expand_dims(recon_model.trainable_weights[0], axis=0))])).numpy() - for b_idx in range(activations.shape[0]): - acts = np.squeeze(activations[b_idx]) + activations = tf.squeeze(activations, axis=1) + activations = tf.squeeze(activations, axis=2) + if args.flatten: + activations = tf.reshape(activations, (-1, activations.shape[1] * activations.shape[2])) + else: + activations = tf.math.reduce_sum(activations, axis=1) + + for b_idx, act in enumerate(activations): + if args.regression: + train_filter_activations.append((act, width[b_idx])) + else: + train_filter_activations.append((act, labels[b_idx])) + +# for b_idx in range(activations.shape[0]): +# acts = np.squeeze(activations[b_idx]) - for i in range(args.num_kernels): - acts_for_filter = acts[:, i] +# for i in range(args.num_kernels): +# acts_for_filter = acts[:, i] - act_sum = np.sum(acts_for_filter) +# act_sum = np.sum(acts_for_filter) - train_filter_activations[i].append((act_sum, float(labels[b_idx]))) +# train_filter_activations[i].append((act_sum, float(labels[b_idx]))) - test_filter_activations = [[] for _ in range(args.num_kernels)] +# test_filter_activations = [[] for _ in range(args.num_kernels)] + test_filter_activations = [] for images, labels, width in tqdm(test_tf.batch(args.batch_size)): images = tf.expand_dims(tf.transpose(images, [0, 2, 3, 1]), axis=1) activations = tf.stop_gradient(sparse_model([images, tf.stop_gradient(tf.expand_dims(recon_model.trainable_weights[0], axis=0))])).numpy() - for b_idx in range(activations.shape[0]): - acts = np.squeeze(activations[b_idx]) + activations = tf.squeeze(activations, axis=1) + activations = tf.squeeze(activations, axis=2) + if args.flatten: + activations = tf.reshape(activations, (-1, activations.shape[1] * activations.shape[2])) + else: + activations = tf.math.reduce_sum(activations, axis=1) + + for b_idx, act in enumerate(activations): + if args.regression: + test_filter_activations.append((act, width[b_idx])) + else: + test_filter_activations.append((act, labels[b_idx])) + +# for b_idx in range(activations.shape[0]): +# acts = np.squeeze(activations[b_idx]) - for i in range(args.num_kernels): - acts_for_filter = acts[:, i] +# for i in range(args.num_kernels): +# acts_for_filter = acts[:, i] - act_sum = np.sum(acts_for_filter) +# act_sum = np.sum(acts_for_filter) - test_filter_activations[i].append((act_sum, float(labels[b_idx]))) +# test_filter_activations[i].append((act_sum, float(labels[b_idx]))) train_X = [] train_y = [] - for i in range(len(train_filter_activations[0])): - x = np.array([train_filter_activations[j][i][0] for j in range(args.num_kernels)]) - label = train_filter_activations[0][i][1] +# for i in range(len(train_filter_activations[0])): +# x = np.array([train_filter_activations[j][i][0] for j in range(args.num_kernels)]) +# label = train_filter_activations[0][i][1] +# train_X.append(x) +# train_y.append(label) + for x, label in train_filter_activations: train_X.append(x) train_y.append(label) @@ -186,10 +231,14 @@ if __name__ == "__main__": test_X = [] test_y = [] - for i in range(len(test_filter_activations[0])): - x = np.array([test_filter_activations[j][i][0] for j in range(args.num_kernels)]) - label = test_filter_activations[0][i][1] +# for i in range(len(test_filter_activations[0])): +# x = np.array([test_filter_activations[j][i][0] for j in range(args.num_kernels)]) +# label = test_filter_activations[0][i][1] + +# test_X.append(x) +# test_y.append(label) + for x, label in test_filter_activations: test_X.append(x) test_y.append(label) @@ -217,8 +266,21 @@ if __name__ == "__main__": test_gt_all = np.concatenate([test_gt_all, test_y]) if args.splits == 'leave_one_out': - video_gt = np.array([test_y[0]]) - video_pred = np.array([np.round(np.average(test_pred))]) + if args.regression: + video_gt = np.average(test_y) + if video_gt >= 100 / dataset.max_width: + video_gt = np.array([1]) + else: + video_gt = np.array([0]) + + video_pred = np.array([np.average(test_pred)]) + if video_pred >= 100 / dataset.max_width: + video_pred = np.array([1]) + else: + video_pred = np.array([0]) + else: + video_gt = np.array([test_y[0]]) + video_pred = np.array([np.round(np.average(test_pred))]) if video_pred_all is None: video_pred_all = video_pred @@ -238,18 +300,29 @@ if __name__ == "__main__": frame_pred_all = np.concatenate([frame_pred_all, frame_pred]) frame_gt_all = np.concatenate([frame_gt_all, frame_gt]) - train_acc = metrics.accuracy_score(train_pred, train_y) - test_acc = metrics.accuracy_score(test_pred, test_y) + if args.regression: + train_acc = metrics.mean_absolute_error(train_pred, train_y) + test_acc = metrics.mean_absolute_error(test_pred, test_y) + else: + train_acc = metrics.accuracy_score(train_pred, train_y) + test_acc = metrics.accuracy_score(test_pred, test_y) print('i_fold={}, train_acc={:.2f}, test_acc={:.2f}'.format(i_fold, train_acc, test_acc)) print('Final Predictions!') - train_accuracy = metrics.accuracy_score(train_pred_all, train_gt_all) - test_accuracy = metrics.accuracy_score(test_pred_all, test_gt_all) - frame_accuracy = metrics.accuracy_score(frame_pred_all, frame_gt_all) + if args.regression: + train_accuracy = metrics.mean_absolute_error(train_pred_all, train_gt_all) + test_accuracy = metrics.mean_absolute_error(test_pred_all, test_gt_all) + frame_accuracy = metrics.mean_absolute_error(frame_pred_all, frame_gt_all) + else: + train_accuracy = metrics.accuracy_score(train_pred_all, train_gt_all) + test_accuracy = metrics.accuracy_score(test_pred_all, test_gt_all) + frame_accuracy = metrics.accuracy_score(frame_pred_all, frame_gt_all) if args.splits == 'leave_one_out': + print(video_pred_all) + print(video_gt_all) video_accuracy = metrics.accuracy_score(video_pred_all, video_gt_all) print('train_acc={:.2f}, test_acc={:.2f}, frame_acc={:.2f}, video_acc={:.2f}'.format(train_accuracy, test_accuracy, frame_accuracy, video_accuracy)) diff --git a/sparse_coding_torch/onsd/train_MLP.py b/sparse_coding_torch/onsd/train_MLP.py index 1ad78c4..afdc46f 100644 --- a/sparse_coding_torch/onsd/train_MLP.py +++ b/sparse_coding_torch/onsd/train_MLP.py @@ -10,11 +10,11 @@ import os from sparse_coding_torch.onsd.load_data import load_onsd_videos from sparse_coding_torch.utils import SubsetWeightedRandomSampler, get_sample_weights from sparse_coding_torch.sparse_model import SparseCode, ReconSparse, normalize_weights, normalize_weights_3d -from sparse_coding_torch.onsd.classifier_model import ONSDMLP -from sparse_coding_torch.onsd.video_loader import get_yolo_region_onsd +from sparse_coding_torch.onsd.classifier_model import ONSDMLP, ONSDConv +from sparse_coding_torch.onsd.video_loader import get_yolo_region_onsd, get_participants import time import numpy as np -from sklearn.metrics import f1_score, accuracy_score, confusion_matrix +from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, mean_absolute_error import random import pickle # from sparse_coding_torch.onsd.train_sparse_model import sparse_loss @@ -25,10 +25,77 @@ import glob import cv2 import copy import matplotlib.pyplot as plt +import itertools +import csv tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) +import absl.logging +absl.logging.set_verbosity(absl.logging.ERROR) -def calculate_onsd_scores(input_videos, labels, yolo_model, classifier_model, sparse_model, recon_model, transform, crop_width, crop_height, max_width): +def calculate_onsd_scores_measured(input_videos, yolo_model, classifier_model, sparse_model, recon_model, transform, crop_width, crop_height): + frame_path = 'sparse_coding_torch/onsd/onsd_good_for_eval' + + all_preds = [] + all_gt = [] + fp = [] + fn = [] + + for vid_f in tqdm(input_videos): + split_path = vid_f.split('/') + frame_path = '/'.join(split_path[:-1]) + label = split_path[-3] + f = [png_file for png_file in os.listdir(frame_path) if png_file.endswith('.png')][0] +# for f in tqdm(os.listdir(os.path.join(frame_path, label))): +# if not f.endswith('.png'): +# continue +# print(split_path) +# print(frame_path) +# print(label) +# print(f) +# raise Exception + + frame = torch.tensor(cv2.imread(os.path.join(frame_path, f))).swapaxes(2, 1).swapaxes(1, 0) + +# print(frame.size()) + + frame = get_yolo_region_onsd(yolo_model, frame, crop_width, crop_height, False) + if not frame: + continue + +# print(frame) + + frame = frame[0] + +# print(frame) + + frame = transform(frame).to(torch.float32).unsqueeze(3).unsqueeze(1).numpy() + + activations = tf.stop_gradient(sparse_model([frame, tf.stop_gradient(tf.expand_dims(recon_model.trainable_weights[0], axis=0))])) + + activations = tf.squeeze(activations, axis=1) + activations = tf.squeeze(activations, axis=2) + activations = tf.math.reduce_sum(activations, axis=1) + + pred = classifier_model.predict(activations) + + pred = tf.math.round(pred) + + final_pred = float(pred) + + all_preds.append(final_pred) + + if label == 'Positives': + all_gt.append(1.0) + if final_pred == 0.0: + fn.append(f) + elif label == 'Negatives': + all_gt.append(0.0) + if final_pred == 1.0: + fp.append(f) + + return np.array(all_preds), np.array(all_gt), fn, fp + +def calculate_onsd_scores(input_videos, labels, yolo_model, classifier_model, sparse_model, recon_model, transform, crop_width, crop_height, max_width, flatten, do_regression, activations_2d, use_valid, valid_vids): all_predictions = [] numerical_labels = [] @@ -42,6 +109,9 @@ def calculate_onsd_scores(input_videos, labels, yolo_model, classifier_model, sp fp_ids = [] fn_ids = [] for v_idx, f in tqdm(enumerate(input_videos)): + if use_valid and not get_participants([f])[0] in valid_vids: + continue + vc = torchvision.io.read_video(f)[0].permute(3, 0, 1, 2) all_classes = [] @@ -51,7 +121,9 @@ def calculate_onsd_scores(input_videos, labels, yolo_model, classifier_model, sp all_yolo = [get_yolo_region_onsd(yolo_model, frame, crop_width, crop_height, False) for frame in all_frames] - all_yolo = [yolo[0] for yolo in all_yolo if yolo is not None] + all_yolo = list(itertools.chain.from_iterable([y for y in all_yolo if y is not None])) + +# all_yolo = [yolo[0] for yolo in all_yolo if yolo is not None] for i in range(0, len(all_yolo), 32): batch = torch.stack(all_yolo[i:i+32]) @@ -62,16 +134,35 @@ def calculate_onsd_scores(input_videos, labels, yolo_model, classifier_model, sp activations = tf.squeeze(activations, axis=1) activations = tf.squeeze(activations, axis=2) - activations = tf.math.reduce_sum(activations, axis=1) + if flatten: + activations = tf.reshape(activations, (-1, activations.shape[1] * activations.shape[2])) + elif activations_2d: + activations = tf.expand_dims(activations, axis=3) + else: + activations = tf.math.reduce_sum(activations, axis=1) - pred = classifier_model(activations) + pred = classifier_model.predict(activations) - pred = tf.math.round(tf.math.sigmoid(pred)) +# if not do_regression: +# pred = tf.math.round(pred) # width_pred = tf.math.round(width_pred * max_width) all_classes.append(pred) - final_pred = np.round(np.average(np.concatenate(all_classes))) + if do_regression: + final_pred = np.average(np.concatenate(all_classes)) +# raise Exception +# print(all_classes) +# print(final_pred) +# print(max_width) +# print(100/max_width) +# raise Exception + if final_pred >= 100: + final_pred = np.array([1]) + else: + final_pred = np.array([0]) + else: + final_pred = np.round(np.average(np.concatenate(all_classes))) # print(all_widths) # average_width = np.average(np.array(all_widths)) # print(average_width) @@ -92,7 +183,7 @@ def calculate_onsd_scores(input_videos, labels, yolo_model, classifier_model, sp if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--batch_size', default=128, type=int) + parser.add_argument('--batch_size', default=200, type=int) parser.add_argument('--kernel_width', default=150, type=int) parser.add_argument('--kernel_height', default=10, type=int) parser.add_argument('--kernel_depth', default=1, type=int) @@ -100,8 +191,8 @@ if __name__ == "__main__": parser.add_argument('--stride', default=1, type=int) parser.add_argument('--max_activation_iter', default=300, type=int) parser.add_argument('--activation_lr', default=1e-2, type=float) - parser.add_argument('--lr', default=5e-2, type=float) - parser.add_argument('--epochs', default=15, type=int) + parser.add_argument('--lr', default=0.001, type=float) + parser.add_argument('--epochs', default=200, type=int) parser.add_argument('--lam', default=0.05, type=float) parser.add_argument('--output_dir', default='./output', type=str) parser.add_argument('--sparse_checkpoint', default=None, type=str) @@ -118,6 +209,10 @@ if __name__ == "__main__": parser.add_argument('--scale_factor', type=int, default=2) parser.add_argument('--clip_depth', type=int, default=1) parser.add_argument('--frames_to_skip', type=int, default=1) + parser.add_argument('--do_regression', action='store_true') + parser.add_argument('--flatten', action='store_true') + parser.add_argument('--activations_2d', action='store_true') + parser.add_argument('--valid_vids', action='store_true') args = parser.parse_args() @@ -141,6 +236,16 @@ if __name__ == "__main__": with open(os.path.join(output_dir, 'arguments.txt'), 'w+') as out_f: out_f.write(str(args)) + + valid_vids = set() + with open('sparse_coding_torch/onsd/good_frames_onsd.csv', 'r') as valid_in: + reader = csv.DictReader(valid_in) + for row in reader: + vid = row['video'].strip() + good_frames = row['good_frames'].strip() + + if good_frames: + valid_vids.add(vid) yolo_model = YoloModel(args.dataset) @@ -163,28 +268,44 @@ if __name__ == "__main__": ]) - splits, dataset = load_onsd_videos(args.batch_size, input_size=(image_height, image_width), crop_size=(crop_height, crop_width), yolo_model=yolo_model, mode=args.splits, n_splits=args.n_splits) + splits, dataset = load_onsd_videos(args.batch_size, input_size=(image_height, image_width), crop_size=(crop_height, crop_width), yolo_model=yolo_model, mode=args.splits, n_splits=args.n_splits, do_regression=args.do_regression) positive_class = 'Positives' + all_video_labels = [f.split('/')[-3] for f in dataset.get_all_videos()] + print('{} videos with positive labels.'.format(len([lbl for lbl in all_video_labels if lbl == 'Positives']))) + print('{} videos with negative labels.'.format(len([lbl for lbl in all_video_labels if lbl == 'Negatives']))) + # difficult_vids = split_difficult_vids(dataset.get_difficult_vids(), args.n_splits) print('Processing frames...') sparse_codes = [] + total_acts = 0 + total_non_zero = 0 frames = dataset.get_frames() for i in tqdm(range(0, len(frames), 32)): frame = tf.stack(frames[i:i+32]) frame = tf.expand_dims(data_augmentation(tf.transpose(frame, [0, 2, 3, 1])), axis=1) activations = tf.stop_gradient(sparse_model([frame, tf.stop_gradient(tf.expand_dims(recon_model.trainable_weights[0], axis=0))])).numpy() + + total_non_zero += float(tf.math.count_nonzero(activations)) + total_acts += float(tf.math.reduce_prod(tf.shape(activations))) activations = tf.squeeze(activations, axis=1) activations = tf.squeeze(activations, axis=2) - activations = tf.math.reduce_sum(activations, axis=1) + + if args.flatten: + activations = tf.reshape(activations, (-1, activations.shape[1] * activations.shape[2])) + elif args.activations_2d: + activations = tf.expand_dims(activations, axis=3) + else: + activations = tf.math.reduce_sum(activations, axis=1) for act in activations: sparse_codes.append(act) assert len(sparse_codes) == len(frames) + print('Average sparsity is: {}'.format(total_non_zero / total_acts)) video_true = [] video_pred = [] @@ -214,17 +335,27 @@ if __name__ == "__main__": train_sparse_codes = [sc for i, sc in enumerate(sparse_codes) if i in train_idx] test_sparse_codes = [sc for i, sc in enumerate(sparse_codes) if i in test_idx] - - train_tf = tf.data.Dataset.from_tensor_slices((train_sparse_codes, train_loader.get_labels(), train_loader.get_widths())) - test_tf = tf.data.Dataset.from_tensor_slices((test_sparse_codes, test_loader.get_labels(), test_loader.get_widths())) - print('{} train videos.'.format(len(train_tf))) - print('{} positive videos.'.format(len(list(train_tf.filter(lambda features, label, width: label==1))))) - print('{} negative videos.'.format(len(list(train_tf.filter(lambda features, label, width: label==0))))) - print('-----------------') - print('{} test videos.'.format(len(test_tf))) - print('{} positive videos.'.format(len(list(test_tf.filter(lambda features, label, width: label==1))))) - print('{} negative videos.'.format(len(list(test_tf.filter(lambda features, label, width: label==0))))) + if args.do_regression: + train_x = tf.stack(train_sparse_codes) + test_x = tf.stack(test_sparse_codes) + + train_y = tf.stack(train_loader.get_widths()) + test_y = tf.stack(test_loader.get_widths()) + else: + train_x = tf.stack(train_sparse_codes) + test_x = tf.stack(test_sparse_codes) + + train_y = tf.stack(train_loader.get_labels()) + test_y = tf.stack(test_loader.get_labels()) + +# print('{} train frames.'.format(len(train_x))) +# print('{} positive frames.'.format(len(list(train_y.filter(lambda features, label, width: label==1))))) +# print('{} negative frames.'.format(len(list(train_y.filter(lambda features, label, width: label==0))))) +# print('-----------------') +# print('{} test frames.'.format(len(test_tf))) +# print('{} positive frames.'.format(len(list(test_tf.filter(lambda features, label, width: label==1))))) +# print('{} negative frames.'.format(len(list(test_tf.filter(lambda features, label, width: label==0))))) # negative_ds = ( @@ -242,110 +373,63 @@ if __name__ == "__main__": if args.checkpoint: classifier_model = keras.models.load_model(args.checkpoint) else: - classifier_inputs = keras.Input(shape=(args.num_kernels)) - classifier_outputs = ONSDMLP()(classifier_inputs) + if args.flatten: + classifier_inputs = keras.Input(shape=(args.num_kernels * ((image_height - args.kernel_height) // args.stride + 1))) + elif args.activations_2d: + classifier_inputs = keras.Input(shape=(((image_height - args.kernel_height) // args.stride + 1), args.num_kernels, 1)) + else: + classifier_inputs = keras.Input(shape=(args.num_kernels)) + + if args.activations_2d: + classifier_outputs = ONSDConv(args.do_regression)(classifier_inputs) + else: + classifier_outputs = ONSDMLP(args.do_regression)(classifier_inputs) classifier_model = keras.Model(inputs=classifier_inputs, outputs=classifier_outputs) - - prediction_optimizer = keras.optimizers.Adam(learning_rate=args.lr) - - best_so_far = float('inf') - - class_criterion = keras.losses.BinaryCrossentropy(from_logits=True, reduction=keras.losses.Reduction.SUM) -# width_criterion = keras.losses.MeanSquaredError(reduction=keras.losses.Reduction.SUM) - - train_losses = [] - test_losses = [] - - train_accuracies = [] - test_accuracies = [] + + if not args.do_regression: + criterion = keras.losses.BinaryCrossentropy() + else: + criterion = keras.losses.MeanSquaredError() + + + classifier_model.compile(optimizer=keras.optimizers.Adam(learning_rate=args.lr), loss=criterion) -# train_mse = [] -# test_mse = [] if args.train: - for epoch in range(args.epochs): - epoch_loss = 0 - t1 = time.perf_counter() - -# for images, labels, width in tqdm(balanced_ds.shuffle(len(train_tf)).batch(args.batch_size)): -# for images, labels, width in tqdm(balanced_ds.take(len(train_tf)).shuffle(len(train_tf)).batch(args.batch_size)): - classifier_model.do_dropout = True - for activations, labels, width in train_tf.shuffle(len(train_tf)).batch(args.batch_size): - with tf.GradientTape() as tape: - class_pred = classifier_model(activations) - class_loss = class_criterion(labels, class_pred) -# width_loss = width_criterion(width, width_pred * width_mask) - loss = class_loss - - epoch_loss += loss * activations.shape[0] - - gradients = tape.gradient(loss, classifier_model.trainable_weights) - - prediction_optimizer.apply_gradients(zip(gradients, classifier_model.trainable_weights)) - - t2 = time.perf_counter() - - if epoch_loss < best_so_far: - print("found better model") - # Save model parameters - classifier_model.save(os.path.join(output_dir, "best_classifier_{}.pt".format(i_fold))) -# recon_model.save(os.path.join(output_dir, "best_sparse_model_{}.pt".format(i_fold))) -# pickle.dump(prediction_optimizer.get_weights(), open(os.path.join(output_dir, 'optimizer_{}.pt'.format(i_fold)), 'wb+')) - best_so_far = epoch_loss - - classifier_model = keras.models.load_model(os.path.join(output_dir, "best_classifier_{}.pt".format(i_fold))) - - y_true_train = None - y_pred_train = None - - y_true_test = None - y_pred_test = None - - classifier_model.do_dropout = False - for activations, labels, width in train_tf.batch(args.batch_size): - pred = classifier_model(activations) - - if y_true_train is None: - y_true_train = labels - y_pred_train = tf.math.round(tf.math.sigmoid(pred)) - else: - y_true_train = tf.concat((y_true_train, labels), axis=0) - y_pred_train = tf.concat((y_pred_train, tf.math.round(tf.math.sigmoid(pred))), axis=0) - - for activations, labels, width in test_tf.batch(args.batch_size): - pred = classifier_model(activations) - - if y_true_test is None: - y_true_test = labels - y_pred_test = tf.math.round(tf.math.sigmoid(pred)) - else: - y_true_test = tf.concat((y_true_test, labels), axis=0) - y_pred_test = tf.concat((y_pred_test, tf.math.round(tf.math.sigmoid(pred))), axis=0) + classifier_model.fit(train_x, train_y, batch_size=args.batch_size, epochs=args.epochs, verbose=False) - t2 = time.perf_counter() - - y_true_test = tf.cast(y_true_test, tf.int32) - y_pred_test = tf.cast(y_pred_test, tf.int32) - - y_true_train = tf.cast(y_true_train, tf.int32) - y_pred_train = tf.cast(y_pred_train, tf.int32) + y_true_train = train_y + if args.do_regression: + y_pred_train = classifier_model.predict(train_x) + else: + y_pred_train = np.round(classifier_model.predict(train_x)) train_frame_true.append(y_true_train) train_frame_pred.append(y_pred_train) + y_true_test = test_y + if args.do_regression: + y_pred_test = classifier_model.predict(test_x) + else: + y_pred_test = np.round(classifier_model.predict(test_x)) + test_frame_true.append(y_true_test) test_frame_pred.append(y_pred_test) - f1 = f1_score(y_true_test, y_pred_test, average='macro') - accuracy = accuracy_score(y_true_test, y_pred_test) + t2 = time.perf_counter() - train_accuracy = accuracy_score(y_true_train, y_pred_train) + if args.do_regression: + f1 = 0.0 + accuracy = mean_absolute_error(y_true_test, y_pred_test) + train_accuracy = mean_absolute_error(y_true_train, y_pred_train) + else: + f1 = f1_score(y_true_test, y_pred_test, average='macro') + accuracy = accuracy_score(y_true_test, y_pred_test) -# test_mae = keras.losses.MeanAbsoluteError()(width_gt, width_p) - test_mae = 0.0 + train_accuracy = accuracy_score(y_true_train, y_pred_train) - train_accuracies.append(train_accuracy) - test_accuracies.append(accuracy) +# train_accuracies.append(train_accuracy) +# test_accuracies.append(accuracy) pred_dict = {} gt_dict = {} @@ -362,27 +446,33 @@ if __name__ == "__main__": test_labels = [vid_f.split('/')[-3] for vid_f in test_videos] classifier_model.do_dropout = False - y_pred, y_true, fn, fp = calculate_onsd_scores(test_videos, test_labels, yolo_model, classifier_model, sparse_model, recon_model, transform, image_width, image_height, 0) + max_width = 0 + if args.do_regression: + max_width = dataset.max_width + y_pred, y_true, fn, fp = calculate_onsd_scores(test_videos, test_labels, yolo_model, classifier_model, sparse_model, recon_model, transform, crop_width, crop_height, max_width, args.flatten, args.do_regression, args.activations_2d, args.valid_vids, valid_vids) +# y_pred, y_true, fn, fp = calculate_onsd_scores_measured(test_videos, yolo_model, classifier_model, sparse_model, recon_model, transform, image_width, image_height) t2 = time.perf_counter() print('i_fold={}, time={:.2f}'.format(i_fold, t2-t1)) + + if np.size(y_pred): - y_true = tf.cast(y_true, tf.int32) - y_pred = tf.cast(y_pred, tf.int32) + y_true = tf.cast(y_true, tf.int32) + y_pred = tf.cast(y_pred, tf.int32) - f1 = f1_score(y_true, y_pred, average='macro') - accuracy = accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average='macro') + vid_accuracy = accuracy_score(y_true, y_pred) - video_fn.extend(fn) - video_fp.extend(fp) + video_fn.extend(fn) + video_fp.extend(fp) - video_true.extend(y_true) - video_pred.extend(y_pred) + video_true.extend(y_true) + video_pred.extend(y_pred) - print("Test f1={:.2f}, vid_acc={:.2f}".format(f1, accuracy)) + print("Test f1={:.2f}, vid acc={:.2f}, train acc={:.2f}, test acc={:.2f}".format(f1, vid_accuracy, train_accuracy, accuracy)) - print(confusion_matrix(y_true, y_pred)) + print(confusion_matrix(y_true, y_pred)) # plt.clf() # plt.figure() @@ -418,8 +508,12 @@ if __name__ == "__main__": test_frame_true = np.concatenate(test_frame_true) test_frame_pred = np.concatenate(test_frame_pred) - train_frame_acc = accuracy_score(train_frame_true, train_frame_pred) - test_frame_acc = accuracy_score(test_frame_true, test_frame_pred) + if args.do_regression: + train_frame_acc = mean_absolute_error(train_frame_true, train_frame_pred) + test_frame_acc = mean_absolute_error(test_frame_true, test_frame_pred) + else: + train_frame_acc = accuracy_score(train_frame_true, train_frame_pred) + test_frame_acc = accuracy_score(test_frame_true, test_frame_pred) print("Final video accuracy={:.2f}, video f1={:.2f}, frame train accuracy={:.2f}, frame test accuracy={:.2f}".format(final_acc, final_f1, train_frame_acc, test_frame_acc)) print(final_conf) diff --git a/sparse_coding_torch/onsd/train_sparse_model.py b/sparse_coding_torch/onsd/train_sparse_model.py index b3452ab..68f8bd7 100644 --- a/sparse_coding_torch/onsd/train_sparse_model.py +++ b/sparse_coding_torch/onsd/train_sparse_model.py @@ -16,6 +16,8 @@ from sparse_coding_torch.utils import plot_filters from yolov4.get_bounding_boxes import YoloModel import copy +tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + def sparse_loss(images, recon, activations, batch_size, lam, stride): loss = 0.5 * (1/batch_size) * tf.math.reduce_sum(tf.math.pow(images - recon, 2)) loss += lam * tf.reduce_mean(tf.math.reduce_sum(tf.math.abs(tf.reshape(activations, (batch_size, -1))), axis=1)) @@ -24,24 +26,25 @@ def sparse_loss(images, recon, activations, batch_size, lam, stride): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--batch_size', default=32, type=int) - parser.add_argument('--kernel_width', default=150, type=int) - parser.add_argument('--kernel_height', default=10, type=int) + parser.add_argument('--kernel_width', default=60, type=int) + parser.add_argument('--kernel_height', default=30, type=int) parser.add_argument('--kernel_depth', default=1, type=int) - parser.add_argument('--num_kernels', default=10, type=int) + parser.add_argument('--num_kernels', default=16, type=int) parser.add_argument('--stride', default=1, type=int) parser.add_argument('--max_activation_iter', default=300, type=int) parser.add_argument('--activation_lr', default=1e-2, type=float) parser.add_argument('--lr', default=0.003, type=float) - parser.add_argument('--epochs', default=150, type=int) - parser.add_argument('--lam', default=0.05, type=float) + parser.add_argument('--epochs', default=200, type=int) + parser.add_argument('--lam', default=0.1, type=float) parser.add_argument('--output_dir', default='./output', type=str) parser.add_argument('--seed', default=42, type=int) parser.add_argument('--run_2d', action='store_true') parser.add_argument('--save_filters', action='store_true') parser.add_argument('--optimizer', default='sgd', type=str) - parser.add_argument('--crop_height', type=int, default=100) + parser.add_argument('--crop_height', type=int, default=30) parser.add_argument('--crop_width', type=int, default=300) - parser.add_argument('--scale_factor', type=int, default=2) + parser.add_argument('--image_height', type=int, default=30) + parser.add_argument('--image_width', type=int, default=250) parser.add_argument('--clip_depth', type=int, default=1) parser.add_argument('--frames_to_skip', type=int, default=1) @@ -55,8 +58,8 @@ if __name__ == "__main__": crop_height = args.crop_height crop_width = args.crop_width - image_height = int(crop_height / args.scale_factor) - image_width = int(crop_width / args.scale_factor) + image_height = args.image_height + image_width = args.image_width clip_depth = args.clip_depth yolo_model = YoloModel('onsd') @@ -71,7 +74,7 @@ if __name__ == "__main__": out_f.write(str(args)) # splits, dataset = load_onsd_videos(args.batch_size, input_size=(image_height, image_width, clip_depth), mode='all_train') - splits, dataset = load_onsd_videos(args.batch_size, input_size=(image_height, image_width), crop_size=(crop_height, crop_width), yolo_model=yolo_model, mode='all_train', n_splits=1) + splits, dataset = load_onsd_videos(args.batch_size, crop_size=(crop_height, crop_width), yolo_model=yolo_model, mode='all_train', n_splits=1) train_idx, test_idx = list(splits)[0] train_loader = copy.deepcopy(dataset) @@ -108,6 +111,15 @@ if __name__ == "__main__": filter_optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate) else: filter_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) + + crop_amount = (crop_width - image_width) + assert crop_amount % 2 == 0 + crop_amount = crop_amount // 2 + + data_augmentation = keras.Sequential([ + keras.layers.RandomTranslation(0, 0.08), + keras.layers.Cropping2D((0, crop_amount)) + ]) loss_log = [] best_so_far = float('inf') @@ -118,12 +130,16 @@ if __name__ == "__main__": epoch_start = time.perf_counter() num_iters = 0 + + average_activations = [] for images, labels, width in tqdm(train_tf.shuffle(len(train_tf)).batch(args.batch_size)): - images = tf.expand_dims(tf.transpose(images, [0, 2, 3, 1]), axis=1) + images = tf.expand_dims(data_augmentation(tf.transpose(images, [0, 2, 3, 1])), axis=1) activations = tf.stop_gradient(sparse_model([images, tf.stop_gradient(tf.expand_dims(recon_model.trainable_weights[0], axis=0))])) + average_activations.append(float(tf.math.count_nonzero(activations)) / float(tf.math.reduce_prod(tf.shape(activations)))) + with tf.GradientTape() as tape: recon = recon_model(activations) loss = sparse_loss(images, recon, activations, images.shape[0], args.lam, args.stride) @@ -159,7 +175,9 @@ if __name__ == "__main__": best_so_far = epoch_loss loss_log.append(epoch_loss) - print('epoch={}, epoch_loss={:.2f}, time={:.2f}'.format(epoch, epoch_loss, epoch_end - epoch_start)) + + sparsity = np.average(np.array(average_activations)) + print('epoch={}, epoch_loss={:.2f}, time={:.2f}, average sparsity={:.2f}'.format(epoch, epoch_loss, epoch_end - epoch_start, sparsity)) plt.plot(loss_log) diff --git a/sparse_coding_torch/onsd/video_loader.py b/sparse_coding_torch/onsd/video_loader.py index 877fe29..74d5d97 100644 --- a/sparse_coding_torch/onsd/video_loader.py +++ b/sparse_coding_torch/onsd/video_loader.py @@ -183,7 +183,7 @@ class ONSDGoodFramesLoader: for start_range, end_range in ranges: for j in range(start_range, end_range, 5): - if j == vc.size(1): + if j >= vc.size(1): break frame = vc[:, j, :, :] @@ -192,7 +192,7 @@ class ONSDGoodFramesLoader: width_key = txt_label + '/' + width_key width_key = width_key + '/' + str(j) + '.png' if width_key not in onsd_widths: - width = 0 + continue else: width = onsd_widths[width_key] @@ -213,7 +213,7 @@ class ONSDGoodFramesLoader: elif label == 'Negatives': label = np.array(0.0) - width = np.round(width / 30) +# width = np.round(width / 30) for frm in all_frames: self.clips.append((label, frm.numpy(), self.videos[vid_idx][2], width)) @@ -264,7 +264,7 @@ class ONSDGoodFramesLoader: return [frame for _, frame, _, _ in self.clips] def get_widths(self): - return [width / self.max_width for _, _, _, width in self.clips] + return [width for _, _, _, width in self.clips] def __next__(self): if self.count < len(self.clips): -- GitLab