From 054bdaca2e524cedac6a0f0b9f2a71917f1533c8 Mon Sep 17 00:00:00 2001 From: Philip Monaco <philmonaco34@gmail.com> Date: Thu, 3 Mar 2022 10:43:49 -0500 Subject: [PATCH] Add additional processing for model creation --- Project_Notebook.ipynb | 473 +++++++++++++++++++++++++++++++++++++---- data_processing.py | 6 +- 2 files changed, 431 insertions(+), 48 deletions(-) diff --git a/Project_Notebook.ipynb b/Project_Notebook.ipynb index 3fc1bd6..8a90228 100644 --- a/Project_Notebook.ipynb +++ b/Project_Notebook.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "d7e56e0e-7eec-429d-940b-c3337db4b4dc", "metadata": {}, "outputs": [], @@ -17,6 +17,7 @@ "from tqdm import tqdm\n", "from data_processing import load_sort_data, transform\n", "from EDA import find_mean_img, eigenimages, plot_pca\n", + "from tensorflow.keras.preprocessing import image_dataset_from_directory\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n" ] @@ -90,7 +91,41 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, + "id": "52d44e91", + "metadata": {}, + "outputs": [], + "source": [ + "train_meta = pd.read_csv('./data/ISIC2018_Task3_Training_GroundTruth.csv')\n", + "test_meta = pd.read_csv('./data/ISIC2018_Task3_Validation_GroundTruth.csv')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b25db3d1", + "metadata": {}, + "outputs": [], + "source": [ + "#Process metadata and decision labelslabels\n", + "lab = ['MEL', 'NV', 'BCC', 'AKIEC', 'BKL', 'DF', 'VASC']\n", + "\n", + "#the original metadata was one-hot encoded so we must reverse\n", + "train_labels = train_meta[lab].idxmax(axis=1)\n", + "test_labels = test_meta[lab].idxmax(axis=1)\n", + "\n", + "#combine the reversed one hot encoded with the image file name\n", + "ptrainmeta = pd.concat([train_meta['image'], train_labels],keys=['image_id', 'dx'], axis=1)\n", + "ptestmeta = pd.concat([test_meta['image'], test_labels],keys=['image_id', 'dx'], axis=1)\n", + "\n", + "#save off .csv's that contain decision labels and image names\n", + "ptrainmeta.to_csv('./data/train_metadata.csv')\n", + "ptestmeta.to_csv('./data/test_metadata.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "id": "b8c4f292", "metadata": {}, "outputs": [ @@ -98,35 +133,60 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1099/1099 [00:22<00:00, 48.04it/s]\n", - "100%|██████████| 6705/6705 [02:20<00:00, 47.65it/s]\n", - "100%|██████████| 115/115 [00:02<00:00, 38.72it/s]\n", - "100%|██████████| 1113/1113 [00:23<00:00, 48.19it/s]\n", - "100%|██████████| 142/142 [00:03<00:00, 46.04it/s]\n", - "100%|██████████| 514/514 [00:09<00:00, 51.89it/s]\n", - "100%|██████████| 327/327 [00:06<00:00, 54.49it/s]\n", - "100%|██████████| 7/7 [03:32<00:00, 30.35s/it]\n" + "100%|██████████| 6705/6705 [01:09<00:00, 97.14it/s]\n", + "100%|██████████| 1113/1113 [00:12<00:00, 92.28it/s]\n", + "100%|██████████| 1099/1099 [00:10<00:00, 100.47it/s]\n", + "100%|██████████| 115/115 [00:01<00:00, 99.05it/s]\n", + "100%|██████████| 327/327 [00:02<00:00, 110.73it/s]\n", + "100%|██████████| 514/514 [00:04<00:00, 112.84it/s]\n", + "100%|██████████| 142/142 [00:01<00:00, 106.61it/s]\n", + "100%|██████████| 7/7 [01:46<00:00, 15.18s/it]\n" ] } ], "source": [ "# function takes 3 parameters: metadata filename, the folder of the raw images, and the desired name of the destination directory. \n", - "metadata, dest_dir = load_sort_data('HAM10000_metadata', 'ISIC2018_Task3_Training_Input', 'Training_Images_')" + "metadata, dest_dir = load_sort_data('train_metadata.csv', 'ISIC2018_Task3_Training_Input', 'training/')" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 20, + "id": "87405f6d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 123/123 [00:01<00:00, 86.44it/s]\n", + "100%|██████████| 15/15 [00:00<00:00, 94.34it/s]\n", + "100%|██████████| 22/22 [00:00<00:00, 92.05it/s]\n", + "100%|██████████| 21/21 [00:00<00:00, 114.75it/s]\n", + "100%|██████████| 8/8 [00:00<00:00, 142.86it/s]\n", + "100%|██████████| 1/1 [00:00<00:00, 100.00it/s]\n", + "100%|██████████| 3/3 [00:00<00:00, 85.72it/s]\n", + "100%|██████████| 7/7 [00:02<00:00, 3.11it/s]\n" + ] + } + ], + "source": [ + "metadata, dest_dir = load_sort_data('test_metadata.csv', 'ISIC2018_Task3_Validation_Input', 'test/')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, "id": "7e9702c3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'h:\\\\School\\\\Winter 2022\\\\DS Projects\\\\2018\\\\hvm-image-clf/data/Training_Images_'" + "'h:\\\\School\\\\Winter 2022\\\\DS Projects\\\\2018\\\\hvm-image-clf/data/training/Training_Images_'" ] }, - "execution_count": 3, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -148,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 26, "id": "0ba9148a", "metadata": {}, "outputs": [ @@ -188,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 27, "id": "e6d378d5", "metadata": {}, "outputs": [ @@ -220,7 +280,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "<ipython-input-5-f268d13f0828>:5: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.\n", + "<ipython-input-27-f268d13f0828>:5: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.\n", " metadata.drop('dx',1).isna().groupby(\n" ] } @@ -240,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 28, "id": "91aa284b", "metadata": {}, "outputs": [ @@ -286,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 29, "id": "05398a91", "metadata": {}, "outputs": [ @@ -294,7 +354,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 327/327 [00:02<00:00, 139.03it/s]\n" + "100%|██████████| 327/327 [00:03<00:00, 92.87it/s] \n" ] } ], @@ -305,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 21, "id": "e8642d8d", "metadata": {}, "outputs": [ @@ -313,7 +373,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 514/514 [00:06<00:00, 76.53it/s] \n" + "100%|██████████| 15/15 [00:00<00:00, 93.75it/s]\n" ] } ], @@ -323,7 +383,34 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 22, + "id": "2060e363", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[179., 185., 183., ..., 197., 192., 190.],\n", + " [163., 167., 168., ..., 135., 128., 119.],\n", + " [170., 168., 167., ..., 179., 178., 182.],\n", + " ...,\n", + " [198., 199., 195., ..., 175., 176., 176.],\n", + " [153., 150., 153., ..., 123., 115., 106.],\n", + " [179., 182., 184., ..., 178., 178., 180.]], dtype=float32)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bcc_images" + ] + }, + { + "cell_type": "code", + "execution_count": 31, "id": "5312b5de", "metadata": {}, "outputs": [ @@ -331,7 +418,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1099/1099 [00:21<00:00, 51.53it/s]\n" + "100%|██████████| 1099/1099 [00:24<00:00, 45.15it/s]\n" ] } ], @@ -341,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 32, "id": "49338970", "metadata": {}, "outputs": [ @@ -349,7 +436,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 115/115 [00:01<00:00, 111.11it/s]\n" + "100%|██████████| 115/115 [00:01<00:00, 107.58it/s]\n" ] } ], @@ -359,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 33, "id": "784d69cd", "metadata": {}, "outputs": [ @@ -367,7 +454,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1113/1113 [00:21<00:00, 50.65it/s]\n" + "100%|██████████| 1113/1113 [00:24<00:00, 46.00it/s]\n" ] } ], @@ -377,7 +464,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 34, "id": "6cd167a7", "metadata": {}, "outputs": [ @@ -385,7 +472,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 6705/6705 [08:49<00:00, 12.65it/s]\n" + "100%|██████████| 6705/6705 [09:36<00:00, 11.63it/s]\n" ] } ], @@ -396,7 +483,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 35, "id": "4de5cec3", "metadata": {}, "outputs": [ @@ -404,7 +491,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 142/142 [00:01<00:00, 103.95it/s]\n" + "100%|██████████| 142/142 [00:01<00:00, 92.27it/s]\n" ] } ], @@ -414,23 +501,23 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 36, "id": "d92158fa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[168., 164., 163., ..., 141., 140., 139.],\n", - " [139., 145., 146., ..., 160., 160., 160.],\n", - " [171., 174., 172., ..., 184., 188., 189.],\n", + "array([[171., 170., 166., ..., 163., 170., 171.],\n", + " [158., 163., 163., ..., 189., 187., 174.],\n", + " [173., 177., 176., ..., 184., 185., 183.],\n", " ...,\n", - " [104., 101., 98., ..., 34., 31., 29.],\n", - " [ 84., 84., 88., ..., 10., 8., 9.],\n", - " [157., 157., 154., ..., 156., 157., 155.]], dtype=float32)" + " [113., 112., 111., ..., 99., 104., 103.],\n", + " [118., 120., 128., ..., 162., 148., 149.],\n", + " [157., 153., 153., ..., 172., 175., 175.]], dtype=float32)" ] }, - "execution_count": 37, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -1182,17 +1269,313 @@ "id": "3157d03a", "metadata": {}, "source": [ - "# 4. Data Processing for Model Ingestion" + "# 5. Model Creation" ] }, { - "cell_type": "markdown", - "id": "64adf033", + "cell_type": "code", + "execution_count": 5, + "id": "5f0f70da", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 10015 files belonging to 7 classes.\n" + ] + } + ], "source": [ - "# 5. Model Creation" + "#make test set\n", + "seed = 12345\n", + "training_data = image_dataset_from_directory(\n", + " directory = './data/training/',\n", + " labels = 'inferred',\n", + " label_mode = 'categorical',\n", + " class_names = lab,\n", + " batch_size = 32,\n", + " image_size = (600,450),\n", + " shuffle = True,\n", + " seed = seed,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b8213cb9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 193 files belonging to 7 classes.\n" + ] + } + ], + "source": [ + "test_data = image_dataset_from_directory(\n", + " directory = './data/test/',\n", + " labels = 'inferred',\n", + " label_mode = 'categorical',\n", + " class_names = lab,\n", + " batch_size = 32,\n", + " image_size = (600,450),\n", + " shuffle = True,\n", + " seed = seed,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f1be00cd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<BatchDataset element_spec=(TensorSpec(shape=(None, 600, 450, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 7), dtype=tf.float32, name=None))>" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f8524747", + "metadata": {}, + "outputs": [], + "source": [ + "from tensorflow.keras import datasets,layers, models" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "25f8bceb", + "metadata": {}, + "outputs": [], + "source": [ + "model = models.Sequential()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "17a95bfb", + "metadata": {}, + "outputs": [], + "source": [ + "model.add(layers.Conv2D(32, (3,3), activation='relu', input_shape=(600,450,3)))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2c9a3491", + "metadata": {}, + "outputs": [], + "source": [ + "model.add(layers.AveragePooling2D(2,2))\n", + "model.add(layers.Conv2D(64, (3, 3), activation='relu'))\n", + "model.add(layers.AveragePooling2D((2, 2)))\n", + "model.add(layers.Conv2D(64, (3, 3), activation='relu'))" ] }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d668b5de", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential\"\n", + "_________________________________________________________________\n", + " Layer (type) Output Shape Param # \n", + "=================================================================\n", + " conv2d (Conv2D) (None, 598, 448, 32) 896 \n", + " \n", + " average_pooling2d (AverageP (None, 299, 224, 32) 0 \n", + " ooling2D) \n", + " \n", + " conv2d_1 (Conv2D) (None, 297, 222, 64) 18496 \n", + " \n", + " average_pooling2d_1 (Averag (None, 148, 111, 64) 0 \n", + " ePooling2D) \n", + " \n", + " conv2d_2 (Conv2D) (None, 146, 109, 64) 36928 \n", + " \n", + "=================================================================\n", + "Total params: 56,320\n", + "Trainable params: 56,320\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9fef7a20", + "metadata": {}, + "outputs": [], + "source": [ + "model.add(layers.Flatten())\n", + "model.add(layers.Dense(64, activation='relu'))\n", + "model.add(layers.Dense(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b2c2de4d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential\"\n", + "_________________________________________________________________\n", + " Layer (type) Output Shape Param # \n", + "=================================================================\n", + " conv2d (Conv2D) (None, 598, 448, 32) 896 \n", + " \n", + " average_pooling2d (AverageP (None, 299, 224, 32) 0 \n", + " ooling2D) \n", + " \n", + " conv2d_1 (Conv2D) (None, 297, 222, 64) 18496 \n", + " \n", + " average_pooling2d_1 (Averag (None, 148, 111, 64) 0 \n", + " ePooling2D) \n", + " \n", + " conv2d_2 (Conv2D) (None, 146, 109, 64) 36928 \n", + " \n", + " flatten (Flatten) (None, 1018496) 0 \n", + " \n", + " dense (Dense) (None, 64) 65183808 \n", + " \n", + " dense_1 (Dense) (None, 10) 650 \n", + " \n", + "=================================================================\n", + "Total params: 65,240,778\n", + "Trainable params: 65,240,778\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "25c98f80", + "metadata": {}, + "outputs": [], + "source": [ + "model.compile(optimizer='adam', \n", + " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", + " metrics=['accuracy'])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "acd6773b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential\"\n", + "_________________________________________________________________\n", + " Layer (type) Output Shape Param # \n", + "=================================================================\n", + " conv2d (Conv2D) (None, 598, 448, 32) 896 \n", + " \n", + " average_pooling2d (AverageP (None, 299, 224, 32) 0 \n", + " ooling2D) \n", + " \n", + " conv2d_1 (Conv2D) (None, 297, 222, 64) 18496 \n", + " \n", + " average_pooling2d_1 (Averag (None, 148, 111, 64) 0 \n", + " ePooling2D) \n", + " \n", + " conv2d_2 (Conv2D) (None, 146, 109, 64) 36928 \n", + " \n", + " flatten (Flatten) (None, 1018496) 0 \n", + " \n", + " dense (Dense) (None, 64) 65183808 \n", + " \n", + " dense_1 (Dense) (None, 10) 650 \n", + " \n", + "=================================================================\n", + "Total params: 65,240,778\n", + "Trainable params: 65,240,778\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ffe382a0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n" + ] + } + ], + "source": [ + "history = model.fit(training_data, epochs=10, validation_data = (test_data))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fc861d9", + "metadata": {}, + "outputs": [], + "source": [ + "history" + ] + }, + { + "cell_type": "markdown", + "id": "64adf033", + "metadata": {}, + "source": [] + }, { "cell_type": "markdown", "id": "c3114115", @@ -1229,7 +1612,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.2" + "version": "3.9.0" } }, "nbformat": 4, diff --git a/data_processing.py b/data_processing.py index 37f6849..cd831cd 100644 --- a/data_processing.py +++ b/data_processing.py @@ -9,19 +9,19 @@ import numpy as np from tqdm import tqdm -def load_sort_data(meta_filename = str, image_folder = str, output_folder = str): +def load_sort_data(meta_filename = str, image_folder = str,sub_dir= str): """[summary] Args: meta_filename ([type], optional): [description]. Defaults to str. image_folder ([type], optional): [description]. Defaults to str. - output_folder ([type], optional): [description]. Defaults to str. + sub_dir (str,optional): Parent directory of the output folders. Returns: [type]: [description] """ data_dir = os.getcwd() + "/data/" - dest_dir = data_dir + output_folder + dest_dir = data_dir + sub_dir metadata = pd.read_csv(data_dir + '/' + meta_filename) labels = metadata['dx'].unique() label_images = [] -- GitLab