Merge branch '10-add-faster-processing-of-transformation-of-dataset' into 'main'

Resolve "Add Faster Processing of Transformation of Dataset" Closes #10 See merge request !5

Merge branch '10-add-faster-processing-of-transformation-of-dataset' into 'main'
2db31def · pjm363 (Philip Monaco) · f5a1060e · 58eaffa3 · 2db31def · 2db31def
Commit 2db31def authored 3 years ago by pjm363 (Philip Monaco)
--- a/EDA.py
+++ b/EDA.py
+from sklearn.decomposition import PCA
+from math import ceil
+import numpy as np
+import matplotlib.pyplot as plt
+def find_mean_img(full_mat, title):
+    """[summary]
+    Args:
+        full_mat ([type]): [description]
+        title ([type]): [description]
+    Returns:
+        [type]: [description]
+    """
+    # calculate the average
+    mean_img = np.mean(full_mat, axis = 0)
+    # reshape it back to a matrix
+    mean_img = mean_img.reshape((300,225))
+    plt.imshow(mean_img, vmin=0, vmax=255, cmap='Greys_r')
+    plt.title(f'Average {title}')
+    plt.axis('off')
+    plt.show()
+    return mean_img
+def eigenimages(full_mat, title, n_comp = 0.7, size = (300,225)):
+    """[summary]
+    Args:
+        full_mat ([type]): [description]
+        title ([type]): [description]
+        n_comp (float, optional): [description]. Defaults to 0.7.
+        size (tuple, optional): [description]. Defaults to (300,225).
+    Returns:
+        [type]: [description]
+    """
+    # fit PCA to describe n_comp * variability in the class
+    pca = PCA(n_components = n_comp, whiten = True)
+    pca.fit(full_mat)
+    print('Number of PC: ', pca.n_components_)
+    return pca
+def plot_pca(pca, size = (300,225)):
+    """[summary]
+    Args:
+        pca ([type]): [description]
+        size (tuple, optional): [description]. Defaults to (300,225).
+    """
+    # plot eigenimages in a grid
+    n = pca.n_components_
+    fig = plt.figure(figsize=(8, 8))
+    r = int(n**.5)
+    c = ceil(n/ r)
+    for i in range(n):
+        ax = fig.add_subplot(r, c, i + 1, xticks = [], yticks = [])
+        ax.imshow(pca.components_[i].reshape(size), 
+                  cmap='Greys_r')
+    plt.axis('off')
+    plt.show()
\ No newline at end of file
--- a/data_processing.py
+++ b/data_processing.py
 import os
-import cv2 #vision task package opencv-python
+import shutil
 import pandas as pd
-import glob
+import tensorflow as tf
+from tensorflow.keras.preprocessing import image
 import numpy as np
-def load_transform_images(folder):
+def load_sort_data(meta_filename = str, image_folder = str, output_folder = str):
    """[summary]
    Args:
-        filename ([type]): [description]
+        meta_filename ([type], optional): [description]. Defaults to str.
+        image_folder ([type], optional): [description]. Defaults to str.
+        output_folder ([type], optional): [description]. Defaults to str.
+    Returns:
+        [type]: [description]
    """
-    images = [cv2.imread(file, flags=cv2.IMREAD_GRAYSCALE) for file in glob.glob("./data/"+ folder+"/*.jpg")]
+    data_dir = os.getcwd() + "/data/"
-    return images
+    dest_dir = data_dir + output_folder
+    metadata = pd.read_csv(data_dir + '/' + meta_filename)
+    labels = metadata['dx'].unique()
+    label_images = []
-def transform(data):
+    for i in labels:
-    flat = []
+        if os.path.exists(dest_dir + str(i) + '/'):
-    df = pd.DataFrame()
+            shutil.rmtree(dest_dir + str(i) + '/')
+        os.mkdir(dest_dir + str(i) + '/')
+        sample = metadata[metadata['dx'] == i]['image_id']
+        label_images.extend(sample)
+        for id in label_images:
+            shutil.copyfile((data_dir + image_folder + '/' + id + '.jpg'), (dest_dir + i + '/' + id + '.jpg'))
+        label_images = []
-    for i,img in enumerate(data):
+    return metadata, dest_dir
-        scale = (img.astype(np.float32) - 127.5)/127.5
-        scale = scale.reshape(1,-1)
-        df = df.append(pd.Series(scale[0]), ignore_index=True)
-    return df
+def transform(path, size = (300, 225)):
+    # create a list of images
+    img_list = [fn for fn in os.listdir(path) if fn.endswith('.jpg')]
+    #iterating over each .jpg
+    for fn in img_list:
+        fp = path + '/' + fn
+        current_image = image.load_img(fp, target_size = size, 
+                                    color_mode = 'grayscale')
+        # covert image to a matrix
+        img_ts = image.img_to_array(current_image)
+        # turn that into a vector / 1D array
+        img_ts = [img_ts.ravel()]
+        try:
+            # concatenate different images
+            full_mat = np.concatenate((full_mat, img_ts))
+        except UnboundLocalError: 
+            # if not assigned yet, assign one
+            full_mat = img_ts
+    return full_mat
-# def batch_data(data):
--- a/image_preprocessing.ipynb
+++ b/image_preprocessing.ipynb