Skip to content
Snippets Groups Projects
Commit 497a8fa7 authored by Philip Monaco's avatar Philip Monaco
Browse files

Fix EDA Errors

parent 392ed00e
No related branches found
No related tags found
1 merge request!8Fix EDA Errors
...@@ -2,60 +2,74 @@ from sklearn.decomposition import PCA ...@@ -2,60 +2,74 @@ from sklearn.decomposition import PCA
from math import ceil from math import ceil
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from tqdm import tqdm
def find_mean_img(full_mat, title): def find_mean_img(full_mat):
"""[summary] """Calculates and plots the mean of each pixel in an image matrix.
Args: Args:
full_mat ([type]): [description] full_mat (np.ndarray): Vectorized array of the image matrix.
title ([type]): [description] title (String): Name of the title for the plot.
Returns: Returns:
[type]: [description] matplotlib.plt: A plot of the the mean pixels for each disease category.
""" """
cols = 4
rows = len(full_mat)//cols + 1
fig = plt.figure(figsize = (12,6))
for i, mat in zip(range(0,len(full_mat)),full_mat):
# calculate the average # calculate the average
mean_img = np.mean(full_mat, axis = 0) mean_img = np.mean(full_mat[mat], axis = 0)
# reshape it back to a matrix # reshape it back to a matrix
mean_img = mean_img.reshape((300,225)) mean_img = mean_img.reshape((200, 150))
plt.imshow(mean_img, vmin=0, vmax=255, cmap='Greys_r') ax = fig.add_subplot(rows, cols,i+1)
plt.title(f'Average {title}') ax.imshow(mean_img, vmin=0, vmax=255, cmap='Greys_r')
ax.set_title('Average ' + mat)
plt.axis('off') plt.axis('off')
plt.show()
return mean_img
def eigenimages(full_mat, title, n_comp = 0.7, size = (300,225)): plt.tight_layout()
"""[summary]
Args: def plot_pca(pca, title, size = (200, 150)):
full_mat ([type]): [description] """Plots each decomposed PCA image and labels the amount of variability for each image.
title ([type]): [description]
n_comp (float, optional): [description]. Defaults to 0.7.
size (tuple, optional): [description]. Defaults to (300,225).
Returns:
[type]: [description]
"""
# fit PCA to describe n_comp * variability in the class
pca = PCA(n_components = n_comp, whiten = True)
pca.fit(full_mat)
print('Number of PC: ', pca.n_components_)
return pca
def plot_pca(pca, size = (300,225)):
"""[summary]
Args: Args:
pca ([type]): [description] pca (sklearn PCA object): A fitted PCA object.
size (tuple, optional): [description]. Defaults to (300,225). title (String): Title of the plot.
size (tuple, optional): Shape of the image matrix. Defaults to (300,225).
""" """
# plot eigen images in a grid # plot eigen images in a grid
n = pca.n_components_ n = pca.n_components_
print('Number of PC in ' + title + ':', n)
fig = plt.figure(figsize=(8, 8)) fig = plt.figure(figsize=(8, 8))
fig.suptitle('PCA Components of ' + title)
r = int(n**.5) r = int(n**.5)
c = ceil(n/ r) c = ceil(n/ r)
for i in range(n): for i in range(n):
ax = fig.add_subplot(r, c, i + 1, xticks = [], yticks = []) ax = fig.add_subplot(r, c, i + 1)
ax.imshow(pca.components_[i].reshape(size), ax.imshow(pca.components_[i].reshape(size),
cmap='Greys_r') cmap='Greys_r')
ax.set_title("Variance " + "{0:.2f}%".format(pca.explained_variance_ratio_[i] * 100) )
plt.axis('off') plt.axis('off')
plt.tight_layout()
plt.show() plt.show()
def eigenimages(full_mat,n_comp = 0.7, size = (200, 150)):
"""Creates creates and fits a PCA estimator from sklearn.
Args:
full_mat (np.ndarray): A vectorized array of images.
n_comp (float, optional): Percentage of desired variability. Defaults to 0.7.
size (tuple, optional): Shape of the image matrix. Defaults to (300,225).
Returns:
sklearn PCA object: Fitted PCA model.
"""
# fit PCA to describe n_comp * variability in the class
pca = PCA(n_components = n_comp, whiten = True)
pca.fit(full_mat)
return pca
Source diff could not be displayed: it is too large. Options to address this: view the blob.
...@@ -4,6 +4,7 @@ import pandas as pd ...@@ -4,6 +4,7 @@ import pandas as pd
import tensorflow as tf import tensorflow as tf
from tensorflow.keras.preprocessing import image from tensorflow.keras.preprocessing import image
import numpy as np import numpy as np
from tqdm import tqdm
def load_sort_data(meta_filename = str, image_folder = str, output_folder = str): def load_sort_data(meta_filename = str, image_folder = str, output_folder = str):
...@@ -23,13 +24,13 @@ def load_sort_data(meta_filename = str, image_folder = str, output_folder = str) ...@@ -23,13 +24,13 @@ def load_sort_data(meta_filename = str, image_folder = str, output_folder = str)
labels = metadata['dx'].unique() labels = metadata['dx'].unique()
label_images = [] label_images = []
for i in labels: for i in tqdm(labels):
if os.path.exists(dest_dir + str(i) + '/'): if os.path.exists(dest_dir + str(i) + '/'):
shutil.rmtree(dest_dir + str(i) + '/') shutil.rmtree(dest_dir + str(i) + '/')
os.mkdir(dest_dir + str(i) + '/') os.mkdir(dest_dir + str(i) + '/')
sample = metadata[metadata['dx'] == i]['image_id'] sample = metadata[metadata['dx'] == i]['image_id']
label_images.extend(sample) label_images.extend(sample)
for id in label_images: for id in tqdm(label_images):
shutil.copyfile((data_dir + image_folder + '/' + id + '.jpg'), (dest_dir + i + '/' + id + '.jpg')) shutil.copyfile((data_dir + image_folder + '/' + id + '.jpg'), (dest_dir + i + '/' + id + '.jpg'))
label_images = [] label_images = []
...@@ -39,7 +40,7 @@ def transform(path, size = (300, 225)): ...@@ -39,7 +40,7 @@ def transform(path, size = (300, 225)):
# create a list of images # create a list of images
img_list = [fn for fn in os.listdir(path) if fn.endswith('.jpg')] img_list = [fn for fn in os.listdir(path) if fn.endswith('.jpg')]
#iterating over each .jpg #iterating over each .jpg
for fn in img_list: for fn in tqdm(img_list):
fp = path + '/' + fn fp = path + '/' + fn
current_image = image.load_img(fp, target_size = size, current_image = image.load_img(fp, target_size = size,
color_mode = 'grayscale') color_mode = 'grayscale')
......
...@@ -2,3 +2,4 @@ numpy>=1.21.5 ...@@ -2,3 +2,4 @@ numpy>=1.21.5
pandas>=1.3.5 pandas>=1.3.5
tensorflow>=2.8.0 tensorflow>=2.8.0
matplotlib>=3.3.2 matplotlib>=3.3.2
tqdm>=4.*
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment