Skip to content
Snippets Groups Projects
Commit 2db31def authored by pjm363 (Philip Monaco)'s avatar pjm363 (Philip Monaco)
Browse files

Merge branch '10-add-faster-processing-of-transformation-of-dataset' into 'main'

Resolve "Add Faster Processing of Transformation of Dataset"

Closes #10

See merge request !5
parents f5a1060e 58eaffa3
Branches
No related tags found
1 merge request!5Resolve "Add Faster Processing of Transformation of Dataset"
EDA.py 0 → 100644
from sklearn.decomposition import PCA
from math import ceil
import numpy as np
import matplotlib.pyplot as plt
def find_mean_img(full_mat, title):
"""[summary]
Args:
full_mat ([type]): [description]
title ([type]): [description]
Returns:
[type]: [description]
"""
# calculate the average
mean_img = np.mean(full_mat, axis = 0)
# reshape it back to a matrix
mean_img = mean_img.reshape((300,225))
plt.imshow(mean_img, vmin=0, vmax=255, cmap='Greys_r')
plt.title(f'Average {title}')
plt.axis('off')
plt.show()
return mean_img
def eigenimages(full_mat, title, n_comp = 0.7, size = (300,225)):
"""[summary]
Args:
full_mat ([type]): [description]
title ([type]): [description]
n_comp (float, optional): [description]. Defaults to 0.7.
size (tuple, optional): [description]. Defaults to (300,225).
Returns:
[type]: [description]
"""
# fit PCA to describe n_comp * variability in the class
pca = PCA(n_components = n_comp, whiten = True)
pca.fit(full_mat)
print('Number of PC: ', pca.n_components_)
return pca
def plot_pca(pca, size = (300,225)):
"""[summary]
Args:
pca ([type]): [description]
size (tuple, optional): [description]. Defaults to (300,225).
"""
# plot eigenimages in a grid
n = pca.n_components_
fig = plt.figure(figsize=(8, 8))
r = int(n**.5)
c = ceil(n/ r)
for i in range(n):
ax = fig.add_subplot(r, c, i + 1, xticks = [], yticks = [])
ax.imshow(pca.components_[i].reshape(size),
cmap='Greys_r')
plt.axis('off')
plt.show()
\ No newline at end of file
import os import os
import cv2 #vision task package opencv-python import shutil
import pandas as pd import pandas as pd
import glob import tensorflow as tf
from tensorflow.keras.preprocessing import image
import numpy as np import numpy as np
def load_transform_images(folder):
def load_sort_data(meta_filename = str, image_folder = str, output_folder = str):
"""[summary] """[summary]
Args: Args:
filename ([type]): [description] meta_filename ([type], optional): [description]. Defaults to str.
image_folder ([type], optional): [description]. Defaults to str.
output_folder ([type], optional): [description]. Defaults to str.
Returns:
[type]: [description]
""" """
images = [cv2.imread(file, flags=cv2.IMREAD_GRAYSCALE) for file in glob.glob("./data/"+ folder+"/*.jpg")] data_dir = os.getcwd() + "/data/"
return images dest_dir = data_dir + output_folder
metadata = pd.read_csv(data_dir + '/' + meta_filename)
labels = metadata['dx'].unique()
label_images = []
def transform(data): for i in labels:
flat = [] if os.path.exists(dest_dir + str(i) + '/'):
df = pd.DataFrame() shutil.rmtree(dest_dir + str(i) + '/')
os.mkdir(dest_dir + str(i) + '/')
sample = metadata[metadata['dx'] == i]['image_id']
label_images.extend(sample)
for id in label_images:
shutil.copyfile((data_dir + image_folder + '/' + id + '.jpg'), (dest_dir + i + '/' + id + '.jpg'))
label_images = []
for i,img in enumerate(data): return metadata, dest_dir
scale = (img.astype(np.float32) - 127.5)/127.5
scale = scale.reshape(1,-1)
df = df.append(pd.Series(scale[0]), ignore_index=True)
return df def transform(path, size = (300, 225)):
# create a list of images
img_list = [fn for fn in os.listdir(path) if fn.endswith('.jpg')]
#iterating over each .jpg
for fn in img_list:
fp = path + '/' + fn
current_image = image.load_img(fp, target_size = size,
color_mode = 'grayscale')
# covert image to a matrix
img_ts = image.img_to_array(current_image)
# turn that into a vector / 1D array
img_ts = [img_ts.ravel()]
try:
# concatenate different images
full_mat = np.concatenate((full_mat, img_ts))
except UnboundLocalError:
# if not assigned yet, assign one
full_mat = img_ts
return full_mat
# def batch_data(data):
Source diff could not be displayed: it is too large. Options to address this: view the blob.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment