Skip to content
Snippets Groups Projects
Commit 58eaffa3 authored by Philip Monaco's avatar Philip Monaco
Browse files

Add faster processing of images and some image EDA

parent f5a1060e
Branches
No related tags found
1 merge request!5Resolve "Add Faster Processing of Transformation of Dataset"
EDA.py 0 → 100644
from sklearn.decomposition import PCA
from math import ceil
import numpy as np
import matplotlib.pyplot as plt
def find_mean_img(full_mat, title):
"""[summary]
Args:
full_mat ([type]): [description]
title ([type]): [description]
Returns:
[type]: [description]
"""
# calculate the average
mean_img = np.mean(full_mat, axis = 0)
# reshape it back to a matrix
mean_img = mean_img.reshape((300,225))
plt.imshow(mean_img, vmin=0, vmax=255, cmap='Greys_r')
plt.title(f'Average {title}')
plt.axis('off')
plt.show()
return mean_img
def eigenimages(full_mat, title, n_comp = 0.7, size = (300,225)):
"""[summary]
Args:
full_mat ([type]): [description]
title ([type]): [description]
n_comp (float, optional): [description]. Defaults to 0.7.
size (tuple, optional): [description]. Defaults to (300,225).
Returns:
[type]: [description]
"""
# fit PCA to describe n_comp * variability in the class
pca = PCA(n_components = n_comp, whiten = True)
pca.fit(full_mat)
print('Number of PC: ', pca.n_components_)
return pca
def plot_pca(pca, size = (300,225)):
"""[summary]
Args:
pca ([type]): [description]
size (tuple, optional): [description]. Defaults to (300,225).
"""
# plot eigenimages in a grid
n = pca.n_components_
fig = plt.figure(figsize=(8, 8))
r = int(n**.5)
c = ceil(n/ r)
for i in range(n):
ax = fig.add_subplot(r, c, i + 1, xticks = [], yticks = [])
ax.imshow(pca.components_[i].reshape(size),
cmap='Greys_r')
plt.axis('off')
plt.show()
\ No newline at end of file
import os
import cv2 #vision task package opencv-python
import shutil
import pandas as pd
import glob
import tensorflow as tf
from tensorflow.keras.preprocessing import image
import numpy as np
def load_transform_images(folder):
def load_sort_data(meta_filename = str, image_folder = str, output_folder = str):
"""[summary]
Args:
filename ([type]): [description]
meta_filename ([type], optional): [description]. Defaults to str.
image_folder ([type], optional): [description]. Defaults to str.
output_folder ([type], optional): [description]. Defaults to str.
Returns:
[type]: [description]
"""
images = [cv2.imread(file, flags=cv2.IMREAD_GRAYSCALE) for file in glob.glob("./data/"+ folder+"/*.jpg")]
return images
data_dir = os.getcwd() + "/data/"
dest_dir = data_dir + output_folder
metadata = pd.read_csv(data_dir + '/' + meta_filename)
labels = metadata['dx'].unique()
label_images = []
def transform(data):
flat = []
df = pd.DataFrame()
for i in labels:
if os.path.exists(dest_dir + str(i) + '/'):
shutil.rmtree(dest_dir + str(i) + '/')
os.mkdir(dest_dir + str(i) + '/')
sample = metadata[metadata['dx'] == i]['image_id']
label_images.extend(sample)
for id in label_images:
shutil.copyfile((data_dir + image_folder + '/' + id + '.jpg'), (dest_dir + i + '/' + id + '.jpg'))
label_images = []
for i,img in enumerate(data):
scale = (img.astype(np.float32) - 127.5)/127.5
scale = scale.reshape(1,-1)
df = df.append(pd.Series(scale[0]), ignore_index=True)
return metadata, dest_dir
return df
def transform(path, size = (300, 225)):
# create a list of images
img_list = [fn for fn in os.listdir(path) if fn.endswith('.jpg')]
#iterating over each .jpg
for fn in img_list:
fp = path + '/' + fn
current_image = image.load_img(fp, target_size = size,
color_mode = 'grayscale')
# covert image to a matrix
img_ts = image.img_to_array(current_image)
# turn that into a vector / 1D array
img_ts = [img_ts.ravel()]
try:
# concatenate different images
full_mat = np.concatenate((full_mat, img_ts))
except UnboundLocalError:
# if not assigned yet, assign one
full_mat = img_ts
return full_mat
# def batch_data(data):
Source diff could not be displayed: it is too large. Options to address this: view the blob.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment