Skip to content
Snippets Groups Projects
Commit 8607c26f authored by Philip Monaco's avatar Philip Monaco
Browse files

Class formation of synthetic datagen

parent e23aa855
Branches
No related tags found
No related merge requests found
Pipeline #1587 passed
Showing
with 294 additions and 0 deletions
import numpy as np
import math
from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
from subprocess import call
# from utils.data_processing.synthetic_generator import synthetic_dataset
from utils.data_processing.synthetic import SyntheticData
# from utils.data_processing.callbacks import update_samples_or_dataset
# from utils.algorithms.loader import load_algorithm
from bokeh.io import curdoc, show, output_notebook
from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, Select, Slider, Plot, Scatter
from bokeh.palettes import Spectral6
from bokeh.plotting import figure
np.random.seed(0)
data = SyntheticData()
# print(type(data.generator()))
x, y = data.generator()
spectral = np.hstack([Spectral6] * 20)
colors = [spectral[i] for i in y]
source = ColumnDataSource(dict(x=x[:,0], y=x[:,1], colors=colors))
b = figure(
title="Some title", width=400, height=400, min_border=0)
glyph = Scatter(x="x", y="y", size=5, fill_color="colors")
b.add_glyph(source, glyph)
clf_algorithms = [
'Decision Tree'
]
datasets_names = [
"Make Classification",
"Multilabel Classification",
"Blobs"
]
algorithm_select = Select(value = 'Decision Tree',
title='Select Algorithm:',
width=200,
options=clf_algorithms
)
dataset_select = Select(value='Make Classification',
title='Select Dataset',
width=200,
options=datasets_names)
samples_slider = Slider(title="Number of samples",
value=1500.0,
start=200.0,
end=3000.0,
step=100,
width=400)
classes_slider = Slider(title="Number of Classes",
value = 3,
start=2,
end=20,
step=1,
width=400)
features_slider = Slider(title="Number of Features",
value = 3,
start=2,
end=1000,
step=1,
width=400)
inf_slider = Slider(title='Informative Classes',
value=3,
start=2,
end=100,
step=1,
width=400)
def update_samples_or_dataset(attrname, old, new):
global x, y
dataset = dataset_select.value
n_samples = int(samples_slider.value)
n_classes = int(classes_slider.value)
n_features = int(features_slider.value)
n_inf = int(inf_slider.value)
if n_inf > n_features:
n_features = n_inf
features_slider.update(value=n_inf)
if n_classes > 2**n_inf:
# n_inf = math.floor(math.sqrt(n_classes*n_clusters_p_class)) + n_classes % 2
n_inf = (math.ceil(math.log2(n_classes)))
n_features = n_inf
# print("this is v", n_inf)
inf_slider.update(value=n_inf)
features_slider.update(value=n_features)
data = SyntheticData(dataset, n_samples, n_features, n_classes, n_inf)
x, y = data.generator()
# x, y = data.generator(dataset, n_samples, n_inf, n_features, n_classes)
colors = [spectral[i] for i in y]
source.data = dict(colors=colors, x=x[:, 0], y=x[:, 1])
dataset_select.on_change('value', update_samples_or_dataset)
samples_slider.on_change('value_throttled', update_samples_or_dataset)
classes_slider.on_change('value_throttled', update_samples_or_dataset)
features_slider.on_change('value', update_samples_or_dataset)
inf_slider.on_change('value', update_samples_or_dataset)
# set up layout
selects = row(dataset_select, width=420)
inputs = column(selects, samples_slider, classes_slider, inf_slider, features_slider)
# add to document
curdoc().add_root(row(inputs, b))
curdoc().title = "Decision Tree"
attrs:
Figure:
width: 400
height: 400
background_fill_color: 'lightgrey'
background_fill_alpha: 0.2
Grid:
grid_line_color: null
Title:
text_font_size: '13px'
\ No newline at end of file
File added
File added
from sklearn import tree
from sklearn.preprocessing import StandardScaler
def load_algorithm(algorithm):
# normalize dataset for easier parameter selection
# estimate bandwidth for mean shift
# bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
# connectivity matrix for structured Ward
# connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
# make connectivity symmetric
# connectivity = 0.5 * (connectivity + connectivity.T)
# # Generate the new colors:
if algorithm=='MiniBatchKMeans':
model = tree.DecisionTreeClassifier()
# elif algorithm=='Birch':
# model = cluster.Birch(n_clusters=n_clusters)
# elif algorithm=='DBSCAN':
# model = cluster.DBSCAN(eps=.2)
# elif algorithm=='AffinityPropagation':
# model = cluster.AffinityPropagation(damping=.9,
# preference=-200)
# elif algorithm=='MeanShift':
# model = cluster.MeanShift(bandwidth=bandwidth,
# bin_seeding=True)
# elif algorithm=='SpectralClustering':
# model = cluster.SpectralClustering(n_clusters=n_clusters,
# eigen_solver='arpack',
# affinity="nearest_neighbors")
# elif algorithm=='Ward':
# model = cluster.AgglomerativeClustering(n_clusters=n_clusters,
# linkage='ward',
# connectivity=connectivity)
# elif algorithm=='AgglomerativeClustering':
# model = cluster.AgglomerativeClustering(linkage="average",
# affinity="cityblock",
# n_clusters=n_clusters,
# connectivity=connectivity)
File added
File added
File added
File added
import numpy as np
import math
from utils.data_processing.synthetic import synthetic_dataset
from bokeh.io import curdoc, show, output_notebook
from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, Select, Slider, Plot, Scatter
from bokeh.palettes import Spectral6
from bokeh.plotting import figure
spectral = np.hstack([Spectral6] * 20)
n_clusters_p_class = 1
def update_samples_or_dataset(attrname,
old,
new,
# dataset_select,
# samples_slider,
# classes_slider,
# features_slider,
# inf_slider,
# source
):
global x, y
dataset = dataset_select.value
n_samples = int(samples_slider.value)
n_classes = int(classes_slider.value)
n_features = int(features_slider.value)
n_inf = int(inf_slider.value)
if n_inf > n_features:
n_features = n_inf
features_slider.update(value=n_inf)
if n_classes * n_clusters_p_class > 2**n_inf:
# n_inf = math.floor(math.sqrt(n_classes*n_clusters_p_class)) + n_classes % 2
n_inf = (math.ceil(math.log2(n_classes)))
n_features = n_inf
# print("this is v", n_inf)
inf_slider.update(value=n_inf)
features_slider.update(value=n_features)
x, y = synthetic_dataset(dataset, n_samples, n_inf, n_features, n_classes)
colors = [spectral[i] for i in y]
source.data = dict(colors=colors, x=x[:, 0], y=x[:, 1])
\ No newline at end of file
import numpy as np
from sklearn import datasets
class SyntheticData:
def __init__(self,
dataset='Make Classification',
n_samples=1500,
n_features=4,
n_classes=3,
n_inf=2):
self.dataset = dataset
self.n_samples = n_samples
self.n_features = n_features
self.n_classes = n_classes
self.n_inf = n_inf
def generator(self):
if self.dataset == 'Blobs':
return datasets.make_blobs(n_samples=self.n_samples,
random_state=8)
elif self.dataset == 'Make Classification':
return datasets.make_classification(n_samples=self.n_samples,
n_features=self.n_features,
n_informative=self.n_inf,
n_redundant=0,
n_clusters_per_class=1,
n_classes=self.n_classes,
random_state=8)
# if dataset == 'Noisy Circles':
# return datasets.make_circles(n_samples=n_samples,
# factor=0.5,
# noise=0.05)
# elif dataset == 'Noisy Moons':
# return datasets.make_moons(n_samples=n_samples,
# noise=0.05)
# elif dataset == 'Multilabel Classification':
# return datasets.make_multilabel_classification(n_samples=n_samples,
# n_features=n_features,
# n_classes=n_classes,
# random_state=8)
elif self.dataset == "No Structure":
return np.random.rand(self.n_samples, 2), None
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment