diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/examples/decision_tree/__init__.py b/examples/decision_tree/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/examples/decision_tree/main.py b/examples/decision_tree/main.py new file mode 100644 index 0000000000000000000000000000000000000000..90ab43a9e0c4b68bf43c2b86ea24ac3a0466da20 --- /dev/null +++ b/examples/decision_tree/main.py @@ -0,0 +1,133 @@ +import numpy as np +import math +from sklearn import cluster, datasets +from sklearn.neighbors import kneighbors_graph + +from sklearn.model_selection import train_test_split + +from sklearn.tree import export_graphviz +from subprocess import call + +# from utils.data_processing.synthetic_generator import synthetic_dataset +from utils.data_processing.synthetic import SyntheticData +# from utils.data_processing.callbacks import update_samples_or_dataset +# from utils.algorithms.loader import load_algorithm + +from bokeh.io import curdoc, show, output_notebook +from bokeh.layouts import column, row +from bokeh.models import ColumnDataSource, Select, Slider, Plot, Scatter +from bokeh.palettes import Spectral6 +from bokeh.plotting import figure + +np.random.seed(0) + +data = SyntheticData() +# print(type(data.generator())) +x, y = data.generator() + +spectral = np.hstack([Spectral6] * 20) + +colors = [spectral[i] for i in y] + +source = ColumnDataSource(dict(x=x[:,0], y=x[:,1], colors=colors)) + +b = figure( + title="Some title", width=400, height=400, min_border=0) + +glyph = Scatter(x="x", y="y", size=5, fill_color="colors") + +b.add_glyph(source, glyph) + +clf_algorithms = [ + 'Decision Tree' +] + +datasets_names = [ + "Make Classification", + "Multilabel Classification", + "Blobs" +] + +algorithm_select = Select(value = 'Decision Tree', + title='Select Algorithm:', + width=200, + options=clf_algorithms + ) + +dataset_select = Select(value='Make Classification', + title='Select Dataset', + width=200, + options=datasets_names) + +samples_slider = Slider(title="Number of samples", + value=1500.0, + start=200.0, + end=3000.0, + step=100, + width=400) + +classes_slider = Slider(title="Number of Classes", + value = 3, + start=2, + end=20, + step=1, + width=400) + +features_slider = Slider(title="Number of Features", + value = 3, + start=2, + end=1000, + step=1, + width=400) + +inf_slider = Slider(title='Informative Classes', + value=3, + start=2, + end=100, + step=1, + width=400) + +def update_samples_or_dataset(attrname, old, new): + global x, y + + dataset = dataset_select.value + n_samples = int(samples_slider.value) + n_classes = int(classes_slider.value) + n_features = int(features_slider.value) + n_inf = int(inf_slider.value) + + if n_inf > n_features: + n_features = n_inf + features_slider.update(value=n_inf) + + if n_classes > 2**n_inf: + # n_inf = math.floor(math.sqrt(n_classes*n_clusters_p_class)) + n_classes % 2 + + n_inf = (math.ceil(math.log2(n_classes))) + n_features = n_inf + # print("this is v", n_inf) + + inf_slider.update(value=n_inf) + features_slider.update(value=n_features) + + data = SyntheticData(dataset, n_samples, n_features, n_classes, n_inf) + x, y = data.generator() + # x, y = data.generator(dataset, n_samples, n_inf, n_features, n_classes) + colors = [spectral[i] for i in y] + + source.data = dict(colors=colors, x=x[:, 0], y=x[:, 1]) + +dataset_select.on_change('value', update_samples_or_dataset) +samples_slider.on_change('value_throttled', update_samples_or_dataset) +classes_slider.on_change('value_throttled', update_samples_or_dataset) +features_slider.on_change('value', update_samples_or_dataset) +inf_slider.on_change('value', update_samples_or_dataset) + +# set up layout +selects = row(dataset_select, width=420) +inputs = column(selects, samples_slider, classes_slider, inf_slider, features_slider) + +# add to document +curdoc().add_root(row(inputs, b)) +curdoc().title = "Decision Tree" + diff --git a/examples/decision_tree/theme.yaml b/examples/decision_tree/theme.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ce434d51aee270c4301f01b4867133750d8ee485 --- /dev/null +++ b/examples/decision_tree/theme.yaml @@ -0,0 +1,12 @@ +attrs: + Figure: + width: 400 + height: 400 + background_fill_color: 'lightgrey' + background_fill_alpha: 0.2 + + Grid: + grid_line_color: null + + Title: + text_font_size: '13px' \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/utils/__pycache__/__init__.cpython-39.pyc b/src/utils/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..142bc23a60b5553413dc37265627f06b05e13b40 Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/src/utils/__pycache__/data.cpython-39.pyc b/src/utils/__pycache__/data.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f119a1c537929c5e708c11e68b31328964e0bd4 Binary files /dev/null and b/src/utils/__pycache__/data.cpython-39.pyc differ diff --git a/src/utils/algorithms/__init__.py b/src/utils/algorithms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/utils/algorithms/callbacks.py b/src/utils/algorithms/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/utils/algorithms/loader.py b/src/utils/algorithms/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..eb34d3ae213946b18a21eef4d175ae5ff15d5663 --- /dev/null +++ b/src/utils/algorithms/loader.py @@ -0,0 +1,49 @@ +from sklearn import tree +from sklearn.preprocessing import StandardScaler + + +def load_algorithm(algorithm): + # normalize dataset for easier parameter selection + + # estimate bandwidth for mean shift + # bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) + + # connectivity matrix for structured Ward + # connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) + + # make connectivity symmetric + # connectivity = 0.5 * (connectivity + connectivity.T) + + # # Generate the new colors: + if algorithm=='MiniBatchKMeans': + model = tree.DecisionTreeClassifier() + + # elif algorithm=='Birch': + # model = cluster.Birch(n_clusters=n_clusters) + + # elif algorithm=='DBSCAN': + # model = cluster.DBSCAN(eps=.2) + + # elif algorithm=='AffinityPropagation': + # model = cluster.AffinityPropagation(damping=.9, + # preference=-200) + + # elif algorithm=='MeanShift': + # model = cluster.MeanShift(bandwidth=bandwidth, + # bin_seeding=True) + + # elif algorithm=='SpectralClustering': + # model = cluster.SpectralClustering(n_clusters=n_clusters, + # eigen_solver='arpack', + # affinity="nearest_neighbors") + + # elif algorithm=='Ward': + # model = cluster.AgglomerativeClustering(n_clusters=n_clusters, + # linkage='ward', + # connectivity=connectivity) + + # elif algorithm=='AgglomerativeClustering': + # model = cluster.AgglomerativeClustering(linkage="average", + # affinity="cityblock", + # n_clusters=n_clusters, + # connectivity=connectivity) diff --git a/src/utils/data_processing/__init__.py b/src/utils/data_processing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/utils/data_processing/__pycache__/__init__.cpython-39.pyc b/src/utils/data_processing/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6cc1f6344c0ba9c5a6d4c2bd7fa332b1c5b4a5fd Binary files /dev/null and b/src/utils/data_processing/__pycache__/__init__.cpython-39.pyc differ diff --git a/src/utils/data_processing/__pycache__/callbacks.cpython-39.pyc b/src/utils/data_processing/__pycache__/callbacks.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a09fb3fae8aef45906a5c805b7542a8cf867fe4a Binary files /dev/null and b/src/utils/data_processing/__pycache__/callbacks.cpython-39.pyc differ diff --git a/src/utils/data_processing/__pycache__/synthetic.cpython-39.pyc b/src/utils/data_processing/__pycache__/synthetic.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc1876944bcf2f5f237179b885d5b4a3e66c4d80 Binary files /dev/null and b/src/utils/data_processing/__pycache__/synthetic.cpython-39.pyc differ diff --git a/src/utils/data_processing/__pycache__/synthetic_generator.cpython-39.pyc b/src/utils/data_processing/__pycache__/synthetic_generator.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74cac755b843e94a96475186e69e96217deaa4dd Binary files /dev/null and b/src/utils/data_processing/__pycache__/synthetic_generator.cpython-39.pyc differ diff --git a/src/utils/data_processing/callbacks.py b/src/utils/data_processing/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..6df69d20b5c883b9934ff8a6673cd5455ecf2ebd --- /dev/null +++ b/src/utils/data_processing/callbacks.py @@ -0,0 +1,51 @@ +import numpy as np +import math + +from utils.data_processing.synthetic import synthetic_dataset + +from bokeh.io import curdoc, show, output_notebook +from bokeh.layouts import column, row +from bokeh.models import ColumnDataSource, Select, Slider, Plot, Scatter +from bokeh.palettes import Spectral6 +from bokeh.plotting import figure + +spectral = np.hstack([Spectral6] * 20) +n_clusters_p_class = 1 + +def update_samples_or_dataset(attrname, + old, + new, + # dataset_select, + # samples_slider, + # classes_slider, + # features_slider, + # inf_slider, + # source + ): + global x, y + + dataset = dataset_select.value + n_samples = int(samples_slider.value) + n_classes = int(classes_slider.value) + n_features = int(features_slider.value) + n_inf = int(inf_slider.value) + + if n_inf > n_features: + n_features = n_inf + features_slider.update(value=n_inf) + + if n_classes * n_clusters_p_class > 2**n_inf: + + # n_inf = math.floor(math.sqrt(n_classes*n_clusters_p_class)) + n_classes % 2 + + n_inf = (math.ceil(math.log2(n_classes))) + n_features = n_inf + # print("this is v", n_inf) + + inf_slider.update(value=n_inf) + features_slider.update(value=n_features) + + x, y = synthetic_dataset(dataset, n_samples, n_inf, n_features, n_classes) + colors = [spectral[i] for i in y] + + source.data = dict(colors=colors, x=x[:, 0], y=x[:, 1]) \ No newline at end of file diff --git a/src/utils/data_processing/synthetic.py b/src/utils/data_processing/synthetic.py new file mode 100644 index 0000000000000000000000000000000000000000..9cfb80634596929639cca352d3e15559b1b2f82d --- /dev/null +++ b/src/utils/data_processing/synthetic.py @@ -0,0 +1,49 @@ +import numpy as np +from sklearn import datasets + +class SyntheticData: + def __init__(self, + dataset='Make Classification', + n_samples=1500, + n_features=4, + n_classes=3, + n_inf=2): + self.dataset = dataset + self.n_samples = n_samples + self.n_features = n_features + self.n_classes = n_classes + self.n_inf = n_inf + + def generator(self): + if self.dataset == 'Blobs': + return datasets.make_blobs(n_samples=self.n_samples, + random_state=8) + + elif self.dataset == 'Make Classification': + return datasets.make_classification(n_samples=self.n_samples, + n_features=self.n_features, + n_informative=self.n_inf, + n_redundant=0, + n_clusters_per_class=1, + n_classes=self.n_classes, + random_state=8) + + # if dataset == 'Noisy Circles': + # return datasets.make_circles(n_samples=n_samples, + # factor=0.5, + # noise=0.05) + + # elif dataset == 'Noisy Moons': + # return datasets.make_moons(n_samples=n_samples, + # noise=0.05) + + # elif dataset == 'Multilabel Classification': + # return datasets.make_multilabel_classification(n_samples=n_samples, + # n_features=n_features, + # n_classes=n_classes, + # random_state=8) + + elif self.dataset == "No Structure": + return np.random.rand(self.n_samples, 2), None + + \ No newline at end of file