Select Git revision
synthetic.py 7.94 KiB
"""
Synthetic data generation class, with callbacks and visualizations
"""
import numpy as np
from sklearn import datasets
from bokeh.models import Select, Slider, Row, Column
from bokeh.io import curdoc
from bokeh.layouts import column, row
import math
from bokeh.palettes import Spectral6
import config as config
from data_vis import vis_synthetic
class SyntheticData:
"""Class for creating a synthetic data object with default parameters.
"""
def __init__(self,
dataset='Make Classification',
n_samples=1500,
n_features=4,
n_classes=3,
n_inf=2):
self.dataset = dataset
self.n_samples = n_samples
self.n_features = n_features
self.n_classes = n_classes
self.n_inf = n_inf
def generator(self):
"""Engine that creates synthetic data.
Takes advantage the synthetic data generator provided by sklearn.
The generator makes 5 data shapes available.
Returns:
X: ndarray of shape(n_samples, 2) The generated samples.
y: ndarray of shape(n_samples,) The integer labels for class membership of each sample.
"""
if self.dataset == 'Blobs':
#sliders: samples, classes, features
return datasets.make_blobs(n_samples=self.n_samples,
centers=self.n_classes,
n_features=self.n_features,
random_state=8
)
elif self.dataset == 'Make Classification':
#sliders: samples, features, informative features, classes
return datasets.make_classification(n_samples=self.n_samples,
n_features=self.n_features,
n_informative=self.n_inf,
n_redundant=0,
n_clusters_per_class=1,
n_classes=self.n_classes,
random_state=8
)
elif self.dataset == 'Noisy Circles':
#sliders: samples
return datasets.make_circles(n_samples=self.n_samples,
factor=0.5,
noise=0.05
)
elif self.dataset == 'Noisy Moons':
#sliders: samples
return datasets.make_moons(n_samples=self.n_samples,
noise=0.05
)
elif self.dataset == 'Multilabel Classification':
#sliders: samples, features, classes
return datasets.make_multilabel_classification(n_samples=self.n_samples,
n_features=self.n_features,
n_classes=self.n_classes,
random_state=8
)
elif self.dataset == "No Structure":
return np.random.rand(self.n_samples, 2), None
def update_samples_or_dataset(attrname, old, new):
"""Callback function that updates samples as values are scrubbed with sliders.
Args:
attrname (_type_): _description_
old (_type_): _description_
new (_type_): _description_
"""
if config.dataset_select.value == 'Blobs':
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
n_classes = int(config.classes_slider.value)
n_features = int(config.features_slider.value)
data = SyntheticData(dataset, n_samples, n_features, n_classes)
config.x, config.y = data.generator()
colors = [config.spectral[i] for i in config.y]
config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
elif config.dataset_select.value == 'Make Classification':
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
n_classes = int(config.classes_slider.value)
n_features = int(config.features_slider.value)
n_inf = int(config.inf_slider.value)
if n_inf > n_features:
n_features = n_inf
config.features_slider.update(value=n_inf)
if n_classes > 2**n_inf:
n_inf = (math.ceil(math.log2(n_classes)))
n_features = n_inf
config.inf_slider.update(value=n_inf)
config.features_slider.update(value=n_features)
data = SyntheticData(dataset, n_samples, n_features, n_classes, n_inf)
config.x, config.y = data.generator()
colors = [config.spectral[i] for i in config.y]
config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
elif config.dataset_select.value == 'Noisy Circles':
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
data = SyntheticData(dataset, n_samples)
config.x, config.y = data.generator()
colors = [config.spectral[i] for i in config.y]
config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
elif config.dataset_select.value == 'Noisy Moons':
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
data = SyntheticData(dataset, n_samples)
config.x, config.y = data.generator()
colors = [config.spectral[i] for i in config.y]
config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
elif config.dataset_select.value == 'Multilabel Classification':
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
n_features = int(config.features_slider.value)
n_classes = int(config.classes_slider.value)
data = SyntheticData(dataset, n_samples, n_features, n_classes)
config.x, config.y = data.generator()
colors = [config.spectral[i] for i in config.y]
config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
def update_layout(attrname, old, new):
"""Callback function that updates the sliders layout as datasets change.
Args:
attrname (_type_): _description_
old (_type_): _description_
new (_type_): _description_
"""
if config.dataset_select.value == 'Blobs' or config.dataset_select.value == 'Multilabel Classification':
inputs = Column(config.selects, config.samples_slider, config.classes_slider, config.features_slider)
b = vis_synthetic()
curdoc().clear()
curdoc().add_root(Row(inputs, b))
elif config.dataset_select.value == 'Make Classification':
inputs = Column(config.selects, config.samples_slider, config.classes_slider, config.features_slider, config.inf_slider)
b = vis_synthetic()
curdoc().clear()
curdoc().add_root(Row(inputs, b))
elif config.dataset_select.value == 'Noisy Circles' or config.dataset_select.value == 'Noisy Moons':
inputs = Column(config.selects, config.samples_slider)
b = vis_synthetic()
curdoc().clear()
curdoc().add_root(Row(inputs,b))
config.dataset_select.on_change('value', update_samples_or_dataset)
config.samples_slider.on_change('value_throttled', update_samples_or_dataset)
config.classes_slider.on_change('value_throttled', update_samples_or_dataset)
config.features_slider.on_change('value', update_samples_or_dataset)
config.inf_slider.on_change('value', update_samples_or_dataset)
config.dataset_select.on_change('value', update_layout)