Skip to content
Snippets Groups Projects
Select Git revision
  • ef21755e425adc1edcea2bd53b8eb7c0b9884991
  • main default
  • logRegressionVisuals
  • Dashboard
  • explainerAnnotation
  • DT-Class-Design
  • WIP-4-ModelCreation-class
  • Data_Ingestion_Class_Creation
  • 3-dummy-issue
  • 5-feature
10 results

synthetic.py

Blame
  • user avatar
    Philip Monaco authored
    ef21755e
    History
    synthetic.py 7.94 KiB
    """
    Synthetic data generation class, with callbacks and visualizations
    """
    
    import numpy as np
    from sklearn import datasets
    from bokeh.models import Select, Slider, Row, Column
    from bokeh.io import curdoc
    from bokeh.layouts import column, row
    import math
    from bokeh.palettes import Spectral6
    import config as config
    from data_vis import vis_synthetic
    
    class SyntheticData:
        """Class for creating a synthetic data object with default parameters.
        """
        def __init__(self, 
                     dataset='Make Classification', 
                     n_samples=1500, 
                     n_features=4, 
                     n_classes=3, 
                     n_inf=2):
            self.dataset = dataset
            self.n_samples = n_samples
            self.n_features = n_features
            self.n_classes = n_classes
            self.n_inf = n_inf
        
        
        def generator(self):
            """Engine that creates synthetic data.
            
            Takes advantage the synthetic data generator provided by sklearn.
            The generator makes 5 data shapes available.
    
            Returns:
                X: ndarray of shape(n_samples, 2) The generated samples.
                y: ndarray of shape(n_samples,) The integer labels for class membership of each sample.
            """
            if self.dataset == 'Blobs':
                #sliders: samples, classes, features
                return datasets.make_blobs(n_samples=self.n_samples,
                                           centers=self.n_classes,
                                           n_features=self.n_features,
                                           random_state=8
                                           )
                
            elif self.dataset == 'Make Classification':
                #sliders: samples, features, informative features, classes
                return datasets.make_classification(n_samples=self.n_samples,
                                                    n_features=self.n_features, 
                                                    n_informative=self.n_inf, 
                                                    n_redundant=0,
                                                    n_clusters_per_class=1, 
                                                    n_classes=self.n_classes,
                                                    random_state=8
                                                    )
                
            elif self.dataset == 'Noisy Circles':
                #sliders: samples
                return datasets.make_circles(n_samples=self.n_samples,
                                            factor=0.5,
                                            noise=0.05
                                            )
    
            elif self.dataset == 'Noisy Moons':
                #sliders: samples
                return datasets.make_moons(n_samples=self.n_samples,
                                           noise=0.05
                                           )
    
            elif self.dataset == 'Multilabel Classification':
                #sliders: samples, features, classes
                return datasets.make_multilabel_classification(n_samples=self.n_samples, 
                                                               n_features=self.n_features,
                                                               n_classes=self.n_classes,
                                                               random_state=8
                                                               )
    
            elif self.dataset == "No Structure":
                return np.random.rand(self.n_samples, 2), None
            
    def update_samples_or_dataset(attrname, old, new):
        """Callback function that updates samples as values are scrubbed with sliders.
    
        Args:
            attrname (_type_): _description_
            old (_type_): _description_
            new (_type_): _description_
        """
    
        if config.dataset_select.value == 'Blobs':
            dataset = config.dataset_select.value
            n_samples = int(config.samples_slider.value)
            n_classes = int(config.classes_slider.value)
            n_features = int(config.features_slider.value)
            
            data = SyntheticData(dataset, n_samples, n_features, n_classes)
            config.x, config.y = data.generator()
            colors = [config.spectral[i] for i in config.y]
    
            config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
            
        elif config.dataset_select.value == 'Make Classification':
            dataset = config.dataset_select.value
            n_samples = int(config.samples_slider.value)
            n_classes = int(config.classes_slider.value)
            n_features = int(config.features_slider.value)
            n_inf = int(config.inf_slider.value)
            
            if n_inf > n_features:
                n_features = n_inf
                config.features_slider.update(value=n_inf)
            
            if n_classes > 2**n_inf:
                n_inf = (math.ceil(math.log2(n_classes)))
                n_features = n_inf
                config.inf_slider.update(value=n_inf)
                config.features_slider.update(value=n_features)
                
            data = SyntheticData(dataset, n_samples, n_features, n_classes, n_inf)
            config.x, config.y = data.generator()
            colors = [config.spectral[i]  for i in config.y]
    
            config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
            
        elif config.dataset_select.value == 'Noisy Circles':
            dataset = config.dataset_select.value
            n_samples = int(config.samples_slider.value)
            
            data = SyntheticData(dataset, n_samples)
            config.x, config.y = data.generator()
            colors = [config.spectral[i]  for i in config.y]
    
            config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
            
        elif config.dataset_select.value == 'Noisy Moons':
            dataset = config.dataset_select.value
            n_samples = int(config.samples_slider.value)
            
            data = SyntheticData(dataset, n_samples)
            config.x, config.y = data.generator()
            colors = [config.spectral[i]  for i in config.y]
    
            config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
            
        elif config.dataset_select.value == 'Multilabel Classification':
            dataset = config.dataset_select.value
            n_samples = int(config.samples_slider.value)
            n_features = int(config.features_slider.value)
            n_classes = int(config.classes_slider.value)
                    
            data = SyntheticData(dataset, n_samples, n_features, n_classes)
            config.x, config.y = data.generator()
            colors = [config.spectral[i]  for i in config.y]
    
            config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
        
    
    def update_layout(attrname, old, new):
        """Callback function that updates the sliders layout as datasets change.
    
        Args:
            attrname (_type_): _description_
            old (_type_): _description_
            new (_type_): _description_
        """
        if config.dataset_select.value == 'Blobs' or config.dataset_select.value == 'Multilabel Classification':
            inputs = Column(config.selects, config.samples_slider, config.classes_slider, config.features_slider)
            b = vis_synthetic()
            curdoc().clear()
            curdoc().add_root(Row(inputs, b))
    
        elif config.dataset_select.value == 'Make Classification':
            inputs = Column(config.selects, config.samples_slider, config.classes_slider, config.features_slider, config.inf_slider)
            b = vis_synthetic()
            curdoc().clear()
            curdoc().add_root(Row(inputs, b))
            
        elif config.dataset_select.value == 'Noisy Circles' or config.dataset_select.value == 'Noisy Moons':
            inputs = Column(config.selects, config.samples_slider)
            b = vis_synthetic()
            curdoc().clear()
            curdoc().add_root(Row(inputs,b))
    
    config.dataset_select.on_change('value', update_samples_or_dataset)
    config.samples_slider.on_change('value_throttled', update_samples_or_dataset)
    config.classes_slider.on_change('value_throttled', update_samples_or_dataset)
    config.features_slider.on_change('value', update_samples_or_dataset)
    config.inf_slider.on_change('value', update_samples_or_dataset)
    
    config.dataset_select.on_change('value', update_layout)