Skip to content
Snippets Groups Projects
Commit 546a0cfb authored by Philip Monaco's avatar Philip Monaco
Browse files

update modelling.py

parent ed34e7f7
Branches
No related tags found
No related merge requests found
Pipeline #1764 passed
import numpy as np
import pandas as pd
from sklearn.tree import export_graphviz
from subprocess import call
# import pandas as pd
from synthetic import *
from data_vis import *
#from data_vis import decisionTreemodel
import config as config
from decisionTreeVisuals import *
# from modelling import *
from bokeh.io import curdoc
from bokeh.models import ColumnDataSource, Select, Slider, Plot, Scatter, Row, Column
from bokeh.models import ColumnDataSource, Row
from bokeh.palettes import Spectral6
import pandas_bokeh
pandas_bokeh.output_notebook()
pd.set_option('plotting.backend', 'pandas_bokeh')
# Create Bokeh-Table with DataFrame:
from bokeh.models.widgets import DataTable, TableColumn
from bokeh.models import ColumnDataSource
np.random.seed(0)
data = SyntheticData()
config.x, config.y = data.generator()
config.spectral = np.hstack([Spectral6] * 20)
colors = [config.spectral[i] for i in config.y]
config.source = ColumnDataSource(dict(x=config.x[:,0], y=config.x[:,1], colors=colors))
b = vis_synthetic()
#decisionTreemodel(config.x, config.y)
'''
df = pd.read_csv("/Users/abdullahshah/documents/Spring Term 2021-2022/ci493/NewProject/why-senior-project/examples/decision_tree/Iris.csv")
# data split to features matrix and target vector
# df stands for the dataframe
# feature matrix
X = df.iloc[:,0:-1]
# target vector
Y = df.iloc[:,-1:]
'''
#s = int(config.index_slider.value)
test = decisionTreemodel(config.x, config.y, 9)
#text_output2 = Paragraph(text=test, width=200, height=100)
data = SyntheticData(shuffle=False)
config.X_train, config.X_test, config.y_train, config.y_test = data.generator()
#instantiate model here.
#show(p_bar)
#plots = layout([p_bar])
from bokeh.io import show
from bokeh.plotting import figure
#fit model using fit method on the model class
config.spectral = np.hstack([Spectral6] * 20)
#fig = figure(x_range=students, plot_height=250, title='Feature Importance Scores',
# toolbar_location=None, tools="")
# plot a line graph
#fig.vbar( x = students, top=mark, width=0.9)
from bokeh.models import ColumnDataSource, Slider, CustomJS
# Adding callback code
#callback = CustomJS(args=dict(source=data.generator(), val=config.index_slider))
#p = figure(x_range = test[0], plot_height = 400, title = "Feature Importance Scores")
#p.vbar(x = test[0], top = test[1], width = 0.5, color = "#fc8d59")
callback = CustomJS(args=dict(source=data.generator()), code="""
s = int(config.index_slider.value)
test = decisionTreemodel(config.x, config.y, s)
source.trigger('change');
""")
#slider = Slider(start=0.1, end=4, value=1, step=.1, title="power", callback=callback)
#show(p)
#tupleTest = decisionTreemodel()
# add to document
#from bokeh.models.widgets import TextInput
#Create the text input widget
#text_input_widget = TextInput(title="Type your text here", value = "")
#Output the text input widget
#output_file("text_input_widget.html")
#show(widgetbox(text_input_widget))
#tupleTest = decisionTreemodel()
colors = [config.spectral[i] for i in config.y_train]
curdoc().add_root(Row(config.inputs, b, test)) #, text_input_widget, stext_output2))
#curdoc().title = "Decision Tree"
config.source = ColumnDataSource(dict(x=config.X_train[:,0], y=config.X_train[:,1], colors=colors))
#print(config.x)
#print(config.y)
#print(test[0])
print(test[1])
print(test)
'''
b = vis_synthetic()
p_bar = test.plot_bokeh.bar(
y = 'FeatureNumbers',
x = 'Features',
ylabel="Feature Importance Score",
title="Feature Importance Chart",
alpha=0.6)
p_bar.xaxis.major_label_orientation = np.pi / 4
'''
\ No newline at end of file
curdoc().add_root(Row(config.inputs, b)) #, text_input_widget, stext_output2))
curdoc().title = "Decision Tree"
\ No newline at end of file
File deleted
File deleted
File deleted
......@@ -5,8 +5,12 @@ from bokeh.models import Select, Slider, Row, Column, Dropdown, Paragraph
from bokeh.layouts import column, row
#from src.data_vis import *t
x = 0
y = 0
# x = 0
# y = 0
X_train = 0
X_test = 0
y_train = 0
y_test = 0
spectral = 0
source = 0
......@@ -19,6 +23,27 @@ datasets_names = [
"Noisy Circles"
]
model_names = [
"Logistic Regression",
"Decision Tree"
]
regularization_names = [
"none",
"l2",
"l1",
"elasticnet" #both l1 and l2 combined
]
solver_names = [
"newton-cg", #dependent on l2 or none regularization
"lbfgs", #dependent on l1 or none regularization
"libliniear", #dependent on l1 or l2 regularization
"sag", #dependent on l2 or none regularization
"saga" #dependent on elasticnet, l1, l2, or none regularization
]
#synthetic data sliders and dropdowns
dataset_select = Select(value='Make Classification',
title='Select Dataset',
width=200,
......@@ -52,34 +77,44 @@ inf_slider = Slider(title='Informative Classes',
step=1,
width=400)
data_split_slider = Slider(title="Validation Size",
value=.2,
start=0.1,
end=.8,
step=.1,
width=400)
# modelling sliders and dropdowns
models_select = Select(value='Logistic Regression',
title='Select Model',
width=200,
options=model_names)
normalization_select = Select(value='none',
title='Select Regularization',
width=200,
options=regularization_names)
solver_select = Select(value = "newton-cg",
title='Select Solver',
width=200,
options=solver_names)
# new slider
index_slider = Slider(title="Data Record",
value=9.0,
start=0,
end=3000.0,
step=1,
width=400)
#tupleTest = decisionTreemodel()
width=200)
myMessage = 'You have entered nothing yet: (none)'
text_output = Paragraph(text=myMessage, width=200, height=100)
selects = Row(dataset_select, width=420)
inputs = Column(selects, samples_slider, classes_slider, inf_slider, features_slider, index_slider)
#menu = [("Item 1", "item_1"), ("Item 2", "item_2"), None, ("Item 3", "item_3")]
#clf_algorithms = [
# 'Decision Tree'
#]
#algorithm_select = Dropdown(label="Dropdown button", button_type="warning", menu=menu)
inputs = Column(selects,
samples_slider,
classes_slider,
inf_slider,
features_slider,
data_split_slider)
\ No newline at end of file
"""
This is a docstrings for datavis
"""
from bokeh.models import ColumnDataSource, Select, Slider, Plot, Scatter, Row, Column
from bokeh.models import Scatter
from bokeh.plotting import figure
import config as config
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
# import pandas as pd
# import numpy as np
# import math
# import matplotlib.pyplot as plt
# import seaborn as sns
# sklearn ML libraries/modules
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
# bokeh libraries/modules
from bokeh.io import output_file, show
from bokeh.layouts import widgetbox
#from bokeh.models.Column import widgetbox
from bokeh.models.widgets import Div
from bokeh.models.widgets import Paragraph
from bokeh.models.widgets import PreText
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split
# lime modules
from lime import submodular_pick
import lime
from lime.lime_tabular import LimeTabularExplainer
from lime import submodular_pick
# from lime import submodular_pick
# import lime
# from lime.lime_tabular import LimeTabularExplainer
# from lime import submodular_pick
def vis_synthetic():
"""This is a docstring for datavis
......
......@@ -3,24 +3,26 @@ This is a docstrings for decisionTreeVisuals
"""
import config as config
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
# import numpy as np
# import math
# import matplotlib.pyplot as plt
# import seaborn as sns
# sklearn ML libraries/modules
from sklearn import preprocessing
# from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from synthetic import update_samples_or_dataset
# bokeh libraries/modules
from bokeh.io import output_file, show
from bokeh.layouts import widgetbox
# from bokeh.io import output_file, show
# from bokeh.layouts import widgetbox
#from bokeh.models.Column import widgetbox
from bokeh.models.widgets import Div
from bokeh.models.widgets import Paragraph
from bokeh.models.widgets import PreText
# from bokeh.models.widgets import Div
# from bokeh.models.widgets import Paragraph
# from bokeh.models.widgets import PreText
# lime modules
from lime import submodular_pick
......@@ -31,6 +33,49 @@ from lime import submodular_pick
import config as config
from bokeh.plotting import figure
class Models:
def __init__(self,
model='Logistic Regression',
penalty = 'none',
solver = 'newton-cg'):
self.model = model,
self.penalty = penalty,
self.solver = solver
def generate_algorithm(self):
if self.model == "Logistic Regression":
return LogisticRegression(penalty=self.penalty,
solver=self.solver
)
elif self.model == "Decision Tree":
return DecisionTreeClassifier()
# def fit_to_model(self):
#here in the fit method you'll need to use the config.X_train, y_test
def model_callback(attrname, old, new):
"""Callback function that updates models with appropriate data and updates on
slider/selector changes.
Args:
attrname (_type_): _description_
old (_type_): _description_
new (_type_): _description_
"""
#controls parameters when solver updates, add more elif statements as needed.
if config.solver_select.value == "newton-cg" or config.solver_select.value == "sag", or config.solver_select.value == "lbfgs":
solver = "l2"
#control statements for selecting the appropriate dataset needs to:
# 1. update the dataset ( this will automatically update the layout)
#have to figure out how to update the dataset from here.
#probably need to update the selector to only show binary datasets. So make 2 separate dataset selectors: binary and multiclass
def decisionTreemodel(X, Y, indexValue):
"""This is a docstring for decisionTreeVisuals
......@@ -48,6 +93,7 @@ def decisionTreemodel(X, Y, indexValue):
#Y = df.iloc[:,-1:]
# splitting data into training and test sets
#not needed
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=7)
# instantiating model
......@@ -118,21 +164,5 @@ def decisionTreemodel(X, Y, indexValue):
#return dfBokehChart
return p
#df = pd.read_csv("src/Iris.csv")
#decisionTreemodel()
# put the below in config
'''
clf_algorithms = [
'Decision Tree'
]
algorithm_select = Select(value = 'Decision Tree',
title='Select Algorithm:',
width=200,
options=clf_algorithms
)
'''
\ No newline at end of file
# add more call backs for each selector
config.models_select.on_change('value', model_callback)
......@@ -4,6 +4,7 @@ Synthetic data generation class, with callbacks and visualizations
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from bokeh.models import Select, Slider, Row, Column
from bokeh.io import curdoc
from bokeh.layouts import column, row
......@@ -11,7 +12,7 @@ import math
from bokeh.palettes import Spectral6
import config as config
from data_vis import vis_synthetic
from decisionTreeVisuals import *
# from modelling import *
class SyntheticData:
"""Class for creating a synthetic data object with default parameters.
......@@ -21,12 +22,16 @@ class SyntheticData:
n_samples=1500,
n_features=4,
n_classes=3,
n_inf=2):
n_inf=2,
test_size=.2,
shuffle=True):
self.dataset = dataset
self.n_samples = n_samples
self.n_features = n_features
self.n_classes = n_classes
self.n_inf = n_inf
self.test_size = test_size
self.shuffle=shuffle
def generator(self):
......@@ -41,15 +46,23 @@ class SyntheticData:
"""
if self.dataset == 'Blobs':
#sliders: samples, classes, features
return datasets.make_blobs(n_samples=self.n_samples,
x, y = datasets.make_blobs(n_samples=self.n_samples,
centers=self.n_classes,
n_features=self.n_features,
random_state=8
)
X_train, X_test, y_train, y_test = train_test_split(x,
y,
test_size=self.test_size,
shuffle=self.shuffle
)
return X_train, X_test, y_train, y_test
elif self.dataset == 'Make Classification':
#sliders: samples, features, informative features, classes
return datasets.make_classification(n_samples=self.n_samples,
x, y = datasets.make_classification(n_samples=self.n_samples,
n_features=self.n_features,
n_informative=self.n_inf,
n_redundant=0,
......@@ -57,28 +70,56 @@ class SyntheticData:
n_classes=self.n_classes,
random_state=8
)
print("Test size in generator",self.test_size)
X_train, X_test, y_train, y_test = train_test_split(x,
y,
test_size=self.test_size,
shuffle=self.shuffle
)
return X_train, X_test, y_train, y_test
elif self.dataset == 'Noisy Circles':
#sliders: samples
return datasets.make_circles(n_samples=self.n_samples,
x, y = datasets.make_circles(n_samples=self.n_samples,
factor=0.5,
noise=0.05
)
X_train, X_test, y_train, y_test = train_test_split(x,
y,
test_size=self.test_size,
shuffle=self.shuffle
)
return X_train, X_test, y_train, y_test
elif self.dataset == 'Noisy Moons':
#sliders: samples
return datasets.make_moons(n_samples=self.n_samples,
x, y = datasets.make_moons(n_samples=self.n_samples,
noise=0.05
)
X_train, X_test, y_train, y_test = train_test_split(x,
y,
test_size=self.test_size,
shuffle=self.shuffle
)
return X_train, X_test, y_train, y_test
elif self.dataset == 'Multilabel Classification':
#sliders: samples, features, classes
return datasets.make_multilabel_classification(n_samples=self.n_samples,
x, y = datasets.make_multilabel_classification(n_samples=self.n_samples,
n_features=self.n_features,
n_classes=self.n_classes,
random_state=8
)
X_train, X_test, y_train, y_test = train_test_split(x,
y,
test_size=self.test_size,
shuffle=self.shuffle
)
return X_train, X_test, y_train, y_test
elif self.dataset == "No Structure":
return np.random.rand(self.n_samples, 2), None
......@@ -96,12 +137,13 @@ def update_samples_or_dataset(attrname, old, new):
n_samples = int(config.samples_slider.value)
n_classes = int(config.classes_slider.value)
n_features = int(config.features_slider.value)
test_size = config.data_split_slider.value
data = SyntheticData(dataset, n_samples, n_features, n_classes)
config.x, config.y = data.generator()
colors = [config.spectral[i] for i in config.y]
data = SyntheticData(dataset, n_samples, n_features, n_classes, test_size=test_size)
config.X_train, config.X_test, config.y_train, config.y_test = data.generator()
colors = [config.spectral[i] for i in config.y_train]
config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
config.source.data = dict(colors=colors, x=config.X_train[:, 0], y=config.X_train[:, 1])
elif config.dataset_select.value == 'Make Classification':
dataset = config.dataset_select.value
......@@ -109,7 +151,8 @@ def update_samples_or_dataset(attrname, old, new):
n_classes = int(config.classes_slider.value)
n_features = int(config.features_slider.value)
n_inf = int(config.inf_slider.value)
test_size = config.data_split_slider.value
print("test size in callback", test_size)
if n_inf > n_features:
n_features = n_inf
config.features_slider.update(value=n_inf)
......@@ -120,44 +163,46 @@ def update_samples_or_dataset(attrname, old, new):
config.inf_slider.update(value=n_inf)
config.features_slider.update(value=n_features)
data = SyntheticData(dataset, n_samples, n_features, n_classes, n_inf)
config.x, config.y = data.generator()
colors = [config.spectral[i] for i in config.y]
data = SyntheticData(dataset, n_samples, n_features, n_classes, n_inf, test_size=test_size)
config.X_train, config.X_test, config.y_train, config.y_test = data.generator()
colors = [config.spectral[i] for i in config.y_train]
config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
config.source.data = dict(colors=colors, x=config.X_train[:, 0], y=config.X_train[:, 1])
elif config.dataset_select.value == 'Noisy Circles':
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
test_size = config.data_split_slider.value
data = SyntheticData(dataset, n_samples)
config.x, config.y = data.generator()
colors = [config.spectral[i] for i in config.y]
data = SyntheticData(dataset, n_samples, test_size=test_size)
config.X_train, config.X_test, config.y_train, config.y_test = data.generator()
colors = [config.spectral[i] for i in config.y_train]
config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
config.source.data = dict(colors=colors, x=config.X_train[:, 0], y=config.X_train[:, 1])
elif config.dataset_select.value == 'Noisy Moons':
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
test_size = config.data_split_slider.value
data = SyntheticData(dataset, n_samples)
config.x, config.y = data.generator()
colors = [config.spectral[i] for i in config.y]
data = SyntheticData(dataset, n_samples, test_size=test_size)
config.X_train, config.X_test, config.y_train, config.y_test = data.generator()
colors = [config.spectral[i] for i in config.y_train]
config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
config.source.data = dict(colors=colors, x=config.X_train[:, 0], y=config.X_train[:, 1])
elif config.dataset_select.value == 'Multilabel Classification':
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
n_features = int(config.features_slider.value)
n_classes = int(config.classes_slider.value)
test_size = config.data_split_slider.value
data = SyntheticData(dataset, n_samples, n_features, n_classes)
config.x, config.y = data.generator()
colors = [config.spectral[i] for i in config.y]
config.source.data = dict(colors=colors, x=config.x[:, 0], y=config.x[:, 1])
data = SyntheticData(dataset, n_samples, n_features, n_classes, test_size=test_size)
config.X_train, config.X_test, config.y_train, config.y_test = data.generator()
colors = [config.spectral[i] for i in config.y_train]
config.source.data = dict(colors=colors, x=config.X_train[:, 0], y=config.X_train[:, 1])
def update_layout(attrname, old, new):
"""Callback function that updates the sliders layout as datasets change.
......@@ -168,73 +213,49 @@ def update_layout(attrname, old, new):
new (_type_): _description_
"""
if config.dataset_select.value == 'Blobs' or config.dataset_select.value == 'Multilabel Classification':
inputs = Column(config.selects, config.samples_slider, config.classes_slider, config.features_slider, config.index_slider)
s = int(config.index_slider.value)
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
n_classes = int(config.classes_slider.value)
n_features = int(config.features_slider.value)
data = SyntheticData(dataset, n_samples, n_features, n_classes)
config.x, config.y = data.generator()
inputs = Column(config.selects, config.samples_slider, config.classes_slider, config.features_slider, config.data_split_slider)
b = vis_synthetic()
test = decisionTreemodel(config.x, config.y, s)
curdoc().clear()
curdoc().add_root(Row(inputs, b, test))
curdoc().add_root(Row(inputs, b))
elif config.dataset_select.value == 'Make Classification':
inputs = Column(config.selects, config.samples_slider, config.classes_slider, config.features_slider, config.inf_slider, config.index_slider)
s = int(config.index_slider.value)
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
n_classes = int(config.classes_slider.value)
n_features = int(config.features_slider.value)
data = SyntheticData(dataset, n_samples, n_features, n_classes)
config.x, config.y = data.generator()
inputs = Column(config.selects, config.samples_slider, config.classes_slider, config.features_slider, config.inf_slider, config.data_split_slider)
b = vis_synthetic()
test = decisionTreemodel(config.x, config.y, s)
curdoc().clear()
curdoc().add_root(Row(inputs, b, test))
curdoc().add_root(Row(inputs, b))
elif config.dataset_select.value == 'Noisy Circles' or config.dataset_select.value == 'Noisy Moons':
s = int(config.index_slider.value)
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
n_classes = int(config.classes_slider.value)
n_features = int(config.features_slider.value)
data = SyntheticData(dataset, n_samples, n_features, n_classes)
config.x, config.y = data.generator()
inputs = Column(config.selects, config.samples_slider, config.index_slider)
inputs = Column(config.selects, config.samples_slider, config.data_split_slider)
b = vis_synthetic()
test = decisionTreemodel(config.x, config.y, s)
curdoc().clear()
curdoc().add_root(Row(inputs,b, test))
def visualUpdate(attrname, old, new):
s = int(config.index_slider.value)
dataset = config.dataset_select.value
n_samples = int(config.samples_slider.value)
n_classes = int(config.classes_slider.value)
n_features = int(config.features_slider.value)
data = SyntheticData(dataset, n_samples, n_features, n_classes)
config.x, config.y = data.generator()
inputs = Column(config.index_slider)
test = decisionTreemodel(config.x, config.y, s)
#p = figure(x_range = test[0], plot_height = 400, title = "Feature Importance Scores")
#p.vbar(x = test[0], top = test[1], width = 0.5, color = "#fc8d59")
curdoc().clear()
curdoc().add_root(Row(inputs,test))
curdoc().add_root(Row(inputs,b))
# def visualUpdate(attrname, old, new):
# s = int(config.index_slider.value)
# dataset = config.dataset_select.value
# n_samples = int(config.samples_slider.value)
# n_classes = int(config.classes_slider.value)
# n_features = int(config.features_slider.value)
# data = SyntheticData(dataset, n_samples, n_features, n_classes)
# config.X_train, config.X_test, config.y_train, config.y_test = data.generator()
# inputs = Column(config.index_slider)
# # test = decisionTreemodel(config.x, config.y, s)
# #p = figure(x_range = test[0], plot_height = 400, title = "Feature Importance Scores")
# #p.vbar(x = test[0], top = test[1], width = 0.5, color = "#fc8d59")
# curdoc().clear()
# curdoc().add_root(Row(inputs))
#data generator callbacks
config.dataset_select.on_change('value', update_samples_or_dataset)
config.samples_slider.on_change('value_throttled', update_samples_or_dataset)
config.classes_slider.on_change('value_throttled', update_samples_or_dataset)
config.features_slider.on_change('value', update_samples_or_dataset)
config.inf_slider.on_change('value', update_samples_or_dataset)
# new
config.index_slider.on_change('value', update_layout)
#config.dataset_select.on_change('value', visualUpdate)
#validation splits callbacks
config.data_split_slider.on_change('value_throttled', update_samples_or_dataset)
config.dataset_select.on_change('value', update_layout)
\ No newline at end of file
#print(len(datasets))
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment