Skip to content
Snippets Groups Projects
Commit 9752b034 authored by Songheng Li's avatar Songheng Li
Browse files

Merge branch 'logRegressionVisuals' of https://gitlab.cci.drexel.edu/pjm363/why-senior-project

parents 4031465a 7c636828
Branches
No related tags found
No related merge requests found
# import numpy as np
# import config as config
# from sklearn.tree import export_graphviz
# from subprocess import call
# from synthetic import *
# from data_vis import *
# from bokeh.io import curdoc
# from bokeh.models import ColumnDataSource, Select, Slider, Plot, Scatter, Row, Column
# from bokeh.palettes import Spectral6
# #seeding random to 0
# np.random.seed(0)
# #getting data from synthetic generator
# data = SyntheticData()
# config.x, config.y = data.generator()
# config.spectral = np.hstack([Spectral6] * 20)
# colors = [config.spectral[i] for i in config.y]
# config.source = ColumnDataSource(dict(x=config.x[:,0], y=config.x[:,1], colors=colors))
# b = vis_synthetic()
# clf_algorithms = [
# 'Log Regression'
# ]
# algorithm_select = Select(value = 'Log Regression',
# title='Select Algorithm:',
# width=200,
# options=clf_algorithms
# )
# # add to document
# curdoc().add_root(Row(config.inputs, b))
# curdoc().title = "Log Regression"
import pandas as pd
import numpy as np
import math
from sklearn import linear_model
import matplotlib.pyplot as plt
import seaborn as sns
from lime import submodular_pick
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from bokeh.io import output_file, show
from bokeh.layouts import widgetbox
#from bokeh.models.Column import widgetbox
from bokeh.models.widgets import Div
from bokeh.models.widgets import Paragraph
from bokeh.models.widgets import PreText
from bokeh.io import output_file, show
from bokeh.plotting import figure
from bokeh.models import Label
import lime
from lime.lime_tabular import LimeTabularExplainer
from lime import submodular_pick
import holoviews as hv
df = pd.read_csv("Iris.csv")
df = df.iloc[:, 1:]
# feature matrix
X = df.iloc[:,0:-1]
# target vector
Y = df.iloc[:,-1:]
# feature matrix
#X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
#X
# target vector
#Y = df['Species']
# grabbing unique names used later as parameter for LIME
#class_names = Y.unique()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=7)
LOGREG_MAXITER = 8192
model_logreg = linear_model.LogisticRegression(max_iter = LOGREG_MAXITER, verbose = 1)
#max_iter = LOGREG_MAXITER, verbose = 1
# fit model on training set
model_logreg.fit(X_train, Y_train)
#model_logreg.predict(X_test)
class_names=model_logreg.classes_
#model_logreg.classes_
#model_logreg.predict(X_scaled_test)
# retrieving accuracy score first way
accuracy_score(Y_test,model_logreg.predict(X_test))
# retrieving accuracy score second way
model_logreg.score(X_test, Y_test)
# grabbing 8th row for model to use to make prediction
ex_specie = np.array(X_test.iloc[3]).reshape(1,-1)
model_logreg.predict(ex_specie)
# lime is a library
# lime_tabular is a module that contains functions that explain classifiers which use tabular data (matrices).
# LimeTabularExplainer is a function that explains predictions of tabular (matrix) data.
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, class_names=class_names, discretize_continuous=True)
print(explainer)
# explain_instance is a function that generates explanations for a prediction after using LimeTabularExplainer.
exp = explainer.explain_instance(X_test.iloc[3],model_logreg.predict_proba,num_features=4,top_labels=1)
Y_test.iloc[3]
explainer.explain_instance(X_test.iloc[3],model_logreg.predict_proba,num_features=4,top_labels=1)
exp.as_list()
tupleTest = exp.as_list()
#tupleTest
# converting tuples to list
NewList = [list(x) for x in tupleTest]
# separating comparisons from feature scores
Labels1 = [item[0] for item in NewList]
# separating feature scores from comparisons
featureNums = [item[1] for item in NewList]
# grabbing count of number of features to determing number of x axis ticks in the chart
count = 0
newList = []
for i in featureNums:
count += 1
newList.append(count)
# list to determine feature count
#Labels1
# displaying chart of feature importance scores for each feature for given record
plt.bar(newList, featureNums, align='center')
plt.xticks(newList, Labels1)
plt.xticks(rotation=60, ha='right')
plt.title("Feature Importance graph")
plt.show()
#y_positiond
#NewList
# converting all elements in list to strings
doubleStrList = [[str(s) for s in sublist] for sublist in NewList]
# splitting split to break each word/number/floats to individual elements
doubleStrListFinal = [sublist[0].split() for sublist in doubleStrList]
# output of doubleStrList
print(doubleStrList)
# output of doubleStrListFinal when split
print(doubleStrListFinal)
# splitting same doubleStrList but for second list within nested list to retreive feature scores
doubleStriListFeature = [sublist[1].split() for sublist in doubleStrList]
# concatenating lists back together for final list output
finalList = [a + b for a, b in zip(doubleStrListFinal, doubleStriListFeature)]
# grabbing length to loop through below
lengthofList = len(finalList)
# displaying length of final list
result = pd.concat([X_train, Y_train], axis=1)
#result
resultColumns = list(result.columns)
#resultColumns
# X-axis are the features (in this case, it will be the flowers lengths)
# Y-axis is the count of features (iris flowers for this example)
for i in X_train.columns:
sns.FacetGrid(result, hue=resultColumns[-1], height=4) \
.map(sns.histplot, i) \
.add_legend()
# nested loop through final list to retrieve and assign individual elements to create explanation
# and meaning of results
for j in finalList:
for i in range(len(j)):
if i == 0:
zero = j[i]
elif i == 1:
if j[i] == '<':
first = "greater than"
elif j[i] == '>':
first = "less than"
elif j[i] == '=<':
first = "less than and equal to"
elif j[i] == '>=':
first = "lesser than and equal to"
elif i == 2:
second = j[i]
elif i == 3:
if j[3] == '<':
third = "greater than"
elif j[3] == '>':
third = "less than"
elif j[3] == '<=':
third = "less than and equal to"
elif j[3] == '>=':
third = "lesser than and equal to"
elif i == 4:
fourth = j[i]
elif i == 5:
fifth = j[i]
#if float(fifth) > 0:
#print(fifth)
#print(second + " is a good classification as the overlap from the histogram is less")
print("After viewing the distributions in the histograms, if " + second + " is " + first + " " + zero + " and " + second + " is " + third + " " + fourth + ", then species is "
+ model_logreg.predict(ex_specie)[0])
# If petal length > 4.8 then species is Iris Virginica")
print()
#print("Due to the value of "+ second, "being" , first, zero, "and\n", second, "is",
# third, fourth + ", the feature importance score\n is: " + fifth)
#ptest = Paragraph(text="""Due to the value of """"+ second, """"being""" + first+ zero+ """and\n"""+ second+ """is"""+
# third+ fourth + """the feature importance score\n is: """ + fifth, width=200, height=100)
#ExpList.append(ptest)
# check if feature importance is positive, then talk about that feature
# example output: the 'feature name' is less than 5.1 and greater than 2.3 with feature importance score of 0.3454
# why does range of values of length determine it to be a specific class?
# ex: '4.10', '<', 'PetalLengthCm', '<=', '5.10
# analyze petal length across all classes
# historgram that describe distrubution across classes
# get distribution of training set
# 12 historgrams
# example output: the 'feature name' is less than 5.1 and greater than 2.3 with feature importance score of 0.3454
predictProList = exp.predict_proba
predictProList
semiProbList = predictProList.tolist()
semiProbList
#semiProbList = [round(s*100,2) for s in semiProbList]
semiProbList
finalProbList = [str(s) for s in semiProbList]
finalProbList
#for i in range(len(finalProbList)):
# finalProbList[i] += "%"
classList = exp.class_names
classList
probaClassList = [list(l) for l in zip(finalProbList, classList)]
# convert probabilities to percentages
probaClassList
# grabbing count of number of features to determing number of x axis ticks in the chart
counts = 0
newProList = []
for i in classList:
counts += 1
#print(counts)
newProList.append(counts)
#newProList
# nested loop through final list to retrieve and assign individual elements to create explanation
# and meaning of results
for j in probaClassList:
for i in range(len(j)):
if i == 0:
zero = j[i]
elif i == 1:
first = j[i]
print("The probability of the class being", first, "is:", zero)
print()
# # displaying chart of preddiction probabilties for each class for given record
plt.bar(newProList, semiProbList, align='center')
plt.xticks(newProList, classList)
plt.xticks(rotation=60, ha='right')
plt.title("Class Prediction Probability graph")
plt.show()
# show_in_notebook is a function of LIME that shows html explanation in ipython notebook.
# Float point numbers on the horizontal bars represent the relative/feature importance of these features.
exp.show_in_notebook(show_table=True, show_all=False)
#SP = submodular_pick.SubmodularPick(explainer, X_test.values, model_logreg.predict_proba, sample_size=45, num_exps_desired=3, num_features=4)
#[exp.as_pyplot_figure(label=exp.available_labels()[0]) for exp in SP.sp_explanations];
import holoviews as hv
#hv.extension("bokeh")
dfBokehChart = pd.DataFrame(list(zip(Labels1, featureNums)), columns =['Features', 'FeatureNumbers'])
dfBokehChartProb = pd.DataFrame(list(zip(classList, semiProbList)), columns =['Class Name', 'Probability Prediction'])
dfBokehChart
dfBokehChartProb
bars = hv.Bars(dfBokehChart, hv.Dimension('FeatureNumbers'), 'Features')
#bars
#pip install holoviews --upgrade
#pip install panel --upgrade
#pip install bokeh==2.4.2
#pip install holoviews==1.14.7
#pip install pandas_bokeh
import pandas_bokeh
pandas_bokeh.output_notebook()
pd.set_option('plotting.backend', 'pandas_bokeh')
# Create Bokeh-Table with DataFrame:
from bokeh.models.widgets import DataTable, TableColumn
from bokeh.models import ColumnDataSource
output_file("bars.html")
p_bar = dfBokehChart.plot_bokeh.bar(
y = 'FeatureNumbers',
x = 'Features',
ylabel="Feature Importance Score",
title="Feature Importance Chart",
alpha=0.6)
p_bar.xaxis.major_label_orientation = np.pi / 4
#mytext = Label(x=70, y=70, text='here your textaaaaaaaaaaaaa')
p_bar1 = dfBokehChartProb.plot_bokeh.bar(
y = 'Probability Prediction',
x = 'Class Name',
ylabel="Predicted Probability By Percentage",
title="Class Probability Chart",
alpha=0.6)
p_bar1.xaxis.major_label_orientation = np.pi / 4
#p_bar.add_layout(mytext)
show(p_bar)
#show(widgetbox(ExpList))
output_file("bars1.html")
show( p_bar1)
#output_file("div.html")
#pre = PreText(text="""Your text is initialized with the 'text' argument. The remaining Paragraph arguments are 'width' and 'height'.""",width=500, height=100)
#p = Paragraph(text="""Your text is initialized with the 'text' argument. The remaining Paragraph arguments are 'width' and 'height'""", width=200, height=100)
#div = Div(text="""Your <a href="https://en.wikipedia.org/wiki/HTML">HTML</a>-supported text is initialized with the <b>text</b> argument. The remaining div arguments are <b>width</b> and <b>height</b>. For this example, those values are <i>200</i> and <i>100</i> respectively.""", width=200, height=100)
#show(widgetbox(ExpList))
#%history -g
#X_train
# ignore below
import pandas as pd
import altair as alt
import numpy as np
np.random.seed(42)
# Generating Data
#source = pd.DataFrame({'Trial A': np.random.normal(0, 0.8, 1000),
# 'Trial B': np.random.normal(-2, 1, 1000)})
#for i in X_train.columns:
# sns.FacetGrid(result, hue=resultColumns[-1], height=3) \
# .map(sns.histplot, i) \
# .add_legend()
base = alt.Chart(X_train).transform_fold(
['Trial A', 'Trial B'],
['Experiment', 'Measurement']
).transform_bin(
field='Measurement',
bin=alt.Bin(maxbins=50),
as_=['Measurement_min', 'Measurement_max']
).transform_aggregate(
count='count()',
groupby=['Measurement_min', 'Measurement_max', 'Experiment']
)
hist = base.mark_area(
opacity=0.3,
interpolate='step'
).encode(
x=alt.X('Measurement_min:Q', bin='binned'),
x2='Measurement_max:Q',
y=alt.Y('count:Q', stack=None),
color='Experiment:N'
)
overlap = base.transform_impute(
impute='count',
key='Measurement_min',
value=0,
groupby=['Experiment']
).transform_aggregate(
overlap='min(count)',
groupby=['Measurement_min']
).mark_bar().encode(
x='sum(overlap):Q'
)
hist & overlap
#X_train.iloc[:, :1]
#X_train.iloc[:, :1].to_numpy()
#flattened1 = [val for sublist in X_train.iloc[:, :1].to_numpy() for val in sublist]
#flattened2 = [val for sublist in X_train.iloc[:, :2].to_numpy() for val in sublist]
#flattened3 = [val for sublist in X_train.iloc[:, :3].to_numpy() for val in sublist]
#flattened4 = [val for sublist in X_train.iloc[:, :4].to_numpy() for val in sublist]
#A = np.array([1.274580708,2.466224824,5.045757621,7.413716262,8.958855646,10.41325305,11.14150951,10.91949012,11.29095648,10.95054297,10.10976255,8.128781795,1.886568472])
#B = np.array([0,1.700493692,4.059243006,5.320899616,6.747120132,7.899067471,9.434997257,11.24520022,12.94569391,12.83598464,12.6165661,10.80636314,4.388370817])
#aa = X_train.to_numpy()
#bb =pd.DataFrame({'Trial B': np.random.normal(-2, 1, 1000)}).to_numpy()
#def histogram_intersection(h1, h2, h3, h4):
# sm = 0
# for i in range(len(flattened1)):
# sm += min(h1[i], h2[i], h3[i], h4[i])
#return sm
#print(histogram_intersection(flattened1, flattened2, flattened3, flattened4))
#print(np.sum(np.minimum(A,B)))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment