Merge branch 'logRegressionVisuals' of https://gitlab.cci.drexel.edu/pjm363/why-senior-project

9752b034 · Songheng Li · 4031465a · 7c636828 · 9752b034 · 9752b034
Commit 9752b034 authored 3 years ago by Songheng Li
--- a/examples/log_regression/__init__.py
+++ b/examples/log_regression/__init__.py
--- a/examples/log_regression/main.py
+++ b/examples/log_regression/main.py
+# import numpy as np
+# import config as config
+
+# from sklearn.tree import export_graphviz
+# from subprocess import call
+
+# from synthetic import *
+# from data_vis import *
+
+# from bokeh.io import curdoc
+# from bokeh.models import ColumnDataSource, Select, Slider, Plot, Scatter, Row, Column
+# from bokeh.palettes import Spectral6
+
+# #seeding random to 0
+# np.random.seed(0)
+
+# #getting data from synthetic generator
+# data = SyntheticData()
+
+# config.x, config.y = data.generator()
+# config.spectral = np.hstack([Spectral6] * 20)
+
+# colors = [config.spectral[i] for i in config.y]
+
+# config.source = ColumnDataSource(dict(x=config.x[:,0], y=config.x[:,1], colors=colors))
+
+# b = vis_synthetic()
+
+# clf_algorithms = [
+#     'Log Regression'
+# ]
+
+# algorithm_select = Select(value = 'Log Regression',
+#                           title='Select Algorithm:',
+#                           width=200,
+#                           options=clf_algorithms
+#                           )
+
+# # add to document
+# curdoc().add_root(Row(config.inputs, b))
+# curdoc().title = "Log Regression"
+
+
+import pandas as pd
+import numpy as np
+import math
+from sklearn import linear_model
+import matplotlib.pyplot as plt
+import seaborn as sns
+from lime import submodular_pick
+from sklearn import preprocessing
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+from bokeh.io import output_file, show
+from bokeh.layouts import widgetbox
+#from bokeh.models.Column import widgetbox
+from bokeh.models.widgets import Div
+from bokeh.models.widgets import Paragraph
+from bokeh.models.widgets import PreText
+from bokeh.io import output_file, show
+from bokeh.plotting import figure
+from bokeh.models import Label
+import lime
+from lime.lime_tabular import LimeTabularExplainer
+from lime import submodular_pick
+import holoviews as hv
+
+df = pd.read_csv("Iris.csv")
+df = df.iloc[:, 1:]
+
+# feature matrix
+X = df.iloc[:,0:-1]
+
+# target vector 
+Y = df.iloc[:,-1:]
+
+# feature matrix
+#X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
+#X
+# target vector
+#Y = df['Species']
+# grabbing unique names used later as parameter for LIME
+#class_names = Y.unique()
+
+X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=7)
+
+LOGREG_MAXITER = 8192
+model_logreg = linear_model.LogisticRegression(max_iter = LOGREG_MAXITER, verbose = 1)
+
+#max_iter = LOGREG_MAXITER, verbose = 1
+# fit model on training set
+model_logreg.fit(X_train, Y_train)
+#model_logreg.predict(X_test)
+class_names=model_logreg.classes_
+
+#model_logreg.classes_
+#model_logreg.predict(X_scaled_test)
+
+# retrieving accuracy score first way
+accuracy_score(Y_test,model_logreg.predict(X_test))
+# retrieving accuracy score second way
+
+model_logreg.score(X_test, Y_test)
+# grabbing 8th row for model to use to make prediction
+ex_specie = np.array(X_test.iloc[3]).reshape(1,-1)
+
+model_logreg.predict(ex_specie)
+# lime is a library
+# lime_tabular is a module that contains functions that explain classifiers which use tabular data (matrices).
+# LimeTabularExplainer is a function that explains predictions of tabular (matrix) data.
+
+explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, class_names=class_names, discretize_continuous=True)
+print(explainer)
+
+# explain_instance is a function that generates explanations for a prediction after using LimeTabularExplainer.
+exp = explainer.explain_instance(X_test.iloc[3],model_logreg.predict_proba,num_features=4,top_labels=1)
+Y_test.iloc[3]
+
+explainer.explain_instance(X_test.iloc[3],model_logreg.predict_proba,num_features=4,top_labels=1)
+exp.as_list()
+
+tupleTest = exp.as_list()
+#tupleTest
+# converting tuples to list
+
+NewList = [list(x) for x in tupleTest]
+# separating comparisons from feature scores
+
+Labels1 = [item[0] for item in NewList]
+# separating feature scores from comparisons
+
+featureNums = [item[1] for item in NewList]
+# grabbing count of number of features to determing number of x axis ticks in the chart
+
+count = 0
+newList = []
+for i in featureNums:
+    count += 1
+    newList.append(count)
+# list to determine feature count
+
+#Labels1
+# displaying chart of feature importance scores for each feature for given record
+plt.bar(newList, featureNums, align='center')
+plt.xticks(newList, Labels1)
+plt.xticks(rotation=60, ha='right')
+plt.title("Feature Importance graph")
+plt.show()
+#y_positiond
+#NewList
+
+# converting all elements in list to strings
+doubleStrList = [[str(s) for s in sublist] for sublist in NewList]
+
+# splitting split to break each word/number/floats to individual elements
+doubleStrListFinal = [sublist[0].split() for sublist in doubleStrList]
+
+# output of doubleStrList
+print(doubleStrList)
+
+# output of doubleStrListFinal when split
+print(doubleStrListFinal)
+
+# splitting same doubleStrList but for second list within nested list to retreive feature scores
+doubleStriListFeature = [sublist[1].split() for sublist in doubleStrList]
+
+# concatenating lists back together for final list output
+finalList = [a + b for a, b in zip(doubleStrListFinal, doubleStriListFeature)]
+
+# grabbing length to loop through below
+lengthofList = len(finalList)
+
+# displaying length of final list
+result = pd.concat([X_train, Y_train], axis=1)
+
+#result
+resultColumns = list(result.columns)
+
+#resultColumns
+# X-axis are the features (in this case, it will be the flowers lengths)
+# Y-axis is the count of features (iris flowers for this example)
+for i in X_train.columns:
+    sns.FacetGrid(result, hue=resultColumns[-1], height=4) \
+    .map(sns.histplot, i) \
+    .add_legend()
+    
+# nested loop through final list to retrieve and assign individual elements to create explanation 
+# and meaning of results
+for j in finalList:
+    for i in range(len(j)):
+        if i == 0:
+            zero = j[i]
+        elif i == 1:
+            if j[i] == '<':
+                first = "greater than"
+            elif j[i] == '>':
+                first = "less than"
+            elif j[i] == '=<':
+                first = "less than and equal to"
+            elif j[i] == '>=':
+                first = "lesser than and equal to"
+        elif i == 2:
+            second = j[i]
+        elif i == 3:
+            if j[3] == '<':
+                third = "greater than"
+            elif j[3] == '>':
+                third = "less than"
+            elif j[3] == '<=':
+                third = "less than and equal to"
+            elif j[3] == '>=':
+                third = "lesser than and equal to"
+        elif i == 4:
+            fourth = j[i]
+        elif i == 5:
+            fifth = j[i]
+
+        
+            
+    #if float(fifth) > 0:
+        #print(fifth)
+    #print(second + " is a good classification as the overlap from the histogram is less")
+    print("After viewing the distributions in the histograms, if " + second + " is " + first + " " + zero + " and " + second + " is " + third + " " + fourth + ", then species is " 
+          + model_logreg.predict(ex_specie)[0]) 
+             # If petal length > 4.8 then species is Iris Virginica")
+    print()
+    #print("Due to the value of "+ second, "being" , first, zero, "and\n", second, "is",
+     #     third, fourth + ", the feature importance score\n is: " + fifth)
+    #ptest = Paragraph(text="""Due to the value of """"+ second, """"being""" + first+ zero+ """and\n"""+ second+ """is"""+
+     #     third+ fourth + """the feature importance score\n is: """ + fifth, width=200, height=100)
+    #ExpList.append(ptest)
+ 
+    # check if feature importance is positive, then talk about that feature
+    
+# example output: the 'feature name' is less than 5.1 and greater than 2.3 with feature importance score of 0.3454
+
+# why does range of values of length determine it to be a specific class?
+    # ex: '4.10', '<', 'PetalLengthCm', '<=', '5.10
+# analyze petal length across all classes
+# historgram that describe distrubution across classes
+# get distribution of training set
+# 12 historgrams
+
+        
+    
+    
+# example output: the 'feature name' is less than 5.1 and greater than 2.3 with feature importance score of 0.3454
+
+
+predictProList = exp.predict_proba
+predictProList
+semiProbList = predictProList.tolist()
+semiProbList
+#semiProbList = [round(s*100,2) for s in semiProbList]
+semiProbList
+finalProbList = [str(s) for s in semiProbList] 
+finalProbList
+#for i in range(len(finalProbList)):
+ #   finalProbList[i] += "%"
+classList = exp.class_names
+classList
+probaClassList = [list(l) for l in zip(finalProbList, classList)]
+# convert probabilities to percentages
+probaClassList
+# grabbing count of number of features to determing number of x axis ticks in the chart
+counts = 0
+newProList = []
+for i in classList:
+    counts += 1
+    #print(counts)
+    newProList.append(counts)
+#newProList
+# nested loop through final list to retrieve and assign individual elements to create explanation 
+# and meaning of results
+for j in probaClassList:
+    for i in range(len(j)):
+        if i == 0:
+            zero = j[i]
+        elif i == 1:
+            first = j[i]
+          
+            
+    print("The probability of the class being", first, "is:", zero)
+    print()
+
+
+# # displaying chart of preddiction probabilties for each class for given record
+plt.bar(newProList, semiProbList, align='center')
+plt.xticks(newProList, classList)
+plt.xticks(rotation=60, ha='right')
+plt.title("Class Prediction Probability graph")
+plt.show()
+# show_in_notebook is a function of LIME that shows html explanation in ipython notebook.
+# Float point numbers on the horizontal bars represent the relative/feature importance of these features.
+exp.show_in_notebook(show_table=True, show_all=False)
+
+#SP = submodular_pick.SubmodularPick(explainer, X_test.values, model_logreg.predict_proba, sample_size=45, num_exps_desired=3, num_features=4)
+#[exp.as_pyplot_figure(label=exp.available_labels()[0]) for exp in SP.sp_explanations];
+import holoviews as hv
+#hv.extension("bokeh")
+dfBokehChart = pd.DataFrame(list(zip(Labels1, featureNums)), columns =['Features', 'FeatureNumbers']) 
+dfBokehChartProb = pd.DataFrame(list(zip(classList, semiProbList)), columns =['Class Name', 'Probability Prediction']) 
+dfBokehChart
+dfBokehChartProb
+bars = hv.Bars(dfBokehChart, hv.Dimension('FeatureNumbers'), 'Features')
+#bars
+#pip install holoviews --upgrade
+#pip install panel --upgrade
+#pip install bokeh==2.4.2
+#pip install holoviews==1.14.7
+#pip install pandas_bokeh
+import pandas_bokeh
+pandas_bokeh.output_notebook()
+pd.set_option('plotting.backend', 'pandas_bokeh')
+# Create Bokeh-Table with DataFrame:
+from bokeh.models.widgets import DataTable, TableColumn
+from bokeh.models import ColumnDataSource
+output_file("bars.html")
+p_bar = dfBokehChart.plot_bokeh.bar(
+    y = 'FeatureNumbers',
+    x = 'Features',
+    ylabel="Feature Importance Score", 
+    title="Feature Importance Chart",
+    alpha=0.6)
+p_bar.xaxis.major_label_orientation = np.pi / 4
+#mytext = Label(x=70, y=70, text='here your textaaaaaaaaaaaaa')
+p_bar1 = dfBokehChartProb.plot_bokeh.bar(
+    y = 'Probability Prediction',
+    x = 'Class Name',
+    ylabel="Predicted Probability By Percentage", 
+    title="Class Probability Chart",
+    alpha=0.6)
+p_bar1.xaxis.major_label_orientation = np.pi / 4
+
+#p_bar.add_layout(mytext)
+show(p_bar)
+    
+#show(widgetbox(ExpList))
+
+output_file("bars1.html")
+show( p_bar1)
+#output_file("div.html")
+    
+#pre = PreText(text="""Your text is initialized with the 'text' argument. The remaining Paragraph arguments are 'width' and 'height'.""",width=500, height=100)
+    
+#p = Paragraph(text="""Your text is initialized with the 'text' argument. The remaining Paragraph arguments are 'width' and 'height'""", width=200, height=100)
+    
+#div = Div(text="""Your <a href="https://en.wikipedia.org/wiki/HTML">HTML</a>-supported text is initialized with the <b>text</b> argument.  The remaining div arguments are <b>width</b> and <b>height</b>. For this example, those values are <i>200</i> and <i>100</i> respectively.""", width=200, height=100)
+    
+#show(widgetbox(ExpList))
+#%history -g
+ #X_train
+
+# ignore below
+import pandas as pd
+import altair as alt
+import numpy as np
+np.random.seed(42)
+
+# Generating Data
+#source = pd.DataFrame({'Trial A': np.random.normal(0, 0.8, 1000),
+ #                  'Trial B': np.random.normal(-2, 1, 1000)})
+#for i in X_train.columns:
+ #   sns.FacetGrid(result, hue=resultColumns[-1], height=3) \
+  #  .map(sns.histplot, i) \
+   # .add_legend()
+
+base = alt.Chart(X_train).transform_fold(
+    ['Trial A', 'Trial B'],
+    ['Experiment', 'Measurement']
+).transform_bin(
+    field='Measurement',
+    bin=alt.Bin(maxbins=50),
+    as_=['Measurement_min', 'Measurement_max']
+).transform_aggregate(
+    count='count()',
+    groupby=['Measurement_min', 'Measurement_max', 'Experiment']
+)
+
+hist = base.mark_area(
+    opacity=0.3,
+    interpolate='step'
+).encode(
+    x=alt.X('Measurement_min:Q', bin='binned'),
+    x2='Measurement_max:Q',
+    y=alt.Y('count:Q', stack=None),
+    color='Experiment:N'
+)
+
+overlap = base.transform_impute(
+    impute='count',
+    key='Measurement_min',
+    value=0,
+    groupby=['Experiment']
+).transform_aggregate(
+    overlap='min(count)',
+    groupby=['Measurement_min']
+).mark_bar().encode(
+    x='sum(overlap):Q'
+)
+
+hist & overlap
+#X_train.iloc[:, :1]
+#X_train.iloc[:, :1].to_numpy()
+#flattened1 = [val for sublist in X_train.iloc[:, :1].to_numpy() for val in sublist]
+#flattened2 = [val for sublist in X_train.iloc[:, :2].to_numpy() for val in sublist]
+#flattened3 = [val for sublist in X_train.iloc[:, :3].to_numpy() for val in sublist]
+#flattened4 = [val for sublist in X_train.iloc[:, :4].to_numpy() for val in sublist]
+#A = np.array([1.274580708,2.466224824,5.045757621,7.413716262,8.958855646,10.41325305,11.14150951,10.91949012,11.29095648,10.95054297,10.10976255,8.128781795,1.886568472])
+#B = np.array([0,1.700493692,4.059243006,5.320899616,6.747120132,7.899067471,9.434997257,11.24520022,12.94569391,12.83598464,12.6165661,10.80636314,4.388370817])
+#aa = X_train.to_numpy()
+#bb =pd.DataFrame({'Trial B': np.random.normal(-2, 1, 1000)}).to_numpy()
+#def histogram_intersection(h1, h2, h3, h4):
+ #   sm = 0
+  #  for i in range(len(flattened1)):
+   #     sm += min(h1[i], h2[i], h3[i], h4[i])
+    #return sm
+
+#print(histogram_intersection(flattened1, flattened2, flattened3, flattened4))
+#print(np.sum(np.minimum(A,B)))
--- a/examples/log_regression/theme.yaml
+++ b/examples/log_regression/theme.yaml