Bonidia · natan-dot-com · Sep 30, 2022 · Sep 30, 2022 · Oct 1, 2022 · Oct 2, 2022
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@
 experimental/
 .idea/
 feat_extraction
+tmp/
+tests/
diff --git a/BioAutoML-binary.py b/BioAutoML-binary.py
@@ -12,6 +12,8 @@
 import joblib
 # import shutil
 import xgboost as xgb
+import matplotlib.pyplot as plt
+import shap
 from sklearn.metrics import roc_auc_score
 from sklearn.model_selection import cross_val_predict
 #  from sklearn.metrics import multilabel_confusion_matrix
@@ -47,7 +49,12 @@
 from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
 from sklearn.preprocessing import LabelEncoder
 from tpot import TPOTClassifier
-
+from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from numpy.random import default_rng
+from interpretability_report import Report, REPORT_MAIN_TITLE_BINARY, REPORT_SHAP_PREAMBLE_BINARY, REPORT_SHAP_BAR_BINARY, \
+	REPORT_SHAP_BEESWARM_BINARY, REPORT_SHAP_WATERFALL_BINARY
 
 def header(output_header):
 
@@ -461,10 +468,122 @@ def save_prediction(prediction, nameseqs, pred_output):
 			file.write('\n')
 	return
 
+def type_model(explainer, model, data, labels):
+	"""
+        Check the type of exit and modify the "shap" structure as is necessary in the next function. 
+	"""
+
+
+	shap_values = explainer(data)
+	xgbtype = "<class 'xgboost.sklearn.XGBClassifier'>"
+	cattype  = "<class 'catboost.core.CatBoostClassifier'>"
+	lgbmtype = "<class 'lightgbm.sklearn.LGBMClassifier'>"
+	randtype = "<class 'sklearn.ensemble._forest.RandomForestClassifier'>"
+	assert lgbmtype == str(type(model)) or randtype == str(type(model)) or xgbtype == str(type(model))\
+        or cattype == str(type(model)), "Error: Model type don't expected "
+
+	if lgbmtype == str(type(model) ) or randtype == str(type(model)):
+		shap_values = shap_values[:, :, 0]
+	if xgbtype == str(type(model)):
+                labels = le.fit_transform(labels)
+
+	return shap_values, labels
+
+def shap_waterf(explainer, model, X_test, X_label, path):
+	"""
+        To do two waterfall graph for each classes in the problem. 
+	"""
+	graphs_path = []
+	X_label= pd.DataFrame(data={'label': X_label}) 
+	classes = X_label.iloc[:,0].unique()
+
+	assert len(classes) == 2,\
+	"Error: Classes generated by the explainer of 'model' doesn't match the distinct number " +\
+	f"of classes in 'targets'. [Explainer={2}, Target={len(classes)}]"
+
+	for i in range(2):
+		# made a subset with only one class
+		subset = X_test[X_label.label==classes[i]]
+		shap_values, classes = type_model(explainer, model, subset, classes)
+
+        # choose two samples from a given class
+		numbers = default_rng().choice(range(1, subset.shape[0]), size=(2), replace=False)
+
+		for j in numbers:
+			waterfall_name = 'class_' + str(classes[i]) + '_sample_' +str(j)
+			local_name = os.path.join(path, f"{waterfall_name}.png")
+			plt.title(waterfall_name, fontsize=16)
+			sp = shap.plots.waterfall(shap_values[j], show=False)
+			plt.savefig(local_name, dpi=300,bbox_inches='tight')
+			plt.close(sp)
+			graphs_path.append(local_name)
+    # return the graph paths
+	return graphs_path
+
+
+def shap_bar(shap_values, path, fig_name):
+
+	local_name = os.path.join(path, f"{fig_name}.png")
+	plt.title(fig_name, fontsize=16)
+	sp = shap.plots.bar(shap_values, show=False)
+	plt.savefig(local_name, dpi=300,bbox_inches='tight')
+	plt.close(sp)
+	return local_name
+
+def shap_beeswarm(shap_values, path, fig_name):
+
+	local_name = os.path.join(path, f"{fig_name}.png")
+	plt.title(fig_name, fontsize=16)
+	sp = shap.plots.beeswarm(shap_values, show=False)
+	plt.savefig(local_name, dpi=300,bbox_inches='tight')
+	plt.close(sp)
+	return local_name
+
+
+def interp_shap(model, X_test, X_label,output,path='explanations'):
+	"""
+        To do all types of graphs for interpretability by shap values.
+	"""
+	path = os.path.join(output,path)
+	generated_plt = {}
+	explainer = shap.TreeExplainer(model,feature_perturbation="tree_path_dependent")
+
+	shap_values, X_label = type_model(explainer, model, X_test, X_label)
+
+	if not os.path.exists(path):
+		print(f"Creating explanations directory: {path}...")
+		os.mkdir(path)
+	else:
+		print(f"Directory {path} already exists. Will proceed using it...")    
+
+	generated_plt['bar_graph']=[shap_bar(shap_values, path, fig_name='bar_graph')]
+	generated_plt['beeswarm_graph']=[shap_beeswarm(shap_values, path, fig_name='beeswarm_graph')]
+	generated_plt['waterfall_graph']=shap_waterf(explainer, model, X_test, X_label, path)
+	return generated_plt
+
+
+def build_interpretability_report(generated_plt=[], report_name="interpretability.pdf", directory="."):
+	report = Report(report_name, directory=directory)
+	root_dir = os.path.abspath(os.path.join(__file__, os.pardir))
+
+	report.insert_doc_header(REPORT_MAIN_TITLE_BINARY, logo_fig=os.path.join(root_dir, "img/BioAutoML.png"))
+	report.insert_text_on_doc(REPORT_SHAP_PREAMBLE_BINARY, font_size=14)
+
+	report.insert_figure_on_doc(generated_plt['bar_graph'])
+	report.insert_text_on_doc(REPORT_SHAP_BAR_BINARY, font_size=14)
+
+	report.insert_figure_on_doc(generated_plt['beeswarm_graph'])
+	report.insert_text_on_doc(REPORT_SHAP_BEESWARM_BINARY, font_size=12)
+
+	report.insert_figure_on_doc(generated_plt['waterfall_graph'])
+	report.insert_text_on_doc(REPORT_SHAP_WATERFALL_BINARY, font_size=12)
+
+	report.build()
+
 
 def binary_pipeline(test, test_labels, test_nameseq, norm, fs, classifier, tuning, output):
 
-	global clf, train, train_labels
+	global clf, train, train_labels, le
 
 	if not os.path.exists(output):
 		os.mkdir(output)
@@ -578,19 +697,22 @@ def binary_pipeline(test, test_labels, test_nameseq, norm, fs, classifier, tunin
 			if imbalance_data is True:
 				train, train_labels = imbalanced_function(clf, train, train_labels)
 	elif classifier == 3:
-		if tuning is True:
-			print('Tuning: ' + str(tuning))
-			print('Classifier: XGBClassifier')
-			clf = xgb.XGBClassifier(eval_metric='mlogloss', random_state=63)
-			if imbalance_data is True:
-				train, train_labels = imbalanced_function(clf, train, train_labels)
-			print('Tuning not yet available for XGBClassifier.')
-		else:
-			print('Tuning: ' + str(tuning))
-			print('Classifier: XGBClassifier')
-			clf = xgb.XGBClassifier(eval_metric='mlogloss', random_state=63)
-			if imbalance_data is True:
-				train, train_labels = imbalanced_function(clf, train, train_labels)
+                le = LabelEncoder()
+                train_labels = le.fit_transform(train_labels)
+                test_labels = le.fit_transform(test_labels)
+                if tuning is True:
+                        print('Tuning: ' + str(tuning))
+                        print('Classifier: XGBClassifier')
+                        clf = xgb.XGBClassifier(eval_metric='mlogloss', random_state=63)
+                        if imbalance_data is True:
+                                train, train_labels = imbalanced_function(clf, train, train_labels)
+                        print('Tuning not yet available for XGBClassifier.')
+                else:
+                        print('Tuning: ' + str(tuning))
+                        print('Classifier: XGBClassifier')
+                        clf = xgb.XGBClassifier(eval_metric='mlogloss', random_state=63)
+                        if imbalance_data is True:
+                                train, train_labels = imbalanced_function(clf, train, train_labels)
 	else:
 		sys.exit('This classifier option does not exist - Try again')
 
@@ -635,6 +757,22 @@ def binary_pipeline(test, test_labels, test_nameseq, norm, fs, classifier, tunin
 	print('Saving trained model in ' + model_output + '...')
 	print('Training: Finished...')
 
+	"""Generating Interpretability Summary """
+
+	try:    
+		generated_plt = interp_shap(clf, train, train_labels,output) 
+		build_interpretability_report(generated_plt=generated_plt, directory=output)
+	except ValueError as e:
+		print(e)
+		print("If you believe this is a bug, please report it to https://github.com/Bonidia/BioAutoML.")
+		print("Generation of explanation plots and report failed. Proceeding without it...")
+	except AssertionError as e:
+		print(e)
+		print("This is certainly a bug. Please report it to https://github.com/Bonidia/BioAutoML.")
+		print("Generation of explanation plots and report failed. Proceeding without it...")
+	else:
+		print("Explanation plots and report generated successfully!")
+
 	"""Generating Feature Importance - Selected feature subset..."""
 
 	print('Generating Feature Importance - Selected feature subset...')
@@ -643,6 +781,7 @@ def binary_pipeline(test, test_labels, test_nameseq, norm, fs, classifier, tunin
 	print('Saving results in ' + importance_output + '...')
 
 	"""Testing model..."""
+	#test_labels = le.fit_transform(test_labels)
 
 	if os.path.exists(ftest) is True:
 		print('Generating Performance Test...')

diff --git a/BioAutoML-env.yml b/BioAutoML-env.yml
@@ -6,33 +6,41 @@ dependencies:
   - _libgcc_mutex=0.1=main
   - _openmp_mutex=4.5=1_gnu
   - _py-xgboost-mutex=2.0=cpu_0
+  - backcall=0.2.0=pyhd3eb1b0_0
   - biopython=1.78=py37h4abf009_1
   - blas=2.14=openblas
   - brotlipy=0.7.0=py37hb5d75c8_1001
   - bzip2=1.0.8=h516909a_3
-  - ca-certificates=2021.10.26=h06a4308_2
+  - ca-certificates=2022.10.11=h06a4308_0
   - cairo=1.16.0=h3fc0475_1005
   - catboost=1.0.3=py37h89c1867_1
-  - certifi=2021.10.8=py37h89c1867_1
+  - certifi=2022.9.24=py37h06a4308_0
   - cffi=1.15.0=py37h7f8727e_0
   - charset-normalizer=2.0.9=pyhd8ed1ab_0
   - cloudpickle=2.0.0=pyhd8ed1ab_0
   - colorama=0.4.4=pyh9f0ad1d_0
   - cryptography=36.0.0=py37h9ce1e76_0
+  - cycler=0.11.0=pyhd3eb1b0_0
   - deap=1.3.1=py37h9fdb41a_2
   - decorator=5.1.0=pyhd8ed1ab_0
   - fontconfig=2.13.1=h7e3eb15_1002
   - freetype=2.10.4=h7ca028e_0
   - future=0.18.2=py37h89c1867_4
   - gettext=0.19.8.1=hf34092f_1004
+  - giflib=5.2.1=h7b6447c_0
   - glib=2.58.3=py37he00f558_1004
   - gmp=6.2.1=h58526e2_0
   - hyperopt=0.2.5=pyh9f0ad1d_0
   - icu=67.1=he1b5a44_0
   - idna=3.3=pyhd3eb1b0_0
   - igraph=0.8.3=hef4adab_1
   - imbalanced-learn=0.8.1=pyhd8ed1ab_0
+  - ipython=7.31.1=py37h06a4308_1
+  - jedi=0.18.1=py37h06a4308_1
   - joblib=1.1.0=pyhd8ed1ab_0
+  - jpeg=9e=h7f8727e_0
+  - kiwisolver=1.4.2=py37h295c915_0
+  - lcms2=2.12=h3be6417_0
   - libblas=3.8.0=14_openblas
   - libcblas=3.8.0=14_openblas
   - libedit=3.1.20191231=he28a2e2_2
@@ -44,50 +52,77 @@ dependencies:
   - libiconv=1.16=h516909a_0
   - liblapack=3.8.0=14_openblas
   - liblapacke=3.8.0=14_openblas
+  - libllvm11=11.1.0=h3826bc1_1
   - libopenblas=0.3.7=h5ec1e0e_6
   - libpng=1.6.37=h21135ba_2
   - libstdcxx-ng=9.1.0=hdf63c60_0
+  - libtiff=4.2.0=h85742a9_0
   - libuuid=2.32.1=h14c3975_1000
+  - libwebp=1.2.2=h55f646e_0
+  - libwebp-base=1.2.2=h7f8727e_0
   - libxcb=1.13=h14c3975_1002
   - libxgboost=1.5.0=h295c915_1
   - libxml2=2.9.10=h68273f3_2
   - libzlib=1.2.11=h36c2ea0_1013
   - lightgbm=3.2.1=py37h295c915_0
+  - llvmlite=0.38.0=py37h4ff587b_0
+  - lz4-c=1.9.3=h295c915_1
+  - matplotlib=3.2.2=1
+  - matplotlib-base=3.2.2=py37h1d35a4c_1
+  - matplotlib-inline=0.1.6=py37h06a4308_0
   - ncurses=6.2=h58526e2_4
   - networkx=2.6.3=pyhd3eb1b0_0
+  - numba=0.55.1=py37h51133e4_0
   - numpy=1.19.2=py37h7008fea_1
-  - openssl=1.1.1o=h7f8727e_0
+  - openssl=1.1.1s=h7f8727e_0
+  - orderedset=2.0.3=py37h8f50634_3
   - pandas=0.25.3=py37hb3f55d8_0
+  - parso=0.8.3=pyhd3eb1b0_0
   - pcre=8.44=he1b5a44_0
+  - pexpect=4.8.0=pyhd3eb1b0_3
+  - pickleshare=0.7.5=pyhd3eb1b0_1003
+  - pillow=9.0.1=py37h22f2fdc_0
   - pip=21.0.1=pyhd8ed1ab_0
   - pixman=0.38.0=h516909a_1003
+  - prompt-toolkit=3.0.20=pyhd3eb1b0_0
   - pthread-stubs=0.4=h36c2ea0_1001
+  - ptyprocess=0.7.0=pyhd3eb1b0_2
   - py-xgboost=1.5.0=py37h06a4308_1
   - pycairo=1.20.0=py37h01af8b0_1
   - pycparser=2.21=pyhd8ed1ab_0
+  - pygments=2.11.2=pyhd3eb1b0_0
   - pymongo=3.12.0=py37h295c915_0
   - pyopenssl=21.0.0=pyhd8ed1ab_0
+  - pyparsing=3.0.9=py37h06a4308_0
   - pysocks=1.7.1=py37h89c1867_4
   - python=3.7.3=h5b0a415_0
   - python-dateutil=2.8.0=py_0
   - python-igraph=0.8.3=py37h340e831_2
   - python_abi=3.7=2_cp37m
   - pytz=2021.1=pyhd8ed1ab_0
   - readline=7.0=hf8c457e_1001
+  - reportlab=3.5.67=py37hfdd840d_1
   - requests=2.26.0=pyhd8ed1ab_1
   - scikit-learn=1.0.1=py37h51133e4_0
   - scipy=1.6.1=py37hf56f3a7_0
   - setuptools=52.0.0=py37h06a4308_1
+  - shap=0.39.0=py37h51133e4_0
   - six=1.15.0=pyh9f0ad1d_0
+  - slicer=0.0.7=pyhd3eb1b0_0
   - sqlite=3.33.0=h62c20be_0
   - stopit=1.1.2=py_0
+  - tbb=2021.5.0=hd09550d_0
   - texttable=1.6.3=pyh9f0ad1d_0
   - threadpoolctl=3.0.0=pyh8a188c0_0
   - tk=8.6.10=h21135ba_1
+  - tornado=6.1=py37h27cfd23_0
   - tpot=0.11.7=pyhd8ed1ab_1
   - tqdm=4.62.3=pyhd8ed1ab_0
+  - traitlets=5.1.1=pyhd3eb1b0_0
+  - typing_extensions=4.3.0=py37h06a4308_0
   - update_checker=0.18.0=pyh9f0ad1d_0
   - urllib3=1.26.7=pyhd8ed1ab_0
+  - wcwidth=0.2.5=pyhd3eb1b0_0
   - wheel=0.36.2=pyhd3deb0d_0
   - xorg-kbproto=1.0.7=h14c3975_1002
   - xorg-libice=1.0.10=h516909a_0
@@ -102,4 +137,4 @@ dependencies:
   - xorg-xproto=7.0.31=h14c3975_1007
   - xz=5.2.5=h516909a_1
   - zlib=1.2.11=h36c2ea0_1013
-prefix: /home/robson/miniconda3/envs/bioautoml
+  - zstd=1.4.9=haebb681_0
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,5 @@ @@
     experimental/
     .idea/
     feat_extraction
+    tmp/
+    tests/