Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
283a9f0
Update BioAutoML-binary.py
0nurB Sep 30, 2022
e039247
Update BioAutoML-binary.py
0nurB Sep 30, 2022
7025914
Update BioAutoML-binary.py
0nurB Oct 1, 2022
a4ff448
Add shap plot functions into multiclass pipeline
natan-dot-com Oct 2, 2022
3f0240a
Merge branch 'main' of https://github.com/natan-dot-com/BioAutoML
natan-dot-com Oct 2, 2022
af16163
Merge branch 'main' of https://github.com/natan-dot-com/BioAutoML
natan-dot-com Oct 2, 2022
acd38ca
Merge branch 'main' of https://github.com/natan-dot-com/BioAutoML
natan-dot-com Oct 2, 2022
8b6c5fd
Update BioAutoML-binary.py
0nurB Oct 4, 2022
60a87f7
Integrate multiclass plot functions with pipeline
natan-dot-com Oct 5, 2022
a614d87
Generate multiclass example (IncRNA)
natan-dot-com Oct 5, 2022
0960caf
Merge branch 'main' of https://github.com/natan-dot-com/BioAutoML
natan-dot-com Oct 5, 2022
e2c4d77
Fix plt.savefig cropping image
natan-dot-com Oct 5, 2022
aaf750c
Update BioAutoML-binary.py
0nurB Oct 13, 2022
cbfefc7
Some functions comments
0nurB Oct 17, 2022
36330f6
Adding the summary function still in portuguese
0nurB Oct 17, 2022
e6d561b
Solving indentation problem
0nurB Oct 17, 2022
99a70e0
Add exception handling and procedure messages
natan-dot-com Oct 18, 2022
5398414
Fix error handling with proper error structures
natan-dot-com Oct 19, 2022
a8079e4
Add interpretability report class
natan-dot-com Oct 19, 2022
103d9f6
Remove logging and handler
natan-dot-com Oct 19, 2022
eb43c87
Add table support in report class
natan-dot-com Oct 31, 2022
fa379ed
Add table support in report class
natan-dot-com Oct 31, 2022
85bc6a1
Merge branch 'main' of https://github.com/natan-dot-com/BioAutoML
natan-dot-com Oct 31, 2022
e8f1996
Create report generator function
natan-dot-com Nov 8, 2022
71c9b4b
Add header function and text width support
natan-dot-com Nov 13, 2022
6848883
Finish multiclass report generator function
natan-dot-com Nov 13, 2022
0f74e46
Add header function documentation
natan-dot-com Nov 13, 2022
33337ab
Improve random sampling function
natan-dot-com Nov 15, 2022
52cd826
function update
0nurB Nov 16, 2022
2352ab5
new imports
0nurB Nov 16, 2022
a57b62f
new summary descriptions
0nurB Nov 16, 2022
9a1759b
removing some imports
0nurB Nov 16, 2022
716ca80
Changing 'train' datasets to 'test' datasets
0nurB Nov 16, 2022
7c261e6
Update interpretability_report.py
0nurB Nov 19, 2022
c8001a5
function update
0nurB Nov 21, 2022
e1c34b6
Changing names
0nurB Nov 21, 2022
0228a33
change names
0nurB Nov 21, 2022
e964775
Update BioAutoML-binary.py
0nurB Nov 23, 2022
64d2ac2
removed some comments
0nurB Nov 24, 2022
87c9569
Refactor main code directories and branches
natan-dot-com Nov 24, 2022
9c5beca
Merge branch 'main' of https://github.com/natan-dot-com/BioAutoML
natan-dot-com Nov 24, 2022
9f695ac
Fix plot images resize bug
natan-dot-com Nov 26, 2022
52c9c98
Better report's main text
natan-dot-com Nov 26, 2022
81e5e2a
Fix class ordering in plot generation
natan-dot-com Nov 29, 2022
9de7c15
Add empty list assert
natan-dot-com Nov 29, 2022
6dfae02
Remove debug options
natan-dot-com Nov 29, 2022
74638cd
fix some bugs
0nurB Dec 8, 2022
a7cf1ca
Fix sample type bug on multiclass
natan-dot-com Dec 28, 2022
81593a0
Merge branch 'main' of https://github.com/natan-dot-com/BioAutoML
natan-dot-com Dec 28, 2022
99944fd
Recreate conda environment file
natan-dot-com Dec 28, 2022
28c9fc4
fix xgboost
0nurB Feb 1, 2023
b6adf4f
fix some bugs
0nurB Feb 2, 2023
11f6ca3
Add label encoder step on pipeline
natan-dot-com Feb 10, 2023
2520dcb
Merge branch 'main' of https://github.com/natan-dot-com/BioAutoML int…
natan-dot-com Feb 10, 2023
fea4116
Delete .~lock.test.csv#
natan-dot-com Feb 10, 2023
1025d5e
Restore example_results
natan-dot-com Feb 11, 2023
b5a795c
Merge branch 'main' of https://github.com/natan-dot-com/BioAutoML int…
natan-dot-com Feb 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
experimental/
.idea/
feat_extraction
tmp/
tests/
169 changes: 154 additions & 15 deletions BioAutoML-binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import joblib
# import shutil
import xgboost as xgb
import matplotlib.pyplot as plt
import shap
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict
# from sklearn.metrics import multilabel_confusion_matrix
Expand Down Expand Up @@ -47,7 +49,12 @@
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.preprocessing import LabelEncoder
from tpot import TPOTClassifier

from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from numpy.random import default_rng
from interpretability_report import Report, REPORT_MAIN_TITLE_BINARY, REPORT_SHAP_PREAMBLE_BINARY, REPORT_SHAP_BAR_BINARY, \
REPORT_SHAP_BEESWARM_BINARY, REPORT_SHAP_WATERFALL_BINARY

def header(output_header):

Expand Down Expand Up @@ -461,10 +468,122 @@ def save_prediction(prediction, nameseqs, pred_output):
file.write('\n')
return

def type_model(explainer, model, data, labels):
"""
Check the type of exit and modify the "shap" structure as is necessary in the next function.
"""


shap_values = explainer(data)
xgbtype = "<class 'xgboost.sklearn.XGBClassifier'>"
cattype = "<class 'catboost.core.CatBoostClassifier'>"
lgbmtype = "<class 'lightgbm.sklearn.LGBMClassifier'>"
randtype = "<class 'sklearn.ensemble._forest.RandomForestClassifier'>"
assert lgbmtype == str(type(model)) or randtype == str(type(model)) or xgbtype == str(type(model))\
or cattype == str(type(model)), "Error: Model type don't expected "

if lgbmtype == str(type(model) ) or randtype == str(type(model)):
shap_values = shap_values[:, :, 0]
if xgbtype == str(type(model)):
labels = le.fit_transform(labels)

return shap_values, labels

def shap_waterf(explainer, model, X_test, X_label, path):
"""
To do two waterfall graph for each classes in the problem.
"""
graphs_path = []
X_label= pd.DataFrame(data={'label': X_label})
classes = X_label.iloc[:,0].unique()

assert len(classes) == 2,\
"Error: Classes generated by the explainer of 'model' doesn't match the distinct number " +\
f"of classes in 'targets'. [Explainer={2}, Target={len(classes)}]"

for i in range(2):
# made a subset with only one class
subset = X_test[X_label.label==classes[i]]
shap_values, classes = type_model(explainer, model, subset, classes)

# choose two samples from a given class
numbers = default_rng().choice(range(1, subset.shape[0]), size=(2), replace=False)

for j in numbers:
waterfall_name = 'class_' + str(classes[i]) + '_sample_' +str(j)
local_name = os.path.join(path, f"{waterfall_name}.png")
plt.title(waterfall_name, fontsize=16)
sp = shap.plots.waterfall(shap_values[j], show=False)
plt.savefig(local_name, dpi=300,bbox_inches='tight')
plt.close(sp)
graphs_path.append(local_name)
# return the graph paths
return graphs_path


def shap_bar(shap_values, path, fig_name):

local_name = os.path.join(path, f"{fig_name}.png")
plt.title(fig_name, fontsize=16)
sp = shap.plots.bar(shap_values, show=False)
plt.savefig(local_name, dpi=300,bbox_inches='tight')
plt.close(sp)
return local_name

def shap_beeswarm(shap_values, path, fig_name):

local_name = os.path.join(path, f"{fig_name}.png")
plt.title(fig_name, fontsize=16)
sp = shap.plots.beeswarm(shap_values, show=False)
plt.savefig(local_name, dpi=300,bbox_inches='tight')
plt.close(sp)
return local_name


def interp_shap(model, X_test, X_label,output,path='explanations'):
"""
To do all types of graphs for interpretability by shap values.
"""
path = os.path.join(output,path)
generated_plt = {}
explainer = shap.TreeExplainer(model,feature_perturbation="tree_path_dependent")

shap_values, X_label = type_model(explainer, model, X_test, X_label)

if not os.path.exists(path):
print(f"Creating explanations directory: {path}...")
os.mkdir(path)
else:
print(f"Directory {path} already exists. Will proceed using it...")

generated_plt['bar_graph']=[shap_bar(shap_values, path, fig_name='bar_graph')]
generated_plt['beeswarm_graph']=[shap_beeswarm(shap_values, path, fig_name='beeswarm_graph')]
generated_plt['waterfall_graph']=shap_waterf(explainer, model, X_test, X_label, path)
return generated_plt


def build_interpretability_report(generated_plt=[], report_name="interpretability.pdf", directory="."):
report = Report(report_name, directory=directory)
root_dir = os.path.abspath(os.path.join(__file__, os.pardir))

report.insert_doc_header(REPORT_MAIN_TITLE_BINARY, logo_fig=os.path.join(root_dir, "img/BioAutoML.png"))
report.insert_text_on_doc(REPORT_SHAP_PREAMBLE_BINARY, font_size=14)

report.insert_figure_on_doc(generated_plt['bar_graph'])
report.insert_text_on_doc(REPORT_SHAP_BAR_BINARY, font_size=14)

report.insert_figure_on_doc(generated_plt['beeswarm_graph'])
report.insert_text_on_doc(REPORT_SHAP_BEESWARM_BINARY, font_size=12)

report.insert_figure_on_doc(generated_plt['waterfall_graph'])
report.insert_text_on_doc(REPORT_SHAP_WATERFALL_BINARY, font_size=12)

report.build()


def binary_pipeline(test, test_labels, test_nameseq, norm, fs, classifier, tuning, output):

global clf, train, train_labels
global clf, train, train_labels, le

if not os.path.exists(output):
os.mkdir(output)
Expand Down Expand Up @@ -578,19 +697,22 @@ def binary_pipeline(test, test_labels, test_nameseq, norm, fs, classifier, tunin
if imbalance_data is True:
train, train_labels = imbalanced_function(clf, train, train_labels)
elif classifier == 3:
if tuning is True:
print('Tuning: ' + str(tuning))
print('Classifier: XGBClassifier')
clf = xgb.XGBClassifier(eval_metric='mlogloss', random_state=63)
if imbalance_data is True:
train, train_labels = imbalanced_function(clf, train, train_labels)
print('Tuning not yet available for XGBClassifier.')
else:
print('Tuning: ' + str(tuning))
print('Classifier: XGBClassifier')
clf = xgb.XGBClassifier(eval_metric='mlogloss', random_state=63)
if imbalance_data is True:
train, train_labels = imbalanced_function(clf, train, train_labels)
le = LabelEncoder()
train_labels = le.fit_transform(train_labels)
test_labels = le.fit_transform(test_labels)
if tuning is True:
print('Tuning: ' + str(tuning))
print('Classifier: XGBClassifier')
clf = xgb.XGBClassifier(eval_metric='mlogloss', random_state=63)
if imbalance_data is True:
train, train_labels = imbalanced_function(clf, train, train_labels)
print('Tuning not yet available for XGBClassifier.')
else:
print('Tuning: ' + str(tuning))
print('Classifier: XGBClassifier')
clf = xgb.XGBClassifier(eval_metric='mlogloss', random_state=63)
if imbalance_data is True:
train, train_labels = imbalanced_function(clf, train, train_labels)
else:
sys.exit('This classifier option does not exist - Try again')

Expand Down Expand Up @@ -635,6 +757,22 @@ def binary_pipeline(test, test_labels, test_nameseq, norm, fs, classifier, tunin
print('Saving trained model in ' + model_output + '...')
print('Training: Finished...')

"""Generating Interpretability Summary """

try:
generated_plt = interp_shap(clf, train, train_labels,output)
build_interpretability_report(generated_plt=generated_plt, directory=output)
except ValueError as e:
print(e)
print("If you believe this is a bug, please report it to https://github.com/Bonidia/BioAutoML.")
print("Generation of explanation plots and report failed. Proceeding without it...")
except AssertionError as e:
print(e)
print("This is certainly a bug. Please report it to https://github.com/Bonidia/BioAutoML.")
print("Generation of explanation plots and report failed. Proceeding without it...")
else:
print("Explanation plots and report generated successfully!")

"""Generating Feature Importance - Selected feature subset..."""

print('Generating Feature Importance - Selected feature subset...')
Expand All @@ -643,6 +781,7 @@ def binary_pipeline(test, test_labels, test_nameseq, norm, fs, classifier, tunin
print('Saving results in ' + importance_output + '...')

"""Testing model..."""
#test_labels = le.fit_transform(test_labels)

if os.path.exists(ftest) is True:
print('Generating Performance Test...')
Expand Down
43 changes: 39 additions & 4 deletions BioAutoML-env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,41 @@ dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- _py-xgboost-mutex=2.0=cpu_0
- backcall=0.2.0=pyhd3eb1b0_0
- biopython=1.78=py37h4abf009_1
- blas=2.14=openblas
- brotlipy=0.7.0=py37hb5d75c8_1001
- bzip2=1.0.8=h516909a_3
- ca-certificates=2021.10.26=h06a4308_2
- ca-certificates=2022.10.11=h06a4308_0
- cairo=1.16.0=h3fc0475_1005
- catboost=1.0.3=py37h89c1867_1
- certifi=2021.10.8=py37h89c1867_1
- certifi=2022.9.24=py37h06a4308_0
- cffi=1.15.0=py37h7f8727e_0
- charset-normalizer=2.0.9=pyhd8ed1ab_0
- cloudpickle=2.0.0=pyhd8ed1ab_0
- colorama=0.4.4=pyh9f0ad1d_0
- cryptography=36.0.0=py37h9ce1e76_0
- cycler=0.11.0=pyhd3eb1b0_0
- deap=1.3.1=py37h9fdb41a_2
- decorator=5.1.0=pyhd8ed1ab_0
- fontconfig=2.13.1=h7e3eb15_1002
- freetype=2.10.4=h7ca028e_0
- future=0.18.2=py37h89c1867_4
- gettext=0.19.8.1=hf34092f_1004
- giflib=5.2.1=h7b6447c_0
- glib=2.58.3=py37he00f558_1004
- gmp=6.2.1=h58526e2_0
- hyperopt=0.2.5=pyh9f0ad1d_0
- icu=67.1=he1b5a44_0
- idna=3.3=pyhd3eb1b0_0
- igraph=0.8.3=hef4adab_1
- imbalanced-learn=0.8.1=pyhd8ed1ab_0
- ipython=7.31.1=py37h06a4308_1
- jedi=0.18.1=py37h06a4308_1
- joblib=1.1.0=pyhd8ed1ab_0
- jpeg=9e=h7f8727e_0
- kiwisolver=1.4.2=py37h295c915_0
- lcms2=2.12=h3be6417_0
- libblas=3.8.0=14_openblas
- libcblas=3.8.0=14_openblas
- libedit=3.1.20191231=he28a2e2_2
Expand All @@ -44,50 +52,77 @@ dependencies:
- libiconv=1.16=h516909a_0
- liblapack=3.8.0=14_openblas
- liblapacke=3.8.0=14_openblas
- libllvm11=11.1.0=h3826bc1_1
- libopenblas=0.3.7=h5ec1e0e_6
- libpng=1.6.37=h21135ba_2
- libstdcxx-ng=9.1.0=hdf63c60_0
- libtiff=4.2.0=h85742a9_0
- libuuid=2.32.1=h14c3975_1000
- libwebp=1.2.2=h55f646e_0
- libwebp-base=1.2.2=h7f8727e_0
- libxcb=1.13=h14c3975_1002
- libxgboost=1.5.0=h295c915_1
- libxml2=2.9.10=h68273f3_2
- libzlib=1.2.11=h36c2ea0_1013
- lightgbm=3.2.1=py37h295c915_0
- llvmlite=0.38.0=py37h4ff587b_0
- lz4-c=1.9.3=h295c915_1
- matplotlib=3.2.2=1
- matplotlib-base=3.2.2=py37h1d35a4c_1
- matplotlib-inline=0.1.6=py37h06a4308_0
- ncurses=6.2=h58526e2_4
- networkx=2.6.3=pyhd3eb1b0_0
- numba=0.55.1=py37h51133e4_0
- numpy=1.19.2=py37h7008fea_1
- openssl=1.1.1o=h7f8727e_0
- openssl=1.1.1s=h7f8727e_0
- orderedset=2.0.3=py37h8f50634_3
- pandas=0.25.3=py37hb3f55d8_0
- parso=0.8.3=pyhd3eb1b0_0
- pcre=8.44=he1b5a44_0
- pexpect=4.8.0=pyhd3eb1b0_3
- pickleshare=0.7.5=pyhd3eb1b0_1003
- pillow=9.0.1=py37h22f2fdc_0
- pip=21.0.1=pyhd8ed1ab_0
- pixman=0.38.0=h516909a_1003
- prompt-toolkit=3.0.20=pyhd3eb1b0_0
- pthread-stubs=0.4=h36c2ea0_1001
- ptyprocess=0.7.0=pyhd3eb1b0_2
- py-xgboost=1.5.0=py37h06a4308_1
- pycairo=1.20.0=py37h01af8b0_1
- pycparser=2.21=pyhd8ed1ab_0
- pygments=2.11.2=pyhd3eb1b0_0
- pymongo=3.12.0=py37h295c915_0
- pyopenssl=21.0.0=pyhd8ed1ab_0
- pyparsing=3.0.9=py37h06a4308_0
- pysocks=1.7.1=py37h89c1867_4
- python=3.7.3=h5b0a415_0
- python-dateutil=2.8.0=py_0
- python-igraph=0.8.3=py37h340e831_2
- python_abi=3.7=2_cp37m
- pytz=2021.1=pyhd8ed1ab_0
- readline=7.0=hf8c457e_1001
- reportlab=3.5.67=py37hfdd840d_1
- requests=2.26.0=pyhd8ed1ab_1
- scikit-learn=1.0.1=py37h51133e4_0
- scipy=1.6.1=py37hf56f3a7_0
- setuptools=52.0.0=py37h06a4308_1
- shap=0.39.0=py37h51133e4_0
- six=1.15.0=pyh9f0ad1d_0
- slicer=0.0.7=pyhd3eb1b0_0
- sqlite=3.33.0=h62c20be_0
- stopit=1.1.2=py_0
- tbb=2021.5.0=hd09550d_0
- texttable=1.6.3=pyh9f0ad1d_0
- threadpoolctl=3.0.0=pyh8a188c0_0
- tk=8.6.10=h21135ba_1
- tornado=6.1=py37h27cfd23_0
- tpot=0.11.7=pyhd8ed1ab_1
- tqdm=4.62.3=pyhd8ed1ab_0
- traitlets=5.1.1=pyhd3eb1b0_0
- typing_extensions=4.3.0=py37h06a4308_0
- update_checker=0.18.0=pyh9f0ad1d_0
- urllib3=1.26.7=pyhd8ed1ab_0
- wcwidth=0.2.5=pyhd3eb1b0_0
- wheel=0.36.2=pyhd3deb0d_0
- xorg-kbproto=1.0.7=h14c3975_1002
- xorg-libice=1.0.10=h516909a_0
Expand All @@ -102,4 +137,4 @@ dependencies:
- xorg-xproto=7.0.31=h14c3975_1007
- xz=5.2.5=h516909a_1
- zlib=1.2.11=h36c2ea0_1013
prefix: /home/robson/miniconda3/envs/bioautoml
- zstd=1.4.9=haebb681_0
Loading