6. Interpretation

The purpose of this notebook is to apply various post-hoc interpretation methods on our model. For this purose, we will rebuild our DecisionTree model, train it. After this we will apply SHAP, PDP and ALE on the trained DecisionTree model.

import numpy as np
import pandas as pd

#np.bool = np.bool_

import shap

import matplotlib as mpl
import matplotlib.pyplot as plt

import matplotlib.colors as mcolors

from easy_mpl import pie
from easy_mpl import bar_chart
from easy_mpl.utils import create_subplots

from shap.plots import waterfall
from shap import summary_plot, Explanation

from ai4water import Model
from ai4water.utils.utils import TrainTestSplit
from ai4water.postprocessing import PartialDependencePlot

from utils import LABEL_MAP
from utils import version_info
from utils import shap_scatter
from utils import make_classes
from utils import shap_scatter_plots
from utils import prepare_data, set_rcParams, plot_ale, SAVE
for lib, ver in version_info().items():
    print(lib, ver)
python 3.12.10 (main, May  6 2025, 10:49:23) [GCC 11.4.0]
os posix
ai4water 1.07
lightgbm 4.6.0
catboost 1.2.10
xgboost 3.2.0
easy_mpl 0.21.5
SeqMetrics 2.0.0
numpy 1.26.4
pandas 2.2.3
matplotlib 3.10.8
h5py 3.16.0
sklearn 1.3.1
optuna 4.8.0
skopt 0.10.2
plotly 6.6.0
seaborn 0.13.2
crepes 0.9.0
mapie 0.9.2
shap 0.49.1
scipy 1.17.1
set_rcParams()
inputs = ['Solution pH', 'Time (m)', 'Anions', 'Ni (At%)', 'HA (mg/L)',
          'loading (g)', 'Pore size (nm)', 'O (At%)',
          'Light intensity (watt)', 'Mo (At%)', 'Dye concentration (mg/L)']
data, encoders = prepare_data(inputs=inputs,
                              outputs="k")

print(data.shape)
(1527, 12)
input_features = data.columns.tolist()[0:-1]
output_features = data.columns.tolist()[-1:]

TrainX, TestX, TrainY, TestY = TrainTestSplit(seed=313).split_by_random(
    data[input_features],
    data[output_features]
)

print(TrainX.shape, TrainY.shape, TestX.shape, TestY.shape)

model = Model(
    model = "DecisionTreeRegressor",
    input_features=input_features,
    output_features=output_features,
    verbosity=-1,
)

model.fit(TrainX, TrainY.values)
(1068, 11) (1068, 1) (459, 11) (459, 1)
DecisionTreeRegressor(random_state=313)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.


train_p = model.predict(TrainX, process_results=False)
test_p = model.predict(TestX, process_results=False)

Average prediction on training data

print(train_p.mean())
0.006940225438833623

default feature importance from decision tree

print(model._model.feature_importances_)
[7.40882935e-02 1.94588607e-01 9.18145954e-02 3.19367156e-01
 3.43836446e-02 9.41130328e-02 5.70019588e-03 1.71579275e-02
 3.37566239e-02 2.51049444e-04 1.34778874e-01]
bar_chart(model._model.feature_importances_,
          [LABEL_MAP[n] if n in LABEL_MAP else n for n in model.input_features],
          sort=True,
          show=False)
plt.tight_layout()
plt.show()
Interpretation

SHAP

exp = shap.TreeExplainer(model=model._model,
                         data=TrainX,
                         feature_names=input_features)

print(exp.expected_value)
0.0066904566801259955
shap_values = exp.shap_values(TrainX, TrainY)

summary_plot(shap_values,
    TrainX,
    max_display=34,
    feature_names=[LABEL_MAP[n] if n in LABEL_MAP else n for n in input_features],
    show=False)
if SAVE:
    plt.savefig("results/figures/shap_summary.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()
Interpretation
sv_bar = np.mean(np.abs(shap_values), axis=0)

classes, colors, colors_ = make_classes(exp)

df_with_classes = pd.DataFrame({'features': exp.feature_names,
                   'classes': classes,
                   'mean_shap': sv_bar})

print(df_with_classes)
                    features                     classes  mean_shap
0                Solution pH     Experimental Conditions   0.001051
1                   Time (m)     Experimental Conditions   0.001239
2                     Anions     Experimental Conditions   0.000390
3                   Ni (At%)          Atomic Composition   0.003584
4                  HA (mg/L)     Experimental Conditions   0.000281
5                loading (g)     Experimental Conditions   0.001199
6             Pore size (nm)  Physicochemical Properties   0.000322
7                    O (At%)          Atomic Composition   0.000349
8     Light intensity (watt)     Experimental Conditions   0.000395
9                   Mo (At%)          Atomic Composition   0.000014
10  Dye concentration (mg/L)     Experimental Conditions   0.001875
f, ax = plt.subplots(figsize=(7,9))
ax = bar_chart(
    sv_bar,
    [LABEL_MAP[n] if n in LABEL_MAP else n for n in exp.feature_names],
    bar_labels=np.round(sv_bar, 4),
    bar_label_kws={'label_type':'edge',
                   'fontsize': 10,
                   'weight': 'bold',
                   "fmt": '%.4f',
                   'padding': 1.5
                   },
    show=False,
    sort=True,
    color=colors_,
    ax = ax
)
ax.spines[['top', 'right']].set_visible(False)
ax.set_xlabel(xlabel='mean(|SHAP value|)')
ax.set_xticklabels(ax.get_xticks().astype(float))
ax.set_yticklabels(ax.get_yticklabels())

labels = df_with_classes['classes'].unique()
handles = [plt.Rectangle((0,0),1,1,
                         color=colors[l]) for l in labels]
plt.legend(handles, labels, loc='lower right')
ax.xaxis.set_major_locator(plt.MaxNLocator(4))
if SAVE:
    plt.savefig("results/figures/shap_bar.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()
Interpretation
seg_colors = (colors.values())
# Change the saturation of seg_colors to 70% for the interior segments
rgb = mcolors.to_rgba_array(seg_colors)[:,:-1]
hsv = mcolors.rgb_to_hsv(rgb)
hsv[:,1] = 0.7 * hsv[:, 1]
interior_colors = mcolors.hsv_to_rgb(hsv)

fractions = np.array([
df_with_classes.loc[df_with_classes['classes']=='Experimental Conditions']['mean_shap'].sum(),
df_with_classes.loc[df_with_classes['classes']=='Physicochemical Properties']['mean_shap'].sum(),
df_with_classes.loc[df_with_classes['classes']=='Atomic Composition']['mean_shap'].sum(),
])

dye_frac = df_with_classes.loc[df_with_classes['classes']=='Dye Properties']['mean_shap'].sum()
labels = ['Experimental \nConditions', 'Physicochemical \nProperties',
               'Atomic \nComposition']

if dye_frac > 0.0:
    fractions = np.array(fractions.tolist().append(dye_frac))
    labels.append('Dye Properties')

fractions /=fractions.sum()

_, texts= pie(fractions=fractions,
    colors=seg_colors,
    labels=labels,
    wedgeprops=dict(edgecolor="w", width=0.03), radius=1,
    autopct=None,
    textprops = dict(fontsize=12),
    startangle=90, counterclock=False, show=False)
texts[0].set_fontsize(12)
_, texts, autotexts = pie(fractions=fractions,
    colors=interior_colors,
       autopct='%1.0f%%',
    textprops = dict(fontsize=24),
    wedgeprops=dict(edgecolor="w"), radius=1-2*0.03,
    startangle=90, counterclock=False, ax=plt.gca(), show=False)
texts[0].set_fontsize(12)
if SAVE:
    plt.savefig("results/figures/shap_pie.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()
Interpretation
index = train_p.argmax()
print(index, train_p.max())
355 0.036809739452414975
e = Explanation(
    shap_values[index],
    base_values=exp.expected_value,
    data=TrainX.values[index],
    feature_names=input_features
)

waterfall(e, max_display=20, show=False)
if SAVE:
    plt.savefig(f"results/figures/shap_local_{index}.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()
Interpretation

The following figures show SHAP interaction plots. These figures depict the inteaction effect of two features on model performance. In these figures, the numbers in legends for Anions, have following meanings

encoders['Anions'].inverse_transform(np.array([0,1,2,3,4, 5, 5]).reshape(-1,1))
array(['N/A', 'Na2HPO4', 'Na2SO4', 'NaCO3', 'NaCl', 'NaHCO3', 'NaHCO3'],
      dtype=object)

Similarly for catalyst, the numbers in legend have following meanings Pt-BFO : 6 Pd-BFO: 4 LM : 2 Ag-BFO : 0 Photolysis : 5 LTH : 3 BFO : 1

Dye Concentration

It represents initial concentration of dye.

feature_name = 'Dye concentration (mg/L)'

if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)
Interpretation

Ni (At%)

feature_name = 'Ni (At%)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)
Interpretation

loading

It represents how much photocatalyst is present.

feature_name = 'loading (g)'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)
Interpretation

Time

feature_name = 'Time (m)'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)
Interpretation

Solution pH

feature_name = 'Solution pH'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)
Interpretation

Light intensiy

feature_name =  'Light intensity (watt)'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)
Interpretation

Oxygen

feature_name = 'O (At%)'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)
Interpretation

Humic Acid

feature_name = 'HA (mg/L)'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)
Interpretation

Pore size

feature_name = 'Pore size (nm)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)
Interpretation

Anions

feature_name = 'Anions'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)
Interpretation
feature_name = 'Mass ratio (Catalyst/Dye)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

S

feature_name = 'S (At%)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Surface Area

feature_name = 'Surface area (m2/g)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Mo

feature_name =  'Mo (At%)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)
Interpretation
fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(
    2,4, figsize=(15, 8))

ax = shap_scatter(
    shap_values[:, 5],
    TrainX.loc[:, 'loading (g)'],
    TrainX.loc[:, 'Ni (At%)'],
    feature_name='Cat. Loading (g/L)',
    ax=ax1,
    show=False
)
ax.set_ylabel('')
#ax.set_xlim(ax.get_xlim()[0], 0.32)

ax = shap_scatter(
    shap_values[:, 5],
    TrainX.loc[:, 'loading (g)'],
    TrainX.loc[:, 'Pore size (nm)'],
    feature_name='Cat. Loading (g/L)',
    ax=ax2,
    show=False
)
ax.set_ylabel('')
#ax.set_xlim(2.5, 12.5)

ax = shap_scatter(
    shap_values[:, 5],
    TrainX.loc[:, 'loading (g)'],
    TrainX.loc[:, 'Solution pH'],
    feature_name='Cat. Loading (g/L)',
    ax=ax3,
    show=False,
)
ax.set_ylabel('')
#ax.set_xlim(ax.get_xlim()[0], 62)


ax = shap_scatter(
    shap_values[:, 5],
    TrainX.loc[:, 'loading (g)'],
    TrainX.loc[:, 'Mo (At%)'],
    feature_name='Cat. Loading (g/L)',
    ax=ax4,
    show=False,
)
ax.set_ylabel('')
#ax.set_xlim(2.5, 12.5)

ax = shap_scatter(
    shap_values[:, 3],
    TrainX.loc[:, 'Ni (At%)'],
    TrainX.loc[:, 'Pore size (nm)'],
    feature_name='Ni (At%)',
    ax=ax5,
    show=False
)
ax.set_ylabel('')

ax = shap_scatter(
    shap_values[:, 3],
    TrainX.loc[:, 'Ni (At%)'],
    TrainX.loc[:, 'Solution pH'],
    feature_name='Ni (At%)',
    ax=ax6,
    show=False
)
ax.set_ylabel('')
#ax.set_xlim(2.5, 12.5)

ax = shap_scatter(
    shap_values[:, 3],
    TrainX.loc[:, 'Ni (At%)'],
    TrainX.loc[:, 'Mo (At%)'],
    feature_name='Ni (At%)',
    ax=ax7,
    show=False
)
ax.set_ylabel('')

ax = shap_scatter(
    shap_values[:, 0],
    TrainX.loc[:, 'Solution pH'],
    TrainX.loc[:, 'O (At%)'],
    feature_name='Solution pH',
    ax=ax8,
    show=False
)
ax.set_ylabel('')
plt.tight_layout()
if SAVE:
    plt.savefig("results/figures/shap_dep.png", dpi=600, bbox_inches="tight")
plt.show()
Interpretation

Partial Dependence Plot

pdp = PartialDependencePlot(
    model.predict,
    TrainX,
    num_points=20,
    feature_names=TrainX.columns.tolist(),
    show=False,
    save=False
)
mpl.rcParams.update(mpl.rcParamsDefault)
colors = ["#DB0007", "#670E36", "#e30613", "#0057B8", "#6C1D45",
          "#034694", "#1B458F", "#003399", "#FFCD00", "#003090",
          "#C8102E", "#6CABDD", "#DA291C", "#241F20", "#00A650",
          "#D71920", "#132257", "#ED2127", "#7A263A", "#FDB913",
          "#DB0007", "#670E36", "#e30613", "#0057B8", "#6C1D45",
          "#034694", "#1B458F", "#003399", "#FFCD00", "#003090",
          ]

f, axes = create_subplots(TrainX.shape[1], figsize=(10, 12))

for ax, feature, clr in zip(axes.flat, TrainX.columns, colors):

    pdp_vals, ice_vals = pdp.calc_pdp_1dim(TrainX.values, feature)

    ax = pdp.plot_pdp_1dim(pdp_vals, ice_vals, TrainX.values,
                            feature,
                            pdp_line_kws={
                                'color': clr, 'zorder': 3},
                            ice_color="gray",
                            ice_lines_kws=dict(zorder=2, alpha=0.15),
                            ax=ax,
                            show=False,
                            )
    ax.set_xlabel(LABEL_MAP.get(feature, feature))
    ax.set_ylabel(f"E[f(x) | " + feature + "]")
if SAVE:
    plt.savefig("results/figures/pdp.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()
Interpretation

Accumulated Local Effects

class MyModel:
    def predict(self, X):
        return model.predict(X).reshape(-1,)

f, axes = create_subplots(TrainX.shape[1], figsize=(10, 12))

for ax, feature, clr in zip(axes.flat, TrainX.columns, colors):
    plot_ale(MyModel().predict, TrainX, feature,
             ax=ax, show=False, color=clr, )

plt.tight_layout()
plt.show()
Interpretation

All Features model interpretation

For the sake of comparison, we also show interpretation of model which uses all features as input.

set_rcParams()

data, encoders = prepare_data(outputs="k")

print(data.shape)
(1527, 35)
input_features = data.columns.tolist()[0:-1]
output_features = data.columns.tolist()[-1:]

TrainX, TestX, TrainY, TestY = TrainTestSplit(seed=313).split_by_random(
    data[input_features],
    data[output_features]
)

print(TrainX.shape, TrainY.shape, TestX.shape, TestY.shape)


model = Model(
    model = "DecisionTreeRegressor",
    input_features=input_features,
    output_features=output_features,
    verbosity=-1,
)

model.fit(TrainX, TrainY.values)
(1068, 34) (1068, 1) (459, 34) (459, 1)
DecisionTreeRegressor(random_state=313)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.


train_p = model.predict(TrainX, process_results=False)
test_p = model.predict(TestX, process_results=False)

Average prediction on training data

print(train_p.mean())
0.0069402254388336235

default feature importance from decision tree

print(model._model.feature_importances_)
[3.81006042e-05 1.73386029e-06 1.19712970e-02 0.00000000e+00
 7.23370042e-03 1.99232034e-05 3.97314739e-04 2.75412342e-01
 1.25751993e-03 4.10362230e-02 9.97703911e-04 0.00000000e+00
 2.34364582e-05 2.25969210e-04 1.03530578e-02 2.14975624e-03
 5.75046118e-04 0.00000000e+00 8.39858878e-02 3.36541611e-02
 2.61657725e-03 1.93957659e-01 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.34414955e-01 7.38624495e-02
 3.42792786e-02 9.15359069e-02]
fig, ax = plt.subplots(figsize=(6, 8))
bar_chart(model._model.feature_importances_,
          [LABEL_MAP[n] if n in LABEL_MAP else n for n in model.input_features],
          sort=True,
          show=False,
          ax=ax)
plt.tight_layout()
plt.show()
Interpretation

SHAP all features

exp = shap.TreeExplainer(model=model._model,
                         data=TrainX,
                         feature_names=input_features)

print(exp.expected_value)
0.00671328508398034
shap_values = exp.shap_values(TrainX, TrainY)

summary_plot(shap_values,
    TrainX,
             max_display=34,
feature_names=[LABEL_MAP[n] if n in LABEL_MAP else n for n in input_features],
    show=False)
if SAVE:
    plt.savefig("results/figures/shap_summary_all.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()
Interpretation
# sv_bar = np.mean(np.abs(shap_values), axis=0)

# classes, colors, colors_ = make_classes(exp)

# df_with_classes = pd.DataFrame({'features': exp.feature_names,
#                    'classes': classes,
#                    'mean_shap': sv_bar})

# print(df_with_classes)

# # %%
# f, ax = plt.subplots(figsize=(7,9))
# ax = bar_chart(
#     sv_bar,
#     [LABEL_MAP[n] if n in LABEL_MAP else n for n in exp.feature_names],
#     bar_labels=np.round(sv_bar, 4),
#     bar_label_kws={'label_type':'edge',
#                    'fontsize': 10,
#                    'weight': 'bold',
#                    "fmt": '%.4f',
#                    'padding': 1.5
#                    },
#     show=False,
#     sort=True,
#     color=colors_,
#     ax = ax
# )
# ax.spines[['top', 'right']].set_visible(False)
# ax.set_xlabel(xlabel='mean(|SHAP value|)')
# ax.set_xticklabels(ax.get_xticks().astype(float))
# ax.set_yticklabels(ax.get_yticklabels())

# labels = df_with_classes['classes'].unique()
# handles = [plt.Rectangle((0,0),1,1,
#                          color=colors[l]) for l in labels]
# plt.legend(handles, labels, loc='lower right')
# ax.xaxis.set_major_locator(plt.MaxNLocator(4))
# if SAVE:
#     plt.savefig("results/figures/shap_bar_all.png", dpi=600, bbox_inches="tight")
# plt.tight_layout()
# plt.show()

# # %%

# seg_colors = (colors.values())
# # Change the saturation of seg_colors to 70% for the interior segments
# rgb = mcolors.to_rgba_array(seg_colors)[:,:-1]
# hsv = mcolors.rgb_to_hsv(rgb)
# hsv[:,1] = 0.7 * hsv[:, 1]
# interior_colors = mcolors.hsv_to_rgb(hsv)

# fractions = np.array([
# df_with_classes.loc[df_with_classes['classes']=='Experimental Conditions']['mean_shap'].sum(),
# df_with_classes.loc[df_with_classes['classes']=='Physicochemical Properties']['mean_shap'].sum(),
# df_with_classes.loc[df_with_classes['classes']=='Atomic Composition']['mean_shap'].sum(),
# ])

# dye_frac = df_with_classes.loc[df_with_classes['classes']=='Dye Properties']['mean_shap'].sum()
# labels = ['Experimental \nConditions', 'Physicochemical \nProperties',
#                'Atomic \nComposition']

# if dye_frac > 0.0:
#     fractions = np.array(fractions.tolist().append(dye_frac))
#     labels.append('Dye Properties')

# fractions /=fractions.sum()

# _, texts= pie(fractions=fractions,
#     colors=seg_colors,
#     labels=labels,
#     wedgeprops=dict(edgecolor="w", width=0.03), radius=1,
#     autopct=None,
#     textprops = dict(fontsize=12),
#     startangle=90, counterclock=False, show=False)
# texts[0].set_fontsize(12)
# _, texts, autotexts = pie(fractions=fractions,
#     colors=interior_colors,
#        autopct='%1.0f%%',
#     textprops = dict(fontsize=24),
#     wedgeprops=dict(edgecolor="w"), radius=1-2*0.03,
#     startangle=90, counterclock=False, ax=plt.gca(), show=False)
# texts[0].set_fontsize(12)
# if SAVE:
#     plt.savefig("results/figures/shap_pie_all.png", dpi=600, bbox_inches="tight")
# plt.tight_layout()
# plt.show()

Total running time of the script: (0 minutes 56.549 seconds)

Gallery generated by Sphinx-Gallery