Note

Go to the end to download the full example code.

6. Interpretation

The purpose of this notebook is to apply various post-hoc interpretation methods on our model. For this purose, we will rebuild our DecisionTree model, train it. After this we will apply SHAP, PDP and ALE on the trained DecisionTree model.

import numpy as np
import pandas as pd

#np.bool = np.bool_

import shap

import matplotlib as mpl
import matplotlib.pyplot as plt

import matplotlib.colors as mcolors

from easy_mpl import pie
from easy_mpl import bar_chart
from easy_mpl.utils import create_subplots

from shap.plots import waterfall
from shap import summary_plot, Explanation

from ai4water import Model
from ai4water.utils.utils import TrainTestSplit
from ai4water.postprocessing import PartialDependencePlot

from utils import LABEL_MAP
from utils import version_info
from utils import shap_scatter
from utils import make_classes
from utils import shap_scatter_plots
from utils import prepare_data, set_rcParams, plot_ale, SAVE

for lib, ver in version_info().items():
    print(lib, ver)

python 3.12.10 (main, May  6 2025, 10:49:23) [GCC 11.4.0]
os posix
ai4water 1.07
lightgbm 4.6.0
catboost 1.2.10
xgboost 3.2.0
easy_mpl 0.21.5
SeqMetrics 2.0.0
numpy 1.26.4
pandas 2.2.3
matplotlib 3.10.8
h5py 3.16.0
sklearn 1.3.1
optuna 4.8.0
skopt 0.10.2
plotly 6.6.0
seaborn 0.13.2
crepes 0.9.0
mapie 0.9.2
shap 0.49.1
scipy 1.17.1

set_rcParams()

inputs = ['Solution pH', 'Time (m)', 'Anions', 'Ni (At%)', 'HA (mg/L)',
          'loading (g)', 'Pore size (nm)', 'O (At%)',
          'Light intensity (watt)', 'Mo (At%)', 'Dye concentration (mg/L)']
data, encoders = prepare_data(inputs=inputs,
                              outputs="k")

print(data.shape)

(1527, 12)

input_features = data.columns.tolist()[0:-1]
output_features = data.columns.tolist()[-1:]

TrainX, TestX, TrainY, TestY = TrainTestSplit(seed=313).split_by_random(
    data[input_features],
    data[output_features]
)

print(TrainX.shape, TrainY.shape, TestX.shape, TestY.shape)

model = Model(
    model = "DecisionTreeRegressor",
    input_features=input_features,
    output_features=output_features,
    verbosity=-1,
)

model.fit(TrainX, TrainY.values)

(1068, 11) (1068, 1) (459, 11) (459, 1)

DecisionTreeRegressor(random_state=313)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

train_p = model.predict(TrainX, process_results=False)

test_p = model.predict(TestX, process_results=False)

Average prediction on training data

print(train_p.mean())

0.006940225438833623

default feature importance from decision tree

print(model._model.feature_importances_)

[7.40882935e-02 1.94588607e-01 9.18145954e-02 3.19367156e-01
 3.43836446e-02 9.41130328e-02 5.70019588e-03 1.71579275e-02
 3.37566239e-02 2.51049444e-04 1.34778874e-01]

bar_chart(model._model.feature_importances_,
          [LABEL_MAP[n] if n in LABEL_MAP else n for n in model.input_features],
          sort=True,
          show=False)
plt.tight_layout()
plt.show()

SHAP

exp = shap.TreeExplainer(model=model._model,
                         data=TrainX,
                         feature_names=input_features)

print(exp.expected_value)

0.0066904566801259955

shap_values = exp.shap_values(TrainX, TrainY)

summary_plot(shap_values,
    TrainX,
    max_display=34,
    feature_names=[LABEL_MAP[n] if n in LABEL_MAP else n for n in input_features],
    show=False)
if SAVE:
    plt.savefig("results/figures/shap_summary.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()

sv_bar = np.mean(np.abs(shap_values), axis=0)

classes, colors, colors_ = make_classes(exp)

df_with_classes = pd.DataFrame({'features': exp.feature_names,
                   'classes': classes,
                   'mean_shap': sv_bar})

print(df_with_classes)

                    features                     classes  mean_shap
              Solution pH     Experimental Conditions   0.001051
                 Time (m)     Experimental Conditions   0.001239
                   Anions     Experimental Conditions   0.000390
                 Ni (At%)          Atomic Composition   0.003584
                HA (mg/L)     Experimental Conditions   0.000281
              loading (g)     Experimental Conditions   0.001199
           Pore size (nm)  Physicochemical Properties   0.000322
                  O (At%)          Atomic Composition   0.000349
   Light intensity (watt)     Experimental Conditions   0.000395
                 Mo (At%)          Atomic Composition   0.000014
Dye concentration (mg/L)     Experimental Conditions   0.001875

f, ax = plt.subplots(figsize=(7,9))
ax = bar_chart(
    sv_bar,
    [LABEL_MAP[n] if n in LABEL_MAP else n for n in exp.feature_names],
    bar_labels=np.round(sv_bar, 4),
    bar_label_kws={'label_type':'edge',
                   'fontsize': 10,
                   'weight': 'bold',
                   "fmt": '%.4f',
                   'padding': 1.5
                   },
    show=False,
    sort=True,
    color=colors_,
    ax = ax
)
ax.spines[['top', 'right']].set_visible(False)
ax.set_xlabel(xlabel='mean(|SHAP value|)')
ax.set_xticklabels(ax.get_xticks().astype(float))
ax.set_yticklabels(ax.get_yticklabels())

labels = df_with_classes['classes'].unique()
handles = [plt.Rectangle((0,0),1,1,
                         color=colors[l]) for l in labels]
plt.legend(handles, labels, loc='lower right')
ax.xaxis.set_major_locator(plt.MaxNLocator(4))
if SAVE:
    plt.savefig("results/figures/shap_bar.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()

seg_colors = (colors.values())
# Change the saturation of seg_colors to 70% for the interior segments
rgb = mcolors.to_rgba_array(seg_colors)[:,:-1]
hsv = mcolors.rgb_to_hsv(rgb)
hsv[:,1] = 0.7 * hsv[:, 1]
interior_colors = mcolors.hsv_to_rgb(hsv)

fractions = np.array([
df_with_classes.loc[df_with_classes['classes']=='Experimental Conditions']['mean_shap'].sum(),
df_with_classes.loc[df_with_classes['classes']=='Physicochemical Properties']['mean_shap'].sum(),
df_with_classes.loc[df_with_classes['classes']=='Atomic Composition']['mean_shap'].sum(),
])

dye_frac = df_with_classes.loc[df_with_classes['classes']=='Dye Properties']['mean_shap'].sum()
labels = ['Experimental \nConditions', 'Physicochemical \nProperties',
               'Atomic \nComposition']

if dye_frac > 0.0:
    fractions = np.array(fractions.tolist().append(dye_frac))
    labels.append('Dye Properties')

fractions /=fractions.sum()

_, texts= pie(fractions=fractions,
    colors=seg_colors,
    labels=labels,
    wedgeprops=dict(edgecolor="w", width=0.03), radius=1,
    autopct=None,
    textprops = dict(fontsize=12),
    startangle=90, counterclock=False, show=False)
texts[0].set_fontsize(12)
_, texts, autotexts = pie(fractions=fractions,
    colors=interior_colors,
       autopct='%1.0f%%',
    textprops = dict(fontsize=24),
    wedgeprops=dict(edgecolor="w"), radius=1-2*0.03,
    startangle=90, counterclock=False, ax=plt.gca(), show=False)
texts[0].set_fontsize(12)
if SAVE:
    plt.savefig("results/figures/shap_pie.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()

index = train_p.argmax()
print(index, train_p.max())

355 0.036809739452414975

e = Explanation(
    shap_values[index],
    base_values=exp.expected_value,
    data=TrainX.values[index],
    feature_names=input_features
)

waterfall(e, max_display=20, show=False)
if SAVE:
    plt.savefig(f"results/figures/shap_local_{index}.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()

The following figures show SHAP interaction plots. These figures depict the inteaction effect of two features on model performance. In these figures, the numbers in legends for Anions, have following meanings

encoders['Anions'].inverse_transform(np.array([0,1,2,3,4, 5, 5]).reshape(-1,1))

array(['N/A', 'Na2HPO4', 'Na2SO4', 'NaCO3', 'NaCl', 'NaHCO3', 'NaHCO3'],
      dtype=object)

Similarly for catalyst, the numbers in legend have following meanings Pt-BFO : 6 Pd-BFO: 4 LM : 2 Ag-BFO : 0 Photolysis : 5 LTH : 3 BFO : 1

Dye Concentration

It represents initial concentration of dye.

feature_name = 'Dye concentration (mg/L)'

if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Ni (At%)

feature_name = 'Ni (At%)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

loading

It represents how much photocatalyst is present.

feature_name = 'loading (g)'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Time

feature_name = 'Time (m)'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Solution pH

feature_name = 'Solution pH'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Light intensiy

feature_name =  'Light intensity (watt)'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Oxygen

feature_name = 'O (At%)'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Humic Acid

feature_name = 'HA (mg/L)'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Pore size

feature_name = 'Pore size (nm)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Anions

feature_name = 'Anions'
shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

feature_name = 'Mass ratio (Catalyst/Dye)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

S

feature_name = 'S (At%)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Surface Area

feature_name = 'Surface area (m2/g)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

Mo

feature_name =  'Mo (At%)'
if feature_name in TrainX:
    shap_scatter_plots(shap_values, TrainX, feature_name,
                   encoders=encoders,
                   save=SAVE)

fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(
    2,4, figsize=(15, 8))

ax = shap_scatter(
    shap_values[:, 5],
    TrainX.loc[:, 'loading (g)'],
    TrainX.loc[:, 'Ni (At%)'],
    feature_name='Cat. Loading (g/L)',
    ax=ax1,
    show=False
)
ax.set_ylabel('')
#ax.set_xlim(ax.get_xlim()[0], 0.32)

ax = shap_scatter(
    shap_values[:, 5],
    TrainX.loc[:, 'loading (g)'],
    TrainX.loc[:, 'Pore size (nm)'],
    feature_name='Cat. Loading (g/L)',
    ax=ax2,
    show=False
)
ax.set_ylabel('')
#ax.set_xlim(2.5, 12.5)

ax = shap_scatter(
    shap_values[:, 5],
    TrainX.loc[:, 'loading (g)'],
    TrainX.loc[:, 'Solution pH'],
    feature_name='Cat. Loading (g/L)',
    ax=ax3,
    show=False,
)
ax.set_ylabel('')
#ax.set_xlim(ax.get_xlim()[0], 62)


ax = shap_scatter(
    shap_values[:, 5],
    TrainX.loc[:, 'loading (g)'],
    TrainX.loc[:, 'Mo (At%)'],
    feature_name='Cat. Loading (g/L)',
    ax=ax4,
    show=False,
)
ax.set_ylabel('')
#ax.set_xlim(2.5, 12.5)

ax = shap_scatter(
    shap_values[:, 3],
    TrainX.loc[:, 'Ni (At%)'],
    TrainX.loc[:, 'Pore size (nm)'],
    feature_name='Ni (At%)',
    ax=ax5,
    show=False
)
ax.set_ylabel('')

ax = shap_scatter(
    shap_values[:, 3],
    TrainX.loc[:, 'Ni (At%)'],
    TrainX.loc[:, 'Solution pH'],
    feature_name='Ni (At%)',
    ax=ax6,
    show=False
)
ax.set_ylabel('')
#ax.set_xlim(2.5, 12.5)

ax = shap_scatter(
    shap_values[:, 3],
    TrainX.loc[:, 'Ni (At%)'],
    TrainX.loc[:, 'Mo (At%)'],
    feature_name='Ni (At%)',
    ax=ax7,
    show=False
)
ax.set_ylabel('')

ax = shap_scatter(
    shap_values[:, 0],
    TrainX.loc[:, 'Solution pH'],
    TrainX.loc[:, 'O (At%)'],
    feature_name='Solution pH',
    ax=ax8,
    show=False
)
ax.set_ylabel('')
plt.tight_layout()
if SAVE:
    plt.savefig("results/figures/shap_dep.png", dpi=600, bbox_inches="tight")
plt.show()

Partial Dependence Plot

pdp = PartialDependencePlot(
    model.predict,
    TrainX,
    num_points=20,
    feature_names=TrainX.columns.tolist(),
    show=False,
    save=False
)

mpl.rcParams.update(mpl.rcParamsDefault)
colors = ["#DB0007", "#670E36", "#e30613", "#0057B8", "#6C1D45",
          "#034694", "#1B458F", "#003399", "#FFCD00", "#003090",
          "#C8102E", "#6CABDD", "#DA291C", "#241F20", "#00A650",
          "#D71920", "#132257", "#ED2127", "#7A263A", "#FDB913",
          "#DB0007", "#670E36", "#e30613", "#0057B8", "#6C1D45",
          "#034694", "#1B458F", "#003399", "#FFCD00", "#003090",
          ]

f, axes = create_subplots(TrainX.shape[1], figsize=(10, 12))

for ax, feature, clr in zip(axes.flat, TrainX.columns, colors):

    pdp_vals, ice_vals = pdp.calc_pdp_1dim(TrainX.values, feature)

    ax = pdp.plot_pdp_1dim(pdp_vals, ice_vals, TrainX.values,
                            feature,
                            pdp_line_kws={
                                'color': clr, 'zorder': 3},
                            ice_color="gray",
                            ice_lines_kws=dict(zorder=2, alpha=0.15),
                            ax=ax,
                            show=False,
                            )
    ax.set_xlabel(LABEL_MAP.get(feature, feature))
    ax.set_ylabel(f"E[f(x) | " + feature + "]")
if SAVE:
    plt.savefig("results/figures/pdp.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()

Accumulated Local Effects

class MyModel:
    def predict(self, X):
        return model.predict(X).reshape(-1,)

f, axes = create_subplots(TrainX.shape[1], figsize=(10, 12))

for ax, feature, clr in zip(axes.flat, TrainX.columns, colors):
    plot_ale(MyModel().predict, TrainX, feature,
             ax=ax, show=False, color=clr, )

plt.tight_layout()
plt.show()

All Features model interpretation

For the sake of comparison, we also show interpretation of model which uses all features as input.

set_rcParams()

data, encoders = prepare_data(outputs="k")

print(data.shape)

(1527, 35)

input_features = data.columns.tolist()[0:-1]
output_features = data.columns.tolist()[-1:]

TrainX, TestX, TrainY, TestY = TrainTestSplit(seed=313).split_by_random(
    data[input_features],
    data[output_features]
)

print(TrainX.shape, TrainY.shape, TestX.shape, TestY.shape)


model = Model(
    model = "DecisionTreeRegressor",
    input_features=input_features,
    output_features=output_features,
    verbosity=-1,
)

model.fit(TrainX, TrainY.values)

(1068, 34) (1068, 1) (459, 34) (459, 1)

DecisionTreeRegressor(random_state=313)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

train_p = model.predict(TrainX, process_results=False)

test_p = model.predict(TestX, process_results=False)

Average prediction on training data

print(train_p.mean())

0.0069402254388336235

default feature importance from decision tree

print(model._model.feature_importances_)

[3.81006042e-05 1.73386029e-06 1.19712970e-02 0.00000000e+00
23370042e-03 1.99232034e-05 3.97314739e-04 2.75412342e-01
25751993e-03 4.10362230e-02 9.97703911e-04 0.00000000e+00
34364582e-05 2.25969210e-04 1.03530578e-02 2.14975624e-03
75046118e-04 0.00000000e+00 8.39858878e-02 3.36541611e-02
61657725e-03 1.93957659e-01 0.00000000e+00 0.00000000e+00
00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
00000000e+00 0.00000000e+00 1.34414955e-01 7.38624495e-02
42792786e-02 9.15359069e-02]

fig, ax = plt.subplots(figsize=(6, 8))
bar_chart(model._model.feature_importances_,
          [LABEL_MAP[n] if n in LABEL_MAP else n for n in model.input_features],
          sort=True,
          show=False,
          ax=ax)
plt.tight_layout()
plt.show()

SHAP all features

exp = shap.TreeExplainer(model=model._model,
                         data=TrainX,
                         feature_names=input_features)

print(exp.expected_value)

0.00671328508398034

shap_values = exp.shap_values(TrainX, TrainY)

summary_plot(shap_values,
    TrainX,
             max_display=34,
feature_names=[LABEL_MAP[n] if n in LABEL_MAP else n for n in input_features],
    show=False)
if SAVE:
    plt.savefig("results/figures/shap_summary_all.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()

# sv_bar = np.mean(np.abs(shap_values), axis=0)

# classes, colors, colors_ = make_classes(exp)

# df_with_classes = pd.DataFrame({'features': exp.feature_names,
#                    'classes': classes,
#                    'mean_shap': sv_bar})

# print(df_with_classes)

# # %%
# f, ax = plt.subplots(figsize=(7,9))
# ax = bar_chart(
#     sv_bar,
#     [LABEL_MAP[n] if n in LABEL_MAP else n for n in exp.feature_names],
#     bar_labels=np.round(sv_bar, 4),
#     bar_label_kws={'label_type':'edge',
#                    'fontsize': 10,
#                    'weight': 'bold',
#                    "fmt": '%.4f',
#                    'padding': 1.5
#                    },
#     show=False,
#     sort=True,
#     color=colors_,
#     ax = ax
# )
# ax.spines[['top', 'right']].set_visible(False)
# ax.set_xlabel(xlabel='mean(|SHAP value|)')
# ax.set_xticklabels(ax.get_xticks().astype(float))
# ax.set_yticklabels(ax.get_yticklabels())

# labels = df_with_classes['classes'].unique()
# handles = [plt.Rectangle((0,0),1,1,
#                          color=colors[l]) for l in labels]
# plt.legend(handles, labels, loc='lower right')
# ax.xaxis.set_major_locator(plt.MaxNLocator(4))
# if SAVE:
#     plt.savefig("results/figures/shap_bar_all.png", dpi=600, bbox_inches="tight")
# plt.tight_layout()
# plt.show()

# # %%

# seg_colors = (colors.values())
# # Change the saturation of seg_colors to 70% for the interior segments
# rgb = mcolors.to_rgba_array(seg_colors)[:,:-1]
# hsv = mcolors.rgb_to_hsv(rgb)
# hsv[:,1] = 0.7 * hsv[:, 1]
# interior_colors = mcolors.hsv_to_rgb(hsv)

# fractions = np.array([
# df_with_classes.loc[df_with_classes['classes']=='Experimental Conditions']['mean_shap'].sum(),
# df_with_classes.loc[df_with_classes['classes']=='Physicochemical Properties']['mean_shap'].sum(),
# df_with_classes.loc[df_with_classes['classes']=='Atomic Composition']['mean_shap'].sum(),
# ])

# dye_frac = df_with_classes.loc[df_with_classes['classes']=='Dye Properties']['mean_shap'].sum()
# labels = ['Experimental \nConditions', 'Physicochemical \nProperties',
#                'Atomic \nComposition']

# if dye_frac > 0.0:
#     fractions = np.array(fractions.tolist().append(dye_frac))
#     labels.append('Dye Properties')

# fractions /=fractions.sum()

# _, texts= pie(fractions=fractions,
#     colors=seg_colors,
#     labels=labels,
#     wedgeprops=dict(edgecolor="w", width=0.03), radius=1,
#     autopct=None,
#     textprops = dict(fontsize=12),
#     startangle=90, counterclock=False, show=False)
# texts[0].set_fontsize(12)
# _, texts, autotexts = pie(fractions=fractions,
#     colors=interior_colors,
#        autopct='%1.0f%%',
#     textprops = dict(fontsize=24),
#     wedgeprops=dict(edgecolor="w"), radius=1-2*0.03,
#     startangle=90, counterclock=False, ax=plt.gca(), show=False)
# texts[0].set_fontsize(12)
# if SAVE:
#     plt.savefig("results/figures/shap_pie_all.png", dpi=600, bbox_inches="tight")
# plt.tight_layout()
# plt.show()

Total running time of the script: (0 minutes 56.549 seconds)

Gallery generated by Sphinx-Gallery