1. Exploratory Data Analysis

In this file we analyze the data using some basic statistics and exploratory plots. The purpose is to get familiarize with the data

synthesis time: hydrothermal reaction time (time taken to prepare the material)

band gap: property of material (how much energy is required to excite one electron from outermost shell)

volume : volume of wastewater

import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.manifold import TSNE

from easy_mpl import boxplot, pie
from easy_mpl.utils import create_subplots, despine_axes, map_array_to_cmap

from utils import CATEGORIES
from utils import set_rcParams
from utils import SAVE, version_info
from utils import read_data, LABEL_MAP, plot_correlation, prepare_data
/home/docs/checkouts/readthedocs.org/user_builds/weil101/envs/latest/lib/python3.12/site-packages/sklearn/experimental/enable_hist_gradient_boosting.py:15: UserWarning: Since version 1.0, it is not needed to import enable_hist_gradient_boosting anymore. HistGradientBoostingClassifier and HistGradientBoostingRegressor are now stable and can be normally imported from sklearn.ensemble.
  warnings.warn(
/home/docs/checkouts/readthedocs.org/user_builds/weil101/envs/latest/lib/python3.12/site-packages/ai4water/datasets/__init__.py:3: UserWarning: datasets module is deprecated. Please install aqua-fetch using
              pip install aqua-fetch
              and import corresponding dataset from there.
  warnings.warn("""datasets module is deprecated. Please install aqua-fetch using

**********Tensorflow models could not be imported **********
for lib, ver in version_info().items():
    print(lib, ver)
python 3.12.10 (main, May  6 2025, 10:49:23) [GCC 11.4.0]
os posix
ai4water 1.07
lightgbm 4.6.0
catboost 1.2.10
xgboost 3.2.0
easy_mpl 0.21.5
SeqMetrics 2.0.0
numpy 1.26.4
pandas 2.2.3
matplotlib 3.10.8
h5py 3.16.0
sklearn 1.3.1
optuna 4.8.0
skopt 0.10.2
plotly 6.6.0
seaborn 0.13.2
crepes 0.9.0
mapie 0.9.2
shap 0.49.1
scipy 1.17.1
set_rcParams()
data = read_data(outputs=['k', 'Efficiency'])

printing number of rows and number of columns in the data

print(data.shape)
(1527, 36)

printing counts of missing values

data.isna().sum()
Catalyst                             0
Hydrothermal synthesis time (min)    0
Energy Band gap (Eg) eV              0
C (At%)                              0
O (At%)                              0
Fe (At%)                             0
Al (At%)                             0
Ni (At%)                             0
Mo (At%)                             0
S (At%)                              0
Bi                                   0
Ag                                   0
Pd                                   0
Pt                                   0
Surface area (m2/g)                  0
Pore volume (cm3/g)                  0
Pore size (nm)                       0
volume (L)                           0
loading (g)                          0
Light intensity (watt)               0
Light source distance (cm)           0
Time (m)                             0
Dye                                  0
log_Kw                               0
hydrogen_bonding_acceptor_count      0
hydrogen_bonding_donor_count         0
solubility (g/L)                     0
molecular_wt (g/mol)                 0
pka1                                 0
pka2                                 0
Dye concentration (mg/L)             0
Solution pH                          0
HA (mg/L)                            0
Anions                               0
k                                    0
Efficiency                           0
dtype: int64
data.describe()
Hydrothermal synthesis time (min) Energy Band gap (Eg) eV C (At%) O (At%) Fe (At%) Al (At%) Ni (At%) Mo (At%) S (At%) Bi Ag Pd Pt Surface area (m2/g) Pore volume (cm3/g) Pore size (nm) volume (L) loading (g) Light intensity (watt) Light source distance (cm) Time (m) log_Kw hydrogen_bonding_acceptor_count hydrogen_bonding_donor_count solubility (g/L) molecular_wt (g/mol) pka1 pka2 Dye concentration (mg/L) Solution pH HA (mg/L) k Efficiency
count 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000 1527.000000
mean 992.534381 2.407800 4.142927 29.788527 14.267033 2.553418 2.744853 0.407073 1.776130 40.994695 0.170923 1.116110 0.073674 32.208743 0.039448 4.352083 0.081297 0.102161 99.764244 11.422397 137.681729 1.786396 3.636542 0.467583 32.986248 388.628939 7.157171 10.861100 14.479371 6.970530 0.471513 0.006963 51.955125
std 238.605181 0.722862 8.572614 8.287888 3.970434 4.736580 5.049570 0.872797 4.493009 23.895041 0.664602 1.013013 0.315130 29.118821 0.068732 4.488440 0.033870 0.057295 20.010075 1.572735 79.635446 0.704081 2.963659 0.846760 12.701398 42.951469 0.465718 1.016112 15.184775 2.264407 1.810214 0.006731 31.922546
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.020000 0.000000 0.000000 9.000000 0.000000 1.397600 2.000000 0.000000 10.000000 364.911000 6.900000 10.300000 5.000000 3.000000 0.000000 0.000000 0.000000
25% 960.000000 2.100000 0.000000 26.100000 9.950000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 19.500000 0.004200 2.340000 0.100000 0.050000 105.000000 12.000000 60.000000 1.397600 2.000000 0.000000 40.000000 364.911000 6.900000 10.300000 5.000000 5.450000 0.000000 0.002919 24.300000
50% 960.000000 2.100000 0.000000 26.100000 16.990000 0.000000 0.000000 0.000000 0.000000 54.900000 0.000000 2.010000 0.000000 22.900000 0.004600 2.670000 0.100000 0.100000 105.000000 12.000000 150.000000 1.397600 2.000000 0.000000 40.000000 364.911000 6.900000 10.300000 10.000000 5.450000 0.000000 0.005077 53.200000
75% 960.000000 2.600000 0.000000 31.600000 16.990000 0.000000 0.000000 0.000000 0.000000 54.900000 0.000000 2.010000 0.000000 22.900000 0.004600 2.670000 0.100000 0.150000 105.000000 12.000000 210.000000 1.397600 2.000000 0.000000 40.000000 364.911000 6.900000 10.300000 20.000000 7.000000 0.000000 0.008911 79.900000
max 1320.000000 4.610000 39.850000 51.390000 17.430000 13.680000 13.130000 3.000000 17.240000 55.830000 3.690000 3.050000 2.000000 140.000000 0.269000 18.980000 0.100000 0.250000 105.000000 23.000000 270.000000 3.060600 9.000000 2.000000 40.000000 466.360000 8.000000 12.700000 100.000000 11.000000 10.000000 0.039305 100.000000


print(len(data['Catalyst'].unique()))

data['Catalyst'].unique()
18

array(['LTH', 'LM1', 'LM1.5', 'LM2', 'no catalyst', 'pure BFO',
       '0.5 wt% Pd-BFO', '1 wt% Pd-BFO', '2 wt% Pd-BFO', '3 wt% Pd-BFO',
       '1 wt% Ag-BFO', '2 wt% Ag-BFO', '3 wt% Ag-BFO', '4 wt% Ag-BFO',
       '0.25 wt% Pt-BFO', '0.5 wt% Pt-BFO', '1 wt% Pt-BFO',
       '2 wt% Pt-BFO'], dtype=object)
colors, _ = map_array_to_cmap(np.arange(len(data['Catalyst'].unique())), cmap="tab20")
pie(data['Catalyst'], colors=colors)
eda
([<matplotlib.patches.Wedge object at 0x796857b5c080>, <matplotlib.patches.Wedge object at 0x796874c87cb0>, <matplotlib.patches.Wedge object at 0x796857b69be0>, <matplotlib.patches.Wedge object at 0x796857b6a390>, <matplotlib.patches.Wedge object at 0x796857b6ad50>, <matplotlib.patches.Wedge object at 0x796857b6b350>, <matplotlib.patches.Wedge object at 0x796857b6bd40>, <matplotlib.patches.Wedge object at 0x796857b806b0>, <matplotlib.patches.Wedge object at 0x796857b811c0>, <matplotlib.patches.Wedge object at 0x796857b818e0>, <matplotlib.patches.Wedge object at 0x796857b81f70>, <matplotlib.patches.Wedge object at 0x796857b82630>, <matplotlib.patches.Wedge object at 0x796857b82d50>, <matplotlib.patches.Wedge object at 0x796857b83410>, <matplotlib.patches.Wedge object at 0x796857b83ad0>, <matplotlib.patches.Wedge object at 0x796857bf8200>, <matplotlib.patches.Wedge object at 0x796857bf88f0>, <matplotlib.patches.Wedge object at 0x796857bf8fe0>], [Text(1.0979054584181451, 0.06784986643791263, '0.25 wt% Pt-BFO (30) '), Text(1.0811969547076803, 0.20251702429879467, '0.5 wt% Pd-BFO (30) '), Text(1.0480342260992856, 0.3341021713854484, '0.5 wt% Pt-BFO (30) '), Text(0.9989219604690933, 0.4606027756023443, '1 wt% Ag-BFO (30) '), Text(0.9346075741508314, 0.5800936841061954, '1 wt% Pd-BFO (30) '), Text(0.8560698373602738, 0.6907564212962154, '1 wt% Pt-BFO (30) '), Text(0.764503978754419, 0.7909068633338966, '2 wt% Ag-BFO (30) '), Text(-0.8603168727646185, 0.6854596110906225, '2 wt% Pd-BFO (780) '), Text(-0.6001465388378605, -0.9218590629380049, '2 wt% Pt-BFO (30) '), Text(-0.4820727478629304, -0.9887395338348132, '3 wt% Ag-BFO (30) '), Text(-0.35666251982538927, -1.0405728455768024, '3 wt% Pd-BFO (30) '), Text(-0.22582441364934935, -1.0765701715169, '4 wt% Ag-BFO (30) '), Text(0.07801126188187789, -1.097230259799463, 'LM1 (105) '), Text(0.5302858892755645, -0.9637410832973884, 'LM1.5 (105) '), Text(0.8851054967830092, -0.6531372440494438, 'LM2 (105) '), Text(1.0394672226534496, -0.35987205091410485, 'LTH (42) '), Text(1.0811969381167907, -0.20251711287413876, 'no catalyst (30) '), Text(1.097905452859649, -0.06784995638207392, 'pure BFO (30) ')], [Text(0.5988575227735337, 0.03700901805704325, '2.0%'), Text(0.5897437934769164, 0.11046383143570618, '2.0%'), Text(0.5716550324177921, 0.1822375480284264, '2.0%'), Text(0.5448665238922327, 0.2512378776012787, '2.0%'), Text(0.5097859495368171, 0.31641473678519744, '2.0%'), Text(0.4669471840146947, 0.3767762297979356, '2.0%'), Text(0.41700217022968306, 0.43140374363667083, '2.0%'), Text(-0.469263748780701, 0.3738870605948849, '51.1%'), Text(-0.3273526575479239, -0.5028322161480026, '2.0%'), Text(-0.26294877156159835, -0.5393124730008071, '2.0%'), Text(-0.1945431926320305, -0.5675851884964377, '2.0%'), Text(-0.12317695289964509, -0.5872200935546728, '2.0%'), Text(0.04255159739011521, -0.5984892326178889, '6.9%'), Text(0.28924684869576245, -0.5256769545258482, '6.9%'), Text(0.48278481642709586, -0.3562566785724239, '6.9%'), Text(0.5669821214473361, -0.19629384595314808, '2.8%'), Text(0.5897437844273403, -0.11046387974953022, '2.0%'), Text(0.5988575197416266, -0.03700906711749486, '2.0%')])

Anions are representative of inorganic content in the xyz.

print(len(data['Anions'].unique()))

data['Anions'].unique()
6

array(['N/A', 'NaCl', 'Na2SO4', 'NaCO3', 'NaHCO3', 'Na2HPO4'],
      dtype=object)
colors, _ = map_array_to_cmap(np.arange(len(data['Anions'].unique())), cmap="tab20")
pie(data['Anions'], colors=colors)
eda
([<matplotlib.patches.Wedge object at 0x796857b82210>, <matplotlib.patches.Wedge object at 0x796857b82420>, <matplotlib.patches.Wedge object at 0x796857b82e10>, <matplotlib.patches.Wedge object at 0x796857b83740>, <matplotlib.patches.Wedge object at 0x796857b83f50>, <matplotlib.patches.Wedge object at 0x796857b17080>], [Text(-1.048034323852062, 0.33410186474779097, 'N/A (1377) '), Text(0.9346079169970015, -0.58009313173535, 'Na2HPO4 (30) '), Text(0.9989222380844521, -0.46060217352977106, 'Na2SO4 (30) '), Text(1.0480344313800127, -0.33410152744633453, 'NaCO3 (30) '), Text(1.0811970815092378, -0.20251634733005408, 'NaCl (30) '), Text(1.097905501694757, -0.067849166158355, 'NaHCO3 (30) ')], [Text(-0.5716550857374884, 0.18223738077152232, '90.2%'), Text(0.509786136543819, -0.31641443549200904, '2.0%'), Text(0.544866675318792, -0.2512375491980569, '2.0%'), Text(0.5716551443890977, -0.18223719678890973, '2.0%'), Text(0.5897438626414023, -0.11046346218002949, '2.0%'), Text(0.5988575463789583, -0.03700863608637545, '2.0%')])
print(len(data['Dye'].unique()))

data['Dye'].unique()
2

array(['Indigo', 'Melachite Green'], dtype=object)
colors, _ = map_array_to_cmap(np.arange(len(data['Dye'].unique())), cmap="tab20")
pie(data['Dye'], colors=colors)
eda
([<matplotlib.patches.Wedge object at 0x7968559323c0>, <matplotlib.patches.Wedge object at 0x796857bf92b0>], [Text(0.8163984327169364, 0.737220183566165, 'Indigo (357) '), Text(-0.8163984672287029, -0.7372201453477955, 'Melachite Green (1170) ')], [Text(0.44530823602741976, 0.4021201001269991, '23.4%'), Text(-0.4453082548520197, -0.40212007928061566, '76.6%')])

Overall distribution of all features

data_num = data.drop(columns=['Catalyst', 'Anions', 'Dye'])

# rearrange_columns in data_num so that same categories lie close to each other

data_num_ = pd.DataFrame()
for cat, val in CATEGORIES.items():
    for v in val:
        if v in data_num:
            data_num_[v] = data_num[v]
data_num_['k'] = data_num['k']
data_num_['Efficiency'] = data_num['Efficiency']
data_num = data_num_

boxplot(data_num,
        labels=[LABEL_MAP.get(label, label) for label in data_num.columns],
        share_axes=False,
        flierprops=dict(ms=2.0),
        medianprops={"color": "black"},
        fill_color='#01B0B9',
        patch_artist=True,
        show=False,
        figsize=(7, 6),
        )
plt.subplots_adjust(wspace=0.05)
plt.tight_layout()
plt.show()
eda

Distribution of a feautre given a specific dye i.e. Dist(Feature)|Dye

grps = data.groupby(by="Dye")

f, axes = create_subplots(data_num.shape[1], figsize=(11, 8))

for col, ax in zip(data_num.columns, axes.flat):

    _, out = boxplot(
    [grps.get_group('Indigo')[col].values, grps.get_group('Melachite Green')[col].values],
        flierprops=dict(ms=2.0),
        medianprops={"color": "black"},
        fill_color=['#005066', '#B3331D'],
        widths=0.7,
        patch_artist=True,
        ax=ax,
        show=False
    )
    ax.set_xlabel(LABEL_MAP.get(col, col))
    ax.set_xticks([])
ax.legend([out["boxes"][0], out["boxes"][1]], ['Indigo', 'Melachite Green'],
          loc=(-2.5, -1))
plt.subplots_adjust(wspace=0.65, hspace=0.4)
if SAVE:
    plt.savefig("results/figures/boxplots.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()
eda
/home/docs/checkouts/readthedocs.org/user_builds/weil101/checkouts/latest/scripts/eda.py:146: UserWarning: Tight layout not applied. tight_layout cannot make Axes width small enough to accommodate all Axes decorations
  plt.tight_layout()

correlation of all input features with k

data_num1 = data_num.rename(columns=LABEL_MAP)
data_num1.pop('Efficiency')
plot_correlation(data_num1, show=False)
if SAVE:
    plt.savefig("results/figures/corr_all_k.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()
eda

correlation of all input features with Efficiency

data_num1 = data_num.rename(columns=LABEL_MAP)
data_num1.pop('k')
plot_correlation(data_num1)
eda

correlation of all input features with Efficiency and k

data_num1 = data_num.rename(columns=LABEL_MAP)

plot_correlation(data_num1, show=False)
if SAVE:
    plt.savefig("results/figures/corr_all_k_e.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()
eda

correlation of only input features which were termed as important by Boruta method for k.

cols = ['Solution pH', 'Cat. Loading (g/L)', 'O (At%)', 'Pore Size (nm)',
        'HA (mg/L)', 'Mo (At%)', 'Light Int. (W)', #'Anions',
        'Initial Conc. (mg/L)', 'Rxn Time (min)', 'Ni (At%)']

data_num1 = data_num.rename(columns=LABEL_MAP)[cols + ["k"]]
plot_correlation(data_num1, annot_kws={"fontsize": 12}, show=False)
if SAVE:
    plt.savefig("results/figures/corr_k.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()
eda

plotting only those where correlation is higher than 0.6

plot_correlation(data_num1, threshold=0.6, split="pos")
eda

plotting only those where correlation is below 0.5

plot_correlation(data_num1, threshold=-0.5, split="neg")
eda

correlation of input features which were termed as important by Boruta method.

data_num1 = data_num.rename(columns=LABEL_MAP)[cols + ["Efficiency"]]
plot_correlation(data_num1, annot_kws={"fontsize": 12})
eda
data_num1 = data_num.rename(columns=LABEL_MAP)[cols + ["k", "Efficiency"]]
plot_correlation(data_num1, annot_kws={"fontsize": 12},
                 show=False)
if SAVE:
    plt.savefig("results/figures/corr_selected.png", dpi=600, bbox_inches="tight")
plt.tight_layout()
plt.show()
eda
mpl.rcParams.update(mpl.rcParamsDefault)

data, encoders = prepare_data(outputs=["k", "Efficiency"])
input_features = data.columns.tolist()

tsne = TSNE(random_state=313)
comp = tsne.fit_transform(data[input_features])

def scatter_(first, second, label,
             axs, fig):

    c = data[col].values

    pc = axs.scatter(first, second,
                    c=c,
                    s = 2,
            cmap="Spectral",
            )

    if label in ["Catalyst", "Anions", "Dye"]:
        cmap = mpl.cm.Spectral
        bounds = list(set(c))
        norm = mpl.colors.BoundaryNorm(bounds + [bounds[-1]+1],
                                       cmap.N)

        colorbar = fig.colorbar(
            mpl.cm.ScalarMappable(norm=norm, cmap=cmap),
            ticks = bounds,
                 ax=axs, orientation='vertical')

        if label == "Catalyst":
            ticklabels = range(18)
        elif label == "Anions":
            ticklabels = ['N/A', 'Cl', 'SO4', 'CO3', 'HCO3', 'HPO4']
        elif label == "Dye":
            ticklabels = ["MG", "INDIGO"]
        else:
            ticklabels = encoders[label].classes_
        colorbar.ax.set_yticklabels(ticklabels)

    else:
        colorbar = fig.colorbar(pc, ax=axs)
        label = LABEL_MAP.get(label, label)
        colorbar.set_label(label)

    despine_axes(colorbar.ax)
    return
f, axes = create_subplots(29, sharex="all", sharey="all",
                       figsize=(9, 9))

for col, ax in zip(input_features, axes.flat):

    scatter_(comp[:, 0], comp[:, 1],
         label=col, axs=ax, fig=f)

plt.tight_layout()
plt.show()
eda

Total running time of the script: (0 minutes 26.414 seconds)

Gallery generated by Sphinx-Gallery