.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "auto_examples/utils.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        :ref:`Go to the end <sphx_glr_download_auto_examples_utils.py>`
        to download the full example code.

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_auto_examples_utils.py:


==============
utils
==============
This file contains utility functions which are used
in other files.

.. GENERATED FROM PYTHON SOURCE LINES 8-35

.. code-block:: Python


    import os
    import sys
    from typing import Union, List, Tuple, Callable

    import shap
    import scipy
    import mapie
    import crepes
    import numpy as np
    import pandas as pd

    import matplotlib
    import matplotlib.pyplot as plt
    from matplotlib.lines import Line2D

    import seaborn as sns

    from easy_mpl import plot, scatter
    from easy_mpl.utils import create_subplots

    from sklearn.preprocessing import LabelEncoder, OneHotEncoder

    from ai4water.eda import EDA
    from ai4water.preprocessing import DataSet
    from ai4water.utils.utils import get_version_info


.. GENERATED FROM PYTHON SOURCE LINES 36-39

.. code-block:: Python


    SAVE = False


.. GENERATED FROM PYTHON SOURCE LINES 40-42

We have 18 types of catalysts. We can however group them
into following 7 broad categories.

.. GENERATED FROM PYTHON SOURCE LINES 42-62

.. code-block:: Python

    CATALYST_CATEGORIES = {
        'LTH': 'LTH',  # Layered triple hydroxide
        'LM1': 'LM',
        'LM1.5': 'LM',
        'LM2': 'LM',
        'no catalyst': 'Photolysis',
        'pure BFO': 'BFO',
        '0.5 wt% Pd-BFO': 'Pd-BFO',
        '1 wt% Pd-BFO': 'Pd-BFO',
        '2 wt% Pd-BFO': 'Pd-BFO',
        '3 wt% Pd-BFO': 'Pd-BFO',
        '1 wt% Ag-BFO': 'Ag-BFO',
        '2 wt% Ag-BFO': 'Ag-BFO',
        '3 wt% Ag-BFO': 'Ag-BFO',
        '4 wt% Ag-BFO': 'Ag-BFO',
        '0.25 wt% Pt-BFO': 'Pt-BFO',
        '0.5 wt% Pt-BFO': 'Pt-BFO',
        '1 wt% Pt-BFO': 'Pt-BFO',
        '2 wt% Pt-BFO': 'Pt-BFO'
    }


.. GENERATED FROM PYTHON SOURCE LINES 63-91

.. code-block:: Python


    CATEGORIES = {
        "Physicochemical Properties": ["Catalyst", "Pore size (nm)",
                            "Pore volume (cm3/g)",
                            'Energy Band gap (Eg) eV',
                            'Surface area (m2/g)'
                            ],
        "Atomic Composition": ["O (At%)", "Mo (At%)", "Ni (At%)",
                                  "S (At%)", "C (At%)", "Fe (At%)", "Al (At%)",
                                  "Bi", "Ag", "Pd", "Pt"],
        "Dye Properties": ['log_Kw', 'hydrogen_bonding_acceptor_count',
                      'hydrogen_bonding_donor_count',
                      'solubility (g/L)', 'molecular_wt (g/mol)', 'Dye', 'pka1', 'pka2',
                           ],
        "Experimental Conditions": ['Hydrothermal synthesis time (min)',
                         'volume (L)',
                         "loading (g)",
                         "Light intensity (watt)",
                         'Light source distance (cm)',
                         "Time (m)",
                         'Dye concentration (mg/L)',
                         "Solution pH",
                         'HA (mg/L)',
                         "Anions",
                                    'Mass ratio (Catalyst/Dye)'
                         ],
    }


.. GENERATED FROM PYTHON SOURCE LINES 92-121

.. code-block:: Python


    LABEL_MAP = {
        'Hydrothermal synthesis time (min)': 'Synth. Time (min)',
        'Energy Band gap (Eg) eV': "Band Gap (eV)",
        'Light source distance (cm)': "Light Dist. (cm)",
        'Dye concentration (mg/L)': "Initial Conc. (mg/L)",
        'Surface area (m2/g)': "Surface Area (m2/g)",
        'Pore volume (cm3/g)': "Pore Vol. (cm3/g)",
        'Light intensity (watt)': 'Light Int. (W)',
        'Catalyst_loading_mg': 'Cat. loading',
        'hydrogen_bonding_acceptor_count': 'HB acceptor count',
        'hydrogen_bonding_donor_count': 'HB donor count',
        'molecular_wt (g/mol)': 'M.W. (g/mol)',
        'solubility (g/L)': 'Solubility (g/L)',
        'volume (L)': 'Volume (L)',
        "Time (m)": 'Rxn Time (min)',
        "Bi" : "Bi (At%)",
        "Ag": "Ag (At%)",
        "Pd": "Pd (At%)",
        "Pt": "Pt (At%)",
        "log_Kw": "log Kow",
        'loading (g)': 'Cat. Loading (g/L)',
        "Pore size (nm)": "Pore Size (nm)",
        'Dye': 'Dyes',
        'pka1': 'pka1',
        'pka2': 'pka2',
        'Mass ratio (Catalyst/Dye)': 'Mass Ratio'
    }


.. GENERATED FROM PYTHON SOURCE LINES 122-186

.. code-block:: Python


    def read_data(
            inputs:Union[str, List[str]]=None,
            outputs:Union[str, List[str]] = None
    )->pd.DataFrame:

        default_inputs = [
            'Catalyst', 'Hydrothermal synthesis time (min)',
           'Energy Band gap (Eg) eV', 'C (At%)', 'O (At%)', 'Fe (At%)', 'Al (At%)',
           'Ni (At%)', 'Mo (At%)', 'S (At%)', 'Bi', 'Ag', 'Pd', 'Pt',
           'Surface area (m2/g)', 'Pore volume (cm3/g)', 'Pore size (nm)',
           'volume (L)',

            # consider one of loading or catalysing loadnig
            'loading (g)', #'Catalyst_loading_mg',
           'Light intensity (watt)', 'Light source distance (cm)', 'Time (m)',

           'Dye',

            # pollutant (dye) properties)
            'log_Kw', 'hydrogen_bonding_acceptor_count', 'hydrogen_bonding_donor_count',
            'solubility (g/L)', 'molecular_wt (g/mol)', 'pka1', 'pka2',

            # instead of Ci we consider Dye Concentration
            'Dye concentration (mg/L)', 'Solution pH', #'Ci',
            'HA (mg/L)',
           'Anions',

            #'Mass ratio (Catalyst/Dye)'
        ]

        fpath = os.path.join(os.getcwd(), "data", "230613_Photocatalysis_with_Zeeshan_data_CMKim_Updated.csv")
        df = pd.read_csv(fpath)

        # first order k following https://doi.org/10.1016/j.seppur.2019.116195
        k = np.log(df["Ci"] / df["Cf"]) / df["Time (m)"]
        df["k"] = k

        k_2nd = ((1 / df["Cf"]) - (1 / df["Ci"])) / df["Time (m)"]
        df["k_2nd"] = k_2nd

        # at Time 0, let k==0
        df.loc[df['Time (m)'] <= 0.0, "k"] = 0.0

        # when final concentration is very low, k is not calculable (will be inf)
        # therefore inserting very small value of k
        df.loc[df['Cf']==0.0, "k"] = 0.001

        #mass_ratio = (loading / volume )/dye_conc.

        # when no anions are present, represent them as N/A
        df.loc[df['Anions'].isin(['0', 'without Anion']), "Anions"] = "N/A"

        if inputs is None:
            inputs = default_inputs

        if outputs is None:
            outputs = ['Efficiency']
        else:
            if not isinstance(outputs, list):
               outputs = [outputs]

        return df[inputs + outputs]


.. GENERATED FROM PYTHON SOURCE LINES 187-204

.. code-block:: Python


    def _ohe_column(df:pd.DataFrame, col_name:str)->tuple:
        # function for OHE
        assert isinstance(col_name, str)

        # setting sparse to True will return a scipy.sparse.csr.csr_matrix
        # not a numpy array
        encoder = OneHotEncoder(sparse=False)
        ohe_cat = encoder.fit_transform(df[col_name].values.reshape(-1, 1))
        cols_added = [f"{col_name}_{i}" for i in range(ohe_cat.shape[-1])]

        df[cols_added] = ohe_cat

        df.pop(col_name)

        return df, cols_added, encoder


.. GENERATED FROM PYTHON SOURCE LINES 205-254

.. code-block:: Python


    def prepare_data(
            inputs = None,
            outputs=None,
            encoding="le",
    )->Tuple[pd.DataFrame, dict]:

        if encoding is not None:
            assert encoding in ("le", "ohe")

        data = read_data(inputs, outputs)

        cat_encoder, dye_encoder, anion_encoder = None, None, None
        encoders = {}
        if encoding=="ohe":
            # applying One Hot Encoding
            if 'Catalyst' in data.columns:
                data, _, cat_encoder = _ohe_column(data, 'Catalyst')

            if 'Dye' in data.columns:
                data, _, dye_encoder = _ohe_column(data, 'Dye')
            data, _, anion_encoder = _ohe_column(data, 'Anions')
        elif encoding == "le":
            # applying Label Encoding

            if 'Catalyst' in data.columns:
                data, cat_encoder = le_column(data, 'Catalyst')

            if 'Dye' in data.columns:
                data, dye_encoder = le_column(data, 'Dye')
            data, anion_encoder = le_column(data, 'Anions')

        # make sure that efficiency is the last column
        if outputs is None:
            data['Efficiency'] = data.pop("Efficiency")
        elif isinstance(outputs, list):
            for out in outputs:
                data[out] = data.pop(out)
        else:
            assert len(outputs) == 1
            output = outputs[0]
            data[output] = data.pop(output)

        encoders['Catalyst'] = cat_encoder
        encoders['Dye'] = dye_encoder
        encoders['Anions'] = anion_encoder

        return data, encoders


.. GENERATED FROM PYTHON SOURCE LINES 255-262

.. code-block:: Python


    def le_column(df:pd.DataFrame, col_name)->tuple:
        """label encode a column in dataframe"""
        encoder = LabelEncoder()
        df[col_name] = encoder.fit_transform(df[col_name])
        return df, encoder


.. GENERATED FROM PYTHON SOURCE LINES 263-294

.. code-block:: Python


    def set_rcParams(**kwargs):
        # https://matplotlib.org/stable/tutorials/introductory/customizing.html
        _kwargs = {
            'axes.labelsize': '14',
            'axes.labelweight': 'bold',
            'xtick.labelsize': '12',
            'ytick.labelsize': '12',
            'font.weight': 'bold',
            'legend.title_fontsize': '12',
            'axes.titleweight': 'bold',
            'axes.titlesize': '14',
            #'font.family': "Times New Roman"

        }

        if sys.platform == "linux":

            _kwargs['font.family'] = 'serif'
            _kwargs['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']
        else:
            _kwargs['font.family'] = "Times New Roman"

        if kwargs:
            _kwargs.update(kwargs)

        for k,v in _kwargs.items():
            plt.rcParams[k] = v

        return


.. GENERATED FROM PYTHON SOURCE LINES 295-309

.. code-block:: Python


    def get_dataset(encoding="le", seed=313):

        data, encoders = prepare_data(
            encoding=encoding)

        dataset = DataSet(data=data,
                          seed=seed,
                          split_random=True,
                          input_features=data.columns.tolist()[0:-1],
                          output_features=data.columns.tolist()[-1:],
                          )
        return dataset, encoders


.. GENERATED FROM PYTHON SOURCE LINES 310-326

.. code-block:: Python


    def plot_correlation(df, show=True, **kwargs):
        eda = EDA(data=df, show=False)

        ax = eda.correlation(figsize=(9, 9), square=True,
                             cbar_kws={"shrink": .72},
                             cmap="RdYlGn",
                             **kwargs
                             )
        ax.set_xticklabels(ax.get_xticklabels(), fontsize=12, weight='bold', rotation=70)
        ax.set_yticklabels(ax.get_yticklabels(), fontsize=12, weight='bold')
        if show:
            plt.tight_layout()
            plt.show()
        return


.. GENERATED FROM PYTHON SOURCE LINES 327-363

.. code-block:: Python


    def plot_ci(
            prediction,
            lower,
            upper,
            coverage:float,
            num_points=None,
            title= None,
            axes:plt.axes = None,
            legned:bool = True,
    ):
        if num_points:
            prediction = prediction[0:num_points]
            lower = lower[0:num_points]
            upper = upper[0:num_points]

        if axes is None:
            f, axes = plt.subplots()

        axes.plot(prediction,
                    color='forestgreen',
                    label='Prediction')
        axes.fill_between(np.arange(len(prediction)),
                          lower,
                          upper,
                          color="forestgreen",
                          label=f"{int(coverage * 100)}% CI",
                          alpha=0.6
                          )
        if legned:
            axes.legend()
        if title:
            axes.set_title(title)

        return axes


.. GENERATED FROM PYTHON SOURCE LINES 364-432

.. code-block:: Python


    def ale_1d(
            predictor:Callable,
            X:pd.DataFrame,
            feature:str,
            bins:int = 10
    ):
        """creates 1d ale for a continuous feature
        Copying code from alepython package
        """
        if np.__version__ >= "1.23.5":
            quantiles = np.unique(
                np.quantile(
                    X[feature], np.linspace(0, 1, bins + 1), method="lower"
                )
            )
        else:
            quantiles = np.unique(
                np.quantile(
                    X[feature], np.linspace(0, 1, bins + 1), interpolation="lower"
                )
            )

        indices = np.clip(
            np.digitize(X[feature], quantiles, right=True) - 1, 0, None
        )

        predictions = []
        for offset in range(2):
            mod_X = X.copy()
            mod_X[feature] = quantiles[indices + offset]
            predictions.append(predictor(mod_X))
        # The individual effects.
        effects = predictions[1] - predictions[0]

        index_groupby = pd.DataFrame({"index": indices, "effects": effects}).groupby(
            "index"
        )

        mean_effects = index_groupby.mean().to_numpy().flatten()

        ale = np.array([0, *np.cumsum(mean_effects)])

        ale = (ale[1:] + ale[:-1]) / 2

        ale -= np.sum(ale * index_groupby.size() / X.shape[0])
        return ale, quantiles


    def plot_ale(
            predictor,
            X:pd.DataFrame,
            feature:str,
            bins:int = 10,
            ax=None,
            show:bool = True,
            **kwargs
    ):
        ale, q = ale_1d(predictor, X, feature, bins=bins)
        q = (q[1:] + q[:-1]) / 2
        ax = plot(q, ale, ax=ax, show=False, **kwargs)

        ax.set_xlabel(LABEL_MAP.get(feature, feature))
        ax.grid(ls=":", color="lightgrey")
        if show:
            plt.show()
        return


.. GENERATED FROM PYTHON SOURCE LINES 433-508

.. code-block:: Python


    def shap_scatter_plots(
            shap_values:np.ndarray,
            TrainX:pd.DataFrame,
            feature_name:str,
            encoders,
            save:bool = True,
    ):
        """
        It is expected that the columns in TrainX and shap_values have same order.
        :param shap_values:
        :param TrainX:
        :param feature_name:
        :param encoders:
        :param save:

        :return:
        """
        f, axes = create_subplots(TrainX.shape[1],
                                  figsize=(12, 9))

        index = TrainX.columns.to_list().index(feature_name)

        for idx, (feature, ax) in enumerate(zip(TrainX.columns, axes.flat)):

            clr_f_is_cat = False
            if feature in ['Anions', 'Catalyst']:
                clr_f_is_cat = True

            if feature in ['Catalyst', 'Anions']:
                enc = encoders[feature]
                dec_feature = pd.Series(
                    enc.inverse_transform(TrainX.loc[:, feature].values),
                                        name=feature)
                if feature == 'Catalyst':
                    dec_feature_d = {k: CATALYST_CATEGORIES[k] for k in dec_feature.unique()}
                    color_feature = dec_feature.map(dec_feature_d)
                else:
                    color_feature = dec_feature

                # instead of showing the actual names, we still prefer to
                # label encode them because actual names takes very large
                # space in figure/axes
                color_feature = pd.Series(
                    LabelEncoder().fit_transform(color_feature),
                                          name=feature)
            else:
                color_feature = TrainX.loc[:, feature]

            color_feature.name = LABEL_MAP.get(color_feature.name, color_feature.name)


            ax = shap_scatter(
                shap_values[:, index],
                feature_data=TrainX.loc[:, feature_name].values,
                feature_name=LABEL_MAP.get(feature_name, feature_name),
                color_feature=color_feature,
                color_feature_is_categorical=clr_f_is_cat,
                show=False,
                alpha=0.5,
                ax=ax
            )
            ax.set_ylabel('')

        plt.tight_layout()

        if save:
            feature_name = feature_name.replace(' ', '')
            feature_name = feature_name.replace('/', '_')
            plt.savefig(f"results/figures/shap_interac_{feature_name}.png", dpi=600, bbox_inches="tight")

        plt.show()

        return


.. GENERATED FROM PYTHON SOURCE LINES 509-637

.. code-block:: Python


    def shap_scatter(
            feature_shap_values:np.ndarray,
            feature_data:Union[pd.DataFrame, np.ndarray, pd.Series],
            color_feature:pd.Series=None,
            color_feature_is_categorical:bool = False,
            feature_name:str = '',
            show_hist:bool = True,
            palette_name = "tab10",
            s:int = 70,
            ax:plt.Axes = None,
            edgecolors='black',
            linewidth=0.8,
            alpha=0.8,
            show:bool = True,
            **scatter_kws,
    ):
        """

        :param feature_shap_values:
        :param feature_data:
        :param color_feature:
        :param color_feature_is_categorical:
            whether the color feautre is categorical or not. If categorical then the
            array ``color_feature`` is supposed to contain categorical (either string or numerical) values which
            are then mapped to the color and are used prepare the legend box.
        :param feature_name:
        :param show_hist:
        :param palette_name:
            only relevant if ``color_feature_is_categorical`` is True
        :param s:
        :param ax:
        :param edgecolors:
        :param linewidth:
        :param alpha:
        :param show:
        :param scatter_kws:
        :return:
        """
        if ax is None:
            fig, ax = plt.subplots()

        if color_feature is None:
            c = None
        else:
            if color_feature_is_categorical:
                if isinstance(palette_name, (tuple, list)):
                    assert len(palette_name) == len(color_feature.unique())
                    rgb_values = palette_name
                else:
                    rgb_values = sns.color_palette(palette_name, color_feature.unique().__len__())
                color_map = dict(zip(color_feature.unique(), rgb_values))
                c= color_feature.map(color_map)
            else:
                c = color_feature.values.reshape(-1,)

        _, pc = scatter(
            feature_data,
            feature_shap_values,
            c=c,
            s=s,
            marker="o",
            edgecolors=edgecolors,
            linewidth=linewidth,
            alpha=alpha,
            ax=ax,
            show=False,
            **scatter_kws
        )

        if color_feature is not None:
            feature_wrt_name = ' '.join(color_feature.name.split('_'))
            if color_feature_is_categorical:
                # add a legend
                handles = [Line2D([0], [0],
                                  marker='o',
                                  color='w',
                                  markerfacecolor=v,
                                  label=k, markersize=8) for k, v in color_map.items()]

                ax.legend(title=feature_wrt_name,
                      handles=handles, bbox_to_anchor=(1.05, 1),
                          loc='upper left',
                          title_fontsize=14
                          )
            else:
                fig = ax.get_figure()
                # increasing aspect will make the colorbar thin
                cbar = fig.colorbar(pc, ax=ax, aspect=20)
                cbar.ax.set_ylabel(feature_wrt_name,
                                   rotation=90, labelpad=14)

                cbar.set_alpha(1)
                cbar.outline.set_visible(False)

        ax.set_xlabel(feature_name)
        ax.set_ylabel(f"SHAP value for {feature_name}")
        ax.axhline(0, color='grey', linewidth=1.3, alpha=0.3, linestyle='--')

        if show_hist:
            if isinstance(feature_data, (pd.Series, pd.DataFrame)):
                feature_data = feature_data.values
            x = feature_data

            if len(x) >= 500:
                bin_edges = 50
            elif len(x) >= 200:
                bin_edges = 20
            elif len(x) >= 100:
                bin_edges = 10
            else:
                bin_edges = 5

            ax2 = ax.twinx()

            xlim = ax.get_xlim()

            ax2.hist(x.reshape(-1,), bin_edges,
                     range=(xlim[0], xlim[1]),
                     density=False, facecolor='#000000', alpha=0.1, zorder=-1)
            ax2.set_ylim(0, len(x))
            ax2.set_yticks([])

        if show:
            plt.show()

        return ax


.. GENERATED FROM PYTHON SOURCE LINES 638-648

.. code-block:: Python


    def version_info()->dict:
        info = get_version_info()
        info['crepes'] = crepes.__version__
        info['mapie'] = mapie.__version__
        info['shap'] = shap.__version__
        info['scipy'] = scipy.__version__
        info['matplotlib'] = matplotlib.__version__
        return info


.. GENERATED FROM PYTHON SOURCE LINES 649-675

.. code-block:: Python


    def make_classes(exp):
        colors = {'Experimental Conditions': '#ed9571',
                  'Physicochemical Properties': '#faebd7',
                  'Atomic Composition': '#8a5a45',
                  'Dye Properties': '#F3D4C4'
                  }

        classes = []
        colors_ = []
        for f in exp.feature_names:
            if f in CATEGORIES['Experimental Conditions']:
                classes.append('Experimental Conditions')
                colors_.append(colors['Experimental Conditions'])
            elif f in CATEGORIES['Physicochemical Properties']:
                classes.append('Physicochemical Properties')
                colors_.append(colors['Physicochemical Properties'])
            elif f in CATEGORIES['Atomic Composition']:
                classes.append('Atomic Composition')
                colors_.append(colors['Atomic Composition'])
            elif f in CATEGORIES['Dye Properties']:
                classes.append('Dye Properties')
                colors_.append(colors['Dye Properties'])


        return classes, colors, colors_


.. rst-class:: sphx-glr-timing

   **Total running time of the script:** (0 minutes 0.010 seconds)


.. _sphx_glr_download_auto_examples_utils.py:

.. only:: html

  .. container:: sphx-glr-footer sphx-glr-footer-example

    .. container:: sphx-glr-download sphx-glr-download-jupyter

      :download:`Download Jupyter notebook: utils.ipynb <utils.ipynb>`

    .. container:: sphx-glr-download sphx-glr-download-python

      :download:`Download Python source code: utils.py <utils.py>`

    .. container:: sphx-glr-download sphx-glr-download-zip

      :download:`Download zipped: utils.zip <utils.zip>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_