Source code for eflow.model_analysis.outlier_analysis

from eflow.utils.sys_utils import *
from eflow._hidden.parent_objects import ModelAnalysis
from eflow.utils.pandas_utils import zcore_remove_outliers

import pandas as pd
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import numpy as np

__author__ = "Eric Cacciavillani"
__copyright__ = "Copyright 2019, eFlow"
__credits__ = ["Eric Cacciavillani"]
__license__ = "MIT"
__maintainer__ = "EricCacciavillani"
__email__ = "eric.cacciavillani@gmail.com"

[docs]class OutlierAnalysis(ModelAnalysis):

    """
        Analyzes a classification model's result's based on the prediction
        function(s) passed to it. Creates graphs and tables to be saved in directory
        structure.
    """

    def __init__(self,
                 dataset_name,
                 model,
                 model_name,
                 feature_order,
                 df_features,
                 project_sub_dir="Outlier Analysis",
                 overwrite_full_path=None,
                 save_models=True,
                 notebook_mode=False):

        # Init parent object
        ModelAnalysis.__init__(self,
                               f'{dataset_name}/{project_sub_dir}/{model_name}',
                               overwrite_full_path)

        self.__feature_order = copy.deepcopy(feature_order)

        self.__model = copy.deepcopy(model)
        self.__model_name = copy.deepcopy(model_name)

        self.__df_features = copy.deepcopy(df_features)
        self.__notebook_mode = copy.deepcopy(notebook_mode)

        # Determines if the perform was called
        self.__called_from_perform = False

        del model

        # Attempt to save machine learning model
        if save_models:
            try:
                pickle_object_to_file(model,
                                      self.folder_path,
                                      model_name)
            except:
                pass

        create_dir_structure(self.folder_path,
                             "_Extras")

    def model_decision_outliers(self):

        return None


    def graph_decision_outliers(self,
                                X,
                                heavy_outlier_zscore=float("inf"),
                                medium_outlier_zscore=float("inf"),
                                save_file=True):

        model_decisions = self.__model.decision_function(X)

        zscore_val = heavy_outlier_zscore
        zscore_series = pd.Series((model_decisions - model_decisions.mean()) /
                                  model_decisions.std(ddof=0))

        bool_series = zscore_series.between(zscore_val * -1, zscore_val)
        hv_outlier_index_list = bool_series[
            bool_series == False].index.tolist()

        hv_outlier_dict = dict(zip(hv_outlier_index_list,list(zscore_series)))
        tmp_df = pd.DataFrame.from_dict(hv_outlier_dict,
                                        orient='index',columns=["Z-Scores"])

        tmp_df.sort_values(by=['Z-Scores'], inplace=True)
        tmp_df.to_csv(self.folder_path +  f"_Extras/Heavy Outliers and Inlier with a abs zscore {zscore_val}.csv")


        # SAVE HERE

        del zscore_series, hv_outlier_dict, tmp_df

        zscore_val = medium_outlier_zscore
        zscore_series = pd.Series((model_decisions[bool_series] -
                                   model_decisions[bool_series].mean()) /
                                  model_decisions[bool_series].std(ddof=0))

        bool_series = zscore_series.between(zscore_val * -1,
                                            zscore_val)
        md_outlier_index_list = bool_series[
            bool_series == False].index.tolist()


        md_outlier_dict = dict(zip(md_outlier_index_list, list(zscore_series)))

        tmp_df = pd.DataFrame.from_dict(md_outlier_dict,
                                        orient='index', columns=["Z-Scores"])

        tmp_df.sort_values(by=['Z-Scores'], inplace=True)
        tmp_df.to_csv(
            self.folder_path + f"_Extras/Medium Outliers and Inlier with a abs zscore of {zscore_val}.csv")

        # SAVE HERE

        del zscore_series, md_outlier_dict, tmp_df,bool_series

        for shading_pos_neg in [True,False]:
            self.__helper_decision_outlier_graph(model_decisions,
                                                 title=f"{self.__model_name} decision function",
                                                 filename=f"{self.__model_name} decision function",
                                                 shading_pos_neg=shading_pos_neg,
                                                 save_file=save_file)

            title_and_filename = f"{self.__model_name} decision function with heavy outlier removed (Zscore of {heavy_outlier_zscore})"

            if not shading_pos_neg:
                title_and_filename += " show zscore boundaries"

            self.__helper_decision_outlier_graph(model_decisions,
                                                 hv_outlier_index_list=hv_outlier_index_list,
                                                 title=title_and_filename,
                                                 filename=title_and_filename,
                                                 shading_pos_neg=shading_pos_neg,
                                                 save_file=save_file)

            title_and_filename = f"{self.__model_name} decision function with medium (Zscore of {medium_outlier_zscore}) and heavy outlier removed (Zscore of {heavy_outlier_zscore})"

            if not shading_pos_neg:
                title_and_filename += " show zscore boundaries"

            self.__helper_decision_outlier_graph(model_decisions,
                                                 hv_outlier_index_list=hv_outlier_index_list,
                                                 md_outlier_index_list=md_outlier_index_list,
                                                 title=title_and_filename,
                                                 filename=title_and_filename,
                                                 shading_pos_neg=shading_pos_neg,
                                                 save_file=save_file)




    def __helper_decision_outlier_graph(self,
                                        model_decisions,
                                        hv_outlier_index_list=[],
                                        md_outlier_index_list=[],
                                        title="default",
                                        filename="default",
                                        shading_pos_neg=True,
                                        save_file=True):

        outlier_val = min(np.delete(model_decisions,
                                    md_outlier_index_list + hv_outlier_index_list))
        inlier_val = max(np.delete(model_decisions,
                                   md_outlier_index_list + hv_outlier_index_list))

        model_decisions = np.delete(model_decisions,
                                    md_outlier_index_list + hv_outlier_index_list)

        del hv_outlier_index_list, md_outlier_index_list


        plt.figure(figsize=(10, 10))
        ax = sns.distplot(model_decisions, kde=True,
                          hist_kws={'edgecolor': 'black', "rwidth": .9, },
                          bins=16)

        ymax = ax.get_ylim()[1] - (ax.get_ylim()[1] * .03)

        if shading_pos_neg:
            outlier_val = 0

            ax.annotate('Outlier boundaries', xy=(outlier_val, 0),
                        xytext=(outlier_val, ymax),
                        arrowprops=dict(facecolor='red', alpha=.5))

            rect = patches.Rectangle((outlier_val, 0), ax.get_xlim()[1],
                                     ymax - 0, facecolor='blue', alpha=0.07, )
            ax.add_patch(rect)

            rect = patches.Rectangle((ax.get_xlim()[0], 0),
                                     outlier_val - ax.get_xlim()[0], ymax - 0,
                                     facecolor='#b93c43', alpha=0.1)
            ax.add_patch(rect)

            plt.legend(['Inliers', f'Abs Z-Score higher than '],
                       bbox_to_anchor=(1.02, 1), loc='upper left')
        else:

            ax.annotate('Outlier boundaries', xy=(outlier_val, 0),
                        xytext=(outlier_val, ymax),
                        arrowprops=dict(facecolor='red', alpha=.5))
            ax.annotate('Inlier boundaries', xy=(inlier_val, 0),
                        xytext=(inlier_val, ymax),
                        arrowprops=dict(facecolor='red', alpha=.5))

            rect = patches.Rectangle((outlier_val, 0),
                                     inlier_val - outlier_val, ymax - 0,
                                     facecolor='blue', alpha=0.07, )
            ax.add_patch(rect)

            rect = patches.Rectangle((ax.get_xlim()[0], 0),
                                     outlier_val - ax.get_xlim()[0], ymax - 0,
                                     facecolor='#b93c43', alpha=0.1)
            ax.add_patch(rect)

            rect = patches.Rectangle((ax.get_xlim()[1], 0),
                                     inlier_val - ax.get_xlim()[1], ymax - 0,
                                     facecolor='#b93c43', alpha=0.1)
            ax.add_patch(rect)


        plt.legend(['Inliers', f'Abs Z-Score higher than '],
                   bbox_to_anchor=(1.02, 1), loc='upper left')
        leg = ax.get_legend()

        if len(leg.legendHandles) == 2:
            leg.legendHandles[0].set_color('blue')
            leg.legendHandles[1].set_color('#b93c43')

        for lh in leg.legendHandles:
            lh.set_alpha(.4)

        plt.title(title)

        if save_file:
            self.save_plot(filename=filename)

        plt.show()
        plt.close("all")