Source code for eflow.model_analysis.outlier_analysis

from eflow.utils.sys_utils import *
from eflow._hidden.parent_objects import ModelAnalysis
from eflow.utils.pandas_utils import zcore_remove_outliers

import pandas as pd
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import numpy as np

__author__ = "Eric Cacciavillani"
__copyright__ = "Copyright 2019, eFlow"
__credits__ = ["Eric Cacciavillani"]
__license__ = "MIT"
__maintainer__ = "EricCacciavillani"
__email__ = "eric.cacciavillani@gmail.com"

[docs]class OutlierAnalysis(ModelAnalysis): """ Analyzes a classification model's result's based on the prediction function(s) passed to it. Creates graphs and tables to be saved in directory structure. """ def __init__(self, dataset_name, model, model_name, feature_order, df_features, project_sub_dir="Outlier Analysis", overwrite_full_path=None, save_models=True, notebook_mode=False): # Init parent object ModelAnalysis.__init__(self, f'{dataset_name}/{project_sub_dir}/{model_name}', overwrite_full_path) self.__feature_order = copy.deepcopy(feature_order) self.__model = copy.deepcopy(model) self.__model_name = copy.deepcopy(model_name) self.__df_features = copy.deepcopy(df_features) self.__notebook_mode = copy.deepcopy(notebook_mode) # Determines if the perform was called self.__called_from_perform = False del model # Attempt to save machine learning model if save_models: try: pickle_object_to_file(model, self.folder_path, model_name) except: pass create_dir_structure(self.folder_path, "_Extras") def model_decision_outliers(self): return None def graph_decision_outliers(self, X, heavy_outlier_zscore=float("inf"), medium_outlier_zscore=float("inf"), save_file=True): model_decisions = self.__model.decision_function(X) zscore_val = heavy_outlier_zscore zscore_series = pd.Series((model_decisions - model_decisions.mean()) / model_decisions.std(ddof=0)) bool_series = zscore_series.between(zscore_val * -1, zscore_val) hv_outlier_index_list = bool_series[ bool_series == False].index.tolist() hv_outlier_dict = dict(zip(hv_outlier_index_list,list(zscore_series))) tmp_df = pd.DataFrame.from_dict(hv_outlier_dict, orient='index',columns=["Z-Scores"]) tmp_df.sort_values(by=['Z-Scores'], inplace=True) tmp_df.to_csv(self.folder_path + f"_Extras/Heavy Outliers and Inlier with a abs zscore {zscore_val}.csv") # SAVE HERE del zscore_series, hv_outlier_dict, tmp_df zscore_val = medium_outlier_zscore zscore_series = pd.Series((model_decisions[bool_series] - model_decisions[bool_series].mean()) / model_decisions[bool_series].std(ddof=0)) bool_series = zscore_series.between(zscore_val * -1, zscore_val) md_outlier_index_list = bool_series[ bool_series == False].index.tolist() md_outlier_dict = dict(zip(md_outlier_index_list, list(zscore_series))) tmp_df = pd.DataFrame.from_dict(md_outlier_dict, orient='index', columns=["Z-Scores"]) tmp_df.sort_values(by=['Z-Scores'], inplace=True) tmp_df.to_csv( self.folder_path + f"_Extras/Medium Outliers and Inlier with a abs zscore of {zscore_val}.csv") # SAVE HERE del zscore_series, md_outlier_dict, tmp_df,bool_series for shading_pos_neg in [True,False]: self.__helper_decision_outlier_graph(model_decisions, title=f"{self.__model_name} decision function", filename=f"{self.__model_name} decision function", shading_pos_neg=shading_pos_neg, save_file=save_file) title_and_filename = f"{self.__model_name} decision function with heavy outlier removed (Zscore of {heavy_outlier_zscore})" if not shading_pos_neg: title_and_filename += " show zscore boundaries" self.__helper_decision_outlier_graph(model_decisions, hv_outlier_index_list=hv_outlier_index_list, title=title_and_filename, filename=title_and_filename, shading_pos_neg=shading_pos_neg, save_file=save_file) title_and_filename = f"{self.__model_name} decision function with medium (Zscore of {medium_outlier_zscore}) and heavy outlier removed (Zscore of {heavy_outlier_zscore})" if not shading_pos_neg: title_and_filename += " show zscore boundaries" self.__helper_decision_outlier_graph(model_decisions, hv_outlier_index_list=hv_outlier_index_list, md_outlier_index_list=md_outlier_index_list, title=title_and_filename, filename=title_and_filename, shading_pos_neg=shading_pos_neg, save_file=save_file) def __helper_decision_outlier_graph(self, model_decisions, hv_outlier_index_list=[], md_outlier_index_list=[], title="default", filename="default", shading_pos_neg=True, save_file=True): outlier_val = min(np.delete(model_decisions, md_outlier_index_list + hv_outlier_index_list)) inlier_val = max(np.delete(model_decisions, md_outlier_index_list + hv_outlier_index_list)) model_decisions = np.delete(model_decisions, md_outlier_index_list + hv_outlier_index_list) del hv_outlier_index_list, md_outlier_index_list plt.figure(figsize=(10, 10)) ax = sns.distplot(model_decisions, kde=True, hist_kws={'edgecolor': 'black', "rwidth": .9, }, bins=16) ymax = ax.get_ylim()[1] - (ax.get_ylim()[1] * .03) if shading_pos_neg: outlier_val = 0 ax.annotate('Outlier boundaries', xy=(outlier_val, 0), xytext=(outlier_val, ymax), arrowprops=dict(facecolor='red', alpha=.5)) rect = patches.Rectangle((outlier_val, 0), ax.get_xlim()[1], ymax - 0, facecolor='blue', alpha=0.07, ) ax.add_patch(rect) rect = patches.Rectangle((ax.get_xlim()[0], 0), outlier_val - ax.get_xlim()[0], ymax - 0, facecolor='#b93c43', alpha=0.1) ax.add_patch(rect) plt.legend(['Inliers', f'Abs Z-Score higher than '], bbox_to_anchor=(1.02, 1), loc='upper left') else: ax.annotate('Outlier boundaries', xy=(outlier_val, 0), xytext=(outlier_val, ymax), arrowprops=dict(facecolor='red', alpha=.5)) ax.annotate('Inlier boundaries', xy=(inlier_val, 0), xytext=(inlier_val, ymax), arrowprops=dict(facecolor='red', alpha=.5)) rect = patches.Rectangle((outlier_val, 0), inlier_val - outlier_val, ymax - 0, facecolor='blue', alpha=0.07, ) ax.add_patch(rect) rect = patches.Rectangle((ax.get_xlim()[0], 0), outlier_val - ax.get_xlim()[0], ymax - 0, facecolor='#b93c43', alpha=0.1) ax.add_patch(rect) rect = patches.Rectangle((ax.get_xlim()[1], 0), inlier_val - ax.get_xlim()[1], ymax - 0, facecolor='#b93c43', alpha=0.1) ax.add_patch(rect) plt.legend(['Inliers', f'Abs Z-Score higher than '], bbox_to_anchor=(1.02, 1), loc='upper left') leg = ax.get_legend() if len(leg.legendHandles) == 2: leg.legendHandles[0].set_color('blue') leg.legendHandles[1].set_color('#b93c43') for lh in leg.legendHandles: lh.set_alpha(.4) plt.title(title) if save_file: self.save_plot(filename=filename) plt.show() plt.close("all")