Source code for eflow.model_analysis.classification_analysis

from eflow.utils.sys_utils import *
from eflow.utils.pandas_utils import df_to_image
from eflow.utils.image_processing_utils import create_plt_png
from eflow._hidden.parent_objects import ModelAnalysis
from eflow._hidden.custom_exceptions import RequiresPredictionMethods, ProbasNotPossible, \
    UnsatisfiedRequirments
from eflow.data_analysis import FeatureAnalysis
from eflow.data_pipeline_segments import DataEncoder

from eflow._hidden.constants import GRAPH_DEFAULTS

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
import scikitplot as skplt
import numpy as np
import warnings
import copy
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

__author__ = "Eric Cacciavillani"
__copyright__ = "Copyright 2019, eFlow"
__credits__ = ["Eric Cacciavillani"]
__license__ = "MIT"
__maintainer__ = "EricCacciavillani"
__email__ = "eric.cacciavillani@gmail.com"

[docs]class ClassificationAnalysis(ModelAnalysis): """ Analyzes a classification model's result's based on the prediction function(s) passed to it. Creates graphs and tables to be saved in directory structure. """ def __init__(self, dataset_name, model, model_name, feature_order, target_feature, pred_funcs_dict, df_features, sample_data, project_sub_dir="Classification Analysis", overwrite_full_path=None, target_classes=None, save_model=True, notebook_mode=False): """ Args: dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. model: A fitted supervised machine learning model. model_name: The name of the model in string form. feature_order: collection object Features names in proper order to re-create the pandas dataframe. pred_funcs_dict: A dict of the name of the function and the function defintion for the model prediction methods. (Can handle either a return of probabilities or a singile value.) Init Example: pred_funcs = dict() pred_funcs["Predictions"] = model.predict pred_funcs["Probabilities"] = model.probas sample_data: Given data to then pass into our prediction functions to get a resultant to get the classification prediction 'type'. Can be a matrix or a vector. project_sub_dir: Creates a parent or "project" folder in which all sub-directories will be inner nested. overwrite_full_path: Overwrites the path to the parent folder. target_classes: Specfied list/np.array of targeted classes the model predicts. If set to none then it will attempt to pull from the sklearn's default attribute '.classes_'. df_features: DataFrameTypes object; organizes feature types into groups. """ # Init parent object ModelAnalysis.__init__(self, f'{dataset_name}/{project_sub_dir}/Target Feature: {target_feature}/{model_name}', overwrite_full_path) # Init objects without pass by refrence # Remove target feature from feature order when trying to recreate dataframe self.__target_feature = copy.deepcopy(target_feature) self.__feature_order = copy.deepcopy(feature_order) if self.__target_feature in self.__feature_order: self.__feature_order.remove(self.__target_feature) del feature_order del target_feature self.__model = copy.deepcopy(model) self.__model_name = copy.deepcopy(model_name) self.__target_values = copy.deepcopy(target_classes) self.__df_features = copy.deepcopy(df_features) self.__pred_funcs_dict = copy.deepcopy(pred_funcs_dict) self.__pred_funcs_types = dict() self.__notebook_mode = copy.deepcopy(notebook_mode) # Determines if the perform was called self.__called_from_perform = False # Init on sklearns default target classes attribute if not self.__target_values: self.__target_values = copy.deepcopy(model.classes_) # --- if len(self.__target_values) != 2: self.__binary_classifcation = False else: self.__binary_classifcation = True # Attempt to save machine learning model try: if save_model: pickle_object_to_file(self.__model, self.folder_path, f'{self.__model_name}') except: pass # --- create_dir_structure(self.folder_path, "_Extras") # Save predicted classes write_object_text_to_file(self.__target_values, self.folder_path + "_Extras", "_Classes") # Save feature order write_object_text_to_file(self.__feature_order, self.folder_path + "_Extras", "_Feature_Order") # Save features and or df_features object df_features.create_json_file_representation(self.folder_path + "_Extras", "df_features.json") self.__sample_data = None # Extract sample data if len(sample_data.shape) == 2: self.__sample_data = np.reshape(sample_data[0], (-1, sample_data.shape[1])) elif len(sample_data.shape) == 1: self.__sample_data = [sample_data] else: raise UnsatisfiedRequirments("This program can only handle 1D and 2D matrices.") # Find the 'type' of each prediction. Probabilities or Predictions if self.__pred_funcs_dict: for pred_name, pred_func in self.__pred_funcs_dict.items(): try: model_output = pred_func(self.__sample_data)[0] except Exception as e: model_output = pred_func(np.array(self.__sample_data))[0] # Confidence / Probability (Continuous output) if isinstance(model_output, list) or isinstance(model_output, np.ndarray): self.__pred_funcs_types[pred_name] = "Probabilities" # Classification (Discrete output) else: self.__pred_funcs_types[pred_name] = "Predictions" else: raise RequiresPredictionMethods("This object requires you to pass the prediction methods in a dict with the name of the method as the key.") try: feature_importances = model.feature_importances_ self.graph_model_importances(copy.deepcopy(self.__feature_order), feature_importances, display_visuals=False) except AttributeError: pass def get_predictions_names(self): return self.__pred_funcs_dict.keys()
[docs] def perform_analysis(self, X, y, dataset_name, thresholds_matrix=None, classification_error_analysis=False, classification_correct_analysis=False, ignore_metrics=[], custom_metrics_dict=dict(), average_scoring=["micro", "macro", "weighted"], display_visuals=True): """ Runs all available analysis functions on the models predicted data. Args: X: Feature matrix. y: Target data vector. dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. thresholds_matrix: List of list/matrix of thresholds. If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. classification_error_analysis: bool Perform feature analysis on data that was incorrectly predicted. classification_correct_analysis: bool Perform feature analysis on data that was correctly predicted. figsize: All plot's dimension's. ignore_metrics: Specify the default metrics to not apply to the classification data_analysis. * Precision * MCC * Recall * F1-Score * Accuracy custom_metrics_dict: Pass the name of metric(s) with the function definition(s) in a dictionary. average_scoring: Determines the type of averaging performed on the data. display_visuals: Controls visual display of error error data_analysis if it is able to run. Returns: Performs all classification functionality with the provided feature data and target data. * plot_precision_recall_curve * classification_evaluation * plot_confusion_matrix """ try: # Convert to if isinstance(thresholds_matrix, np.ndarray): thresholds_matrix = thresholds_matrix.tolist() if not thresholds_matrix: thresholds_matrix = [[]] if isinstance(thresholds_matrix, list) and not isinstance( thresholds_matrix[0], list): thresholds_matrix = list(thresholds_matrix) none_required = False for vector in thresholds_matrix: if not vector or len(vector) == 0: none_required = True if not none_required: thresholds_matrix.append(None) self.__called_from_perform = True self.generate_matrix_meta_data(X, dataset_name + "/_Extras") print("\n\n" + "---" * 10 + f'{dataset_name}' + "---" * 10) first_iteration = True for pred_name, pred_type in self.__pred_funcs_types.items(): # Nicer formating if not first_iteration: print("\n\n\n") first_iteration = False for thresholds in thresholds_matrix: print(f"Now running classification on {pred_name}", end='') if pred_type == "Predictions": print() thresholds = None else: if thresholds: print(f" on thresholds:") for i,target_val in enumerate(self.__target_values): try: print(f"\tTarget Value:{target_val}: Prediction weight: {thresholds[i]}") except IndexError: raise IndexError("Thresholds must of the same length as the target values!") else: print(" on no thresholds.") if display_visuals: try: print(f"\nShape of the data is {X.shape}") except AttributeError: pass self.classification_metrics(X, y, pred_name=pred_name, dataset_name=dataset_name, thresholds=thresholds, ignore_metrics=ignore_metrics, custom_metrics_dict=custom_metrics_dict, average_scoring=average_scoring, display_visuals=display_visuals) self.plot_confusion_matrix(X, y, pred_name=pred_name, dataset_name=dataset_name, thresholds=thresholds, normalize=True, display_visuals=display_visuals) self.plot_confusion_matrix(X, y, pred_name=pred_name, dataset_name=dataset_name, thresholds=thresholds, normalize=False, display_visuals=display_visuals) if pred_type == "Probabilities": self.plot_precision_recall_curve(X, y, pred_name=pred_name, dataset_name=dataset_name, thresholds=thresholds, display_visuals=display_visuals) self.plot_roc_curve(X, y, pred_name=pred_name, dataset_name=dataset_name, thresholds=thresholds, display_visuals=display_visuals) if self.__binary_classifcation: self.plot_lift_curve(X, y, pred_name=pred_name, dataset_name=dataset_name, thresholds=thresholds, display_visuals=display_visuals) self.plot_ks_statistic(X, y, pred_name=pred_name, dataset_name=dataset_name, thresholds=thresholds, display_visuals=display_visuals) self.plot_cumulative_gain(X, y, pred_name=pred_name, dataset_name=dataset_name, thresholds=thresholds, display_visuals=display_visuals) if classification_error_analysis: self.classification_error_analysis(X, y, pred_name, dataset_name, thresholds=thresholds, display_visuals=False, save_file=True, aggerate_target=True, display_print=False, suppress_runtime_errors=True, aggregate_target_feature=True, selected_features=None, extra_tables=True, statistical_analysis_on_aggregates=True) if classification_correct_analysis: self.classification_correct_analysis(X, y, pred_name, dataset_name, thresholds=thresholds, display_visuals=False, save_file=True, aggerate_target=True, display_print=False, suppress_runtime_errors=True, aggregate_target_feature=True, selected_features=None, extra_tables=True, statistical_analysis_on_aggregates=True) print("-" * (len(dataset_name) + 60) + "\n") if pred_type == "Predictions": break finally: self.__called_from_perform = False
[docs] def plot_ks_statistic(self, X, y, pred_name, dataset_name, thresholds=None, display_visuals=True, save_file=True, title=None, ax=None, figsize=GRAPH_DEFAULTS.FIGSIZE, title_fontsize='large', text_fontsize='medium'): """ From scikit-plot documentation Link: http://tinyurl.com/y3ym5pyc Generates the KS Statistic plot from labels and scores/probabilities. Args: X: Feature matrix. y: Target data vector. pred_name: The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: Visualize graph if needed. save_file: Boolean value to whether or not to save the file. """ filename = f'KS Statistic on {dataset_name} on {self.__model_name}' sub_dir = self.__create_sub_dir_with_thresholds(pred_name, dataset_name, thresholds) if not title: title = filename skplt.metrics.plot_ks_statistic(y, self.__get_model_probas(pred_name, X), title=title, ax=ax, figsize=figsize, title_fontsize=title_fontsize, text_fontsize=text_fontsize) legend = plt.legend(frameon=1) frame = legend.get_frame() frame.set_facecolor('white') frame.set_edgecolor('white') if save_file: self.save_plot(filename=filename, sub_dir=sub_dir) if not self.__called_from_perform: self.generate_matrix_meta_data(X, dataset_name + "/_Extras") if display_visuals: plt.show() plt.close()
[docs] def plot_roc_curve(self, X, y, pred_name, dataset_name, thresholds=None, display_visuals=True, save_file=True, title=None, ax=None, figsize=GRAPH_DEFAULTS.FIGSIZE, title_fontsize='large', text_fontsize='medium'): """ From scikit-plot documentation Link: http://tinyurl.com/y3ym5pyc Creates ROC curves from labels and predicted probabilities. Args: X: Feature matrix. y: Target data vector. pred_name: The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: Visualize graph if needed. save_file: Boolean value to whether or not to save the file. """ filename = f'Roc Curve on {dataset_name} on {self.__model_name}' sub_dir = self.__create_sub_dir_with_thresholds(pred_name, dataset_name, thresholds) if not title: title = filename skplt.metrics.plot_roc(y, self.__get_model_probas(pred_name, X), title=title, ax=ax, figsize=figsize, title_fontsize=title_fontsize, text_fontsize=text_fontsize) legend = plt.legend(frameon=1) frame = legend.get_frame() frame.set_facecolor('white') frame.set_edgecolor('white') if save_file: self.save_plot(filename=filename, sub_dir=sub_dir) if not self.__called_from_perform: self.generate_matrix_meta_data(X, dataset_name + "/_Extras") if display_visuals: plt.show() plt.close()
[docs] def plot_cumulative_gain(self, X, y, pred_name, dataset_name, thresholds=None, display_visuals=True, save_file=True, title=None, ax=None, figsize=GRAPH_DEFAULTS.FIGSIZE, title_fontsize='large', text_fontsize='medium'): """ From scikit-plot documentation Link: http://tinyurl.com/y3ym5pyc Plots calibration curves for a set of classifier probability estimates. Args: X: Feature matrix. y: Target data vector. pred_name: The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: Visualize graph if needed. save_file: Boolean value to whether or not to save the file. """ filename = f'Cumulative Gain gain on {dataset_name} on {self.__model_name}' sub_dir = self.__create_sub_dir_with_thresholds(pred_name, dataset_name, thresholds) if not title: title = filename skplt.metrics.plot_cumulative_gain(y, self.__get_model_probas(pred_name, X), title=title, ax=ax, figsize=figsize, title_fontsize=title_fontsize, text_fontsize=text_fontsize) legend = plt.legend(frameon=1) frame = legend.get_frame() frame.set_facecolor('white') frame.set_edgecolor('white') if save_file: self.save_plot(filename=filename, sub_dir=sub_dir) if not self.__called_from_perform: self.generate_matrix_meta_data(X, dataset_name + "/_Extras") if display_visuals: plt.show() plt.close()
[docs] def plot_precision_recall_curve(self, X, y, pred_name, dataset_name, thresholds=None, display_visuals=True, save_file=True, title=None, plot_micro=True, classes_to_plot=None, ax=None, figsize=GRAPH_DEFAULTS.FIGSIZE, cmap='nipy_spectral', title_fontsize='large', text_fontsize='medium'): """ From scikit-plot documentation Link: http://tinyurl.com/y3ym5pyc Plots precision recall curve plot based on the models predictions. Args: X: Feature matrix. y: Target data vector. pred_name: The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: Visualize graph if needed. save_file: Boolean value to whether or not to save the file. """ filename = f'Precision Recall on {dataset_name} on {self.__model_name}' sub_dir = self.__create_sub_dir_with_thresholds(pred_name, dataset_name, thresholds) if not title: title = filename skplt.metrics.plot_precision_recall(y, self.__get_model_probas(pred_name, X), title=title, plot_micro=plot_micro, classes_to_plot=classes_to_plot, ax=ax, figsize=figsize, cmap=cmap, title_fontsize=title_fontsize, text_fontsize=text_fontsize) legend = plt.legend(frameon=1) frame = legend.get_frame() frame.set_facecolor('white') frame.set_edgecolor('white') if save_file: self.save_plot(filename=filename, sub_dir=sub_dir) if not self.__called_from_perform: self.generate_matrix_meta_data(X, dataset_name + "/_Extras") if display_visuals: plt.show() plt.close()
[docs] def plot_lift_curve(self, X, y, pred_name, dataset_name, thresholds=None, display_visuals=True, save_file=True, title=None, ax=None, figsize=GRAPH_DEFAULTS.FIGSIZE, title_fontsize='large', text_fontsize='medium'): """ From scikit-plot documentation Link: http://tinyurl.com/y3ym5pyc The lift curve is used to determine the effectiveness of a binary classifier. A detailed explanation can be found at http://tinyurl.com/csegj9. The implementation here works only for binary classification. Args: X: Feature matrix. y: Target data vector. pred_name: The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: Visualize graph if needed. save_file: Boolean value to whether or not to save the file. """ filename = f'Lift Curve on {dataset_name} on {self.__model_name}' sub_dir = self.__create_sub_dir_with_thresholds(pred_name, dataset_name, thresholds) if not title: title = filename skplt.metrics.plot_lift_curve(y, self.__get_model_probas(pred_name, X), title=title, ax=ax, figsize=figsize, title_fontsize=title_fontsize, text_fontsize=text_fontsize) legend = plt.legend(frameon=1) frame = legend.get_frame() frame.set_facecolor('white') frame.set_edgecolor('white') if save_file: self.save_plot(filename=filename, sub_dir=sub_dir) if not self.__called_from_perform: self.generate_matrix_meta_data(X, dataset_name + "/_Extras") if display_visuals: plt.show() plt.close()
[docs] def plot_confusion_matrix(self, X, y, pred_name, dataset_name, thresholds=None, display_visuals=True, save_file=True, title=None, normalize=False, hide_zeros=False, hide_counts=False, x_tick_rotation=0, ax=None, figsize=GRAPH_DEFAULTS.FIGSIZE, cmap='Blues', title_fontsize='large', text_fontsize='medium'): """ From scikit-plot documentation Link: http://tinyurl.com/y3ym5pyc Creates a confusion matrix plot based on the models predictions. Args: X: Feature matrix. y: Target data vector. pred_name: The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: Visualize graph if needed. save_file: Boolean value to whether or not to save the file. """ filename = f'Confusion Matrix: {dataset_name} on {self.__model_name} Normalized: {normalize}' sub_dir = self.__create_sub_dir_with_thresholds(pred_name, dataset_name, thresholds) if not title: title = filename warnings.filterwarnings('ignore') ax = skplt.metrics.plot_confusion_matrix( self.__get_model_prediction(pred_name, X, thresholds), y, title=title, normalize=normalize, hide_zeros=hide_zeros, hide_counts=hide_counts, x_tick_rotation=x_tick_rotation, ax=ax, figsize=figsize, cmap=cmap, title_fontsize=title_fontsize, text_fontsize=text_fontsize) warnings.filterwarnings('default') if save_file: self.save_plot(filename=filename, sub_dir=sub_dir) if not self.__called_from_perform: self.generate_matrix_meta_data(X, dataset_name + "/_Extras") if display_visuals: plt.show() plt.close()
[docs] def classification_metrics(self, X, y, pred_name, dataset_name, thresholds=None, display_visuals=True, save_file=True, title="", custom_metrics_dict=dict(), ignore_metrics=[], average_scoring=["micro", "macro", "weighted"]): """ Creates a dataframe based on the prediction metrics of the feature matrix and target vector. Args: X: Feature matrix. y: list or np.array Target data vector. pred_name: The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: Display tables. save_file: Determines whether or not to save the generated document. title: Adds to the column 'Metric Score'. custom_metrics_dict: Pass the name of metric(s) and the function definition(s) in a dictionary. ignore_metrics: Specify the default metrics to not apply to the classification data_analysis. * Precision * MCC * Recall * F1-Score * Accuracy average_scoring: Determines the type of averaging performed on the data. * micro * macro * weighted Returns: Return a dataframe object of the metrics value. """ filename = f'Metric Evaluation on {dataset_name} on {self.__model_name}' sub_dir = self.__create_sub_dir_with_thresholds(pred_name, dataset_name, thresholds) if not isinstance(average_scoring, list): average_scoring = [average_scoring] # Default metric name's and their function metric_functions = dict() metric_functions["Precision"] = precision_score metric_functions["MCC"] = matthews_corrcoef metric_functions["Recall"] = recall_score metric_functions["F1-Score"] = f1_score metric_functions["Accuracy"] = accuracy_score warnings.filterwarnings('ignore') # Ignore default metrics if needed for remove_metric in ignore_metrics: if remove_metric in metric_functions: del metric_functions[remove_metric] # Add in custom metrics if len(custom_metrics_dict.keys()): metric_functions.update(custom_metrics_dict) # Evaluate model on metrics evaluation_report = dict() for metric_name in metric_functions.keys(): for average_score in average_scoring: model_predictions = self.__get_model_prediction(pred_name, X, thresholds) try: evaluation_report[f'{metric_name}({average_score})'] = \ metric_functions[metric_name](y_true=y, y_pred=model_predictions, average=average_score) except TypeError: if metric_name not in evaluation_report.keys(): evaluation_report[metric_name] = metric_functions[ metric_name](y, model_predictions) continue warnings.filterwarnings('default') if title and len(title) > 0: index_name = f"Metric Scores ({title})" else: index_name = "Metric Scores" # --- evaluation_report = pd.DataFrame({index_name: [f'{metric_score:.4f}' for metric_score in evaluation_report.values()]}, index=list(evaluation_report.keys())) if display_visuals: if self.__notebook_mode: display(evaluation_report) else: print(evaluation_report) if save_file: df_to_image(evaluation_report, self.folder_path, sub_dir, convert_to_filename(filename), col_width=20, show_index=True, format_float_pos=4) if not self.__called_from_perform: self.generate_matrix_meta_data(X, dataset_name + "/_Extras")
[docs] def classification_correct_analysis(self, X, y, pred_name, dataset_name, thresholds=None, display_visuals=True, save_file=True, aggerate_target=False, display_print=True, suppress_runtime_errors=True, aggregate_target_feature=True, selected_features=None, extra_tables=True, statistical_analysis_on_aggregates=True): """ Compares the actual target value to the predicted value and performs analysis of all the data. Args: X: np.matrix or lists of lists Feature matrix. y: list or np.array Target data vector. pred_name: str The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: str The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. feature_order: collection object Features names in proper order to re-create the pandas dataframe. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. extra_tables: bool When handling two types of features if set to true this will generate any extra tables that might be helpful. Note - These graphics may create duplicates if you already applied an aggregation in 'perform_analysis' aggregate_target_feature: bool Aggregate the data of the target feature if the data is non-continuous data. Note In the future I will have this also working with continuous data. selected_features: collection object of features Will only focus on these selected feature's and will ignore the other given features. statistical_analysis_on_aggregates: bool If set to true then the function 'statistical_analysis_on_aggregates' will run; which aggregates the data of the target feature either by discrete values or by binning/labeling continuous data. """ sub_dir = self.__create_sub_dir_with_thresholds(pred_name, dataset_name, thresholds) model_predictions = self.__get_model_prediction(pred_name, X, thresholds=thresholds) if sum(model_predictions != y) == len(y): print("Your model predicted everything correctly for this dataset! No correct analysis needed!") print("Also sorry for your model...zero correct? Dam...") else: print("\n\n" + "*" * 10 + "Generating graphs for when the model predicted correctly" + "*" * 10 + "\n") # Generate error dataframe correct_df = pd.DataFrame.from_records(X[model_predictions == y]) correct_df.columns = self.__feature_order correct_df[self.__target_feature] = y[model_predictions == y] # Directory path create_dir_structure(self.folder_path, sub_dir + "/Correctly Predicted Data/All Correct Data") output_path = f"{self.folder_path}/{sub_dir}/Correctly Predicted Data" tmp_df_features = copy.deepcopy(self.__df_features) data_encoder = DataEncoder(create_file=False) data_encoder.revert_dummies(correct_df, tmp_df_features, qualitative_features=list(self.__df_features.get_dummy_encoded_features().keys())) data_encoder.decode_data(correct_df, tmp_df_features, apply_value_representation=False) data_encoder.apply_value_representation(correct_df, tmp_df_features) del data_encoder # Create feature analysis feature_analysis = FeatureAnalysis(tmp_df_features, overwrite_full_path=output_path + "/All Correct Data") feature_analysis.perform_analysis(correct_df, dataset_name=dataset_name, target_features=[self.__target_feature], save_file=save_file, selected_features=selected_features, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print, display_visuals=display_visuals, dataframe_snapshot=False, aggregate_target_feature=aggregate_target_feature, extra_tables=extra_tables, statistical_analysis_on_aggregates=statistical_analysis_on_aggregates) # Aggregate target by predicted and actual if aggerate_target: targets = set(y) for pred_target in targets: if pred_target != pred_target: create_dir_structure(output_path, f"/Actual and Predicted:{pred_target}") # Create predicted vs actual dataframe tmp_df = copy.deepcopy(correct_df[correct_df[ self.__target_feature] == pred_target]) if tmp_df.shape[0]: # Create feature analysis directory structure with given graphics feature_analysis = FeatureAnalysis( tmp_df_features, overwrite_full_path=f"/Actual and Predicted:{pred_target}") feature_analysis.perform_analysis(tmp_df, dataset_name=dataset_name, target_features=[ self.__target_feature], save_file=save_file, selected_features=selected_features, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print, display_visuals=display_visuals, dataframe_snapshot=False, extra_tables=extra_tables, statistical_analysis_on_aggregates=statistical_analysis_on_aggregates)
[docs] def classification_error_analysis(self, X, y, pred_name, dataset_name, thresholds=None, display_visuals=True, save_file=True, aggerate_target=False, display_print=True, suppress_runtime_errors=True, aggregate_target_feature=True, selected_features=None, extra_tables=True, statistical_analysis_on_aggregates=True): """ Compares the actual target value to the predicted value and performs analysis of all the data. Args: X: np.matrix or lists of lists Feature matrix. y: list or np.array Target data vector. pred_name: str The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: str The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. feature_order: collection object Features names in proper order to re-create the pandas dataframe. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. extra_tables: bool When handling two types of features if set to true this will generate any extra tables that might be helpful. Note - These graphics may create duplicates if you already applied an aggregation in 'perform_analysis' aggregate_target_feature: bool Aggregate the data of the target feature if the data is non-continuous data. Note In the future I will have this also working with continuous data. selected_features: collection object of features Will only focus on these selected feature's and will ignore the other given features. statistical_analysis_on_aggregates: bool If set to true then the function 'statistical_analysis_on_aggregates' will run; which aggregates the data of the target feature either by discrete values or by binning/labeling continuous data. """ sub_dir = self.__create_sub_dir_with_thresholds(pred_name, dataset_name, thresholds) model_predictions = self.__get_model_prediction(pred_name, X, thresholds=thresholds) if sum(model_predictions == y) == len(y): print("Your model predicted everything correctly for this dataset! No error analysis needed!") else: print("\n\n" + "*" * 10 + "Generating graphs for when the model predicted incorrectly" + "*" * 10 + "\n") # Generate error dataframe error_df = pd.DataFrame.from_records(X[model_predictions != y]) error_df.columns = self.__feature_order error_df[self.__target_feature] = y[model_predictions != y] error_df.reset_index(drop=True, inplace=True) # Directory path create_dir_structure(self.folder_path, sub_dir + "/Incorrectly Predicted Data/All Incorrect Data") output_path = f"{self.folder_path}/{sub_dir}/Incorrectly Predicted Data" tmp_df_features = copy.deepcopy(self.__df_features) data_encoder = DataEncoder(create_file=False) data_encoder.revert_dummies(error_df, tmp_df_features, qualitative_features=list(self.__df_features.get_dummy_encoded_features().keys())) data_encoder.decode_data(error_df, tmp_df_features, apply_value_representation=False) data_encoder.apply_value_representation(error_df, tmp_df_features) del data_encoder # Create feature analysis feature_analysis = FeatureAnalysis(tmp_df_features, overwrite_full_path=output_path + "/All Incorrect Data") feature_analysis.perform_analysis(error_df, dataset_name=dataset_name, target_features=[self.__target_feature], save_file=save_file, selected_features=selected_features, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print, display_visuals=display_visuals, dataframe_snapshot=False, aggregate_target_feature=aggregate_target_feature, extra_tables=extra_tables, statistical_analysis_on_aggregates=statistical_analysis_on_aggregates) # Aggregate target by predicted and actual if aggerate_target: targets = set(y) prediction_feature = self.__target_feature + "_MODEL_PREDICTIONS_" error_df[prediction_feature] = model_predictions[model_predictions != y] for actual_target in targets: for pred_target in targets: if pred_target != actual_target: create_dir_structure(output_path, f"/Predicted:{pred_target} Actual: {actual_target}") # Create predicted vs actual dataframe tmp_df = copy.deepcopy(error_df[error_df[ self.__target_feature] == actual_target][ error_df[ prediction_feature] == pred_target]) tmp_df.drop(columns=[prediction_feature], inplace=True) if tmp_df.shape[0]: # Create feature analysis directory structure with given graphics feature_analysis = FeatureAnalysis(tmp_df_features, overwrite_full_path=f"{output_path}/Predicted:{pred_target} Actual: {actual_target}") feature_analysis.perform_analysis(tmp_df, dataset_name=dataset_name, target_features=[ self.__target_feature], save_file=save_file, selected_features=selected_features, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print, display_visuals=display_visuals, dataframe_snapshot=False, extra_tables=extra_tables, statistical_analysis_on_aggregates=statistical_analysis_on_aggregates)
[docs] def classification_report(self, X, y, pred_name, dataset_name, thresholds=None, display_visuals=True, save_file=True): """ Creates a report of all target's metric evaluations based on the model's prediction output from the classification report from the sklearn. Args: X: Feature matrix. y: Target data vector. pred_name: The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: Visualize graph if needed. save_file: Boolean value to whether or not to save the file. """ filename = f'Classification Report {dataset_name} on {self.__model_name}' sub_dir = self.__create_sub_dir_with_thresholds(pred_name, dataset_name, thresholds) # Create dataframe report report_df = pd.DataFrame(classification_report(y, self.__get_model_prediction( pred_name, X, thresholds), output_dict=True)) # --- if display_visuals: if self.__notebook_mode: display(report_df) else: print(report_df) if save_file: # Output dataframe as png df_to_image(report_df, self.folder_path, sub_dir, filename, col_width=20, show_index=True, format_float_pos=4) if not self.__called_from_perform: self.generate_matrix_meta_data(X, dataset_name + "/_Extras")
[docs] def graph_model_importances(self, feature_order, feature_importances, display_visuals=True): """ Graph given models's feature importances Args: feature_order: list Features names in proper order to re-create the pandas dataframe. feature_importances: list List of floats that represent each corresponding features importance display_visuals: bool Visualize graph if needed. """ feature_importances, feature_order = zip( *sorted(zip(feature_importances, feature_order), reverse=True)) feature_order = list(feature_order) plt.figure(figsize=(20, 10)) palette = "PuBu" # Color ranking rank_list = np.argsort(-np.array(feature_importances)).argsort() pal = sns.color_palette(palette, len(feature_importances)) palette = np.array(pal[::-1])[rank_list] plt.clf() plt.title("Feature Importances") ax = sns.barplot(x=feature_importances, y=feature_order, palette=palette, order=feature_order) self.save_plot("Feature Importances", "_Extras") pickle_object_to_file(dict(zip(feature_order, feature_importances)), self.folder_path + "/_Extras", "Feature Importances") if self.__notebook_mode and display_visuals: plt.show() plt.close("all")
def __get_model_prediction(self, pred_name, X, thresholds=None): """ Finds the model's predicted labels. Args: X: np.matrix or np.array Feature matrix. pred_name: str The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. Returns: Returns back a predicted value based for a given matrix. Handles prediction function 'types' Predictions and Probabilities. Helps streamline the entire process of evaluating classes. """ # Check if function name exists in the dictionary if pred_name not in self.__pred_funcs_types: raise KeyError("The function name has not be recorded!") # Must be a prediction function if self.__pred_funcs_types[pred_name] == "Predictions": return self.__pred_funcs_dict[pred_name](X) # Output must be continuous; Probabilities elif self.__pred_funcs_types[pred_name] == "Probabilities": # Validate probabilities if thresholds: if isinstance(thresholds, list) or \ isinstance(thresholds, np.ndarray): if len(thresholds) != len(self.__target_values): raise UnsatisfiedRequirments("Length of thresholds must match the same length as the associated classes.") else: raise UnsatisfiedRequirments("Threshold object is not a list or numpy array!") # Get model probability output model_output = self.__get_model_probas(pred_name, X) # No thresholds found if not thresholds: return np.asarray([self.__target_values[np.argmax(proba)] for proba in model_output]) # Apply thresholds to model's probability output else: model_output = model_output - thresholds return np.asarray([self.__target_values[np.argmax(proba)] for proba in model_output]) else: raise ValueError(f"Unknown type '{self.__pred_funcs_types[pred_name]}' was found!") def __get_model_probas(self, pred_name, X): """ Attempts to get the probabilities from the prediction function. Args: X: Feature matrix. pred_name: str The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' Raises: If probabilities isn't possible with the given function that it will invoke an error. Returns: Returns back a series of values between 0-1 to represent it's confidence. """ if self.__pred_funcs_types[pred_name] == "Probabilities": model_output = self.__pred_funcs_dict[pred_name](X) # --- if isinstance(model_output, list): model_output = np.asarray(model_output) return model_output else: raise ProbasNotPossible def __create_sub_dir_with_thresholds(self, pred_name, dataset_name, thresholds): """ Iterates through directory structure looking at each text file containing a string representation of the given threshold; keeps comparing the passed value of 'thresholds' to the text file. Args: pred_name: The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. Returns: Looking at the root of the starting directory and looking at each '_Thresholds.txt' file to determine if the files can be outputed to that directory. The content of the file must match the content of the list/numpy array 'thresholds'. """ sub_dir = f'{dataset_name}/{pred_name}' # Only generate extra folder structure if function type is Probabilities if self.__pred_funcs_types[pred_name] == "Probabilities": # ------ if not thresholds: sub_dir = f'{sub_dir}/No Thresholds' else: i = 0 sub_dir = f'{sub_dir}/Thresholds' tmp_sub_dir = copy.deepcopy(sub_dir) while True: threshold_dir = self.folder_path if i > 0: tmp_sub_dir = (sub_dir + f' {i}') threshold_dir += tmp_sub_dir # If file exists with the same thresholds; than use this directory if os.path.exists(threshold_dir): if self.__compare_thresholds_to_saved_thresholds( threshold_dir, thresholds): sub_dir = tmp_sub_dir break # Create new directory else: os.makedirs(threshold_dir) write_object_text_to_file(thresholds, threshold_dir, "_Thresholds") sub_dir = tmp_sub_dir break # Iterate for directory name change i += 1 return sub_dir def __compare_thresholds_to_saved_thresholds(self, directory_path, thresholds): """ Compare the thresholds object to a threshold text file found in the directory; returns true if the file exists and the object's value matches up. Args: directory_path: Path to the given folder where the "_Thresholds.txt" thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. Returns: Compare the thresholds object to the text file; returns true if the file exists and the object's value matches up. """ file_directory = correct_directory_path(directory_path) if os.path.exists(file_directory): # Extract file contents and convert to a list object file = open(file_directory + "_Thresholds.txt", "r") line = file.read() converted_list = line.split("=")[-1].strip().strip('][').split( ', ') converted_list = [float(val) for val in converted_list] file.close() if thresholds == converted_list: return True else: return False else: return False