Source code for eflow.model_analysis.regression_analysis

from eflow.utils.sys_utils import *
from eflow.utils.pandas_utils import df_to_image
from eflow._hidden.parent_objects import ModelAnalysis
from eflow.data_analysis import FeatureAnalysis

# from sklearn.metrics import max_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

import numpy as np
import warnings
import copy
import pandas as pd
from IPython.display import display

__author__ = "Eric Cacciavillani"
__copyright__ = "Copyright 2019, eFlow"
__credits__ = ["Eric Cacciavillani"]
__license__ = "MIT"
__maintainer__ = "EricCacciavillani"
__email__ = "eric.cacciavillani@gmail.com"

[docs]class RegressionAnalysis(ModelAnalysis): """ Analyzes a classification model's result's based on the prediction function(s) passed to it. Creates graphs and tables to be saved in directory structure. """ def __init__(self, dataset_name, model, model_name, feature_order, target_feature, pred_funcs_dict, df_features, project_sub_dir="Regression Analysis", overwrite_full_path=None, save_model=True, notebook_mode=False): """ Args: dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. model: A fitted supervised machine learning model. model_name: The name of the model in string form. feature_order: collection object Features names in proper order to re-create the pandas dataframe. pred_funcs_dict: A dict of the name of the function and the function defintion for the model prediction methods. (Can handle either a return of probabilities or a singile value.) Init Example: pred_funcs = dict() pred_funcs["Predictions"] = model.predict pred_funcs["Probabilities"] = model.probas sample_data: Given data to then pass into our prediction functions to get a resultant to get the classification prediction 'type'. Can be a matrix or a vector. project_sub_dir: Creates a parent or "project" folder in which all sub-directories will be inner nested. overwrite_full_path: Overwrites the path to the parent folder. df_features: DataFrameTypes object; organizes feature types into groups. """ # Init parent object ModelAnalysis.__init__(self, f'{dataset_name}/{project_sub_dir}/Target Feature: {target_feature}/{model_name}', overwrite_full_path) # Init objects without pass by refrence # Remove target feature from feature order when trying to recreate dataframe self.__target_feature = copy.deepcopy(target_feature) self.__feature_order = copy.deepcopy(feature_order) if self.__target_feature in self.__feature_order: self.__feature_order.remove(self.__target_feature) self.__model = copy.deepcopy(model) self.__model_name = copy.deepcopy(model_name) self.__pred_funcs_dict = copy.deepcopy(pred_funcs_dict) self.__df_features = copy.deepcopy(df_features) self.__notebook_mode = copy.deepcopy(notebook_mode) # Determines if the perform was called self.__called_from_perform = False # Attempt to save machine learning model try: if save_model: pickle_object_to_file(self.__model, self.folder_path, f'{self.__model_name}') except: pass # --- create_dir_structure(self.folder_path, "_Extras") # Save features and or df_features object df_features.create_json_file_representation(self.folder_path + "_Extras", "df_features.json") def get_predictions_names(self): return self.__pred_funcs_dict.keys()
[docs] def perform_analysis(self, X, y, dataset_name, regression_error_analysis=False, regression_correct_analysis=False, ignore_metrics=[], custom_metrics_dict=dict(), display_visuals=True, mse_score=None): """ Runs all available analysis functions on the models predicted data. Args: X: Feature matrix. y: Target data vector. dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. regression_error_analysis: bool Perform feature analysis on data that was incorrectly predicted. regression_correct_analysis: bool Perform feature analysis on data that was correctly predicted. ignore_metrics: Specify the default metrics to not apply to the classification data_analysis. * * * * * custom_metrics_dict: Pass the name of metric(s) with the function definition(s) in a dictionary. display_visuals: Controls visual display of error error data_analysis if it is able to run. Returns: Performs all classification functionality with the provided feature data and target data. * plot_precision_recall_curve * classification_evaluation * plot_confusion_matrix """ try: self.__called_from_perform = True self.generate_matrix_meta_data(X, dataset_name + "/_Extras") print("\n\n" + "---" * 10 + f'{dataset_name}' + "---" * 10) for pred_name in self.__pred_funcs_dict.keys(): self.regression_metrics(X, y, pred_name, dataset_name, display_visuals=display_visuals, ignore_metrics=ignore_metrics, custom_metrics_dict=custom_metrics_dict) if regression_error_analysis: self.regression_error_analysis(X, y, pred_name, dataset_name, mse_score=mse_score, display_print=False, display_visuals=display_visuals) if regression_correct_analysis: self.regression_correct_analysis(X, y, pred_name, dataset_name, mse_score=mse_score, display_print=False, display_visuals=display_visuals,) finally: self.__called_from_perform = False
[docs] def regression_metrics(self, X, y, pred_name, dataset_name, display_visuals=True, save_file=True, title="", custom_metrics_dict=dict(), ignore_metrics=[], multioutput=[None, "uniform_average", "variance_weighted"]): """ Creates a dataframe based on the prediction metrics of the feature matrix and target vector. Args: X: Feature matrix. y: Target data vector. pred_name: The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: Display tables. save_file: Determines whether or not to save the generated document. title: Adds to the column 'Metric Score'. custom_metrics_dict: Pass the name of metric(s) and the function definition(s) in a dictionary. ignore_metrics: Specify the default metrics to not apply to the classification data_analysis. * Precision * MCC * Recall * F1-Score * Accuracy average_scoring: Determines the type of averaging performed on the data. * micro * macro * weighted Returns: Return a dataframe object of the metrics value. """ filename = f'Metric Evaluation on {dataset_name} on {self.__model_name}' sub_dir = f'{dataset_name}/{pred_name}' if not isinstance(multioutput, list): multioutput = [multioutput] # Default metric name's and their function metric_functions = dict() metric_functions["Explained Variance Score"] = explained_variance_score # metric_functions["Max Error"] = max_error metric_functions["Mean Absolute Error"] = mean_absolute_error metric_functions["Mean Squared Error"] = mean_squared_error metric_functions["Mean Squared Log Error"] = mean_squared_log_error metric_functions["Mean Squared Log Error"] = median_absolute_error metric_functions["R2 Score"] = r2_score warnings.filterwarnings('ignore') # Ignore default metrics if needed for remove_metric in ignore_metrics: if remove_metric in metric_functions: del metric_functions[remove_metric] # Add in custom metrics if len(custom_metrics_dict.keys()): metric_functions.update(custom_metrics_dict) # Evaluate model on metrics evaluation_report = dict() for metric_name in metric_functions: for multi in multioutput: model_predictions = self.__get_model_prediction(pred_name, X) try: if multi: evaluation_report[f'{metric_name}({multi})'] = \ metric_functions[metric_name](y_true=y, y_pred=model_predictions, multioutput=multi) else: if metric_name not in evaluation_report.keys(): evaluation_report[f'{metric_name}'] = \ metric_functions[metric_name](y_true=y, y_pred=model_predictions, multioutput=multi) except TypeError: if metric_name not in evaluation_report.keys(): evaluation_report[metric_name] = metric_functions[ metric_name](y, model_predictions) warnings.filterwarnings('default') if title and len(title) > 0: index_name = f"Metric Scores ({title})" else: index_name = "Metric Scores" # --- evaluation_report = pd.DataFrame({index_name: [f'{metric_score:.4f}' for metric_score in evaluation_report.values()]}, index=list(evaluation_report.keys())) if display_visuals: if self.__notebook_mode: display(evaluation_report) else: print(evaluation_report) if save_file: df_to_image(evaluation_report, self.folder_path, sub_dir, convert_to_filename(filename), col_width=20, show_index=True, format_float_pos=4) if not self.__called_from_perform: self.generate_matrix_meta_data(X, dataset_name + "/_Extras")
[docs] def regression_correct_analysis(self, X, y, pred_name, dataset_name, mse_score, display_visuals=True, save_file=True, display_print=True, suppress_runtime_errors=True, aggregate_target_feature=True, selected_features=None, extra_tables=True, statistical_analysis_on_aggregates=True): """ Compares the actual target value to the predicted value and performs analysis of all the data. Args: X: np.matrix or lists of lists Feature matrix. y: collection object Target data vector. pred_name: str The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: str The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. feature_order: collection object Features names in proper order to re-create the pandas dataframe. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. extra_tables: bool When handling two types of features if set to true this will generate any extra tables that might be helpful. Note - These graphics may create duplicates if you already applied an aggregation in 'perform_analysis' aggregate_target_feature: bool Aggregate the data of the target feature if the data is non-continuous data. Note In the future I will have this also working with continuous data. selected_features: collection object of features Will only focus on these selected feature's and will ignore the other given features. statistical_analysis_on_aggregates: bool If set to true then the function 'statistical_analysis_on_aggregates' will run; which aggregates the data of the target feature either by discrete values or by binning/labeling continuous data. """ model_predictions = self.__get_model_prediction(pred_name, X) sub_dir = f'{dataset_name}/{pred_name}' if sum(model_predictions != y) == len(y): print("Your model predicted everything correctly for this dataset! No correct analysis needed!") print("Also sorry for your model...zero correct? Dam...") else: print("\n\n" + "*" * 10 + "Generating graphs for when the model predicted correctly" + "*" * 10 + "\n") all_mse_scores = [] for i, pred in enumerate(model_predictions): all_mse_scores.append(mean_squared_error([pred], [y[i]])) # Generate error dataframe bool_list = np.array(all_mse_scores) < mse_score correct_df = pd.DataFrame.from_records(X[bool_list]) correct_df.columns = self.__feature_order correct_df[self.__target_feature] = y[bool_list] # Directory path create_dir_structure(self.folder_path, sub_dir + f"/MSE score less than {mse_score}") output_path = f"{self.folder_path}/{sub_dir}/MSE score less than {mse_score}" # Create feature analysis feature_analysis = FeatureAnalysis(self.__df_features, overwrite_full_path=output_path) feature_analysis.perform_analysis(correct_df, dataset_name=dataset_name, target_features=[self.__target_feature], save_file=save_file, selected_features=selected_features, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print, display_visuals=display_visuals, dataframe_snapshot=False, aggregate_target_feature=aggregate_target_feature, extra_tables=extra_tables, statistical_analysis_on_aggregates=statistical_analysis_on_aggregates)
[docs] def regression_error_analysis(self, X, y, pred_name, dataset_name, mse_score, display_visuals=True, save_file=True, display_print=True, suppress_runtime_errors=True, aggregate_target_feature=True, selected_features=None, extra_tables=True, statistical_analysis_on_aggregates=True): """ Compares the actual target value to the predicted value and performs analysis of all the data. Args: X: np.matrix or lists of lists Feature matrix. y: collection object Target data vector. pred_name: str The name of the prediction function in questioned stored in 'self.__pred_funcs_dict' dataset_name: str The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. feature_order: collection object Features names in proper order to re-create the pandas dataframe. thresholds: If the model outputs a probability list/numpy array then we apply thresholds to the ouput of the model. For classification only; will not affect the direct output of the probabilities. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. extra_tables: bool When handling two types of features if set to true this will generate any extra tables that might be helpful. Note - These graphics may create duplicates if you already applied an aggregation in 'perform_analysis' aggregate_target_feature: bool Aggregate the data of the target feature if the data is non-continuous data. Note In the future I will have this also working with continuous data. selected_features: collection object of features Will only focus on these selected feature's and will ignore the other given features. statistical_analysis_on_aggregates: bool If set to true then the function 'statistical_analysis_on_aggregates' will run; which aggregates the data of the target feature either by discrete values or by binning/labeling continuous data. """ # sub_dir = self.__create_sub_dir_with_thresholds(pred_name, # dataset_name, # thresholds) model_predictions = self.__get_model_prediction(pred_name, X) sub_dir = f'{dataset_name}/{pred_name}' if sum(model_predictions == y) == len(y): print("Your model predicted everything correctly for this dataset! No error analysis needed!") else: print("\n\n" + "*" * 10 + "Generating graphs for when the model predicted incorrectly" + "*" * 10 + "\n") all_mse_scores = [] for i,pred in enumerate(model_predictions): all_mse_scores.append(mean_squared_error([pred],[y[i]])) # Generate error dataframe bool_list = np.array(all_mse_scores) > mse_score error_df = pd.DataFrame.from_records(X[bool_list]) error_df.columns = self.__feature_order error_df[self.__target_feature] = y[bool_list] # Directory path create_dir_structure(self.folder_path, sub_dir + f"/MSE score greater than {mse_score}") output_path = f"{self.folder_path}/{sub_dir}/MSE score greater than {mse_score}" # Create feature analysis feature_analysis = FeatureAnalysis(self.__df_features, overwrite_full_path=output_path) feature_analysis.perform_analysis(error_df, dataset_name=dataset_name, target_features=[self.__target_feature], save_file=save_file, selected_features=selected_features, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print, display_visuals=display_visuals, dataframe_snapshot=False, aggregate_target_feature=aggregate_target_feature, extra_tables=extra_tables, statistical_analysis_on_aggregates=statistical_analysis_on_aggregates)
def __get_model_prediction(self, pred_name, X): if pred_name in self.__pred_funcs_dict.keys(): return self.__pred_funcs_dict[pred_name](X) else: raise KeyError(f"No prediction name found of {pred_name}.")