Source code for eflow.data_analysis.null_analysis

from eflow._hidden.constants import GRAPH_DEFAULTS
from eflow._hidden.custom_exceptions import SnapshotMismatchError
from eflow._hidden.general_objects import DataFrameSnapshot
from eflow._hidden.parent_objects import DataAnalysis
from eflow.data_analysis.feature_analysis import FeatureAnalysis
from eflow.utils.pandas_utils import missing_values_table, generate_meta_data
import copy
from IPython.display import display

import pandas as pd
import missingno as msno
from matplotlib import pyplot as plt
import warnings


__author__ = "Eric Cacciavillani"
__copyright__ = "Copyright 2019, eFlow"
__credits__ = ["Eric Cacciavillani"]
__license__ = "MIT"
__maintainer__ = "EricCacciavillani"
__email__ = "eric.cacciavillani@gmail.com"


[docs]class NullAnalysis(DataAnalysis): """ Analyzes a pandas dataframe's object for null data; creates visuals like graphs and tables. """ def __init__(self, df_features, dataset_sub_dir="", dataset_name="Default Dataset Name", overwrite_full_path=None, notebook_mode=False): """ Args: df_features: Data dataset_sub_dir: string Appends to the absolute directory of the output folder dataset_name: string Creates a parent or "project" folder in which all sub-directories will be inner nested. overwrite_full_path: string Overwrites the path to the parent folder. notebook_mode: bool If in a python notebook display visualizations in the notebook. """ DataAnalysis.__init__(self, f'{dataset_sub_dir}/{dataset_name}', overwrite_full_path) self.__df_features = copy.deepcopy(df_features) self.__notebook_mode = copy.deepcopy(notebook_mode) # Determines if the perform was called to see if we need to re-check # the dataframe. self.__called_from_perform = False self.__feature_analysis = FeatureAnalysis(df_features, dataset_name=dataset_name, project_sub_dir=project_sub_dir, notebook_mode=notebook_mode)
[docs] def perform_analysis(self, df, dataset_name, display_visuals=True, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, display_print=True, null_features_only=False): """ Perform all public methods of the NullAnalysis object. Except for feature_analysis_of_null_data. Args: df: pd.Dataframe Pandas Dataframe object. dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. null_features_only: bool Dataframe will pass on null features for the visualizations """ try: self.__called_from_perform = False if df is not None: # All functionality is meaningless without getting past the # following check; exit function if not self.__check_dataframe(df): print("Exiting perform data_analysis function call") return None if dataframe_snapshot: df_snapshot = DataFrameSnapshot() df_snapshot.check_create_snapshot(df, self.__df_features, directory_path=self.folder_path, sub_dir=f"{dataset_name}/_Extras") generate_meta_data(df, self.folder_path, f"{dataset_name}" + "/_Extras") # Set to true to represent the function call was made with perform self.__called_from_perform = True if display_visuals: print("\n\n") # -------------------------------------- self.missing_values_table(df, dataset_name, display_visuals=display_visuals, save_file=save_file, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print) if display_visuals: print("\n\n") # -------------------------------------- self.plot_null_bar_graph(df, dataset_name, null_features_only=null_features_only, display_visuals=display_visuals, save_file=save_file, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print) if display_visuals: print("\n\n") # -------------------------------------- self.plot_null_matrix_graph(df, dataset_name, null_features_only=null_features_only, display_visuals=display_visuals, save_file=save_file, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print) if display_visuals: print("\n\n") # -------------------------------------- self.plot_null_heatmap_graph(df, dataset_name, display_visuals=display_visuals, save_file=save_file, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print) if display_visuals: print("\n\n") # -------------------------------------- self.plot_null_dendrogram_graph(df, dataset_name, null_features_only=null_features_only, display_visuals=display_visuals, save_file=save_file, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print) finally: self.__called_from_perform = False
[docs] def feature_analysis_of_null_data(self, df, dataset_name, target_features=None, display_visuals=True, display_print=True, save_file=True, suppress_runtime_errors=True, aggregate_target_feature=True, selected_features=None, extra_tables=True, statistical_analysis_on_aggregates=True, nan_features=[]): """ Performs all public methods that generate visualizations/insights that feature analysis uses on an aggregation of null data in a feature. Note: Pretty much my personal lazy button for running the entire object without specifying any method in particular. Args: df: pd.Dataframe Pandas dataframe object dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. target_features: collection of string or None A feature name that both exists in the init df_features and the passed dataframe. Note If init to 'None' then df_features will try to extract out the target feature. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. save_file: bool Boolean value to whether or not to save the file. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. extra_tables: bool When handling two types of features if set to true this will generate any extra tables that might be helpful. Note - These graphics may create duplicates if you already applied an aggregation in 'perform_analysis' statistical_analysis_on_aggregates: bool If set to true then the function 'statistical_analysis_on_aggregates' will run; which aggregates the data of the target feature either by discrete values or by binning/labeling continuous data. aggregate_target_feature: bool Aggregate the data of the target feature if the data is non-continuous data. Note In the future I will have this also working with continuous data. selected_features: collection object of features Will only focus on these selected feature's and will ignore the other given features. nan_features: collection of strings Features names that must contain nan data to aggregate on. Raises: If an empty dataframe is passed to this function or if the same dataframe is passed to it raise error. """ target_features = set(target_features) for nan_feature_name in nan_features: new_target_features = copy.deepcopy(target_features) if nan_feature_name in new_target_features: new_target_features.discard(nan_feature_name) # No null data ignore feature if df[df[nan_feature_name].isna()].shape[0] == 0: print(f"No nan data found for {nan_feature_name}") continue if display_print: print(f"Feature Analysis on data where {nan_feature_name} = NaN") self.__feature_analysis.perform_analysis( df[df[nan_feature_name].isna()].drop(columns=[nan_feature_name]), dataset_name=dataset_name + "/Feature Analysis of Null Data/" + nan_feature_name + " = NaN", target_features=new_target_features, display_visuals=display_visuals, display_print=display_print, save_file=save_file, dataframe_snapshot=False, suppress_runtime_errors=suppress_runtime_errors, aggregate_target_feature=aggregate_target_feature, statistical_analysis_on_aggregates=statistical_analysis_on_aggregates, selected_features=selected_features, extra_tables=extra_tables)
[docs] def plot_null_matrix_graph(self, df, dataset_name, display_visuals=True, display_print=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, null_features_only=False, filter=None, n=0, p=0, sort=None, figsize=GRAPH_DEFAULTS.NULL_FIGSIZE, width_ratios=(15, 1), color=(.027, .184, .373), fontsize=16, labels=None, sparkline=True, inline=False, freq=None): """ Desc (Taken from missingno): A matrix visualization of the nullity of the given DataFrame then pushes the image to output folder. Args: df: pd.Dataframe Pandas dataframe object dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. save_file: bool Boolean value to whether or not to save the file. filename: string If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. sub_dir: string Specify the sub directory to append to the pre-defined folder path. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. null_features_only: bool Dataframe will pass on null features for the visualizations Please read the offical documentation at for more about the parameters: Link: https://github.com/ResidentMario/missingno Note: Changed the default color of the bar graph because I thought it was ugly. """ # All credit to the following author for making the 'missingno' package # https://github.com/ResidentMario/missingno try: if not self.__called_from_perform: if not self.__check_dataframe(df): if display_print: print("Couldn't create missing values table because" " there is no missing data to display!") return None null_sorted_features, null_features = self.__sort_features_by_nulls(df) if null_features_only: selected_features = null_features else: selected_features = null_sorted_features if display_print: print("Generating graph for null matrix graph...") plt.close("all") msno.matrix(df[selected_features], filter=filter, n=n, p=p, sort=sort, figsize=figsize, width_ratios=width_ratios, color=color, fontsize=fontsize, labels=labels, sparkline=sparkline, inline=inline, freq=freq) if not filename: filename = "Missing data matrix graph" if save_file: if not sub_dir: sub_dir = f"{dataset_name}/Graphics" if self.__called_from_perform: dataframe_snapshot = False self.save_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, meta_data=not self.__called_from_perform) if self.__notebook_mode and display_visuals: plt.show() plt.close("all") except SnapshotMismatchError as e: raise e except Exception as e: plt.close('all') if suppress_runtime_errors: warnings.warn( f"Plot null matrix raised an error:\n{str(e)}", RuntimeWarning) else: raise e
[docs] def plot_null_bar_graph(self, df, dataset_name, display_visuals=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, display_print=True, null_features_only=False, figsize=GRAPH_DEFAULTS.NULL_FIGSIZE, fontsize=16, labels=None, log=False, color=GRAPH_DEFAULTS.NULL_COLOR, inline=False, filter=False, n=0, p=0, sort=None): """ Desc (Taken from missingno): A bar graph visualization of the nullity of the given DataFrame then pushes the image to output folder. Args: df: pd.Dataframe Pandas dataframe object dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: string If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. null_features_only: bool Dataframe will pass on null features for the visualizations Please read the offical documentation for more about the parameters: Link - https://github.com/ResidentMario/missingno Note - Changed the default color of the bar graph because I thought it was ugly. """ # Credit to the following author for making the 'missingno' package # https://github.com/ResidentMario/missingno try: if not self.__called_from_perform: if not self.__check_dataframe(df): if display_print: print("Couldn't create missing values table because" " there is no missing data to display!") return None null_sorted_features, null_features = self.__sort_features_by_nulls(df) if null_features_only: selected_features = null_features else: selected_features = null_sorted_features if display_print: print("Generating graph for null bar graph...") plt.close("all") ax = msno.bar(df[selected_features], figsize=figsize, log=log, fontsize=fontsize, labels=labels, color=color, inline=inline, filter=filter, n=n, p=p, sort=sort) # Annotation props = dict(boxstyle='round', facecolor="#FFFFFF", alpha=0) ax.text(0.05, 1.13, f"Clean data is {df.shape[0]} entries", transform=ax.transAxes, fontsize=10, size=17, verticalalignment='top', bbox=props) # Sets filename with a default name if not filename: filename = "Missing data bar graph" if save_file: if not sub_dir: sub_dir = f"{dataset_name}/Graphics" if self.__called_from_perform: dataframe_snapshot = False self.save_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, meta_data=not self.__called_from_perform) if self.__notebook_mode and display_visuals: plt.show() plt.close("all") except SnapshotMismatchError as e: raise e except Exception as e: plt.close('all') if suppress_runtime_errors: warnings.warn( f"Plot null bar graph raised an error:\n{str(e)}", RuntimeWarning) else: raise e
[docs] def plot_null_heatmap_graph(self, df, dataset_name, display_visuals=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, display_print=True, inline=False, filter=None, n=0, p=0, sort=None, figsize=GRAPH_DEFAULTS.NULL_FIGSIZE, fontsize=16, labels=True, cmap='RdBu', vmin=-1, vmax=1, cbar=True): """ Desc (Taken from missingno): Presents a `seaborn` heatmap visualization of nullity correlation in the given DataFrame. Args: df: pd.Dataframe Pandas dataframe object dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: string If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. Please read the offical documentation for more about the parameters: Link: https://github.com/ResidentMario/missingno Note: Changed the default color of the bar graph because I thought it was ugly. """ # All credit to the following author for making the 'missingno' package # https://github.com/ResidentMario/missingno try: if not self.__called_from_perform: # Compares the json file snapshot to passed dataframe's snapshot if not self.__check_dataframe(df): if display_print: print("Couldn't create missing values table because" " there is no missing data to display!") return None if display_print: print("Generating graph for null heatmap...") # ----- plt.close("all") ax = msno.heatmap(df, inline=inline, filter=filter, n=n, p=p, sort=sort, figsize=figsize, fontsize=fontsize, labels=labels, cmap=cmap, vmin=vmin, vmax=vmax, cbar=cbar) # bottom, top = ax.get_ylim() # ax.set_ylim(bottom + 0.5, top - 0.5) # Sets filename with a default name if not filename: filename = "Missing data heatmap graph" if save_file: if not sub_dir: sub_dir = f"{dataset_name}/Graphics" if self.__called_from_perform: dataframe_snapshot = False self.save_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, meta_data=not self.__called_from_perform) if self.__notebook_mode and display_visuals: plt.show() plt.close("all") except SnapshotMismatchError as e: raise e except Exception as e: plt.close('all') if suppress_runtime_errors: warnings.warn( f"Plot null heatmap raised an error:\n{str(e)}", RuntimeWarning) else: raise e
[docs] def plot_null_dendrogram_graph(self, df, dataset_name, display_visuals=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, display_print=True, null_features_only=False, method='average', filter=None, n=0, p=0, orientation=None, figsize=GRAPH_DEFAULTS.NULL_FIGSIZE, fontsize=16, inline=False): # All credit to the following author for making the 'missingno' package # https://github.com/ResidentMario/missingno """ Desc (Taken from missingno): Fits a `scipy` hierarchical clustering algorithm to the given DataFrame's variables and visualizes the results as a `scipy` dendrogram. Args: df: Pandas dataframe object dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: string If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. null_features_only: bool Dataframe will pass on only null features for the visualizations Please read the offical documentation for more about the parameters: Link: https://github.com/ResidentMario/missingno """ try: if not self.__called_from_perform: if not self.__check_dataframe(df): if display_print: print("Couldn't create missing values table because" " there is no missing data to display!") return None null_sorted_features, null_features = self.__sort_features_by_nulls(df) if null_features_only: selected_features = null_features else: selected_features = null_sorted_features if display_print: print("Generating graph for null dendrogram graph...") plt.close("all") msno.dendrogram(df[selected_features], method=method, filter=filter, n=n, p=p, orientation=orientation, figsize=figsize, fontsize=fontsize, inline=inline) # Sets filename with a default name if not filename: filename = f"Missing data dendrogram graph {method}" if save_file: if not sub_dir: sub_dir = f"{dataset_name}/Graphics" if self.__called_from_perform: dataframe_snapshot = False self.save_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, meta_data=not self.__called_from_perform) if self.__notebook_mode and display_visuals: plt.show() plt.close("all") except SnapshotMismatchError as e: raise e except Exception as e: plt.close('all') if suppress_runtime_errors: warnings.warn( f"Plot null dendrogram raised an error:\n{str(e)}", RuntimeWarning) else: raise e
[docs] def missing_values_table(self, df, dataset_name, display_visuals=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, display_print=True): """ Creates/Saves a Pandas DataFrame object giving the percentage of the null data for the original DataFrame columns. Args: df: pd.Dataframe Pandas DataFrame object dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: string If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. """ try: if not self.__called_from_perform: if not self.__check_dataframe(df): if display_print: print("Couldn't create missing values table because" " there is no missing data to display!") return None if display_print: print("Creating missing values table...") if not self.__called_from_perform: self.__check_dataframe(df) mis_val_table_ren_columns = missing_values_table(df) if display_print: print(f"Your selected dataframe has {str(df.shape[1])} columns.\n" f"It has {str(mis_val_table_ren_columns.shape[0])} columns missing data.\n") if self.__notebook_mode: if display_visuals: display(mis_val_table_ren_columns) else: if display_visuals: print(mis_val_table_ren_columns) # Sets filename with a default name if not filename: filename = "Missing Data Table" # --- if save_file: if not sub_dir: sub_dir = f"{dataset_name}/Tables" if self.__called_from_perform: dataframe_snapshot = False self.save_table_as_plot(df=df, df_features=self.__df_features, filename=filename, show_index=True, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, table=mis_val_table_ren_columns) except SnapshotMismatchError as e: raise e except Exception as e: plt.close('all') if suppress_runtime_errors: warnings.warn( f"Missing data table raised an error:\n{str(e)}", RuntimeWarning) else: raise e
def __check_dataframe(self, df): """ Args: df: pd.Dataframe Pandas Dataframe object. Returns: Returns backs a bool to determine whether or not to null analysis method should work with it. Note: I only made this function in case I needed to do more error checks in the future. """ passed_check = True if not df.isnull().values.any() or df.shape[0] == 0: passed_check = False return passed_check def __sort_features_by_nulls(self, df): """ Sorts a dataframe by data containing the most nulls to least nulls. Args: df: pd.Dataframe Pandas Dataframe object. Returns: Returns back the sorted order of features and the features that contain null. """ # Perform sort of nulls features = df.isnull().sum().index.tolist() null_values = df.isnull().sum().values.tolist() null_values, null_sorted_features = zip(*sorted(zip(null_values, features))) # Get list and reverse sequence null_values = list(null_values) null_sorted_features = list(null_sorted_features) null_sorted_features.reverse() null_values.reverse() # ------------------------------------- # Iterate until through feature values until no nulls feature is found for feature_index, value in enumerate(null_values): if value == 0: break null_features = null_sorted_features[0:feature_index] return null_sorted_features, null_features