from eflow._hidden.constants import GRAPH_DEFAULTS
from eflow._hidden.custom_exceptions import SnapshotMismatchError
from eflow._hidden.general_objects import DataFrameSnapshot
from eflow._hidden.parent_objects import DataAnalysis
from eflow.data_analysis.feature_analysis import FeatureAnalysis
from eflow.utils.pandas_utils import missing_values_table, generate_meta_data
import copy
from IPython.display import display
import pandas as pd
import missingno as msno
from matplotlib import pyplot as plt
import warnings
__author__ = "Eric Cacciavillani"
__copyright__ = "Copyright 2019, eFlow"
__credits__ = ["Eric Cacciavillani"]
__license__ = "MIT"
__maintainer__ = "EricCacciavillani"
__email__ = "eric.cacciavillani@gmail.com"
[docs]class NullAnalysis(DataAnalysis):
"""
Analyzes a pandas dataframe's object for null data; creates visuals
like graphs and tables.
"""
def __init__(self,
df_features,
dataset_sub_dir="",
dataset_name="Default Dataset Name",
overwrite_full_path=None,
notebook_mode=False):
"""
Args:
df_features:
Data
dataset_sub_dir: string
Appends to the absolute directory of the output folder
dataset_name: string
Creates a parent or "project" folder in which all sub-directories
will be inner nested.
overwrite_full_path: string
Overwrites the path to the parent folder.
notebook_mode: bool
If in a python notebook display visualizations in the notebook.
"""
DataAnalysis.__init__(self,
f'{dataset_sub_dir}/{dataset_name}',
overwrite_full_path)
self.__df_features = copy.deepcopy(df_features)
self.__notebook_mode = copy.deepcopy(notebook_mode)
# Determines if the perform was called to see if we need to re-check
# the dataframe.
self.__called_from_perform = False
self.__feature_analysis = FeatureAnalysis(df_features,
dataset_name=dataset_name,
project_sub_dir=project_sub_dir,
notebook_mode=notebook_mode)
[docs] def feature_analysis_of_null_data(self,
df,
dataset_name,
target_features=None,
display_visuals=True,
display_print=True,
save_file=True,
suppress_runtime_errors=True,
aggregate_target_feature=True,
selected_features=None,
extra_tables=True,
statistical_analysis_on_aggregates=True,
nan_features=[]):
"""
Performs all public methods that generate visualizations/insights
that feature analysis uses on an aggregation of null data in a
feature.
Note:
Pretty much my personal lazy button for running the entire object
without specifying any method in particular.
Args:
df: pd.Dataframe
Pandas dataframe object
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
target_features: collection of string or None
A feature name that both exists in the init df_features
and the passed dataframe.
Note
If init to 'None' then df_features will try to extract out
the target feature.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
save_file: bool
Boolean value to whether or not to save the file.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
extra_tables: bool
When handling two types of features if set to true this will
generate any extra tables that might be helpful.
Note -
These graphics may create duplicates if you already applied
an aggregation in 'perform_analysis'
statistical_analysis_on_aggregates: bool
If set to true then the function 'statistical_analysis_on_aggregates'
will run; which aggregates the data of the target feature either
by discrete values or by binning/labeling continuous data.
aggregate_target_feature: bool
Aggregate the data of the target feature if the data is
non-continuous data.
Note
In the future I will have this also working with continuous
data.
selected_features: collection object of features
Will only focus on these selected feature's and will ignore
the other given features.
nan_features: collection of strings
Features names that must contain nan data to aggregate on.
Raises:
If an empty dataframe is passed to this function or if the same
dataframe is passed to it raise error.
"""
target_features = set(target_features)
for nan_feature_name in nan_features:
new_target_features = copy.deepcopy(target_features)
if nan_feature_name in new_target_features:
new_target_features.discard(nan_feature_name)
# No null data ignore feature
if df[df[nan_feature_name].isna()].shape[0] == 0:
print(f"No nan data found for {nan_feature_name}")
continue
if display_print:
print(f"Feature Analysis on data where {nan_feature_name} = NaN")
self.__feature_analysis.perform_analysis(
df[df[nan_feature_name].isna()].drop(columns=[nan_feature_name]),
dataset_name=dataset_name + "/Feature Analysis of Null Data/" + nan_feature_name + " = NaN",
target_features=new_target_features,
display_visuals=display_visuals,
display_print=display_print,
save_file=save_file,
dataframe_snapshot=False,
suppress_runtime_errors=suppress_runtime_errors,
aggregate_target_feature=aggregate_target_feature,
statistical_analysis_on_aggregates=statistical_analysis_on_aggregates,
selected_features=selected_features,
extra_tables=extra_tables)
[docs] def plot_null_matrix_graph(self,
df,
dataset_name,
display_visuals=True,
display_print=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
null_features_only=False,
filter=None,
n=0,
p=0,
sort=None,
figsize=GRAPH_DEFAULTS.NULL_FIGSIZE,
width_ratios=(15, 1),
color=(.027, .184, .373),
fontsize=16,
labels=None,
sparkline=True,
inline=False,
freq=None):
"""
Desc (Taken from missingno):
A matrix visualization of the nullity of the given DataFrame then
pushes the image to output folder.
Args:
df: pd.Dataframe
Pandas dataframe object
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
save_file: bool
Boolean value to whether or not to save the file.
filename: string
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
sub_dir: string
Specify the sub directory to append to the pre-defined folder path.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
null_features_only: bool
Dataframe will pass on null features for the visualizations
Please read the offical documentation at for more about the parameters:
Link: https://github.com/ResidentMario/missingno
Note:
Changed the default color of the bar graph because I thought it
was ugly.
"""
# All credit to the following author for making the 'missingno' package
# https://github.com/ResidentMario/missingno
try:
if not self.__called_from_perform:
if not self.__check_dataframe(df):
if display_print:
print("Couldn't create missing values table because"
" there is no missing data to display!")
return None
null_sorted_features, null_features = self.__sort_features_by_nulls(df)
if null_features_only:
selected_features = null_features
else:
selected_features = null_sorted_features
if display_print:
print("Generating graph for null matrix graph...")
plt.close("all")
msno.matrix(df[selected_features],
filter=filter,
n=n,
p=p,
sort=sort,
figsize=figsize,
width_ratios=width_ratios,
color=color,
fontsize=fontsize,
labels=labels,
sparkline=sparkline,
inline=inline,
freq=freq)
if not filename:
filename = "Missing data matrix graph"
if save_file:
if not sub_dir:
sub_dir = f"{dataset_name}/Graphics"
if self.__called_from_perform:
dataframe_snapshot = False
self.save_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
meta_data=not self.__called_from_perform)
if self.__notebook_mode and display_visuals:
plt.show()
plt.close("all")
except SnapshotMismatchError as e:
raise e
except Exception as e:
plt.close('all')
if suppress_runtime_errors:
warnings.warn(
f"Plot null matrix raised an error:\n{str(e)}",
RuntimeWarning)
else:
raise e
[docs] def plot_null_bar_graph(self,
df,
dataset_name,
display_visuals=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
display_print=True,
null_features_only=False,
figsize=GRAPH_DEFAULTS.NULL_FIGSIZE,
fontsize=16,
labels=None,
log=False,
color=GRAPH_DEFAULTS.NULL_COLOR,
inline=False,
filter=False,
n=0,
p=0,
sort=None):
"""
Desc (Taken from missingno):
A bar graph visualization of the nullity of the given DataFrame then
pushes the image to output folder.
Args:
df: pd.Dataframe
Pandas dataframe object
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename: string
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
save_file: bool
Boolean value to whether or not to save the file.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
null_features_only: bool
Dataframe will pass on null features for the visualizations
Please read the offical documentation for more about the parameters:
Link - https://github.com/ResidentMario/missingno
Note -
Changed the default color of the bar graph because I thought it
was ugly.
"""
# Credit to the following author for making the 'missingno' package
# https://github.com/ResidentMario/missingno
try:
if not self.__called_from_perform:
if not self.__check_dataframe(df):
if display_print:
print("Couldn't create missing values table because"
" there is no missing data to display!")
return None
null_sorted_features, null_features = self.__sort_features_by_nulls(df)
if null_features_only:
selected_features = null_features
else:
selected_features = null_sorted_features
if display_print:
print("Generating graph for null bar graph...")
plt.close("all")
ax = msno.bar(df[selected_features],
figsize=figsize,
log=log,
fontsize=fontsize,
labels=labels,
color=color,
inline=inline,
filter=filter,
n=n,
p=p,
sort=sort)
# Annotation
props = dict(boxstyle='round',
facecolor="#FFFFFF",
alpha=0)
ax.text(0.05,
1.13,
f"Clean data is {df.shape[0]} entries",
transform=ax.transAxes,
fontsize=10,
size=17,
verticalalignment='top',
bbox=props)
# Sets filename with a default name
if not filename:
filename = "Missing data bar graph"
if save_file:
if not sub_dir:
sub_dir = f"{dataset_name}/Graphics"
if self.__called_from_perform:
dataframe_snapshot = False
self.save_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
meta_data=not self.__called_from_perform)
if self.__notebook_mode and display_visuals:
plt.show()
plt.close("all")
except SnapshotMismatchError as e:
raise e
except Exception as e:
plt.close('all')
if suppress_runtime_errors:
warnings.warn(
f"Plot null bar graph raised an error:\n{str(e)}",
RuntimeWarning)
else:
raise e
[docs] def plot_null_heatmap_graph(self,
df,
dataset_name,
display_visuals=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
display_print=True,
inline=False,
filter=None,
n=0,
p=0,
sort=None,
figsize=GRAPH_DEFAULTS.NULL_FIGSIZE,
fontsize=16,
labels=True,
cmap='RdBu',
vmin=-1,
vmax=1,
cbar=True):
"""
Desc (Taken from missingno):
Presents a `seaborn` heatmap visualization of nullity correlation
in the given DataFrame.
Args:
df: pd.Dataframe
Pandas dataframe object
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename: string
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
save_file: bool
Boolean value to whether or not to save the file.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
Please read the offical documentation for more about the parameters:
Link: https://github.com/ResidentMario/missingno
Note:
Changed the default color of the bar graph because I thought it
was ugly.
"""
# All credit to the following author for making the 'missingno' package
# https://github.com/ResidentMario/missingno
try:
if not self.__called_from_perform:
# Compares the json file snapshot to passed dataframe's snapshot
if not self.__check_dataframe(df):
if display_print:
print("Couldn't create missing values table because"
" there is no missing data to display!")
return None
if display_print:
print("Generating graph for null heatmap...")
# -----
plt.close("all")
ax = msno.heatmap(df,
inline=inline,
filter=filter,
n=n,
p=p,
sort=sort,
figsize=figsize,
fontsize=fontsize,
labels=labels,
cmap=cmap,
vmin=vmin,
vmax=vmax,
cbar=cbar)
# bottom, top = ax.get_ylim()
# ax.set_ylim(bottom + 0.5, top - 0.5)
# Sets filename with a default name
if not filename:
filename = "Missing data heatmap graph"
if save_file:
if not sub_dir:
sub_dir = f"{dataset_name}/Graphics"
if self.__called_from_perform:
dataframe_snapshot = False
self.save_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
meta_data=not self.__called_from_perform)
if self.__notebook_mode and display_visuals:
plt.show()
plt.close("all")
except SnapshotMismatchError as e:
raise e
except Exception as e:
plt.close('all')
if suppress_runtime_errors:
warnings.warn(
f"Plot null heatmap raised an error:\n{str(e)}",
RuntimeWarning)
else:
raise e
[docs] def plot_null_dendrogram_graph(self,
df,
dataset_name,
display_visuals=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
display_print=True,
null_features_only=False,
method='average',
filter=None,
n=0,
p=0,
orientation=None,
figsize=GRAPH_DEFAULTS.NULL_FIGSIZE,
fontsize=16,
inline=False):
# All credit to the following author for making the 'missingno' package
# https://github.com/ResidentMario/missingno
"""
Desc (Taken from missingno):
Fits a `scipy` hierarchical clustering algorithm to the given
DataFrame's variables and visualizes the results as
a `scipy` dendrogram.
Args:
df:
Pandas dataframe object
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename: string
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
save_file: bool
Boolean value to whether or not to save the file.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
null_features_only: bool
Dataframe will pass on only null features for the visualizations
Please read the offical documentation for more about the parameters:
Link: https://github.com/ResidentMario/missingno
"""
try:
if not self.__called_from_perform:
if not self.__check_dataframe(df):
if display_print:
print("Couldn't create missing values table because"
" there is no missing data to display!")
return None
null_sorted_features, null_features = self.__sort_features_by_nulls(df)
if null_features_only:
selected_features = null_features
else:
selected_features = null_sorted_features
if display_print:
print("Generating graph for null dendrogram graph...")
plt.close("all")
msno.dendrogram(df[selected_features],
method=method,
filter=filter,
n=n,
p=p,
orientation=orientation,
figsize=figsize,
fontsize=fontsize,
inline=inline)
# Sets filename with a default name
if not filename:
filename = f"Missing data dendrogram graph {method}"
if save_file:
if not sub_dir:
sub_dir = f"{dataset_name}/Graphics"
if self.__called_from_perform:
dataframe_snapshot = False
self.save_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
meta_data=not self.__called_from_perform)
if self.__notebook_mode and display_visuals:
plt.show()
plt.close("all")
except SnapshotMismatchError as e:
raise e
except Exception as e:
plt.close('all')
if suppress_runtime_errors:
warnings.warn(
f"Plot null dendrogram raised an error:\n{str(e)}",
RuntimeWarning)
else:
raise e
[docs] def missing_values_table(self,
df,
dataset_name,
display_visuals=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
display_print=True):
"""
Creates/Saves a Pandas DataFrame object giving the percentage of
the null data for the original DataFrame columns.
Args:
df: pd.Dataframe
Pandas DataFrame object
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename: string
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
save_file: bool
Boolean value to whether or not to save the file.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
"""
try:
if not self.__called_from_perform:
if not self.__check_dataframe(df):
if display_print:
print("Couldn't create missing values table because"
" there is no missing data to display!")
return None
if display_print:
print("Creating missing values table...")
if not self.__called_from_perform:
self.__check_dataframe(df)
mis_val_table_ren_columns = missing_values_table(df)
if display_print:
print(f"Your selected dataframe has {str(df.shape[1])} columns.\n"
f"It has {str(mis_val_table_ren_columns.shape[0])} columns missing data.\n")
if self.__notebook_mode:
if display_visuals:
display(mis_val_table_ren_columns)
else:
if display_visuals:
print(mis_val_table_ren_columns)
# Sets filename with a default name
if not filename:
filename = "Missing Data Table"
# ---
if save_file:
if not sub_dir:
sub_dir = f"{dataset_name}/Tables"
if self.__called_from_perform:
dataframe_snapshot = False
self.save_table_as_plot(df=df,
df_features=self.__df_features,
filename=filename,
show_index=True,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
table=mis_val_table_ren_columns)
except SnapshotMismatchError as e:
raise e
except Exception as e:
plt.close('all')
if suppress_runtime_errors:
warnings.warn(
f"Missing data table raised an error:\n{str(e)}",
RuntimeWarning)
else:
raise e
def __check_dataframe(self,
df):
"""
Args:
df: pd.Dataframe
Pandas Dataframe object.
Returns:
Returns backs a bool to determine whether or not to null analysis
method should work with it.
Note:
I only made this function in case I needed to do more error checks
in the future.
"""
passed_check = True
if not df.isnull().values.any() or df.shape[0] == 0:
passed_check = False
return passed_check
def __sort_features_by_nulls(self,
df):
"""
Sorts a dataframe by data containing the most nulls to least nulls.
Args:
df: pd.Dataframe
Pandas Dataframe object.
Returns:
Returns back the sorted order of features and the features that
contain null.
"""
# Perform sort of nulls
features = df.isnull().sum().index.tolist()
null_values = df.isnull().sum().values.tolist()
null_values, null_sorted_features = zip(*sorted(zip(null_values,
features)))
# Get list and reverse sequence
null_values = list(null_values)
null_sorted_features = list(null_sorted_features)
null_sorted_features.reverse()
null_values.reverse()
# -------------------------------------
# Iterate until through feature values until no nulls feature is found
for feature_index, value in enumerate(null_values):
if value == 0:
break
null_features = null_sorted_features[0:feature_index]
return null_sorted_features, null_features