Source code for eflow.data_analysis.feature_analysis

from eflow._hidden.parent_objects import FileOutput
from eflow._hidden.general_objects import DataFrameSnapshot
from eflow.utils.pandas_utils import descr_table,value_counts_table
from eflow._hidden.custom_exceptions import UnsatisfiedRequirments, SnapshotMismatchError
from eflow._hidden.constants import GRAPH_DEFAULTS
from eflow._hidden.parent_objects import DataAnalysis
from eflow.utils.pandas_utils import check_if_feature_exists, generate_meta_data, generate_entropy_table, feature_correlation_table, average_feature_correlation_table
from eflow.utils.sys_utils import dict_to_json_file, pickle_object_to_file, create_dir_structure

import warnings
import random
import numpy as np
from matplotlib import pyplot as plt
import copy
from IPython.display import display
from eflow.utils.pandas_utils import df_auto_binning
import seaborn as sns
import pandas as pd
from scipy import stats

__author__ = "Eric Cacciavillani"
__copyright__ = "Copyright 2019, eFlow"
__credits__ = ["Eric Cacciavillani"]
__license__ = "MIT"
__maintainer__ = "EricCacciavillani"
__email__ = "eric.cacciavillani@gmail.com"


[docs]class FeatureAnalysis(DataAnalysis): """ Analyzes the feature data of a pandas Dataframe object. (Ignores null data for displaying data and creates 2d graphics with 2 features. In the future I might add 3d graphics with 3 features.) """ def __init__(self, df_features, dataset_sub_dir="", dataset_name="", overwrite_full_path=None, notebook_mode=False): """ Args: df_features: DataFrameTypes object from eflow. DataFrameTypes object. project_sub_dir: string Appends to the absolute directory of the output folder dataset_name: string Creates a parent or "project" folder in which all sub-directories will be inner nested. overwrite_full_path: string, None Overwrites the path to the parent folder. notebook_mode: bool If in a python notebook display visualizations in the notebook. """ DataAnalysis.__init__(self, f'{dataset_name}/{dataset_sub_dir}', overwrite_full_path) self.__df_features = copy.deepcopy(df_features) self.__notebook_mode = copy.deepcopy(notebook_mode) # Determines if the perform was called to see if we need to re-check # the dataframe. self.__called_from_perform = False
[docs] def perform_analysis(self, df, dataset_name, target_features=None, display_visuals=True, display_print=True, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, figsize=GRAPH_DEFAULTS.FIGSIZE, aggregate_target_feature=True, selected_features=None, extra_tables=True, statistical_analysis_on_aggregates=True): """ Performs all public methods that generate visualizations/insights about the data. Note: Pretty much my personal lazy button for running the entire object without specifying any method in particular. Args: df: pd.Dataframe Pandas dataframe object dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. target_features: collection of strings or None A feature name that both exists in the init df_features and the passed dataframe. Note If init to 'None' then df_features will try to extract out the target feature. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. extra_tables: bool When handling two types of features if set to true this will generate any extra tables that might be helpful. Note - These graphics may create duplicates if you already applied an aggregation in 'perform_analysis' aggregate_target_feature: bool Aggregate the data of the target feature if the data is non-continuous data. Note In the future I will have this also working with continuous data. selected_features: collection object of features Will only focus on these selected feature's and will ignore the other given features. statistical_analysis_on_aggregates: bool If set to true then the function 'statistical_analysis_on_aggregates' will run; which aggregates the data of the target feature either by discrete values or by binning/labeling continuous data. Raises: If an empty dataframe is passed to this function or if the same dataframe is passed to it raise error. """ try: self.__called_from_perform = False # Raise empty dataframe error if df.shape[0] == 0 or np.sum(np.sum(df.isnull()).values) == df.shape[0]: raise UnsatisfiedRequirments("Dataframe must contain valid data and " "not be empty or filled with nulls!") # Compare dataframe json file's snapshot to the given dataframe's # snapshot if dataframe_snapshot: df_snapshot = DataFrameSnapshot() df_snapshot.check_create_snapshot(df, self.__df_features, directory_path=self.folder_path, sub_dir=f"{dataset_name}/_Extras") generate_meta_data(df, self.folder_path, f"{dataset_name}" + "/_Extras") generate_entropy_table(df, self.__df_features, self.folder_path, f"{dataset_name}" + "/_Extras/Statistics") corr_df = feature_correlation_table(df) # Ensures that the correlation dataframe has numerical values if len(corr_df): self.save_table_as_plot(df=df, table=corr_df, show_index=True, format_float_pos=7, df_features=self.__df_features, filename="Correlation Table", sub_dir=f"{dataset_name}" + "/_Extras/Statistics", dataframe_snapshot=False, suppress_runtime_errors=suppress_runtime_errors, meta_data=False) corr_df = average_feature_correlation_table(df) self.save_table_as_plot(df=df, table=corr_df, show_index=True, format_float_pos=7, df_features=self.__df_features, filename="Average Correlation Table", sub_dir=f"{dataset_name}" + "/_Extras/Statistics", dataframe_snapshot=False, suppress_runtime_errors=suppress_runtime_errors, meta_data=False) # Init color ranking fo plot # Ref: http://tinyurl.com/ydgjtmty plt.figure(figsize=(13, 10)) pal = sns.color_palette("GnBu_d", len(corr_df["Average Correlations"])) rank = np.array(corr_df["Average Correlations"]).argsort().argsort() ax = sns.barplot(y=corr_df.index.tolist(), x=corr_df["Average Correlations"], palette=np.array(pal[::-1])[rank]) plt.xticks(rotation=0, fontsize=15) plt.yticks(fontsize=15) plt.xlabel("Features", fontsize=20, labelpad=20) plt.ylabel("Correlation Average", fontsize=20, labelpad=20) plt.title("Average Feature Correlation", fontsize=15) self.save_plot(df=df, df_features=self.__df_features, filename="Average Correlation Rank Graph", sub_dir=f"{dataset_name}" + "/_Extras/Statistics", dataframe_snapshot=False, suppress_runtime_errors=suppress_runtime_errors, meta_data=False) plt.close("all") del corr_df # Set to true to represent the function call was made with perform self.__called_from_perform = True if isinstance(target_features,str): target_features = {target_features} if not target_features: target_features = {None} if isinstance(target_features,list): target_features = set(target_features) # Iterate through all target features for target_feature in target_features: # Iterate through all dataframe features for feature_name in df.columns: # Only compare selected features if user specfied features if selected_features and feature_name not in selected_features and feature_name != target_feature: continue # Ignore if the feature is found to be purely null if feature_name in self.__df_features.null_only_features(): continue # Ignore datetime features if feature_name in self.__df_features.datetime_features(): continue self.analyze_feature(df, feature_name, dataset_name, target_feature=target_feature, display_visuals=display_visuals, save_file=save_file, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, figsize=figsize, display_print=display_print, extra_tables=extra_tables) # Aggregate data if target feature exists if target_feature and feature_name == target_feature and aggregate_target_feature: # ----- if target_feature in self.__df_features.non_numerical_features() or \ target_feature in self.__df_features.bool_features(): target_feature_values = df[target_feature].value_counts(sort=False).index.to_list() # Begin aggregation for target_feature_val in target_feature_values: repr_target_feature_val = target_feature_val # Convert to best bool representation of value if target_feature in self.__df_features.bool_features(): try: repr_target_feature_val = bool(int(repr_target_feature_val)) except ValueError: continue except TypeError: continue # Iterate through all features to generate new graphs for aggregation for f_name in df.columns: if selected_features and f_name not in selected_features and f_name != target_feature: continue if f_name == target_feature: continue if display_print: if repr_target_feature_val: print(f"Target feature {target_feature} set to {target_feature_val}; also known as {repr_target_feature_val}.") else: print(f"Target feature {target_feature} set to {target_feature_val}.") try: self.analyze_feature(df[df[target_feature] == target_feature_val], f_name, dataset_name, target_feature=target_feature, display_visuals=display_visuals, save_file=save_file, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, figsize=figsize, display_print=display_print, sub_dir=f"{dataset_name}/{target_feature}/Where {target_feature} = {repr_target_feature_val}/{f_name}", extra_tables=False) except Exception as e: print(f"Error found on feature {f_name}: {e}") # If any missed features are picked up... missed_features = set(df.columns) ^ self.__df_features.all_features() if len(missed_features) != 0 and display_print: print("Some features were not analyzed by perform analysis!") for feature_name in missed_features: print(f"\t\tFeature:{feature_name}") if statistical_analysis_on_aggregates and target_feature: self.statistical_analysis_on_aggregates(df, target_features, dataset_name, dataframe_snapshot=False) # Ensures that called from perform is turned off finally: self.__called_from_perform = False
[docs] def analyze_feature(self, df, feature_name, dataset_name, target_feature=None, display_visuals=True, display_print=True, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, figsize=GRAPH_DEFAULTS.FIGSIZE, extra_tables=True): """ Generate's all graphic's for that given feature and the relationship to the target feature. Args: df: pd.Dataframe Pandas DataFrame object feature_name: string Specified feature column name. dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. target_feature: string Will create graphics involving this feature with the main feature 'feature_name'. display_visuals: string Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. sub_dir: string Specify the sub directory to append to the pre-defined folder path. save_file: bool Saves file if set to True; doesn't if set to False. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. extra_tables: bool When handling two types of features if set to true this will generate any extra tables that might be helpful. Note - These graphics may create duplicates if you already applied an aggregation in 'perform_analysis' Raises: Raises error if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ # ----- check_if_feature_exists(df, feature_name) colors = self.__get_feature_colors(df, feature_name) # Display colors if colors and display_print: print(f"Colors:\n{colors}\n") # Check if feature exist in df_features and by extension the dataframe if target_feature: if target_feature not in self.__df_features.all_features(): raise UnsatisfiedRequirments("Target feature does not exist in pre-defined " "df_features!") if target_feature not in df.columns: raise UnsatisfiedRequirments("Target feature does not exist in " "the dataframe!") if feature_name not in self.__df_features.all_features(): raise UnsatisfiedRequirments( "Feature name does not exist in pre-defined " "df_features!") if feature_name not in df.columns: raise UnsatisfiedRequirments("Feature name does not exist in " "the dataframe!") # Generate sub directory structure for plots involving two features two_dim_sub_dir = None if sub_dir: two_dim_sub_dir = sub_dir else: if target_feature: two_dim_sub_dir = f"{dataset_name}/{target_feature}/Two feature analysis/{target_feature} by {feature_name}" # ----- if feature_name in self.__df_features.non_numerical_features() or feature_name in self.__df_features.bool_features(): # Pie graph's should only have less than or equal to six. # (The function can handle ample more than this; just stylistically) if len(df[feature_name].value_counts().index) <= 5: self.plot_pie_graph(df, feature_name, dataset_name=dataset_name, display_visuals=display_visuals, sub_dir=sub_dir, save_file=save_file, pallete=colors, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, figsize=figsize, display_print=display_print) # Count plot without colors self.plot_count_graph(df, feature_name, dataset_name=dataset_name, display_visuals=display_visuals, sub_dir=sub_dir, save_file=save_file, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print) # Count plot with colors if colors: self.plot_count_graph(df, feature_name, dataset_name=dataset_name, display_visuals=display_visuals, sub_dir=sub_dir, save_file=save_file, palette=colors, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, figsize=figsize, display_print=display_print) # Generate value counts table self.value_counts_table(df, feature_name, dataset_name=dataset_name, display_visuals=display_visuals, sub_dir=sub_dir, save_file=save_file, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print) # ----- elif feature_name in self.__df_features.continuous_numerical_features(): # Plot distance plot graph self.plot_distance_graph(df, feature_name, dataset_name=dataset_name, display_visuals=display_visuals, sub_dir=sub_dir, save_file=save_file, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, figsize=figsize, display_print=display_print) # Create description table self.descr_table(df, feature_name, dataset_name=dataset_name, display_visuals=display_visuals, sub_dir=sub_dir, save_file=save_file, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print) if target_feature and feature_name != target_feature: # Simplified conditional check for finding type relationship between the two features num_features = [] non_num_features = [] if target_feature in self.__df_features.continuous_numerical_features(): num_features.append(target_feature) elif target_feature in self.__df_features.datetime_features(): pass elif target_feature not in self.__df_features.continuous_numerical_features(): non_num_features.append(target_feature) if feature_name in self.__df_features.continuous_numerical_features(): num_features.append(feature_name) elif feature_name in self.__df_features.datetime_features(): pass elif feature_name not in self.__df_features.continuous_numerical_features(): non_num_features.append(feature_name) # Two different types of features (numerical and non-numerical) if len(num_features) == 1 and len(non_num_features) == 1: # Extract out feature name's to better named variables for sanity numerical_feature = num_features.pop() non_numerical_feature = non_num_features.pop() # Generate violin self.plot_violin_graph(df, non_numerical_feature, dataset_name=dataset_name, other_feature_name=numerical_feature, display_visuals=display_visuals, sub_dir=two_dim_sub_dir, save_file=save_file, palette=colors, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, figsize=figsize, display_print=display_print) # Generate ridge graph self.plot_ridge_graph(df, non_numerical_feature, dataset_name=dataset_name, other_feature_name=numerical_feature, display_visuals=display_visuals, sub_dir=two_dim_sub_dir, save_file=save_file, dataframe_snapshot=dataframe_snapshot, palette=colors, suppress_runtime_errors=suppress_runtime_errors, figsize=figsize, display_print=display_print) # Generate tables based on the aggregation of the non-numerical feature if extra_tables: for val in df[non_numerical_feature].unique(): if display_print: print(f"Where {non_numerical_feature} = {val}") # Create new sub dir based on the aggregation two_dim_desc_sub_dir = copy.deepcopy( two_dim_sub_dir) if not two_dim_desc_sub_dir: two_dim_desc_sub_dir = "" two_dim_desc_sub_dir += "/" + str(val) # Create new dataframe on aggregated value and check for nans tmp_df = df[df[non_numerical_feature] == val] if np.sum(tmp_df[numerical_feature].isnull()) != \ tmp_df.shape[0]: self.descr_table(df=tmp_df, feature_name=numerical_feature, dataset_name=dataset_name, display_visuals=display_visuals, display_print=display_print, sub_dir=two_dim_desc_sub_dir, dataframe_snapshot=False) if display_print: print("\n") del tmp_df elif len(non_num_features) == 2: # Generate tables based on the aggregation of the non-numerical feature if extra_tables: for val in df[feature_name].dropna().unique(): if display_print: print(f"Where {feature_name} = {val}") # Create new sub dir based on the aggregation two_dim_desc_sub_dir = copy.deepcopy( two_dim_sub_dir) if not two_dim_desc_sub_dir: two_dim_desc_sub_dir = "" two_dim_desc_sub_dir += "/" + str(val) # Create new dataframe on aggregated value and check for nans tmp_df = df[df[feature_name] == val] if np.sum(tmp_df[target_feature].isnull()) != \ tmp_df.shape[0]: self.value_counts_table(df=df[df[feature_name] == val], feature_name=target_feature, dataset_name=dataset_name, display_visuals=display_visuals, display_print=display_print, sub_dir=two_dim_desc_sub_dir, dataframe_snapshot=False) if display_print: print("\n") self.group_by_feature_value_count_table(df, feature_name, dataset_name=dataset_name, other_feature_name=target_feature, display_visuals=display_visuals, sub_dir=two_dim_sub_dir, save_file=save_file, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, display_print=display_print) self.plot_multi_bar_graph(df, feature_name, dataset_name=dataset_name, other_feature_name=target_feature, display_visuals=display_visuals, sub_dir=two_dim_sub_dir, save_file=save_file, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, figsize=figsize, display_print=display_print, stacked=False) self.plot_multi_bar_graph(df, feature_name, dataset_name=dataset_name, other_feature_name=target_feature, display_visuals=display_visuals, sub_dir=two_dim_sub_dir, save_file=save_file, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, figsize=figsize, display_print=display_print, stacked=True) elif len(num_features) == 2: # Generate jointplot graph with scatter and kde self.plot_jointplot_graph(df, feature_name, dataset_name=dataset_name, other_feature_name=target_feature, display_visuals=display_visuals, sub_dir=two_dim_sub_dir, save_file=save_file, dataframe_snapshot=dataframe_snapshot, color=colors, suppress_runtime_errors=suppress_runtime_errors, figsize=figsize, display_print=display_print) # Generate jointplot graph with kde self.plot_jointplot_graph(df, feature_name, dataset_name=dataset_name, other_feature_name=target_feature, display_visuals=display_visuals, sub_dir=two_dim_sub_dir, save_file=save_file, dataframe_snapshot=dataframe_snapshot, color=colors, suppress_runtime_errors=suppress_runtime_errors, figsize=figsize, display_print=display_print, kind="kde") if display_print: print("\n\n")
[docs] def plot_distance_graph(self, df, feature_name, dataset_name, display_visuals=True, display_print=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, figsize=GRAPH_DEFAULTS.FIGSIZE, bins=None, norm_hist=True, hist=True, kde=True, colors=None, fit=None, fit_kws=None): """ Display a distance plot and save the graph in the correct directory. Args: df: pd.Dataframe Pandas dataframe object feature_name: string Specified feature column name. dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: string Name to give the file. sub_dir: string Specify the sub directory to append to the pre-defined folder path. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. figsize: tuple The given size of the plot. bins: int Specification of hist bins, or None to use Freedman-Diaconis rule. norm_hist: bool If True, the histogram height shows a density rather than a count. This is implied if a KDE or fitted density is plotted. hist: bool Whether to plot a (normed) histogram. kde: bool Whether to plot a gaussian kernel density estimate. colors : matplotlib color Color to plot everything but the fitted curve in. fit: functional method An object with fit method, returning a tuple that can be passed to a pdf method a positional arguments following an grid of values to evaluate the pdf on. fit_kws : dictionaries, optional Keyword arguments for underlying plotting functions. Credit to seaborn's author: Michael Waskom Git username: mwaskom Doc Link: http://tinyurl.com/ycco2hok Raises: Raises error if the feature data is filled with only nulls or if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ try: # ----- check_if_feature_exists(df, feature_name) # Error check if np.sum(df[feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments( "Distance plot graph couldn't be generated because " + f"there is only missing data to display in {feature_name}!") if display_print: print(f"Generating graph for distance plot on {feature_name}") feature_values = pd.to_numeric(df[feature_name].dropna(), errors='coerce').dropna() if not len(feature_values): raise ValueError( f"The given feature {feature_name} doesn't seem to convert to a numeric vector.") # Closes up any past graph info plt.close('all') # Set foundation graph info sns.set(style="whitegrid") plt.figure(figsize=figsize) plt.title("Distance Plot: " + feature_name) # Create seaborn graph sns.distplot(feature_values, bins=bins, hist=hist, kde=kde, fit=fit, fit_kws=fit_kws, color=colors, norm_hist=norm_hist) # Pass a default name if needed if not filename: filename = f"Distance plot graph on {feature_name}" # Create string sub directory path if not sub_dir: sub_dir = f"{dataset_name}/{feature_name}" # ----- if save_file: if self.__called_from_perform: dataframe_snapshot = False self.save_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, meta_data=not self.__called_from_perform) if self.__notebook_mode and display_visuals: plt.show() except SnapshotMismatchError as e: raise e except Exception as e: if suppress_runtime_errors: warnings.warn( f"Distance plot graph throw an error on feature '{feature_name}':\n{str(e)}", RuntimeWarning) else: raise e finally: plt.close('all')
[docs] def plot_violin_graph(self, df, feature_name, dataset_name, other_feature_name, display_visuals=True, display_print=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, figsize=GRAPH_DEFAULTS.FIGSIZE, order=None, cut=2, scale='area', gridsize=100, width=0.8, palette=None, saturation=0.75): """ Display a violin plot and save the graph in the correct directory. Args: df: pd.Dataframe Pandas dataframe object feature_name: string Specified feature column name to compare to y. dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. other_feature_name: string Specified feature column name to compare to x. display_visuals: bool Boolean value to whether or not to display visualizations. filename: string Name to give the file. sub_dir: string Specify the sub directory to append to the pre-defined folder path. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. display_print: bool Determines whether or not to print function's embedded print statements. figsize: tuple Size of the given plot. order: lists of strings Order to plot the categorical levels in, otherwise the levels are inferred from the data objects. cut: float Distance, in units of bandwidth size, to extend the density past the extreme datapoints. Set to 0 to limit the violin range within the range of the observed data. (i.e., to have the same effect as trim=True in ggplot.) scale: string {area, count, width} The method used to scale the width of each violin. If area, each violin will have the same area. If count, the width of the violins will be scaled by the number of observations in that bin. If width, each violin will have the same width. gridsize: int Number of points in the discrete grid used to compute the kernel density estimate. width: float Width of a full element when not using hue nesting, or width of all the elements for one level of the major grouping variable. palette: dict or string Colors to use for the different levels of the hue variable. Should be something that can be interpreted by color_palette(), or a dictionary mapping hue levels to matplotlib colors. saturation: float Proportion of the original saturation to draw colors at. Large patches often look better with slightly desaturated colors, but set this to 1 if you want the plot colors to perfectly match the input color spec. Credit to seaborn's author: Michael Waskom Git username: mwaskom Doc link: http://tinyurl.com/y3hxxzgv Raises: Raises error if the feature data is filled with only nulls or if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ try: # ----- check_if_feature_exists(df, feature_name) if other_feature_name: check_if_feature_exists(df, other_feature_name) # Error check and create title/part of default file name found_features = [] feature_title = "" for feature in (feature_name, other_feature_name): if feature: if np.sum(df[feature].isnull()) == df.shape[0]: raise UnsatisfiedRequirments("Count plot graph couldn't be generated because " + f"there is only missing data to display in {feature}!") found_features.append(feature) if len(found_features) == 1: feature_title = f"{feature}" else: feature_title += f" by {feature}" if not len(found_features): raise UnsatisfiedRequirments("Both x and y feature's are type 'None'. Please pass at least one feature.") if np.sum(df[feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments( "Violin plot graph couldn't be generated because " + f"there is only missing data to display in {feature_name}!") del found_features if display_print: print("Generating graph violin graph on " + feature_title) # Closes up any past graph info plt.close('all') # Set plot structure fig = plt.figure(figsize=figsize) plt.title("Violin Plot: " + feature_title) feature_values = pd.to_numeric(df[other_feature_name], errors='coerce').dropna() if not len(feature_values): raise ValueError("The y feature must contain numerical features.") x_values = copy.deepcopy(df[feature_name].dropna()) # if feature_name in self.__df_features.bool_features(): # x_values = pd.to_numeric(x_values, # errors='ignore') # # x_values = ['True' if val == 1 else 'False' # if val == 0 else val # for val in x_values] # Sort list by x_values x_values, feature_values = self.__sort_two_lists(x_values,feature_values) warnings.filterwarnings("ignore") sns.violinplot(x=x_values, y=feature_values, order=order, cut=cut, scale=scale, gridsize=gridsize, width=width, palette=palette, saturation=saturation) warnings.filterwarnings("default") # Pass a default name if needed if not filename: filename = f"Violin plot graph on {feature_title}." # Create string sub directory path if not sub_dir: sub_dir = f"{dataset_name}/{feature_name}" # ----- if save_file: if self.__called_from_perform: dataframe_snapshot = False self.save_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, meta_data=not self.__called_from_perform) if self.__notebook_mode and display_visuals: plt.show() except SnapshotMismatchError as e: raise e except Exception as e: warnings.filterwarnings("default") if suppress_runtime_errors: warnings.warn( f"Plot violin graph an error on feature '{feature_name}':\n{str(e)}", RuntimeWarning) else: raise e finally: plt.close('all') warnings.filterwarnings("default")
[docs] def plot_count_graph(self, df, feature_name, dataset_name, display_visuals=True, display_print=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, figsize=GRAPH_DEFAULTS.FIGSIZE, flip_axis=False, palette="PuBu"): """ Display a barplot with color ranking from a feature's value counts from the seaborn libary and save the graph in the correct directory structure. Args: df: pd.Dataframe Pandas dataframe object. feature_name: string Specified feature column name. dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: string Name to give the file. sub_dir: string Specify the sub directory to append to the pre-defined folder path. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. display_print: bool Determines whether or not to print function's embedded print statements. figsize: tuple Size for the given plot. flip_axis: bool Flip the axis the ploting axis from x to y if set to 'True'. palette: dict or string String representation of color pallete for ranking from seaborn's pallete. Credit to seaborn's author: Michael Waskom Git username: mwaskom Link: http://tinyurl.com/y4pzrgcf Raises: Raises error if the feature data is filled with only nulls or if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ try: # ----- check_if_feature_exists(df, feature_name) # Error check if np.sum(df[feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments( "Count plot graph couldn't be generated because " + f"there is only missing data to display in {feature_name}!") if display_print: print(f"Count plot graph on {feature_name}") # Closes up any past graph info plt.close('all') # Set graph info plt.figure(figsize=figsize) sns.set(style="whitegrid") value_counts = df[feature_name].dropna().value_counts(sort=True) feature_values,counts = value_counts.index, value_counts.values del value_counts # Find and rank values based on counts for color variation of the graph if not palette: palette = "PuBu" if isinstance(palette,str): rank_list = np.argsort(-np.array(counts)).argsort() pal = sns.color_palette(palette, len(counts)) palette = np.array(pal[::-1])[rank_list] plt.clf() if feature_name in self.__df_features.bool_features(): i = 0 for val in feature_values: try: feature_values[i] = float(val) except: pass feature_values = [bool(val) if val == 0 or val == 1 else val for val in feature_values] # Flip the graph for visual flare if flip_axis: ax = sns.barplot(x=counts, y=feature_values, palette=palette, order=feature_values) else: ax = sns.barplot(x=feature_values, y=counts, palette=palette, order=feature_values) # Labels for numerical count of each bar for p in ax.patches: height = p.get_height() ax.text(p.get_x() + p.get_width() / 2., height + 3, '{:1}'.format(height), ha="center") plt.title("Category Count Plot: " + feature_name) # Pass a default name if needed if not filename: filename = f"Count plot graph on {feature_name}" if isinstance(palette,np.ndarray): filename += " with count color ranking." # Create string sub directory path if not sub_dir: sub_dir = f"{dataset_name}/{feature_name}" # ----- if save_file: if self.__called_from_perform: dataframe_snapshot = False self.save_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, meta_data=not self.__called_from_perform) if self.__notebook_mode and display_visuals: plt.show() except SnapshotMismatchError as e: raise e except Exception as e: if suppress_runtime_errors: warnings.warn( f"Plot count graph raised an error on feature '{feature_name}':\n{str(e)}", RuntimeWarning) else: raise e finally: plt.close('all')
[docs] def plot_pie_graph(self, df, feature_name, dataset_name, display_visuals=True, display_print=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, figsize=GRAPH_DEFAULTS.FIGSIZE, pallete=None): """ Display a pie graph and save the graph in the correct directory. Args: df: Pandas DataFrame object. feature_name: Specified feature column name. dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. sub_dir: Specify the sub directory to append to the pre-defined folder path. save_file: Boolean value to whether or not to save the file. dataframe_snapshot: Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. figsize: tuple Size of the plot. pallete: dict or string Dictionary of all feature values to hex color values. Raises: Raises error if the feature data is filled with only nulls or if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ try: # ----- check_if_feature_exists(df, feature_name) if np.sum(df[feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments("Pie graph couldn't be generated because " + f"there is only missing data to display in {feature_name}!") if display_print: print(f"Pie graph on {feature_name}") # Closes up any past graph info plt.close('all') # Find value counts value_counts = df[feature_name].dropna().value_counts(sort=False) feature_values = value_counts.index.tolist() value_count_list = value_counts.values.tolist() color_list = None plt.figure(figsize=figsize) # if feature_name in self.__df_features.bool_features(): # # i = 0 # for val in feature_values: # try: # feature_values[i] = float(val) # except: # pass # feature_values = [bool(val) if val == 0 or val == 1 else val # for val in feature_values] # Sort by feature_values feature_values,value_count_list = self.__sort_two_lists(feature_values, value_count_list) if isinstance(pallete,dict): color_list = [] for value in tuple(feature_values): try: color_list.append(pallete[value]) except KeyError: raise KeyError(f"The given value '{value}' in feature '{feature_name}'" + " was not found in the passed color dict.") # Explode the part of the pie graph that is the maximum of the graph explode_array = [0] * len(feature_values) explode_array[np.array(value_count_list).argmax()] = .03 # Plot pie graph plt.pie( tuple(value_count_list), labels=tuple(feature_values), shadow=False, colors=color_list, explode=tuple(explode_array), startangle=90, autopct='%1.1f%%', ) # Set foundation graph info plt.gcf() plt.title("Pie Chart: " + feature_name) plt.legend(fancybox=True, facecolor='w') # Set foundation plt.axis('equal') # Pass a default name if needed if not filename: filename = f"Pie graph on {feature_name}" # Create string sub directory path if not sub_dir: sub_dir = f"{dataset_name}/{feature_name}" # ----- if save_file: if self.__called_from_perform: dataframe_snapshot = False self.save_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, meta_data=not self.__called_from_perform) if self.__notebook_mode and display_visuals: plt.show() except SnapshotMismatchError as e: raise e except Exception as e: if suppress_runtime_errors: warnings.warn( f"Pie graph raised an error on feature '{feature_name}':\n{str(e)}", RuntimeWarning) else: raise e finally: plt.close('all')
[docs] def plot_ridge_graph(self, df, feature_name, dataset_name, other_feature_name, display_visuals=True, display_print=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, figsize=GRAPH_DEFAULTS.FIGSIZE, palette=None): """ Display a ridge plot and save the graph in the correct directory. Args: df: pd.Dataframe Pandas DataFrame object. feature_name: string Specified feature column name. dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. other_feature_name: string Feature to compare to. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: string If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. sub_dir: string Specify the sub directory to append to the pre-defined folder path. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. display_print: bool Determines whether or not to print function's embedded print statements. figsize: tuple Tuple object to represent the plot/image's size. palette: dict or string Dictionary of all feature values to hex color values. Note - A large part of this was taken from: http://tinyurl.com/tuou2cn Raises: Raises error if the feature data is filled with only nulls or if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ try: # ----- check_if_feature_exists(df, feature_name) # ----- check_if_feature_exists(df, other_feature_name) # Error check on null data if np.sum(df[feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments( "Ridge plot graph couldn't be generated because " + f"there is only missing data to display in {feature_name}!") if np.sum(df[other_feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments( "Ridge plot graph couldn't be generated because " + f"there is only missing data to display in {other_feature_name}!") if display_print: print(f"Ridge plot graph on {feature_name} by {other_feature_name}.") sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)}) # Temporarily turn off chained assignments chained_assignment = pd.options.mode.chained_assignment pd.options.mode.chained_assignment = None tmp_df = copy.deepcopy(df[[feature_name,other_feature_name]]) tmp_df[other_feature_name] = pd.to_numeric(tmp_df[other_feature_name], errors='coerce') # if feature_name in self.__df_features.bool_features(): # # tmp_df[feature_name] = pd.to_numeric(tmp_df[feature_name], # errors='ignore') # tmp_df[feature_name] = ['True' if val == 1 else 'False' # if val == 0 else val # for val in tmp_df[feature_name]] tmp_df.dropna(inplace=True) # Remove any values that only return a single value back for val in tmp_df[feature_name].dropna().unique(): feature_value_counts = tmp_df[other_feature_name][tmp_df[feature_name] == val].dropna().value_counts() count_length = len(feature_value_counts.values) if len(feature_value_counts.index.to_list()) <= 1 or count_length == 0: tmp_df = tmp_df[tmp_df[feature_name] != val] elif count_length == 1 and feature_value_counts.values[0] == 1: tmp_df = tmp_df[tmp_df[feature_name] != val] # ----- # for val in tmp_df[other_feature_name].dropna().unique(): # # feature_value_counts = tmp_df[feature_name][tmp_df[other_feature_name] == val].dropna().value_counts() # # count_length = len(feature_value_counts.values) # if len(feature_value_counts.index.to_list()) <= 1 or count_length == 0: # tmp_df = tmp_df[tmp_df[other_feature_name] != val] # elif count_length == 1 and feature_value_counts.values[0] == 1: # tmp_df = tmp_df[tmp_df[other_feature_name] != val] pd.options.mode.chained_assignment = chained_assignment del chained_assignment # # Sort by dataframe's series of 'feature_name' # tmp_df[feature_name], tmp_df[other_feature_name] = self.__sort_two_lists(tmp_df[feature_name], # tmp_df[other_feature_name]) # Suppress any warnings that the seaborn's backend raises warnings.filterwarnings("ignore") sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)}) if not palette: palette = sns.cubehelix_palette(10, rot=-.20, light=.7) # Initialize the FacetGrid object g = sns.FacetGrid(tmp_df, row=feature_name, hue=feature_name, aspect=15, height=.4, palette=palette) # Draw the densities in a few steps g.map(sns.kdeplot, other_feature_name, clip_on=False, shade=True, alpha=1, lw=1.5, bw=.2) g.map(sns.kdeplot, other_feature_name, clip_on=False, color="w", lw=2, bw=.2) g.map(plt.axhline, y=0, lw=2, clip_on=False) # Define and use a simple function to label the plot in axes coordinates def label(x, color, label): ax = plt.gca() ax.text(-.1, .2, label, fontweight="bold", color=color, ha="left", va="center", transform=ax.transAxes) g.map(label, other_feature_name) # Set the subplots to overlap g.fig.subplots_adjust(hspace=-.25) # Remove axes details that don't play well with overlap g.set_titles("") g.set(yticks=[]) g.despine(bottom=True, left=True) g.fig.set_size_inches(figsize[0], figsize[1], forward=True) g.fig.suptitle(f'{feature_name} by {other_feature_name}') warnings.filterwarnings("default") # Pass a default name if needed if not filename: filename = f"Ridge plot graph on {feature_name} by {other_feature_name}" # Create string sub directory path if not sub_dir: sub_dir = f"{dataset_name}/{feature_name}" # ----- if save_file: if self.__called_from_perform: dataframe_snapshot = False if self.__notebook_mode and display_visuals: plt.show() self.save_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, meta_data=not self.__called_from_perform) except SnapshotMismatchError as e: raise e except Exception as e: warnings.filterwarnings("default") if suppress_runtime_errors: warnings.warn( f"Plot ridge graph raised an error on feature '{feature_name}':\n{str(e)}", RuntimeWarning) else: raise e finally: plt.close('all') warnings.filterwarnings("default")
[docs] def plot_multi_bar_graph(self, df, feature_name, dataset_name, other_feature_name, display_visuals=True, display_print=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, figsize=GRAPH_DEFAULTS.FIGSIZE, colors=None, stacked=False): """ Display a pie graph and save the graph in the correct directory. Args: df: Pandas DataFrame object. feature_name: Specified feature column name. dataset_name: The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. sub_dir: Specify the sub directory to append to the pre-defined folder path. save_file: Boolean value to whether or not to save the file. dataframe_snapshot: Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. figsize: tuple Size of the plot. colors: dict or string Dictionary of all feature values to hex color values. stacked: bool Determines if the multi bar graph should be stacked or not. Raises: Raises error if the feature data is filled with only nulls or if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ try: # ----- check_if_feature_exists(df, feature_name) # ----- check_if_feature_exists(df, other_feature_name) # Error check on nan data if np.sum(df[feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments("Multi bar graph on " + f"there is only missing data to display in {feature_name}!") if np.sum(df[other_feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments("Multi bar graph on " + f"there is only missing data to display in {other_feature_name}!") if display_print: print(f"Multi bar graph on {feature_name} by {other_feature_name}") # Closes up any past graph info plt.close('all') if not colors: try: colors = [self.__df_features.get_feature_colors(feature_name)[val] for val in list(df.groupby( [other_feature_name, feature_name]).size().unstack().columns)] except TypeError: pass except KeyError: pass g = df.groupby([other_feature_name, feature_name]).size().unstack().plot( kind='bar', stacked=stacked, color=colors, figsize=figsize) g.legend(loc='upper center', bbox_to_anchor=(1.07, 1), shadow=True, ncol=1) sns.set(style="whitegrid") plt.title(f"Multi bar graph on {feature_name} by {other_feature_name}") # Pass a default name if needed if not filename: if stacked: filename = f"Multi bar graph on {feature_name} by {other_feature_name}" else: filename = f"Multi bar graph stacked on {feature_name} by {other_feature_name}" # Create string sub directory path if not sub_dir: sub_dir = f"{dataset_name}/{feature_name}" # ----- if save_file: if self.__called_from_perform: dataframe_snapshot = False self.save_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, meta_data=not self.__called_from_perform) if self.__notebook_mode and display_visuals: plt.show() except SnapshotMismatchError as e: raise e except Exception as e: if suppress_runtime_errors: warnings.warn( f"Multi bar raised an error on feature '{feature_name}':\n{str(e)}", RuntimeWarning) else: raise e finally: plt.close('all')
[docs] def statistical_analysis_on_aggregates(self, df, target_features, dataset_name, dataframe_snapshot=True): """ Aggregates the data of the target feature either by discrete values or by binning/labeling continuous data. Args: df: pd.Dataframe Pandas DataFrame object. target_features: list of string Specified target features. dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. Note: This function has a lot going on and it's infancy so I am going to purposely not give it suppress_runtime_errors so people will find problems with it and report it to me. Raises: Raises error if the feature data is filled with only nulls or if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ if not self.__called_from_perform and dataframe_snapshot: df_snapshot = DataFrameSnapshot() df_snapshot.check_create_snapshot(df, self.__df_features, directory_path=self.folder_path + f"/{dataset_name}", sub_dir=f"{dataset_name}/_Extras") feature_stats_dict = dict() # Convert to list of the given string if isinstance(target_features, str): target_features = [target_features] # Iterate through all target features features for feature_name in target_features: if feature_name: check_if_feature_exists(df, feature_name) else: continue # Generate bins and labels for continuous numerical data bins = None labels = None if feature_name in self.__df_features.continuous_numerical_features(): bins, labels = df_auto_binning(df, self.__df_features, feature_name) # Labels and bins will act as feature values for aggregation if labels: feature_values = copy.deepcopy(labels) else: feature_values = list(df[feature_name].sort_values( ascending=True).dropna().unique()) # Label to compare to all the data without any aggregations feature_values.append("All") # Copy feature values to remove one given value from the feature list at a time other_feature_values = copy.deepcopy(feature_values) # Create target feature dict feature_stats_dict[feature_name] = dict() feature_val_count = -1 feature_pvalues = dict() # Store all pvalues found for every feature for column in df.columns: if feature_name == column: continue feature_pvalues[column] = dict() # Iterate through remaining features for main_feature_val in feature_values: # Ignore "All" for the main feature val since it doesn't actually exist if main_feature_val == "All": continue # Don't want repeats or compare same subsets other_feature_values.remove(main_feature_val) feature_val_count += 1 # Create series objects based on the main feature val compared to the other_feature_val other_feature_val_count = 0 for other_feature_val in other_feature_values: feature_stats_dict[feature_name][ f"{main_feature_val} -> {other_feature_val}"] = dict() # Generate series object based on bins/discrete values other_feature_val_count += 1 for iterate_feature_name in df.columns: # ----- if iterate_feature_name == feature_name: continue # Create bool array for series object(left) if labels: bool_array = (df[feature_name] <= bins[ feature_val_count + 1]) & ( df[feature_name] > bins[ feature_val_count]) else: bool_array = df[feature_name] == main_feature_val tmp_series_a = df[bool_array][ iterate_feature_name].dropna() del bool_array # Create bool array for series object(right) if other_feature_val == "All": bool_array = [True for _ in range(0, df.shape[0])] else: if labels: bool_array = (df[feature_name] <= bins[ feature_val_count + other_feature_val_count + 1]) & ( df[feature_name] > bins[ feature_val_count + other_feature_val_count]) else: bool_array = df[ feature_name] == other_feature_val tmp_series_b = df[bool_array][ iterate_feature_name].dropna() del bool_array # Extract out pvalue/statistic based on series data if len(tmp_series_a) == 0 or len(tmp_series_b) == 0: pvalue = "NaN" statistic = "NaN" else: ks_2samp = stats.ks_2samp(tmp_series_a, tmp_series_b) pvalue = float(ks_2samp.pvalue) statistic = float(ks_2samp.statistic) # Init pvalue/statistic to proper values feature_stats_dict[feature_name][ f"{main_feature_val} -> {other_feature_val}"][ iterate_feature_name] = dict() feature_stats_dict[feature_name][ f"{main_feature_val} -> {other_feature_val}"][ iterate_feature_name][ "Kolmogorov-Smirnov statistic"] = dict() feature_stats_dict[feature_name][ f"{main_feature_val} -> {other_feature_val}"][ iterate_feature_name][ "Kolmogorov-Smirnov statistic"]["P-Value"] = pvalue feature_stats_dict[feature_name][ f"{main_feature_val} -> {other_feature_val}"][ iterate_feature_name][ "Kolmogorov-Smirnov statistic"][ "Statistic"] = statistic # Don't add to list if pvalue == "NaN": continue # Init dict/list if it doesn't exist if "Kolmogorov-Smirnov statistic" not in \ feature_pvalues[iterate_feature_name]: feature_pvalues[iterate_feature_name][ "Kolmogorov-Smirnov statistic"] = dict() feature_pvalues[iterate_feature_name][ "Kolmogorov-Smirnov statistic"][ "All pvalues"] = [] # Append new pvalue feature_pvalues[iterate_feature_name][ "Kolmogorov-Smirnov statistic"][ "All pvalues"].append(pvalue) # Generate summary data of pvalues for column in df.columns: if column == feature_name or len(feature_pvalues[column].keys()) == 0: continue else: if column in feature_pvalues: feature_pvalues[column][ "Kolmogorov-Smirnov statistic"][ "All pvalues"].sort() # Only create summary if the series is at least the of 2 if len(feature_pvalues[column][ "Kolmogorov-Smirnov statistic"][ "All pvalues"]) >= 2: feature_pvalues[column][ "Kolmogorov-Smirnov statistic" ]["Pvalues Summary"] = descr_table( pd.DataFrame({column: feature_pvalues[column][ "Kolmogorov-Smirnov statistic"][ "All pvalues"]}), column).to_dict()[column] # Init to an empty dict else: feature_pvalues[column][ "Kolmogorov-Smirnov statistic"][ "Pvalues Summary"] = {} feature_stats_dict[feature_name]["P-Values"] = feature_pvalues # End target feature loop # Generate directories create_dir_structure(self.folder_path + dataset_name, "_Extras/Statistics/Accept Null Hypothesis") create_dir_structure(self.folder_path + dataset_name, "_Extras/Statistics/Reject Null Hypothesis") # Create json file dict_to_json_file(feature_stats_dict, self.folder_path + dataset_name + "/_Extras/Statistics", "Statistics on target features") stat_methods_dict = dict() for main_feature, relationship_dict in feature_stats_dict.items(): for _, stats_on_features in relationship_dict.items(): for iterate_feature_name, stats_method_dict in stats_on_features.items(): for stats_method, stats_dict in stats_method_dict.items(): if "All pvalues" in stats_dict: if stats_method not in stat_methods_dict: stat_methods_dict[stats_method] = pd.DataFrame() stats_dict = copy.deepcopy(stats_dict) for k,v in stats_dict.items(): if v == "NaN": stats_dict[k] = [np.nan] else: stats_dict[k] = [v] if len(stats_dict['Pvalues Summary'][0]) > 0: tmp_stats_df = pd.DataFrame.from_dict(stats_dict["Pvalues Summary"])[["mean","std","var"]] tmp_stats_df.index = [f"{main_feature} compared to {iterate_feature_name}"] stat_methods_dict[stats_method] = stat_methods_dict[stats_method].append(tmp_stats_df, ignore_index=False) for stats_method in stat_methods_dict: if stat_methods_dict[stats_method].shape[0]: stat_methods_dict[stats_method].sort_values( by=["mean", "std", "var"], ascending=True, inplace=True) pickle_object_to_file(stat_methods_dict, self.folder_path + dataset_name + "/_Extras/Statistics", "Stat methods of features dataframes", remove_file_extension=False) # Generate multiple json files based on the following pvalues for accept_null_plvalue in [.01, .05, .1, .101, .2, .3, .4, .5, .6, .7, .8, .9, 1]: json_dict = copy.deepcopy(feature_stats_dict) tmp_dict = copy.deepcopy(feature_stats_dict) for main_feature, relationship_dict in tmp_dict.items(): for relationship_string, stats_on_features in relationship_dict.items(): for iterate_feature_name, stats_method_dict in stats_on_features.items(): for stats_method, stats_dict in stats_method_dict.items(): # Not a relationship string; Re-access pvalue list and summary if relationship_string == "P-Values": if "P-Values" not in json_dict[ main_feature] or iterate_feature_name not in \ json_dict[main_feature]["P-Values"]: break filter_pvalues = np.asarray( json_dict[main_feature]["P-Values"][ iterate_feature_name][stats_method][ "All pvalues"]) if accept_null_plvalue <= .1: filter_pvalues = filter_pvalues[ filter_pvalues <= accept_null_plvalue] else: filter_pvalues = filter_pvalues[ filter_pvalues >= accept_null_plvalue] json_dict[main_feature]["P-Values"][ iterate_feature_name][stats_method][ "All pvalues"] = list(filter_pvalues) if len(filter_pvalues) >= 2: json_dict[main_feature][ "P-Values"][iterate_feature_name][ stats_method]["Pvalues Summary"] = \ descr_table( pd.DataFrame( {iterate_feature_name: list( filter_pvalues)}), iterate_feature_name).to_dict()[ iterate_feature_name] else: json_dict[main_feature][ "P-Values"][iterate_feature_name][ stats_method]["Pvalues Summary"] = {} break pvalue = stats_dict["P-Value"] if accept_null_plvalue <= .1: if pvalue == "NaN" or pvalue > accept_null_plvalue: del json_dict[main_feature][ relationship_string][ iterate_feature_name] else: if pvalue == "NaN" or pvalue < accept_null_plvalue: del json_dict[main_feature][ relationship_string][ iterate_feature_name] # Push to accept or reject null hypothesis folder if accept_null_plvalue <= .1: dict_to_json_file(json_dict, self.folder_path + dataset_name + "/_Extras/Statistics/Accept Null Hypothesis", f"Accept Null Hypothesis on target features where pvalue <= {accept_null_plvalue}", remove_file_extension=False) else: dict_to_json_file(json_dict, self.folder_path + dataset_name + "/_Extras/Statistics/Reject Null Hypothesis", f"Reject Null Hypothesis on target features where pvalue >= {accept_null_plvalue}", remove_file_extension=False)
[docs] def plot_jointplot_graph(self, df, feature_name, dataset_name, other_feature_name, display_visuals=True, display_print=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True, figsize=GRAPH_DEFAULTS.FIGSIZE, color=None, kind="scatter and kde", ratio=5): """ Display a ridge plot and save the graph in the correct directory. Args: df: pd.Dataframe Pandas DataFrame object. feature_name: string Specified feature column name. dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. other_feature_name: string Feature to compare to. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: string If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. sub_dir: string Specify the sub directory to append to the pre-defined folder path. save_file: bool Boolean value to whether or not to save the file. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. display_print: bool Determines whether or not to print function's embedded print statements. figsize: tuple Tuple object to represent the plot/image's size. Because joinplot only accepts a single value for the figure; we just pull the greatest of the two values. color: string Seaborn/maplotlib color/hex color for representing the graph kind: string (scatter,reg,resid,kde,hex,scatter and kde) Kind of plot to draw. ratio: Ratio of joint axes height to marginal axes height. (Determines distplot like plots dimensions.) Credit to seaborn's author: Michael Waskom Git username: mwaskom Link: http://tinyurl.com/v9pxsoy Raises: Raises error if the feature data is filled with only nulls or if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ try: # ----- check_if_feature_exists(df, feature_name) check_if_feature_exists(df, other_feature_name) # Error check on null data if np.sum(df[feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments( "Jointplot plot graph couldn't be generated because " + f"there is only missing data to display in {feature_name}!") if np.sum(df[other_feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments( "Jointplot plot graph couldn't be generated because " + f"there is only missing data to display in {other_feature_name}!") if display_print: print(f"Generating jointplot graph on {feature_name} by {other_feature_name}") # Closes up any past graph info plt.close('all') if figsize[0] < figsize[1]: height = figsize[0] else: height = figsize[1] tmp_df = copy.deepcopy(df[[feature_name,other_feature_name]]) tmp_df.dropna() if not kind: kind = "scatter" warnings.filterwarnings("ignore") if kind == "scatter and kde": g = sns.jointplot(feature_name, other_feature_name, data=tmp_df, kind="scatter", color=color, ratio=ratio, height=height).plot_joint(sns.kdeplot, zorder=0, n_levels=6) else: g = sns.jointplot(feature_name, other_feature_name, data=tmp_df, kind=kind, color=color, ratio=ratio, height=height) warnings.filterwarnings("default") plt.subplots_adjust(top=0.93) g.fig.suptitle("Jointplot: " + f"{feature_name} by {other_feature_name}") # Pass a default name if needed if not filename: filename = f"Jointplot plot graph for {feature_name} by {other_feature_name} using {kind}" # Create string sub directory path if not sub_dir: sub_dir = f"{dataset_name}/{feature_name}" # ----- if save_file: if self.__called_from_perform: dataframe_snapshot = False self.save_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, meta_data=not self.__called_from_perform) if self.__notebook_mode and display_visuals: plt.show() except SnapshotMismatchError as e: raise e except Exception as e: warnings.filterwarnings("default") if suppress_runtime_errors: warnings.warn( f"Joinplot plot graph an error on feature '{feature_name}':\n{str(e)}", RuntimeWarning) else: raise e finally: plt.close('all') warnings.filterwarnings("default")
[docs] def value_counts_table(self, df, feature_name, dataset_name, display_visuals=True, display_print=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True): """ Creates a value counts table of the features given data. Note Creates a png of the table. Args: df: pd.Dataframe Pandas DataFrame object feature_name: string Specified feature column name. dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: string If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. sub_dir: string Specify the sub directory to append to the pre-defined folder path. save_file: bool Saves file if set to True; doesn't if set to False. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. Creates/Saves a pandas dataframe of value counts of a dataframe. Note - Creates a png of the table. Raises: Raises error if the feature data is filled with only nulls or if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ try: # ----- check_if_feature_exists(df, feature_name) # Check if feature has only null data if np.sum(df[feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments("Values count table couldn't be generated because " + f"there is only missing data to display in {feature_name}!") if display_print: print(f"Creating value counts table for feature {feature_name}.") # ----- val_counts_df = value_counts_table(df, feature_name) if self.__notebook_mode: if display_visuals: display(val_counts_df) else: if display_visuals: print(val_counts_df) # Pass a default name if needed if not filename: filename = f"{feature_name} Value Counts Table" # Create string sub directory path if not sub_dir: sub_dir = f"{dataset_name}/{feature_name}" if save_file: if self.__called_from_perform: dataframe_snapshot = False self.save_table_as_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, table=val_counts_df, show_index=True, meta_data=not self.__called_from_perform) except SnapshotMismatchError as e: raise e except Exception as e: if suppress_runtime_errors: warnings.warn( f"Value count table raised an error on feature '{feature_name}':\n{str(e)}", RuntimeWarning) else: raise e finally: plt.close('all')
[docs] def descr_table(self, df, feature_name, dataset_name, display_visuals=True, display_print=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True): """ Creates/Saves a pandas dataframe of a feature's numerical data. Standard deviation, mean, Q1-Q5, median, variance, etc. Note Creates a png of the table. Args: df: pd.Dataframe Pandas DataFrame object feature_name: string Specified feature column name. dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: string If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. sub_dir: string Specify the sub directory to append to the pre-defined folder path. save_file: bool Saves file if set to True; doesn't if set to False. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. Raises: Raises error if the feature data is filled with only nulls or if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ try: # ----- check_if_feature_exists(df, feature_name) # Check if dataframe has only null data if np.sum(df[feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments("Descr table couldn't be generated because " + f"there is only missing data to display in {feature_name}!") if display_print: print(f"Creating data description table for {feature_name}") desc_df = descr_table(df, feature_name, to_numeric=True) if self.__notebook_mode: if display_visuals: display(desc_df) else: if display_visuals: print(desc_df) # Pass a default name if needed if not filename: filename = f"{feature_name} Description Table" # Create string sub directory path if not sub_dir: sub_dir = f"{dataset_name}/{feature_name}" if save_file: if self.__called_from_perform: dataframe_snapshot = False self.save_table_as_plot(df=df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, table=desc_df, meta_data=not self.__called_from_perform, show_index=True) except SnapshotMismatchError as e: raise e except Exception as e: if suppress_runtime_errors: warnings.warn( f"Descr table raised an error on feature '{feature_name}':\n{str(e)}", RuntimeWarning) else: raise e finally: plt.close('all')
[docs] def group_by_feature_value_count_table(self, df, feature_name, dataset_name, other_feature_name, display_visuals=True, display_print=True, filename=None, sub_dir=None, save_file=True, dataframe_snapshot=True, suppress_runtime_errors=True): """ Creates/Saves a pandas dataframe of features and their found types in the dataframe. Note Creates a png of the table. Args: df: pd.Dataframe Pandas DataFrame object feature_name: string Specified feature column name. dataset_name: string The dataset's name; this will create a sub-directory in which your generated graph will be inner-nested in. other_feature_name: string Feature to compare to. display_visuals: bool Boolean value to whether or not to display visualizations. display_print: bool Determines whether or not to print function's embedded print statements. filename: string If set to 'None' will default to a pre-defined string; unless it is set to an actual filename. sub_dir: string Specify the sub directory to append to the pre-defined folder path. save_file: bool Saves file if set to True; doesn't if set to False. dataframe_snapshot: bool Boolean value to determine whether or not generate and compare a snapshot of the dataframe in the dataset's directory structure. Helps ensure that data generated in that directory is correctly associated to a dataframe. suppress_runtime_errors: bool If set to true; when generating any graphs will suppress any runtime errors so the program can keep running. Raises: Raises error if the feature data is filled with only nulls or if the json file's snapshot of the given dataframe doesn't match the given dataframe. """ try: # ----- check_if_feature_exists(df, feature_name) # Check if dataframe has only null data if np.sum(df[feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments("Count plot graph couldn't be generated because " + f"there is only missing data to display in {feature_name}!") if np.sum(df[other_feature_name].isnull()) == df.shape[0]: raise UnsatisfiedRequirments("Count plot graph couldn't be generated because " + f"there is only missing data to display in {other_feature_name}!") if display_print: print(f"Creating group by {feature_name} and {other_feature_name} Table") tmp_df = copy.deepcopy(df[[feature_name, other_feature_name]]) tmp_df = tmp_df.groupby([feature_name, other_feature_name]).size().to_frame() tmp_df.columns = ["Counts"] if self.__notebook_mode: if display_visuals: display(tmp_df) else: if display_visuals: print(tmp_df) # Pass a default name if needed if not filename: filename = f"Group by {feature_name} and {other_feature_name} Table" # Create string sub directory path if not sub_dir: sub_dir = f"{dataset_name}/{feature_name}" tmp_df.sort_values(by=["Counts"], ascending=False, inplace=True) if save_file: if self.__called_from_perform: dataframe_snapshot = False self.save_table_as_plot(df=tmp_df, df_features=self.__df_features, filename=filename, sub_dir=sub_dir, dataframe_snapshot=dataframe_snapshot, suppress_runtime_errors=suppress_runtime_errors, table=tmp_df, show_index=True, meta_data=not self.__called_from_perform) except SnapshotMismatchError as e: raise e except Exception as e: if suppress_runtime_errors: warnings.warn( f"Group by table raised an error on feature '{feature_name}' by '{other_feature_name}':\n{str(e)}", RuntimeWarning) else: raise e finally: plt.close('all')
def __get_feature_colors(self, df, feature_name): """ Creates a dict object of all possible feature values with their associated colors. Note Any unknown feature values that aren't declared by df_features are given a default color from the constants section of the project. Goes up to 20 different colors unitl colors is init to None. Args: df: pd.Dataframe Pandas DataFrame object feature_name: string Specified feature column name. Returns: Gives back a dictionary object of all possible feature values with their associated colors. """ colors = self.__df_features.get_feature_colors(feature_name) feature_value_representation = self.__df_features.get_feature_value_representation() if colors: if isinstance(colors, dict): feature_values = df[feature_name].value_counts( sort=False).keys().to_list() decoder = self.__df_features.get_label_decoder() # Add color feature value for decoders values if feature_name in decoder.keys(): for cat, val in decoder[feature_name].items(): if cat in colors.keys(): hex_code = colors[cat] colors[decoder[feature_name][cat]] = hex_code elif val in colors.keys(): hex_code = colors[val] colors[cat] = hex_code # Add color feature value for different value representation if feature_name in feature_value_representation.keys(): for val in feature_value_representation[ feature_name].keys(): if val in colors.keys(): hex_code = colors[val] colors[feature_value_representation[ feature_name][val]] = hex_code i = 0 for value in feature_values: if value not in colors.keys(): colors[value] = \ GRAPH_DEFAULTS.DEFINED_LIST_OF_RANDOM_COLORS[i] i += 1 if i == len( GRAPH_DEFAULTS.DEFINED_LIST_OF_RANDOM_COLORS): colors = None return colors def __sort_two_lists(self, sort_values, other_list): """ Sort's two collections by the first collection passed in. Args: sort_values: collection Values to be sorted by. other_list: collection Values that get sorted based on 'sort_values'. Returns: Returns back those two lists sorted. """ tmp = list(zip(*sorted(list(zip(other_list, sort_values)), key=lambda x: x[1]))) return list(tmp[1]), list(tmp[0])