from eflow._hidden.parent_objects import FileOutput
from eflow._hidden.general_objects import DataFrameSnapshot
from eflow.utils.pandas_utils import descr_table,value_counts_table
from eflow._hidden.custom_exceptions import UnsatisfiedRequirments, SnapshotMismatchError
from eflow._hidden.constants import GRAPH_DEFAULTS
from eflow._hidden.parent_objects import DataAnalysis
from eflow.utils.pandas_utils import check_if_feature_exists, generate_meta_data, generate_entropy_table, feature_correlation_table, average_feature_correlation_table
from eflow.utils.sys_utils import dict_to_json_file, pickle_object_to_file, create_dir_structure
import warnings
import random
import numpy as np
from matplotlib import pyplot as plt
import copy
from IPython.display import display
from eflow.utils.pandas_utils import df_auto_binning
import seaborn as sns
import pandas as pd
from scipy import stats
__author__ = "Eric Cacciavillani"
__copyright__ = "Copyright 2019, eFlow"
__credits__ = ["Eric Cacciavillani"]
__license__ = "MIT"
__maintainer__ = "EricCacciavillani"
__email__ = "eric.cacciavillani@gmail.com"
[docs]class FeatureAnalysis(DataAnalysis):
"""
Analyzes the feature data of a pandas Dataframe object.
(Ignores null data for displaying data and creates 2d graphics with 2 features.
In the future I might add 3d graphics with 3 features.)
"""
def __init__(self,
df_features,
dataset_sub_dir="",
dataset_name="",
overwrite_full_path=None,
notebook_mode=False):
"""
Args:
df_features: DataFrameTypes object from eflow.
DataFrameTypes object.
project_sub_dir: string
Appends to the absolute directory of the output folder
dataset_name: string
Creates a parent or "project" folder in which all sub-directories
will be inner nested.
overwrite_full_path: string, None
Overwrites the path to the parent folder.
notebook_mode: bool
If in a python notebook display visualizations in the notebook.
"""
DataAnalysis.__init__(self,
f'{dataset_name}/{dataset_sub_dir}',
overwrite_full_path)
self.__df_features = copy.deepcopy(df_features)
self.__notebook_mode = copy.deepcopy(notebook_mode)
# Determines if the perform was called to see if we need to re-check
# the dataframe.
self.__called_from_perform = False
[docs] def analyze_feature(self,
df,
feature_name,
dataset_name,
target_feature=None,
display_visuals=True,
display_print=True,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
figsize=GRAPH_DEFAULTS.FIGSIZE,
extra_tables=True):
"""
Generate's all graphic's for that given feature and the relationship
to the target feature.
Args:
df: pd.Dataframe
Pandas DataFrame object
feature_name: string
Specified feature column name.
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
target_feature: string
Will create graphics involving this feature with the main
feature 'feature_name'.
display_visuals: string
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
sub_dir: string
Specify the sub directory to append to the pre-defined folder path.
save_file: bool
Saves file if set to True; doesn't if set to False.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
extra_tables: bool
When handling two types of features if set to true this will
generate any extra tables that might be helpful.
Note -
These graphics may create duplicates if you already applied
an aggregation in 'perform_analysis'
Raises:
Raises error if the json file's snapshot of the given dataframe doesn't
match the given dataframe.
"""
# -----
check_if_feature_exists(df,
feature_name)
colors = self.__get_feature_colors(df,
feature_name)
# Display colors
if colors and display_print:
print(f"Colors:\n{colors}\n")
# Check if feature exist in df_features and by extension the dataframe
if target_feature:
if target_feature not in self.__df_features.all_features():
raise UnsatisfiedRequirments("Target feature does not exist in pre-defined "
"df_features!")
if target_feature not in df.columns:
raise UnsatisfiedRequirments("Target feature does not exist in "
"the dataframe!")
if feature_name not in self.__df_features.all_features():
raise UnsatisfiedRequirments(
"Feature name does not exist in pre-defined "
"df_features!")
if feature_name not in df.columns:
raise UnsatisfiedRequirments("Feature name does not exist in "
"the dataframe!")
# Generate sub directory structure for plots involving two features
two_dim_sub_dir = None
if sub_dir:
two_dim_sub_dir = sub_dir
else:
if target_feature:
two_dim_sub_dir = f"{dataset_name}/{target_feature}/Two feature analysis/{target_feature} by {feature_name}"
# -----
if feature_name in self.__df_features.non_numerical_features() or feature_name in self.__df_features.bool_features():
# Pie graph's should only have less than or equal to six.
# (The function can handle ample more than this; just stylistically)
if len(df[feature_name].value_counts().index) <= 5:
self.plot_pie_graph(df,
feature_name,
dataset_name=dataset_name,
display_visuals=display_visuals,
sub_dir=sub_dir,
save_file=save_file,
pallete=colors,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
figsize=figsize,
display_print=display_print)
# Count plot without colors
self.plot_count_graph(df,
feature_name,
dataset_name=dataset_name,
display_visuals=display_visuals,
sub_dir=sub_dir,
save_file=save_file,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
display_print=display_print)
# Count plot with colors
if colors:
self.plot_count_graph(df,
feature_name,
dataset_name=dataset_name,
display_visuals=display_visuals,
sub_dir=sub_dir,
save_file=save_file,
palette=colors,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
figsize=figsize,
display_print=display_print)
# Generate value counts table
self.value_counts_table(df,
feature_name,
dataset_name=dataset_name,
display_visuals=display_visuals,
sub_dir=sub_dir,
save_file=save_file,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
display_print=display_print)
# -----
elif feature_name in self.__df_features.continuous_numerical_features():
# Plot distance plot graph
self.plot_distance_graph(df,
feature_name,
dataset_name=dataset_name,
display_visuals=display_visuals,
sub_dir=sub_dir,
save_file=save_file,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
figsize=figsize,
display_print=display_print)
# Create description table
self.descr_table(df,
feature_name,
dataset_name=dataset_name,
display_visuals=display_visuals,
sub_dir=sub_dir,
save_file=save_file,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
display_print=display_print)
if target_feature and feature_name != target_feature:
# Simplified conditional check for finding type relationship between the two features
num_features = []
non_num_features = []
if target_feature in self.__df_features.continuous_numerical_features():
num_features.append(target_feature)
elif target_feature in self.__df_features.datetime_features():
pass
elif target_feature not in self.__df_features.continuous_numerical_features():
non_num_features.append(target_feature)
if feature_name in self.__df_features.continuous_numerical_features():
num_features.append(feature_name)
elif feature_name in self.__df_features.datetime_features():
pass
elif feature_name not in self.__df_features.continuous_numerical_features():
non_num_features.append(feature_name)
# Two different types of features (numerical and non-numerical)
if len(num_features) == 1 and len(non_num_features) == 1:
# Extract out feature name's to better named variables for sanity
numerical_feature = num_features.pop()
non_numerical_feature = non_num_features.pop()
# Generate violin
self.plot_violin_graph(df,
non_numerical_feature,
dataset_name=dataset_name,
other_feature_name=numerical_feature,
display_visuals=display_visuals,
sub_dir=two_dim_sub_dir,
save_file=save_file,
palette=colors,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
figsize=figsize,
display_print=display_print)
# Generate ridge graph
self.plot_ridge_graph(df,
non_numerical_feature,
dataset_name=dataset_name,
other_feature_name=numerical_feature,
display_visuals=display_visuals,
sub_dir=two_dim_sub_dir,
save_file=save_file,
dataframe_snapshot=dataframe_snapshot,
palette=colors,
suppress_runtime_errors=suppress_runtime_errors,
figsize=figsize,
display_print=display_print)
# Generate tables based on the aggregation of the non-numerical feature
if extra_tables:
for val in df[non_numerical_feature].unique():
if display_print:
print(f"Where {non_numerical_feature} = {val}")
# Create new sub dir based on the aggregation
two_dim_desc_sub_dir = copy.deepcopy(
two_dim_sub_dir)
if not two_dim_desc_sub_dir:
two_dim_desc_sub_dir = ""
two_dim_desc_sub_dir += "/" + str(val)
# Create new dataframe on aggregated value and check for nans
tmp_df = df[df[non_numerical_feature] == val]
if np.sum(tmp_df[numerical_feature].isnull()) != \
tmp_df.shape[0]:
self.descr_table(df=tmp_df,
feature_name=numerical_feature,
dataset_name=dataset_name,
display_visuals=display_visuals,
display_print=display_print,
sub_dir=two_dim_desc_sub_dir,
dataframe_snapshot=False)
if display_print:
print("\n")
del tmp_df
elif len(non_num_features) == 2:
# Generate tables based on the aggregation of the non-numerical feature
if extra_tables:
for val in df[feature_name].dropna().unique():
if display_print:
print(f"Where {feature_name} = {val}")
# Create new sub dir based on the aggregation
two_dim_desc_sub_dir = copy.deepcopy(
two_dim_sub_dir)
if not two_dim_desc_sub_dir:
two_dim_desc_sub_dir = ""
two_dim_desc_sub_dir += "/" + str(val)
# Create new dataframe on aggregated value and check for nans
tmp_df = df[df[feature_name] == val]
if np.sum(tmp_df[target_feature].isnull()) != \
tmp_df.shape[0]:
self.value_counts_table(df=df[df[feature_name] == val],
feature_name=target_feature,
dataset_name=dataset_name,
display_visuals=display_visuals,
display_print=display_print,
sub_dir=two_dim_desc_sub_dir,
dataframe_snapshot=False)
if display_print:
print("\n")
self.group_by_feature_value_count_table(df,
feature_name,
dataset_name=dataset_name,
other_feature_name=target_feature,
display_visuals=display_visuals,
sub_dir=two_dim_sub_dir,
save_file=save_file,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
display_print=display_print)
self.plot_multi_bar_graph(df,
feature_name,
dataset_name=dataset_name,
other_feature_name=target_feature,
display_visuals=display_visuals,
sub_dir=two_dim_sub_dir,
save_file=save_file,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
figsize=figsize,
display_print=display_print,
stacked=False)
self.plot_multi_bar_graph(df,
feature_name,
dataset_name=dataset_name,
other_feature_name=target_feature,
display_visuals=display_visuals,
sub_dir=two_dim_sub_dir,
save_file=save_file,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
figsize=figsize,
display_print=display_print,
stacked=True)
elif len(num_features) == 2:
# Generate jointplot graph with scatter and kde
self.plot_jointplot_graph(df,
feature_name,
dataset_name=dataset_name,
other_feature_name=target_feature,
display_visuals=display_visuals,
sub_dir=two_dim_sub_dir,
save_file=save_file,
dataframe_snapshot=dataframe_snapshot,
color=colors,
suppress_runtime_errors=suppress_runtime_errors,
figsize=figsize,
display_print=display_print)
# Generate jointplot graph with kde
self.plot_jointplot_graph(df,
feature_name,
dataset_name=dataset_name,
other_feature_name=target_feature,
display_visuals=display_visuals,
sub_dir=two_dim_sub_dir,
save_file=save_file,
dataframe_snapshot=dataframe_snapshot,
color=colors,
suppress_runtime_errors=suppress_runtime_errors,
figsize=figsize,
display_print=display_print,
kind="kde")
if display_print:
print("\n\n")
[docs] def plot_distance_graph(self,
df,
feature_name,
dataset_name,
display_visuals=True,
display_print=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
figsize=GRAPH_DEFAULTS.FIGSIZE,
bins=None,
norm_hist=True,
hist=True,
kde=True,
colors=None,
fit=None,
fit_kws=None):
"""
Display a distance plot and save the graph in the correct directory.
Args:
df: pd.Dataframe
Pandas dataframe object
feature_name: string
Specified feature column name.
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename: string
Name to give the file.
sub_dir: string
Specify the sub directory to append to the pre-defined folder path.
save_file: bool
Boolean value to whether or not to save the file.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
figsize: tuple
The given size of the plot.
bins: int
Specification of hist bins, or None to use Freedman-Diaconis rule.
norm_hist: bool
If True, the histogram height shows a density rather than a count. This is implied if a KDE or fitted density is plotted.
hist: bool
Whether to plot a (normed) histogram.
kde: bool
Whether to plot a gaussian kernel density estimate.
colors : matplotlib color
Color to plot everything but the fitted curve in.
fit: functional method
An object with fit method, returning a tuple that can be passed
to a pdf method a positional arguments following an grid of
values to evaluate the pdf on.
fit_kws : dictionaries, optional
Keyword arguments for underlying plotting functions.
Credit to seaborn's author:
Michael Waskom
Git username: mwaskom
Doc Link: http://tinyurl.com/ycco2hok
Raises:
Raises error if the feature data is filled with only nulls or if
the json file's snapshot of the given dataframe doesn't match the
given dataframe.
"""
try:
# -----
check_if_feature_exists(df,
feature_name)
# Error check
if np.sum(df[feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments(
"Distance plot graph couldn't be generated because " +
f"there is only missing data to display in {feature_name}!")
if display_print:
print(f"Generating graph for distance plot on {feature_name}")
feature_values = pd.to_numeric(df[feature_name].dropna(),
errors='coerce').dropna()
if not len(feature_values):
raise ValueError(
f"The given feature {feature_name} doesn't seem to convert to a numeric vector.")
# Closes up any past graph info
plt.close('all')
# Set foundation graph info
sns.set(style="whitegrid")
plt.figure(figsize=figsize)
plt.title("Distance Plot: " + feature_name)
# Create seaborn graph
sns.distplot(feature_values,
bins=bins,
hist=hist,
kde=kde,
fit=fit,
fit_kws=fit_kws,
color=colors,
norm_hist=norm_hist)
# Pass a default name if needed
if not filename:
filename = f"Distance plot graph on {feature_name}"
# Create string sub directory path
if not sub_dir:
sub_dir = f"{dataset_name}/{feature_name}"
# -----
if save_file:
if self.__called_from_perform:
dataframe_snapshot = False
self.save_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
meta_data=not self.__called_from_perform)
if self.__notebook_mode and display_visuals:
plt.show()
except SnapshotMismatchError as e:
raise e
except Exception as e:
if suppress_runtime_errors:
warnings.warn(
f"Distance plot graph throw an error on feature '{feature_name}':\n{str(e)}",
RuntimeWarning)
else:
raise e
finally:
plt.close('all')
[docs] def plot_violin_graph(self,
df,
feature_name,
dataset_name,
other_feature_name,
display_visuals=True,
display_print=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
figsize=GRAPH_DEFAULTS.FIGSIZE,
order=None,
cut=2,
scale='area',
gridsize=100,
width=0.8,
palette=None,
saturation=0.75):
"""
Display a violin plot and save the graph in the correct directory.
Args:
df: pd.Dataframe
Pandas dataframe object
feature_name: string
Specified feature column name to compare to y.
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
other_feature_name: string
Specified feature column name to compare to x.
display_visuals: bool
Boolean value to whether or not to display visualizations.
filename: string
Name to give the file.
sub_dir: string
Specify the sub directory to append to the pre-defined folder path.
save_file: bool
Boolean value to whether or not to save the file.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
display_print: bool
Determines whether or not to print function's embedded print
statements.
figsize: tuple
Size of the given plot.
order: lists of strings
Order to plot the categorical levels in, otherwise the levels
are inferred from the data objects.
cut: float
Distance, in units of bandwidth size, to extend the density
past the extreme datapoints. Set to 0 to limit the violin range
within the range of the observed data.
(i.e., to have the same effect as trim=True in ggplot.)
scale: string
{area, count, width}
The method used to scale the width of each violin. If area,
each violin will have the same area. If count, the width of the
violins will be scaled by the number of observations in that
bin. If width, each violin will have the same width.
gridsize: int
Number of points in the discrete grid used to compute the kernel density estimate.
width: float
Width of a full element when not using hue nesting, or width of
all the elements for one level of the major grouping variable.
palette: dict or string
Colors to use for the different levels of the hue variable.
Should be something that can be interpreted by color_palette(),
or a dictionary mapping hue levels to matplotlib colors.
saturation: float
Proportion of the original saturation to draw colors at. Large
patches often look better with slightly desaturated colors, but
set this to 1 if you want the plot colors to perfectly match
the input color spec.
Credit to seaborn's author:
Michael Waskom
Git username: mwaskom
Doc link: http://tinyurl.com/y3hxxzgv
Raises:
Raises error if the feature data is filled with only nulls or if
the json file's snapshot of the given dataframe doesn't match the
given dataframe.
"""
try:
# -----
check_if_feature_exists(df,
feature_name)
if other_feature_name:
check_if_feature_exists(df,
other_feature_name)
# Error check and create title/part of default file name
found_features = []
feature_title = ""
for feature in (feature_name, other_feature_name):
if feature:
if np.sum(df[feature].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments("Count plot graph couldn't be generated because " +
f"there is only missing data to display in {feature}!")
found_features.append(feature)
if len(found_features) == 1:
feature_title = f"{feature}"
else:
feature_title += f" by {feature}"
if not len(found_features):
raise UnsatisfiedRequirments("Both x and y feature's are type 'None'. Please pass at least one feature.")
if np.sum(df[feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments(
"Violin plot graph couldn't be generated because " +
f"there is only missing data to display in {feature_name}!")
del found_features
if display_print:
print("Generating graph violin graph on " + feature_title)
# Closes up any past graph info
plt.close('all')
# Set plot structure
fig = plt.figure(figsize=figsize)
plt.title("Violin Plot: " + feature_title)
feature_values = pd.to_numeric(df[other_feature_name],
errors='coerce').dropna()
if not len(feature_values):
raise ValueError("The y feature must contain numerical features.")
x_values = copy.deepcopy(df[feature_name].dropna())
# if feature_name in self.__df_features.bool_features():
# x_values = pd.to_numeric(x_values,
# errors='ignore')
#
# x_values = ['True' if val == 1 else 'False'
# if val == 0 else val
# for val in x_values]
# Sort list by x_values
x_values, feature_values = self.__sort_two_lists(x_values,feature_values)
warnings.filterwarnings("ignore")
sns.violinplot(x=x_values,
y=feature_values,
order=order,
cut=cut,
scale=scale,
gridsize=gridsize,
width=width,
palette=palette,
saturation=saturation)
warnings.filterwarnings("default")
# Pass a default name if needed
if not filename:
filename = f"Violin plot graph on {feature_title}."
# Create string sub directory path
if not sub_dir:
sub_dir = f"{dataset_name}/{feature_name}"
# -----
if save_file:
if self.__called_from_perform:
dataframe_snapshot = False
self.save_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
meta_data=not self.__called_from_perform)
if self.__notebook_mode and display_visuals:
plt.show()
except SnapshotMismatchError as e:
raise e
except Exception as e:
warnings.filterwarnings("default")
if suppress_runtime_errors:
warnings.warn(
f"Plot violin graph an error on feature '{feature_name}':\n{str(e)}",
RuntimeWarning)
else:
raise e
finally:
plt.close('all')
warnings.filterwarnings("default")
[docs] def plot_count_graph(self,
df,
feature_name,
dataset_name,
display_visuals=True,
display_print=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
figsize=GRAPH_DEFAULTS.FIGSIZE,
flip_axis=False,
palette="PuBu"):
"""
Display a barplot with color ranking from a feature's value counts
from the seaborn libary and save the graph in the correct directory
structure.
Args:
df: pd.Dataframe
Pandas dataframe object.
feature_name: string
Specified feature column name.
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename: string
Name to give the file.
sub_dir: string
Specify the sub directory to append to the pre-defined folder path.
save_file: bool
Boolean value to whether or not to save the file.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
display_print: bool
Determines whether or not to print function's embedded print
statements.
figsize: tuple
Size for the given plot.
flip_axis: bool
Flip the axis the ploting axis from x to y if set to 'True'.
palette: dict or string
String representation of color pallete for ranking from seaborn's pallete.
Credit to seaborn's author:
Michael Waskom
Git username: mwaskom
Link: http://tinyurl.com/y4pzrgcf
Raises:
Raises error if the feature data is filled with only nulls or if
the json file's snapshot of the given dataframe doesn't match the
given dataframe.
"""
try:
# -----
check_if_feature_exists(df,
feature_name)
# Error check
if np.sum(df[feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments(
"Count plot graph couldn't be generated because " +
f"there is only missing data to display in {feature_name}!")
if display_print:
print(f"Count plot graph on {feature_name}")
# Closes up any past graph info
plt.close('all')
# Set graph info
plt.figure(figsize=figsize)
sns.set(style="whitegrid")
value_counts = df[feature_name].dropna().value_counts(sort=True)
feature_values,counts = value_counts.index, value_counts.values
del value_counts
# Find and rank values based on counts for color variation of the graph
if not palette:
palette = "PuBu"
if isinstance(palette,str):
rank_list = np.argsort(-np.array(counts)).argsort()
pal = sns.color_palette(palette, len(counts))
palette = np.array(pal[::-1])[rank_list]
plt.clf()
if feature_name in self.__df_features.bool_features():
i = 0
for val in feature_values:
try:
feature_values[i] = float(val)
except:
pass
feature_values = [bool(val) if val == 0 or val == 1 else val
for val in feature_values]
# Flip the graph for visual flare
if flip_axis:
ax = sns.barplot(x=counts,
y=feature_values,
palette=palette,
order=feature_values)
else:
ax = sns.barplot(x=feature_values,
y=counts,
palette=palette,
order=feature_values)
# Labels for numerical count of each bar
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x() + p.get_width() / 2.,
height + 3,
'{:1}'.format(height),
ha="center")
plt.title("Category Count Plot: " + feature_name)
# Pass a default name if needed
if not filename:
filename = f"Count plot graph on {feature_name}"
if isinstance(palette,np.ndarray):
filename += " with count color ranking."
# Create string sub directory path
if not sub_dir:
sub_dir = f"{dataset_name}/{feature_name}"
# -----
if save_file:
if self.__called_from_perform:
dataframe_snapshot = False
self.save_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
meta_data=not self.__called_from_perform)
if self.__notebook_mode and display_visuals:
plt.show()
except SnapshotMismatchError as e:
raise e
except Exception as e:
if suppress_runtime_errors:
warnings.warn(
f"Plot count graph raised an error on feature '{feature_name}':\n{str(e)}",
RuntimeWarning)
else:
raise e
finally:
plt.close('all')
[docs] def plot_pie_graph(self,
df,
feature_name,
dataset_name,
display_visuals=True,
display_print=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
figsize=GRAPH_DEFAULTS.FIGSIZE,
pallete=None):
"""
Display a pie graph and save the graph in the correct directory.
Args:
df:
Pandas DataFrame object.
feature_name:
Specified feature column name.
dataset_name:
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
display_visuals:
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename:
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
sub_dir:
Specify the sub directory to append to the pre-defined folder path.
save_file:
Boolean value to whether or not to save the file.
dataframe_snapshot:
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
figsize: tuple
Size of the plot.
pallete: dict or string
Dictionary of all feature values to hex color values.
Raises:
Raises error if the feature data is filled with only nulls or if
the json file's snapshot of the given dataframe doesn't match the
given dataframe.
"""
try:
# -----
check_if_feature_exists(df,
feature_name)
if np.sum(df[feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments("Pie graph couldn't be generated because " +
f"there is only missing data to display in {feature_name}!")
if display_print:
print(f"Pie graph on {feature_name}")
# Closes up any past graph info
plt.close('all')
# Find value counts
value_counts = df[feature_name].dropna().value_counts(sort=False)
feature_values = value_counts.index.tolist()
value_count_list = value_counts.values.tolist()
color_list = None
plt.figure(figsize=figsize)
# if feature_name in self.__df_features.bool_features():
#
# i = 0
# for val in feature_values:
# try:
# feature_values[i] = float(val)
# except:
# pass
# feature_values = [bool(val) if val == 0 or val == 1 else val
# for val in feature_values]
# Sort by feature_values
feature_values,value_count_list = self.__sort_two_lists(feature_values,
value_count_list)
if isinstance(pallete,dict):
color_list = []
for value in tuple(feature_values):
try:
color_list.append(pallete[value])
except KeyError:
raise KeyError(f"The given value '{value}' in feature '{feature_name}'"
+ " was not found in the passed color dict.")
# Explode the part of the pie graph that is the maximum of the graph
explode_array = [0] * len(feature_values)
explode_array[np.array(value_count_list).argmax()] = .03
# Plot pie graph
plt.pie(
tuple(value_count_list),
labels=tuple(feature_values),
shadow=False,
colors=color_list,
explode=tuple(explode_array),
startangle=90,
autopct='%1.1f%%',
)
# Set foundation graph info
plt.gcf()
plt.title("Pie Chart: " + feature_name)
plt.legend(fancybox=True,
facecolor='w')
# Set foundation
plt.axis('equal')
# Pass a default name if needed
if not filename:
filename = f"Pie graph on {feature_name}"
# Create string sub directory path
if not sub_dir:
sub_dir = f"{dataset_name}/{feature_name}"
# -----
if save_file:
if self.__called_from_perform:
dataframe_snapshot = False
self.save_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
meta_data=not self.__called_from_perform)
if self.__notebook_mode and display_visuals:
plt.show()
except SnapshotMismatchError as e:
raise e
except Exception as e:
if suppress_runtime_errors:
warnings.warn(
f"Pie graph raised an error on feature '{feature_name}':\n{str(e)}",
RuntimeWarning)
else:
raise e
finally:
plt.close('all')
[docs] def plot_ridge_graph(self,
df,
feature_name,
dataset_name,
other_feature_name,
display_visuals=True,
display_print=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
figsize=GRAPH_DEFAULTS.FIGSIZE,
palette=None):
"""
Display a ridge plot and save the graph in the correct directory.
Args:
df: pd.Dataframe
Pandas DataFrame object.
feature_name: string
Specified feature column name.
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
other_feature_name: string
Feature to compare to.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename: string
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
sub_dir: string
Specify the sub directory to append to the pre-defined folder path.
save_file: bool
Boolean value to whether or not to save the file.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
display_print: bool
Determines whether or not to print function's embedded print
statements.
figsize: tuple
Tuple object to represent the plot/image's size.
palette: dict or string
Dictionary of all feature values to hex color values.
Note -
A large part of this was taken from: http://tinyurl.com/tuou2cn
Raises:
Raises error if the feature data is filled with only nulls or if
the json file's snapshot of the given dataframe doesn't match the
given dataframe.
"""
try:
# -----
check_if_feature_exists(df,
feature_name)
# -----
check_if_feature_exists(df,
other_feature_name)
# Error check on null data
if np.sum(df[feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments(
"Ridge plot graph couldn't be generated because " +
f"there is only missing data to display in {feature_name}!")
if np.sum(df[other_feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments(
"Ridge plot graph couldn't be generated because " +
f"there is only missing data to display in {other_feature_name}!")
if display_print:
print(f"Ridge plot graph on {feature_name} by {other_feature_name}.")
sns.set(style="white",
rc={"axes.facecolor": (0, 0, 0, 0)})
# Temporarily turn off chained assignments
chained_assignment = pd.options.mode.chained_assignment
pd.options.mode.chained_assignment = None
tmp_df = copy.deepcopy(df[[feature_name,other_feature_name]])
tmp_df[other_feature_name] = pd.to_numeric(tmp_df[other_feature_name],
errors='coerce')
# if feature_name in self.__df_features.bool_features():
#
# tmp_df[feature_name] = pd.to_numeric(tmp_df[feature_name],
# errors='ignore')
# tmp_df[feature_name] = ['True' if val == 1 else 'False'
# if val == 0 else val
# for val in tmp_df[feature_name]]
tmp_df.dropna(inplace=True)
# Remove any values that only return a single value back
for val in tmp_df[feature_name].dropna().unique():
feature_value_counts = tmp_df[other_feature_name][tmp_df[feature_name] == val].dropna().value_counts()
count_length = len(feature_value_counts.values)
if len(feature_value_counts.index.to_list()) <= 1 or count_length == 0:
tmp_df = tmp_df[tmp_df[feature_name] != val]
elif count_length == 1 and feature_value_counts.values[0] == 1:
tmp_df = tmp_df[tmp_df[feature_name] != val]
# -----
# for val in tmp_df[other_feature_name].dropna().unique():
#
# feature_value_counts = tmp_df[feature_name][tmp_df[other_feature_name] == val].dropna().value_counts()
#
# count_length = len(feature_value_counts.values)
# if len(feature_value_counts.index.to_list()) <= 1 or count_length == 0:
# tmp_df = tmp_df[tmp_df[other_feature_name] != val]
# elif count_length == 1 and feature_value_counts.values[0] == 1:
# tmp_df = tmp_df[tmp_df[other_feature_name] != val]
pd.options.mode.chained_assignment = chained_assignment
del chained_assignment
# # Sort by dataframe's series of 'feature_name'
# tmp_df[feature_name], tmp_df[other_feature_name] = self.__sort_two_lists(tmp_df[feature_name],
# tmp_df[other_feature_name])
# Suppress any warnings that the seaborn's backend raises
warnings.filterwarnings("ignore")
sns.set(style="white",
rc={"axes.facecolor": (0, 0, 0, 0)})
if not palette:
palette = sns.cubehelix_palette(10, rot=-.20, light=.7)
# Initialize the FacetGrid object
g = sns.FacetGrid(tmp_df,
row=feature_name,
hue=feature_name,
aspect=15,
height=.4,
palette=palette)
# Draw the densities in a few steps
g.map(sns.kdeplot,
other_feature_name,
clip_on=False,
shade=True,
alpha=1,
lw=1.5,
bw=.2)
g.map(sns.kdeplot,
other_feature_name,
clip_on=False,
color="w",
lw=2,
bw=.2)
g.map(plt.axhline,
y=0,
lw=2,
clip_on=False)
# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
ax = plt.gca()
ax.text(-.1,
.2,
label,
fontweight="bold",
color=color,
ha="left",
va="center",
transform=ax.transAxes)
g.map(label, other_feature_name)
# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.25)
# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)
g.fig.set_size_inches(figsize[0], figsize[1], forward=True)
g.fig.suptitle(f'{feature_name} by {other_feature_name}')
warnings.filterwarnings("default")
# Pass a default name if needed
if not filename:
filename = f"Ridge plot graph on {feature_name} by {other_feature_name}"
# Create string sub directory path
if not sub_dir:
sub_dir = f"{dataset_name}/{feature_name}"
# -----
if save_file:
if self.__called_from_perform:
dataframe_snapshot = False
if self.__notebook_mode and display_visuals:
plt.show()
self.save_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
meta_data=not self.__called_from_perform)
except SnapshotMismatchError as e:
raise e
except Exception as e:
warnings.filterwarnings("default")
if suppress_runtime_errors:
warnings.warn(
f"Plot ridge graph raised an error on feature '{feature_name}':\n{str(e)}",
RuntimeWarning)
else:
raise e
finally:
plt.close('all')
warnings.filterwarnings("default")
[docs] def plot_multi_bar_graph(self,
df,
feature_name,
dataset_name,
other_feature_name,
display_visuals=True,
display_print=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
figsize=GRAPH_DEFAULTS.FIGSIZE,
colors=None,
stacked=False):
"""
Display a pie graph and save the graph in the correct directory.
Args:
df:
Pandas DataFrame object.
feature_name:
Specified feature column name.
dataset_name:
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
display_visuals:
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename:
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
sub_dir:
Specify the sub directory to append to the pre-defined folder path.
save_file:
Boolean value to whether or not to save the file.
dataframe_snapshot:
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
figsize: tuple
Size of the plot.
colors: dict or string
Dictionary of all feature values to hex color values.
stacked: bool
Determines if the multi bar graph should be stacked or not.
Raises:
Raises error if the feature data is filled with only nulls or if
the json file's snapshot of the given dataframe doesn't match the
given dataframe.
"""
try:
# -----
check_if_feature_exists(df,
feature_name)
# -----
check_if_feature_exists(df,
other_feature_name)
# Error check on nan data
if np.sum(df[feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments("Multi bar graph on " +
f"there is only missing data to display in {feature_name}!")
if np.sum(df[other_feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments("Multi bar graph on " +
f"there is only missing data to display in {other_feature_name}!")
if display_print:
print(f"Multi bar graph on {feature_name} by {other_feature_name}")
# Closes up any past graph info
plt.close('all')
if not colors:
try:
colors = [self.__df_features.get_feature_colors(feature_name)[val]
for val in list(df.groupby(
[other_feature_name, feature_name]).size().unstack().columns)]
except TypeError:
pass
except KeyError:
pass
g = df.groupby([other_feature_name, feature_name]).size().unstack().plot(
kind='bar',
stacked=stacked,
color=colors,
figsize=figsize)
g.legend(loc='upper center',
bbox_to_anchor=(1.07, 1),
shadow=True,
ncol=1)
sns.set(style="whitegrid")
plt.title(f"Multi bar graph on {feature_name} by {other_feature_name}")
# Pass a default name if needed
if not filename:
if stacked:
filename = f"Multi bar graph on {feature_name} by {other_feature_name}"
else:
filename = f"Multi bar graph stacked on {feature_name} by {other_feature_name}"
# Create string sub directory path
if not sub_dir:
sub_dir = f"{dataset_name}/{feature_name}"
# -----
if save_file:
if self.__called_from_perform:
dataframe_snapshot = False
self.save_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
meta_data=not self.__called_from_perform)
if self.__notebook_mode and display_visuals:
plt.show()
except SnapshotMismatchError as e:
raise e
except Exception as e:
if suppress_runtime_errors:
warnings.warn(
f"Multi bar raised an error on feature '{feature_name}':\n{str(e)}",
RuntimeWarning)
else:
raise e
finally:
plt.close('all')
[docs] def statistical_analysis_on_aggregates(self,
df,
target_features,
dataset_name,
dataframe_snapshot=True):
"""
Aggregates the data of the target feature either by discrete values
or by binning/labeling continuous data.
Args:
df: pd.Dataframe
Pandas DataFrame object.
target_features: list of string
Specified target features.
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
Note:
This function has a lot going on and it's infancy so I am going to
purposely not give it suppress_runtime_errors so people will find
problems with it and report it to me.
Raises:
Raises error if the feature data is filled with only nulls or if
the json file's snapshot of the given dataframe doesn't match the
given dataframe.
"""
if not self.__called_from_perform and dataframe_snapshot:
df_snapshot = DataFrameSnapshot()
df_snapshot.check_create_snapshot(df,
self.__df_features,
directory_path=self.folder_path + f"/{dataset_name}",
sub_dir=f"{dataset_name}/_Extras")
feature_stats_dict = dict()
# Convert to list of the given string
if isinstance(target_features, str):
target_features = [target_features]
# Iterate through all target features features
for feature_name in target_features:
if feature_name:
check_if_feature_exists(df,
feature_name)
else:
continue
# Generate bins and labels for continuous numerical data
bins = None
labels = None
if feature_name in self.__df_features.continuous_numerical_features():
bins, labels = df_auto_binning(df,
self.__df_features,
feature_name)
# Labels and bins will act as feature values for aggregation
if labels:
feature_values = copy.deepcopy(labels)
else:
feature_values = list(df[feature_name].sort_values(
ascending=True).dropna().unique())
# Label to compare to all the data without any aggregations
feature_values.append("All")
# Copy feature values to remove one given value from the feature list at a time
other_feature_values = copy.deepcopy(feature_values)
# Create target feature dict
feature_stats_dict[feature_name] = dict()
feature_val_count = -1
feature_pvalues = dict()
# Store all pvalues found for every feature
for column in df.columns:
if feature_name == column:
continue
feature_pvalues[column] = dict()
# Iterate through remaining features
for main_feature_val in feature_values:
# Ignore "All" for the main feature val since it doesn't actually exist
if main_feature_val == "All":
continue
# Don't want repeats or compare same subsets
other_feature_values.remove(main_feature_val)
feature_val_count += 1
# Create series objects based on the main feature val compared to the other_feature_val
other_feature_val_count = 0
for other_feature_val in other_feature_values:
feature_stats_dict[feature_name][
f"{main_feature_val} -> {other_feature_val}"] = dict()
# Generate series object based on bins/discrete values
other_feature_val_count += 1
for iterate_feature_name in df.columns:
# -----
if iterate_feature_name == feature_name:
continue
# Create bool array for series object(left)
if labels:
bool_array = (df[feature_name] <= bins[
feature_val_count + 1]) & (
df[feature_name] > bins[
feature_val_count])
else:
bool_array = df[feature_name] == main_feature_val
tmp_series_a = df[bool_array][
iterate_feature_name].dropna()
del bool_array
# Create bool array for series object(right)
if other_feature_val == "All":
bool_array = [True for _ in range(0, df.shape[0])]
else:
if labels:
bool_array = (df[feature_name] <= bins[
feature_val_count + other_feature_val_count + 1]) & (
df[feature_name] > bins[
feature_val_count + other_feature_val_count])
else:
bool_array = df[
feature_name] == other_feature_val
tmp_series_b = df[bool_array][
iterate_feature_name].dropna()
del bool_array
# Extract out pvalue/statistic based on series data
if len(tmp_series_a) == 0 or len(tmp_series_b) == 0:
pvalue = "NaN"
statistic = "NaN"
else:
ks_2samp = stats.ks_2samp(tmp_series_a,
tmp_series_b)
pvalue = float(ks_2samp.pvalue)
statistic = float(ks_2samp.statistic)
# Init pvalue/statistic to proper values
feature_stats_dict[feature_name][
f"{main_feature_val} -> {other_feature_val}"][
iterate_feature_name] = dict()
feature_stats_dict[feature_name][
f"{main_feature_val} -> {other_feature_val}"][
iterate_feature_name][
"Kolmogorov-Smirnov statistic"] = dict()
feature_stats_dict[feature_name][
f"{main_feature_val} -> {other_feature_val}"][
iterate_feature_name][
"Kolmogorov-Smirnov statistic"]["P-Value"] = pvalue
feature_stats_dict[feature_name][
f"{main_feature_val} -> {other_feature_val}"][
iterate_feature_name][
"Kolmogorov-Smirnov statistic"][
"Statistic"] = statistic
# Don't add to list
if pvalue == "NaN":
continue
# Init dict/list if it doesn't exist
if "Kolmogorov-Smirnov statistic" not in \
feature_pvalues[iterate_feature_name]:
feature_pvalues[iterate_feature_name][
"Kolmogorov-Smirnov statistic"] = dict()
feature_pvalues[iterate_feature_name][
"Kolmogorov-Smirnov statistic"][
"All pvalues"] = []
# Append new pvalue
feature_pvalues[iterate_feature_name][
"Kolmogorov-Smirnov statistic"][
"All pvalues"].append(pvalue)
# Generate summary data of pvalues
for column in df.columns:
if column == feature_name or len(feature_pvalues[column].keys()) == 0:
continue
else:
if column in feature_pvalues:
feature_pvalues[column][
"Kolmogorov-Smirnov statistic"][
"All pvalues"].sort()
# Only create summary if the series is at least the of 2
if len(feature_pvalues[column][
"Kolmogorov-Smirnov statistic"][
"All pvalues"]) >= 2:
feature_pvalues[column][
"Kolmogorov-Smirnov statistic"
]["Pvalues Summary"] = descr_table(
pd.DataFrame({column: feature_pvalues[column][
"Kolmogorov-Smirnov statistic"][
"All pvalues"]}),
column).to_dict()[column]
# Init to an empty dict
else:
feature_pvalues[column][
"Kolmogorov-Smirnov statistic"][
"Pvalues Summary"] = {}
feature_stats_dict[feature_name]["P-Values"] = feature_pvalues
# End target feature loop
# Generate directories
create_dir_structure(self.folder_path + dataset_name,
"_Extras/Statistics/Accept Null Hypothesis")
create_dir_structure(self.folder_path + dataset_name,
"_Extras/Statistics/Reject Null Hypothesis")
# Create json file
dict_to_json_file(feature_stats_dict,
self.folder_path + dataset_name + "/_Extras/Statistics",
"Statistics on target features")
stat_methods_dict = dict()
for main_feature, relationship_dict in feature_stats_dict.items():
for _, stats_on_features in relationship_dict.items():
for iterate_feature_name, stats_method_dict in stats_on_features.items():
for stats_method, stats_dict in stats_method_dict.items():
if "All pvalues" in stats_dict:
if stats_method not in stat_methods_dict:
stat_methods_dict[stats_method] = pd.DataFrame()
stats_dict = copy.deepcopy(stats_dict)
for k,v in stats_dict.items():
if v == "NaN":
stats_dict[k] = [np.nan]
else:
stats_dict[k] = [v]
if len(stats_dict['Pvalues Summary'][0]) > 0:
tmp_stats_df = pd.DataFrame.from_dict(stats_dict["Pvalues Summary"])[["mean","std","var"]]
tmp_stats_df.index = [f"{main_feature} compared to {iterate_feature_name}"]
stat_methods_dict[stats_method] = stat_methods_dict[stats_method].append(tmp_stats_df,
ignore_index=False)
for stats_method in stat_methods_dict:
if stat_methods_dict[stats_method].shape[0]:
stat_methods_dict[stats_method].sort_values(
by=["mean", "std", "var"],
ascending=True,
inplace=True)
pickle_object_to_file(stat_methods_dict,
self.folder_path + dataset_name + "/_Extras/Statistics",
"Stat methods of features dataframes",
remove_file_extension=False)
# Generate multiple json files based on the following pvalues
for accept_null_plvalue in [.01, .05, .1, .101, .2, .3, .4, .5, .6, .7,
.8, .9, 1]:
json_dict = copy.deepcopy(feature_stats_dict)
tmp_dict = copy.deepcopy(feature_stats_dict)
for main_feature, relationship_dict in tmp_dict.items():
for relationship_string, stats_on_features in relationship_dict.items():
for iterate_feature_name, stats_method_dict in stats_on_features.items():
for stats_method, stats_dict in stats_method_dict.items():
# Not a relationship string; Re-access pvalue list and summary
if relationship_string == "P-Values":
if "P-Values" not in json_dict[
main_feature] or iterate_feature_name not in \
json_dict[main_feature]["P-Values"]:
break
filter_pvalues = np.asarray(
json_dict[main_feature]["P-Values"][
iterate_feature_name][stats_method][
"All pvalues"])
if accept_null_plvalue <= .1:
filter_pvalues = filter_pvalues[
filter_pvalues <= accept_null_plvalue]
else:
filter_pvalues = filter_pvalues[
filter_pvalues >= accept_null_plvalue]
json_dict[main_feature]["P-Values"][
iterate_feature_name][stats_method][
"All pvalues"] = list(filter_pvalues)
if len(filter_pvalues) >= 2:
json_dict[main_feature][
"P-Values"][iterate_feature_name][
stats_method]["Pvalues Summary"] = \
descr_table(
pd.DataFrame(
{iterate_feature_name: list(
filter_pvalues)}),
iterate_feature_name).to_dict()[
iterate_feature_name]
else:
json_dict[main_feature][
"P-Values"][iterate_feature_name][
stats_method]["Pvalues Summary"] = {}
break
pvalue = stats_dict["P-Value"]
if accept_null_plvalue <= .1:
if pvalue == "NaN" or pvalue > accept_null_plvalue:
del json_dict[main_feature][
relationship_string][
iterate_feature_name]
else:
if pvalue == "NaN" or pvalue < accept_null_plvalue:
del json_dict[main_feature][
relationship_string][
iterate_feature_name]
# Push to accept or reject null hypothesis folder
if accept_null_plvalue <= .1:
dict_to_json_file(json_dict,
self.folder_path + dataset_name + "/_Extras/Statistics/Accept Null Hypothesis",
f"Accept Null Hypothesis on target features where pvalue <= {accept_null_plvalue}",
remove_file_extension=False)
else:
dict_to_json_file(json_dict,
self.folder_path + dataset_name + "/_Extras/Statistics/Reject Null Hypothesis",
f"Reject Null Hypothesis on target features where pvalue >= {accept_null_plvalue}",
remove_file_extension=False)
[docs] def plot_jointplot_graph(self,
df,
feature_name,
dataset_name,
other_feature_name,
display_visuals=True,
display_print=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True,
figsize=GRAPH_DEFAULTS.FIGSIZE,
color=None,
kind="scatter and kde",
ratio=5):
"""
Display a ridge plot and save the graph in the correct directory.
Args:
df: pd.Dataframe
Pandas DataFrame object.
feature_name: string
Specified feature column name.
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
other_feature_name: string
Feature to compare to.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename: string
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
sub_dir: string
Specify the sub directory to append to the pre-defined folder path.
save_file: bool
Boolean value to whether or not to save the file.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
display_print: bool
Determines whether or not to print function's embedded print
statements.
figsize: tuple
Tuple object to represent the plot/image's size. Because joinplot
only accepts a single value for the figure; we just pull the
greatest of the two values.
color: string
Seaborn/maplotlib color/hex color for representing the graph
kind: string (scatter,reg,resid,kde,hex,scatter and kde)
Kind of plot to draw.
ratio:
Ratio of joint axes height to marginal axes height.
(Determines distplot like plots dimensions.)
Credit to seaborn's author:
Michael Waskom
Git username: mwaskom
Link: http://tinyurl.com/v9pxsoy
Raises:
Raises error if the feature data is filled with only nulls or if
the json file's snapshot of the given dataframe doesn't match the
given dataframe.
"""
try:
# -----
check_if_feature_exists(df,
feature_name)
check_if_feature_exists(df,
other_feature_name)
# Error check on null data
if np.sum(df[feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments(
"Jointplot plot graph couldn't be generated because " +
f"there is only missing data to display in {feature_name}!")
if np.sum(df[other_feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments(
"Jointplot plot graph couldn't be generated because " +
f"there is only missing data to display in {other_feature_name}!")
if display_print:
print(f"Generating jointplot graph on {feature_name} by {other_feature_name}")
# Closes up any past graph info
plt.close('all')
if figsize[0] < figsize[1]:
height = figsize[0]
else:
height = figsize[1]
tmp_df = copy.deepcopy(df[[feature_name,other_feature_name]])
tmp_df.dropna()
if not kind:
kind = "scatter"
warnings.filterwarnings("ignore")
if kind == "scatter and kde":
g = sns.jointplot(feature_name,
other_feature_name,
data=tmp_df,
kind="scatter",
color=color,
ratio=ratio,
height=height).plot_joint(sns.kdeplot, zorder=0,
n_levels=6)
else:
g = sns.jointplot(feature_name,
other_feature_name,
data=tmp_df,
kind=kind,
color=color,
ratio=ratio,
height=height)
warnings.filterwarnings("default")
plt.subplots_adjust(top=0.93)
g.fig.suptitle("Jointplot: " + f"{feature_name} by {other_feature_name}")
# Pass a default name if needed
if not filename:
filename = f"Jointplot plot graph for {feature_name} by {other_feature_name} using {kind}"
# Create string sub directory path
if not sub_dir:
sub_dir = f"{dataset_name}/{feature_name}"
# -----
if save_file:
if self.__called_from_perform:
dataframe_snapshot = False
self.save_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
meta_data=not self.__called_from_perform)
if self.__notebook_mode and display_visuals:
plt.show()
except SnapshotMismatchError as e:
raise e
except Exception as e:
warnings.filterwarnings("default")
if suppress_runtime_errors:
warnings.warn(
f"Joinplot plot graph an error on feature '{feature_name}':\n{str(e)}",
RuntimeWarning)
else:
raise e
finally:
plt.close('all')
warnings.filterwarnings("default")
[docs] def value_counts_table(self,
df,
feature_name,
dataset_name,
display_visuals=True,
display_print=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True):
"""
Creates a value counts table of the features given data.
Note
Creates a png of the table.
Args:
df: pd.Dataframe
Pandas DataFrame object
feature_name: string
Specified feature column name.
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename: string
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
sub_dir: string
Specify the sub directory to append to the pre-defined folder path.
save_file: bool
Saves file if set to True; doesn't if set to False.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
Creates/Saves a pandas dataframe of value counts of a dataframe.
Note -
Creates a png of the table.
Raises:
Raises error if the feature data is filled with only nulls or if
the json file's snapshot of the given dataframe doesn't match the
given dataframe.
"""
try:
# -----
check_if_feature_exists(df,
feature_name)
# Check if feature has only null data
if np.sum(df[feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments("Values count table couldn't be generated because " +
f"there is only missing data to display in {feature_name}!")
if display_print:
print(f"Creating value counts table for feature {feature_name}.")
# -----
val_counts_df = value_counts_table(df,
feature_name)
if self.__notebook_mode:
if display_visuals:
display(val_counts_df)
else:
if display_visuals:
print(val_counts_df)
# Pass a default name if needed
if not filename:
filename = f"{feature_name} Value Counts Table"
# Create string sub directory path
if not sub_dir:
sub_dir = f"{dataset_name}/{feature_name}"
if save_file:
if self.__called_from_perform:
dataframe_snapshot = False
self.save_table_as_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
table=val_counts_df,
show_index=True,
meta_data=not self.__called_from_perform)
except SnapshotMismatchError as e:
raise e
except Exception as e:
if suppress_runtime_errors:
warnings.warn(
f"Value count table raised an error on feature '{feature_name}':\n{str(e)}",
RuntimeWarning)
else:
raise e
finally:
plt.close('all')
[docs] def descr_table(self,
df,
feature_name,
dataset_name,
display_visuals=True,
display_print=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True):
"""
Creates/Saves a pandas dataframe of a feature's numerical data.
Standard deviation, mean, Q1-Q5, median, variance, etc.
Note
Creates a png of the table.
Args:
df: pd.Dataframe
Pandas DataFrame object
feature_name: string
Specified feature column name.
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename: string
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
sub_dir: string
Specify the sub directory to append to the pre-defined folder path.
save_file: bool
Saves file if set to True; doesn't if set to False.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
Raises:
Raises error if the feature data is filled with only nulls or if
the json file's snapshot of the given dataframe doesn't match the
given dataframe.
"""
try:
# -----
check_if_feature_exists(df,
feature_name)
# Check if dataframe has only null data
if np.sum(df[feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments("Descr table couldn't be generated because " +
f"there is only missing data to display in {feature_name}!")
if display_print:
print(f"Creating data description table for {feature_name}")
desc_df = descr_table(df,
feature_name,
to_numeric=True)
if self.__notebook_mode:
if display_visuals:
display(desc_df)
else:
if display_visuals:
print(desc_df)
# Pass a default name if needed
if not filename:
filename = f"{feature_name} Description Table"
# Create string sub directory path
if not sub_dir:
sub_dir = f"{dataset_name}/{feature_name}"
if save_file:
if self.__called_from_perform:
dataframe_snapshot = False
self.save_table_as_plot(df=df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
table=desc_df,
meta_data=not self.__called_from_perform,
show_index=True)
except SnapshotMismatchError as e:
raise e
except Exception as e:
if suppress_runtime_errors:
warnings.warn(
f"Descr table raised an error on feature '{feature_name}':\n{str(e)}",
RuntimeWarning)
else:
raise e
finally:
plt.close('all')
[docs] def group_by_feature_value_count_table(self,
df,
feature_name,
dataset_name,
other_feature_name,
display_visuals=True,
display_print=True,
filename=None,
sub_dir=None,
save_file=True,
dataframe_snapshot=True,
suppress_runtime_errors=True):
"""
Creates/Saves a pandas dataframe of features and their found types
in the dataframe.
Note
Creates a png of the table.
Args:
df: pd.Dataframe
Pandas DataFrame object
feature_name: string
Specified feature column name.
dataset_name: string
The dataset's name; this will create a sub-directory in which your
generated graph will be inner-nested in.
other_feature_name: string
Feature to compare to.
display_visuals: bool
Boolean value to whether or not to display visualizations.
display_print: bool
Determines whether or not to print function's embedded print
statements.
filename: string
If set to 'None' will default to a pre-defined string;
unless it is set to an actual filename.
sub_dir: string
Specify the sub directory to append to the pre-defined folder path.
save_file: bool
Saves file if set to True; doesn't if set to False.
dataframe_snapshot: bool
Boolean value to determine whether or not generate and compare a
snapshot of the dataframe in the dataset's directory structure.
Helps ensure that data generated in that directory is correctly
associated to a dataframe.
suppress_runtime_errors: bool
If set to true; when generating any graphs will suppress any runtime
errors so the program can keep running.
Raises:
Raises error if the feature data is filled with only nulls or if
the json file's snapshot of the given dataframe doesn't match the
given dataframe.
"""
try:
# -----
check_if_feature_exists(df,
feature_name)
# Check if dataframe has only null data
if np.sum(df[feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments("Count plot graph couldn't be generated because " +
f"there is only missing data to display in {feature_name}!")
if np.sum(df[other_feature_name].isnull()) == df.shape[0]:
raise UnsatisfiedRequirments("Count plot graph couldn't be generated because " +
f"there is only missing data to display in {other_feature_name}!")
if display_print:
print(f"Creating group by {feature_name} and {other_feature_name} Table")
tmp_df = copy.deepcopy(df[[feature_name, other_feature_name]])
tmp_df = tmp_df.groupby([feature_name, other_feature_name]).size().to_frame()
tmp_df.columns = ["Counts"]
if self.__notebook_mode:
if display_visuals:
display(tmp_df)
else:
if display_visuals:
print(tmp_df)
# Pass a default name if needed
if not filename:
filename = f"Group by {feature_name} and {other_feature_name} Table"
# Create string sub directory path
if not sub_dir:
sub_dir = f"{dataset_name}/{feature_name}"
tmp_df.sort_values(by=["Counts"],
ascending=False,
inplace=True)
if save_file:
if self.__called_from_perform:
dataframe_snapshot = False
self.save_table_as_plot(df=tmp_df,
df_features=self.__df_features,
filename=filename,
sub_dir=sub_dir,
dataframe_snapshot=dataframe_snapshot,
suppress_runtime_errors=suppress_runtime_errors,
table=tmp_df,
show_index=True,
meta_data=not self.__called_from_perform)
except SnapshotMismatchError as e:
raise e
except Exception as e:
if suppress_runtime_errors:
warnings.warn(
f"Group by table raised an error on feature '{feature_name}' by '{other_feature_name}':\n{str(e)}",
RuntimeWarning)
else:
raise e
finally:
plt.close('all')
def __get_feature_colors(self,
df,
feature_name):
"""
Creates a dict object of all possible feature values with their
associated colors.
Note
Any unknown feature values that aren't declared
by df_features are given a default color from the constants
section of the project. Goes up to 20 different colors unitl
colors is init to None.
Args:
df: pd.Dataframe
Pandas DataFrame object
feature_name: string
Specified feature column name.
Returns:
Gives back a dictionary object of all possible feature values
with their associated colors.
"""
colors = self.__df_features.get_feature_colors(feature_name)
feature_value_representation = self.__df_features.get_feature_value_representation()
if colors:
if isinstance(colors, dict):
feature_values = df[feature_name].value_counts(
sort=False).keys().to_list()
decoder = self.__df_features.get_label_decoder()
# Add color feature value for decoders values
if feature_name in decoder.keys():
for cat, val in decoder[feature_name].items():
if cat in colors.keys():
hex_code = colors[cat]
colors[decoder[feature_name][cat]] = hex_code
elif val in colors.keys():
hex_code = colors[val]
colors[cat] = hex_code
# Add color feature value for different value representation
if feature_name in feature_value_representation.keys():
for val in feature_value_representation[
feature_name].keys():
if val in colors.keys():
hex_code = colors[val]
colors[feature_value_representation[
feature_name][val]] = hex_code
i = 0
for value in feature_values:
if value not in colors.keys():
colors[value] = \
GRAPH_DEFAULTS.DEFINED_LIST_OF_RANDOM_COLORS[i]
i += 1
if i == len(
GRAPH_DEFAULTS.DEFINED_LIST_OF_RANDOM_COLORS):
colors = None
return colors
def __sort_two_lists(self,
sort_values,
other_list):
"""
Sort's two collections by the first collection passed in.
Args:
sort_values: collection
Values to be sorted by.
other_list: collection
Values that get sorted based on 'sort_values'.
Returns:
Returns back those two lists sorted.
"""
tmp = list(zip(*sorted(list(zip(other_list, sort_values)),
key=lambda x: x[1])))
return list(tmp[1]), list(tmp[0])