Source code for transparentai.datasets.variable.correlation

import warnings
import scipy.stats as ss
import gc

import pandas as pd
import numpy as np

from transparentai import utils


[docs]def cramers_v(x, y): """Returns the Cramer V value of two categorical variables using chi square. This correlation metric is between 0 and 1. Code source found in this article : https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9 Parameters ---------- x: array like first categorical variable y: array like second categorical variable Returns ------- float: Cramer V value """ confusion_matrix = pd.crosstab(x, y) chi2 = ss.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum().sum() phi2 = chi2/n r, k = confusion_matrix.shape phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1)) rcorr = r-((r-1)**2)/(n-1) kcorr = k-((k-1)**2)/(n-1) if min((kcorr-1), (rcorr-1)) == 0: return 0 return np.sqrt(phi2corr/min((kcorr-1), (rcorr-1)))
[docs]def merge_corr_df(df_list): """Merges correlation matrix from compute_correlation() function to one. Needs 3 dataframe : pearson_corr, cramers_v_corr and pbs_corr. This matrix has a default : the cramers_v_corr is scale from 0 to 1, but the others are from to -1 to 1. Be sure to understand this. Parameters ---------- df_list: list List of correlation matrices Returns ------- pd.DataFrame: Merged dataframe of correlation matrices """ pearson_corr = df_list[0] cramers_v_corr = df_list[1] pbs_corr = df_list[2] cat_feats = pbs_corr.index.values.tolist() num_feats = pbs_corr.columns.values.tolist() feats = cat_feats + num_feats corr_df = utils.init_corr_matrix(feats, feats) corr_df.loc[num_feats, num_feats] = pearson_corr.loc[num_feats, num_feats] corr_df.loc[cat_feats, cat_feats] = cramers_v_corr.loc[cat_feats, cat_feats] for cat_feat in cat_feats: for num_feat in num_feats: corr_df.loc[cat_feat, num_feat] = pbs_corr.loc[cat_feat, num_feat] corr_df.loc[num_feat, cat_feat] = pbs_corr.loc[cat_feat, num_feat] return corr_df
[docs]def compute_cramers_v_corr(df): """Computes Cramers V correlation for a dataframe. `Cramers V Wikipedia definition`_ : In statistics, Cramér's V (sometimes referred to as Cramér's phi and denoted as φc) is a measure of association between two nominal variables, giving a value between 0 and +1 (inclusive). It is based on Pearson's chi-squared statistic and was published by Harald Cramér in 1946. .. _Cramers V Wikipedia definition: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V Parameters ---------- df: pd.DataFrame pandas Dataframe with values to compute Cramers V correlation Returns ------- pd.DataFrame: Correlation matrix computed for Cramers V coeff Raises ------ TypeError: Must provide a pandas DataFrame representing the data """ if type(df) is not pd.DataFrame: raise TypeError( "Must provide a pandas DataFrame representing the data") cat_feats = df.columns.values.tolist() var_combi = [tuple(sorted([v1, v2])) for v1 in cat_feats for v2 in cat_feats if v1 != v2] var_combi = list(set(var_combi)) cramers_v_corr = utils.init_corr_matrix( columns=cat_feats, index=cat_feats) for var1, var2 in var_combi: corr = cramers_v(df[var1], df[var2]) cramers_v_corr.loc[var1, var2] = corr cramers_v_corr.loc[var2, var1] = corr return cramers_v_corr
[docs]def compute_pointbiserialr_corr(df, cat_feats=None, num_feats=None): """Computes Point Biserial correlation for a dataframe. `Point Biserial Wikipedia definition`_ : The point biserial correlation coefficient (rpb) is a correlation coefficient used when one variable (e.g. Y) is dichotomous; Y can either be "naturally" dichotomous, like whether a coin lands heads or tails, or an artificially dichotomized variable. In most situations it is not advisable to dichotomize variables artificially[citation needed]. When a new variable is artificially dichotomized the new dichotomous variable may be conceptualized as having an underlying continuity. If this is the case, a biserial correlation would be the more appropriate calculation. .. _Point Biserial Wikipedia definition: https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient Parameters ---------- df: pd.DataFrame pandas Dataframe with values to compute Point Biserial correlation Returns ------- pd.DataFrame: Correlation matrix computed for Point Biserial coeff Raises ------ TypeError: Must provide a pandas DataFrame representing the data ValueError: cat_feats and num_feats must be set or be both None TypeError: cat_feats must be a list TypeError: num_feats must be a list """ if type(df) is not pd.DataFrame: raise TypeError( "Must provide a pandas DataFrame representing the data") if ((cat_feats is not None) & (num_feats is None)) | ( (cat_feats is None) & (num_feats is not None)): raise ValueError('cat_feats and num_feats must be set or be both None') if type(cat_feats) != list: TypeError('cat_feats must be a list') if type(num_feats) != list: TypeError('num_feats must be a list') if (cat_feats is None) & (num_feats is None): num_feats = df.select_dtypes('number').columns.values.tolist() cat_feats = [c for c in df.columns if c not in num_feats] data_encoded, _ = utils.encode_categorical_vars(df) var_combi = [(v1, v2) for v1 in cat_feats for v2 in num_feats if v1 != v2] pbs_corr = utils.init_corr_matrix( columns=num_feats, index=cat_feats, fill_diag=0.) for cat_feat, num_feat in var_combi: tmp_df = data_encoded[[cat_feat, num_feat]].dropna() if len(tmp_df) == 0: continue corr, p_value = ss.pointbiserialr( tmp_df[cat_feat], tmp_df[num_feat] ) pbs_corr.loc[cat_feat, num_feat] = corr return pbs_corr
[docs]def compute_correlation(df, nrows=None, max_cat_val=100): """Computes differents correlations matrix for three cases and merge them: - numerical to numerical (using Pearson coeff) - categorical to categorical (using Cramers V & Chi square) - numerical to categorical (discrete) (using Point Biserial) .. raw:: html <b>/!\ ==== Caution ==== /!\\</b> This matrix has a default : the cramers_v_corr is scale from 0 to 1, but the others are from to -1 to 1. Be sure to understand this. `Pearson coeff Wikipedia definition`_ : In statistics, the Pearson correlation coefficient, also referred to as Pearson's r, the Pearson product-moment correlation coefficient (PPMCC) or the bivariate correlation, is a statistic that measures linear correlation between two variables X and Y. It has a value between +1 and −1, where 1 is total positive linear correlation, 0 is no linear correlation, and −1 is total negative linear correlation (that the value lies between -1 and 1 is a consequence of the Cauchy–Schwarz inequality). It is widely used in the sciences. `Cramers V Wikipedia definition`_ : In statistics, Cramér's V (sometimes referred to as Cramér's phi and denoted as φc) is a measure of association between two nominal variables, giving a value between 0 and +1 (inclusive). It is based on Pearson's chi-squared statistic and was published by Harald Cramér in 1946. `Point Biserial Wikipedia definition`_ : The point biserial correlation coefficient (rpb) is a correlation coefficient used when one variable (e.g. Y) is dichotomous; Y can either be "naturally" dichotomous, like whether a coin lands heads or tails, or an artificially dichotomized variable. In most situations it is not advisable to dichotomize variables artificially[citation needed]. When a new variable is artificially dichotomized the new dichotomous variable may be conceptualized as having an underlying continuity. If this is the case, a biserial correlation would be the more appropriate calculation. .. _Pearson coeff Wikipedia definition: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient .. _Cramers V Wikipedia definition: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V .. _Point Biserial Wikipedia definition: https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient Parameters ---------- df: pd.DataFrame pandas Dataframe with values to compute correlation nrows: None or int or float (default None) If not None reduce the data to a sample of nrows if int else if float reduce to len(df) * nrows max_cat_val: int or None (default 100) Number max of unique values in a categorical feature if there are more distinct values than this number then the feature is ignored Returns ------- pd.DataFrame: Correlation matrix computed with Pearson coeff for numerical features to numerical features, Cramers V for categorical features to categorical features and Point Biserial for categorical features to numerical features Raises ------ TypeError: Must provide a pandas DataFrame representing the data """ if type(df) is not pd.DataFrame: raise TypeError( "Must provide a pandas DataFrame representing the data") df = df.copy() if nrows is not None: if nrows < 1.: nrows = int(len(df)*nrows) elif nrows > len(df): nrows = len(df) np.random.seed(42) df = df.sample(nrows) num_feats = df.select_dtypes('number').columns.values.tolist() cat_feats = [c for c in df.columns if c not in num_feats] if max_cat_val is not None: ignore_cat_feats = list() for feat in cat_feats: if df[feat].nunique() > max_cat_val: ignore_cat_feats.append(feat) warnings.warn('%s feature ignored because there are more than %i unique values' % ( feat, max_cat_val)) cat_feats = [v for v in cat_feats if v not in ignore_cat_feats] # Pearson's Correlation for numerical var if len(num_feats) > 0: pearson_corr = df[num_feats].corr() gc.collect() # Cramer's V Correlation for categorical var if len(cat_feats) > 0: cramers_v_corr = compute_cramers_v_corr(df[cat_feats]) gc.collect() # Point Biserial Correlation for categorical and numerical var if (len(num_feats) > 0) & (len(cat_feats) > 0): pbs_corr = compute_pointbiserialr_corr(df, cat_feats, num_feats) gc.collect() if len(cat_feats) == 0: return pearson_corr elif len(cat_feats) == 0: return cramers_v_corr return merge_corr_df([pearson_corr, cramers_v_corr, pbs_corr])