Source code for transparentai.datasets.variable.correlation

import warnings
import scipy.stats as ss
import gc

import pandas as pd
import numpy as np

from transparentai import utils


[docs]def cramers_v(x, y):
    """Returns the Cramer V value of two categorical variables using
    chi square. This correlation metric is between 0 and 1.

    Code source found in this article : 
    https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9

    Parameters
    ----------
    x: array like
        first categorical variable
    y: array like
        second categorical variable

    Returns
    -------
    float:
        Cramer V value
    """
    confusion_matrix = pd.crosstab(x, y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    if min((kcorr-1), (rcorr-1)) == 0:
        return 0
    return np.sqrt(phi2corr/min((kcorr-1), (rcorr-1)))


[docs]def merge_corr_df(df_list):
    """Merges correlation matrix from compute_correlation() function
    to one. Needs 3 dataframe : pearson_corr, cramers_v_corr and pbs_corr.

    This matrix has a default : the cramers_v_corr is scale from 0 to 1, but 
    the others are from to -1 to 1. Be sure to understand this.

    Parameters
    ----------
    df_list: list
        List of correlation matrices

    Returns
    -------
    pd.DataFrame:
        Merged dataframe of correlation matrices
    """
    pearson_corr = df_list[0]
    cramers_v_corr = df_list[1]
    pbs_corr = df_list[2]

    cat_feats = pbs_corr.index.values.tolist()
    num_feats = pbs_corr.columns.values.tolist()

    feats = cat_feats + num_feats

    corr_df = utils.init_corr_matrix(feats, feats)

    corr_df.loc[num_feats, num_feats] = pearson_corr.loc[num_feats, num_feats]
    corr_df.loc[cat_feats, cat_feats] = cramers_v_corr.loc[cat_feats, cat_feats]

    for cat_feat in cat_feats:
        for num_feat in num_feats:
            corr_df.loc[cat_feat, num_feat] = pbs_corr.loc[cat_feat, num_feat]
            corr_df.loc[num_feat, cat_feat] = pbs_corr.loc[cat_feat, num_feat]

    return corr_df


[docs]def compute_cramers_v_corr(df):
    """Computes Cramers V correlation for a dataframe.

    `Cramers V Wikipedia definition`_ :

    In statistics, Cramér's V (sometimes referred to as Cramér's phi and denoted as φc) 
    is a measure of association between two nominal variables, giving a value between 0 and +1 (inclusive). 
    It is based on Pearson's chi-squared statistic and was published by Harald Cramér in 1946.

    .. _Cramers V Wikipedia definition: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

    Parameters
    ----------
    df: pd.DataFrame
        pandas Dataframe with values to compute 
        Cramers V correlation

    Returns
    -------
    pd.DataFrame:
        Correlation matrix computed for Cramers V coeff

    Raises
    ------
    TypeError:
        Must provide a pandas DataFrame representing the data
    """
    if type(df) is not pd.DataFrame:
        raise TypeError(
            "Must provide a pandas DataFrame representing the data")

    cat_feats = df.columns.values.tolist()

    var_combi = [tuple(sorted([v1, v2]))
                 for v1 in cat_feats for v2 in cat_feats if v1 != v2]
    var_combi = list(set(var_combi))

    cramers_v_corr = utils.init_corr_matrix(
        columns=cat_feats, index=cat_feats)

    for var1, var2 in var_combi:
        corr = cramers_v(df[var1], df[var2])
        cramers_v_corr.loc[var1, var2] = corr
        cramers_v_corr.loc[var2, var1] = corr

    return cramers_v_corr


[docs]def compute_pointbiserialr_corr(df, cat_feats=None, num_feats=None):
    """Computes Point Biserial correlation for a dataframe.

    `Point Biserial Wikipedia definition`_ :

    The point biserial correlation coefficient (rpb) is a correlation coefficient used when one variable (e.g. Y) 
    is dichotomous; Y can either be "naturally" dichotomous, like whether a coin lands heads or tails, 
    or an artificially dichotomized variable. In most situations it is not advisable to dichotomize variables
    artificially[citation needed]. When a new variable is artificially dichotomized the new dichotomous variable may 
    be conceptualized as having an underlying continuity. If this is the case, a biserial correlation
    would be the more appropriate calculation. 

    .. _Point Biserial Wikipedia definition: https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient

    Parameters
    ----------
    df: pd.DataFrame
        pandas Dataframe with values to compute 
        Point Biserial correlation

    Returns
    -------
    pd.DataFrame:
        Correlation matrix computed for Point Biserial coeff

    Raises
    ------
    TypeError:
        Must provide a pandas DataFrame representing the data
    ValueError:
        cat_feats and num_feats must be set or be both None
    TypeError:
        cat_feats must be a list
    TypeError:
        num_feats must be a list
    """
    if type(df) is not pd.DataFrame:
        raise TypeError(
            "Must provide a pandas DataFrame representing the data")

    if ((cat_feats is not None) & (num_feats is None)) | (
            (cat_feats is None) & (num_feats is not None)):
        raise ValueError('cat_feats and num_feats must be set or be both None')

    if type(cat_feats) != list:
        TypeError('cat_feats must be a list')
    if type(num_feats) != list:
        TypeError('num_feats must be a list')

    if (cat_feats is None) & (num_feats is None):
        num_feats = df.select_dtypes('number').columns.values.tolist()
        cat_feats = [c for c in df.columns if c not in num_feats]

    data_encoded, _ = utils.encode_categorical_vars(df)
    var_combi = [(v1, v2)
                 for v1 in cat_feats for v2 in num_feats if v1 != v2]

    pbs_corr = utils.init_corr_matrix(
        columns=num_feats, index=cat_feats, fill_diag=0.)

    for cat_feat, num_feat in var_combi:
        tmp_df = data_encoded[[cat_feat, num_feat]].dropna()
        if len(tmp_df) == 0:
            continue
        corr, p_value = ss.pointbiserialr(
            tmp_df[cat_feat], tmp_df[num_feat]
        )
        pbs_corr.loc[cat_feat, num_feat] = corr

    return pbs_corr


[docs]def compute_correlation(df, nrows=None, max_cat_val=100):
    """Computes differents correlations matrix for 
    three cases and merge them:

    - numerical to numerical (using Pearson coeff)
    - categorical to categorical (using Cramers V & Chi square)
    - numerical to categorical (discrete) (using Point Biserial)

    .. raw:: html
        
        <b>/!\ ==== Caution ==== /!\\</b>

    This matrix has a default : the cramers_v_corr is scale from 0 to 1, but 
    the others are from to -1 to 1. Be sure to understand this.

    `Pearson coeff Wikipedia definition`_ :

    In statistics, the Pearson correlation coefficient, also referred to as Pearson's r, 
    the Pearson product-moment correlation coefficient (PPMCC) or the bivariate correlation,
    is a statistic that measures linear correlation between two variables X and Y. 
    It has a value between +1 and −1, where 1 is total positive linear correlation, 
    0 is no linear correlation, and −1 is total negative linear correlation 
    (that the value lies between -1 and 1 is a consequence of the Cauchy–Schwarz inequality). 
    It is widely used in the sciences. 

    `Cramers V Wikipedia definition`_ :

    In statistics, Cramér's V (sometimes referred to as Cramér's phi and denoted as φc) 
    is a measure of association between two nominal variables, giving a value between 0 and +1 (inclusive). 
    It is based on Pearson's chi-squared statistic and was published by Harald Cramér in 1946.

    `Point Biserial Wikipedia definition`_ :

    The point biserial correlation coefficient (rpb) is a correlation coefficient used when one variable (e.g. Y) 
    is dichotomous; Y can either be "naturally" dichotomous, like whether a coin lands heads or tails, 
    or an artificially dichotomized variable. In most situations it is not advisable to dichotomize variables
    artificially[citation needed]. When a new variable is artificially dichotomized the new dichotomous variable may 
    be conceptualized as having an underlying continuity. If this is the case, a biserial correlation
    would be the more appropriate calculation. 

    .. _Pearson coeff Wikipedia definition: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
    .. _Cramers V Wikipedia definition: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
    .. _Point Biserial Wikipedia definition: https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient

    Parameters
    ----------
    df: pd.DataFrame
        pandas Dataframe with values to compute correlation
    nrows: None or int or float (default None)
        If not None reduce the data to a sample of nrows if int
        else if float reduce to len(df) * nrows
    max_cat_val: int or None (default 100)
        Number max of unique values in a categorical feature
        if there are more distinct values than this number
        then the feature is ignored

    Returns
    -------
    pd.DataFrame:
        Correlation matrix computed with Pearson coeff for numerical features
        to numerical features, Cramers V for categorical features to categorical features and
        Point Biserial for categorical features to numerical features

    Raises
    ------
    TypeError:
        Must provide a pandas DataFrame representing the data
    """
    if type(df) is not pd.DataFrame:
        raise TypeError(
            "Must provide a pandas DataFrame representing the data")

    df = df.copy()

    if nrows is not None:
        if nrows < 1.:
            nrows = int(len(df)*nrows)
        elif nrows > len(df):
            nrows = len(df)
        np.random.seed(42)
        df = df.sample(nrows)

    num_feats = df.select_dtypes('number').columns.values.tolist()
    cat_feats = [c for c in df.columns if c not in num_feats]

    if max_cat_val is not None:
        ignore_cat_feats = list()
        for feat in cat_feats:
            if df[feat].nunique() > max_cat_val:
                ignore_cat_feats.append(feat)
                warnings.warn('%s feature ignored because there are more than %i unique values' % (
                    feat, max_cat_val))
        cat_feats = [v for v in cat_feats if v not in ignore_cat_feats]

    # Pearson's Correlation for numerical var
    if len(num_feats) > 0:
        pearson_corr = df[num_feats].corr()
        gc.collect()

    # Cramer's V Correlation for categorical var
    if len(cat_feats) > 0:
        cramers_v_corr = compute_cramers_v_corr(df[cat_feats])
        gc.collect()

    # Point Biserial Correlation for categorical and numerical var
    if (len(num_feats) > 0) & (len(cat_feats) > 0):
        pbs_corr = compute_pointbiserialr_corr(df, cat_feats, num_feats)
        gc.collect()

    if len(cat_feats) == 0:
        return pearson_corr

    elif len(cat_feats) == 0:
        return cramers_v_corr

    return merge_corr_df([pearson_corr, cramers_v_corr, pbs_corr])