Source code for transparentai.datasets.variable.variable

__all__ = [
    'describe_number',
    'describe_datetime',
    'describe_object',
    'describe'
]

import pandas as pd
import numpy as np
from scipy import stats

from transparentai import utils


def describe_common(arr):
    """Common descriptive statistics about an array.

    Returned statistics:

    - Count of valid values
    - Count of missing values

    Parameters
    ----------
    arr: array like
        Array of value to get desriptive statistics from

    Raises
    ------
    TypeError:
        arr is not an array like
    """
    if not utils.is_array_like(arr):
        raise TypeError('arr is not an array like')
    if type(arr) in [list, np.ndarray]:
        arr = pd.Series(arr)

    n = len(arr)
    missing_val = arr.isna().sum()

    desc = {}
    desc['valid values'] = n - missing_val
    desc['missing values'] = missing_val
    return desc


[docs]def describe_number(arr): """Descriptive statistics about a number array. Returned statistics: - Count of valid values - Count of missing values - Mean - Mode - Min - Quantitle 25% - Median - Quantile 75% - Max Parameters ---------- arr: array like Array of value to get desriptive statistics from Raises ------ TypeError: arr is not an array like TypeError: arr is not a number array """ if not utils.is_array_like(arr): raise TypeError('arr is not an array like') if utils.find_dtype(arr) != 'number': raise TypeError('arr is not a number array') desc = describe_common(arr) desc['mean'] = np.round(np.mean(arr), 4) desc['mode'] = stats.mode(arr)[0][0] desc['min'] = np.min(arr) desc['quantile 25%'] = np.quantile(arr, 0.25) desc['quantile 50%'] = np.median(arr) desc['quantile 75%'] = np.quantile(arr, 0.75) desc['max'] = np.max(arr) return desc
[docs]def describe_datetime(arr, format='%Y-%m-%d'): """Descriptive statistics about a datetime array. Returned statistics: - Count of valid values - Count of missing values - Count of unique values - Most common value - Min - Mean - Max Parameters ---------- arr: array like Array of value to get desriptive statistics from format: str String format for datetime value Raises ------ TypeError: arr is not an array like TypeError: arr is not a datetime array """ if not utils.is_array_like(arr): raise TypeError('arr is not an array like') if utils.find_dtype(arr) != 'datetime': raise TypeError('arr is not a datetime array') if type(arr) in [list, np.ndarray]: arr = pd.Series(arr) arr = pd.to_datetime(arr, errors='coerce') desc = describe_common(arr) desc['unique values'] = arr.nunique() desc['most common'] = arr.mode()[0].strftime(format) desc['min'] = arr.min().strftime(format) desc['mean'] = arr.mean().strftime(format) desc['max'] = arr.max().strftime(format) return desc
[docs]def describe_object(arr): """Descriptive statistics about an object array. Returned statistics: - Count of valid values - Count of missing values - Count of unique values - Most common value Parameters ---------- arr: array like Array of value to get desriptive statistics from Raises ------ TypeError: arr is not an array like TypeError: arr is not an object array """ if not utils.is_array_like(arr): raise TypeError('arr is not an array like') if utils.find_dtype(arr) != 'object': raise TypeError('arr is not an object array') if type(arr) in [list, type(np.array([]))]: arr = pd.Series(arr) desc = describe_common(arr) desc['unique values'] = arr.nunique() desc['most common'] = arr.mode()[0] return desc
[docs]def describe(arr): """Descriptive statistics about an array. Depending on the detected dtype (number, date, object) it returns specific stats. Common statistics for all dtype (using describe_common): - Count of valid values - Count of missing values Number statistics (using describe_number): - Mean - Mode - Min - Quantitle 25% - Median - Quantile 75% - Max Datetime statistics (using describe_datetime): - Count of unique values - Most common value - Min - Mean - Max Object statistics (using describe_datetime): - Count of unique values - Most common value Parameters ---------- arr: array like Array of value to get desriptive statistics from Returns ------- dict Dictionnary with descriptive statistics Raises ------ TypeError: arr is not an array like """ if not utils.is_array_like(arr): raise TypeError('arr is not an array like') if type(arr) == list: arr = np.array(arr) if type(arr) in [pd.Series, pd.DataFrame]: arr = arr.to_numpy() dtype = utils.find_dtype(arr) if dtype == 'number': return describe_number(arr) elif dtype == 'object': return describe_object(arr) elif dtype == 'datetime': return describe_datetime(arr) return None