Source code for transparentai.utils.utils

import numpy as np
import pandas as pd
import warnings

from sklearn.preprocessing import LabelEncoder


[docs]def find_dtype(arr, len_sample=1000): """Find the general dtype of an array. Three possible dtypes : - Number - Datetime - Object Parameters ---------- arr: array-like Array to inspect len_sample: int (default, 1000) Number max of items to analyse if len_sample > len(arr) then use len(arr) Returns ------- str: dtype string ('number', 'datetime' or 'object') Raises ------ TypeError: arr is not an array like """ if not is_array_like(arr): raise TypeError('arr is not an array like') if type(arr) in [list, np.ndarray]: arr = pd.DataFrame(arr) elif type(arr) == pd.Series: arr = arr.to_frame() n = len_sample if len(arr) > len_sample else len(arr) arr = arr.iloc[:n] is_number = arr.select_dtypes('number').shape[1] > 0 is_datetime = arr.select_dtypes(['datetime', 'datetimetz']).shape[1] > 0 if is_number: return 'number' elif is_datetime: return 'datetime' try: pd.to_datetime(arr) return 'datetime' except: pass return 'object'
[docs]def is_array_like(obj, n_dims=1): """Returns whether an object is an array like. Valid dtypes are list, np.ndarray, pd.Series, pd.DataFrame. Parameters ---------- obj: Object to inspect n_dims: int (default 1) number of dimension accepted Returns ------- bool: Whether the object is an array like or not """ dtype = type(obj) valid_types = [list, np.ndarray, pd.Series, pd.DataFrame] if dtype not in valid_types: return False if dtype == list: obj = np.array(obj) elif dtype != np.ndarray: obj = obj.to_numpy() if len(obj.shape) <= n_dims: return type(obj[0]) != list return obj.shape[n_dims] == 1
[docs]def format_describe_str(desc, max_len=20): """Returns a formated list for the matplotlib table cellText argument. Each element of the list is like this : ['key ','value '] Number of space at the end of the value depends on len_max argument. Parameters ---------- desc: dict Dictionnary returned by the variable.describe function len_max: int (default 20) Maximum length for the values Returns ------- list(list): Formated list for the matplotlib table cellText argument """ res = {} _max = max([len(str(e)) for k, e in desc.items()]) max_len = _max if _max < max_len else max_len n_valid = desc['valid values'] n_missing = desc['missing values'] n = n_valid + n_missing for k, e in desc.items(): if k == 'valid values': e = str(e) + ' (' + str(int(n_valid*100/n)) + '%)' elif k == 'missing values': e = str(e) + ' (' + str(int(n_missing*100/n)) + '%)' else: e = str(e) e = e.ljust(max_len) if len(e) <= 15 else e[:max_len] res[k.ljust(15).title()] = e return [[k, e] for k, e in res.items()]
[docs]def preprocess_metrics(input_metrics, metrics_dict): """Preprocess the inputed metrics so that it maps with the appropriate function in metrics_dict global variable. input_metrics can have str or function. If it's a string then it has to be a key from metrics_dict global variable dict Returns a dictionnary with metric's name as key and metric function as value Parameters ---------- input_metrics: list List of metrics to compute metrics_dict: dict Dictionnary to compare input_metrics with Returns ------- dict: Dictionnary with metric's name as key and metric function as value Raises ------ TypeError: input_metrics must be a list """ if type(input_metrics) != list: raise TypeError('input_metrics must be a list') fn_dict = {} cnt_custom = 1 for fn in input_metrics: if type(fn) == str: if fn in metrics_dict: fn_dict[fn] = metrics_dict[fn] else: warnings.warn('%s function not found' % fn) else: fn_dict['custom_'+str(cnt_custom)] = fn cnt_custom += 1 if len(fn_dict.keys()) == 0: raise ValueError('No valid metrics found') return fn_dict
[docs]def init_corr_matrix(columns, index, fill_diag=1.): """Returns a matrix n by m fill of 0 (except on the diagonal if squared matrix) Recommended for correlation matrix Parameters ---------- columns: list of column names index: list of index names fill_diag: float (default 1.) if squared matrix, then set diagonal with this value Returns ------- pd.DataFrame Initialized matrix """ zeros = np.zeros((len(index), len(columns)), float) if len(columns) == len(index): rng = np.arange(len(zeros)) zeros[rng, rng] = fill_diag return pd.DataFrame(zeros, columns=columns, index=index)
[docs]def encode_categorical_vars(df): """Encodes categorical variables from a dataframe to be numerical (discrete) It uses LabelEncoder classes from scikit-learn Parameters ---------- df: pd.DataFrame Dataframe to update Returns ------- pd.DataFrame: Encoded dataframe dict: Encoders with feature name on keys and encoder as value """ cat_vars = df.select_dtypes(['object', 'category']).columns data_encoded = df.copy() for var in df.select_dtypes('category').columns: data_encoded[var] = data_encoded[var].cat.add_categories('Unknown') data_encoded[cat_vars] = data_encoded[cat_vars].fillna('Unknown') # Use Label Encoder for categorical columns (including target column) encoders = {} for feature in cat_vars: le = LabelEncoder() le.fit(data_encoded[feature].dropna()) data_encoded[feature] = le.transform(data_encoded[feature]) encoders[feature] = le return data_encoded, encoders
def object_has_function(obj, fn): return bool(getattr(obj, fn, None))