import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import LabelEncoder
[docs]def find_dtype(arr, len_sample=1000):
"""Find the general dtype of an array.
Three possible dtypes :
- Number
- Datetime
- Object
Parameters
----------
arr: array-like
Array to inspect
len_sample: int (default, 1000)
Number max of items to analyse
if len_sample > len(arr) then use len(arr)
Returns
-------
str:
dtype string ('number', 'datetime' or 'object')
Raises
------
TypeError:
arr is not an array like
"""
if not is_array_like(arr):
raise TypeError('arr is not an array like')
if type(arr) in [list, np.ndarray]:
arr = pd.DataFrame(arr)
elif type(arr) == pd.Series:
arr = arr.to_frame()
n = len_sample if len(arr) > len_sample else len(arr)
arr = arr.iloc[:n]
is_number = arr.select_dtypes('number').shape[1] > 0
is_datetime = arr.select_dtypes(['datetime', 'datetimetz']).shape[1] > 0
if is_number:
return 'number'
elif is_datetime:
return 'datetime'
try:
pd.to_datetime(arr)
return 'datetime'
except:
pass
return 'object'
[docs]def is_array_like(obj, n_dims=1):
"""Returns whether an object is an array like.
Valid dtypes are list, np.ndarray, pd.Series, pd.DataFrame.
Parameters
----------
obj:
Object to inspect
n_dims: int (default 1)
number of dimension accepted
Returns
-------
bool:
Whether the object is an array like or not
"""
dtype = type(obj)
valid_types = [list, np.ndarray, pd.Series, pd.DataFrame]
if dtype not in valid_types:
return False
if dtype == list:
obj = np.array(obj)
elif dtype != np.ndarray:
obj = obj.to_numpy()
if len(obj.shape) <= n_dims:
return type(obj[0]) != list
return obj.shape[n_dims] == 1
[docs]def preprocess_metrics(input_metrics, metrics_dict):
"""Preprocess the inputed metrics so that it maps
with the appropriate function in metrics_dict global variable.
input_metrics can have str or function. If it's a string
then it has to be a key from metrics_dict global variable dict
Returns a dictionnary with metric's name as key and
metric function as value
Parameters
----------
input_metrics: list
List of metrics to compute
metrics_dict: dict
Dictionnary to compare input_metrics with
Returns
-------
dict:
Dictionnary with metric's name as key and
metric function as value
Raises
------
TypeError:
input_metrics must be a list
"""
if type(input_metrics) != list:
raise TypeError('input_metrics must be a list')
fn_dict = {}
cnt_custom = 1
for fn in input_metrics:
if type(fn) == str:
if fn in metrics_dict:
fn_dict[fn] = metrics_dict[fn]
else:
warnings.warn('%s function not found' % fn)
else:
fn_dict['custom_'+str(cnt_custom)] = fn
cnt_custom += 1
if len(fn_dict.keys()) == 0:
raise ValueError('No valid metrics found')
return fn_dict
[docs]def init_corr_matrix(columns, index, fill_diag=1.):
"""Returns a matrix n by m fill of 0 (except on the diagonal if squared matrix)
Recommended for correlation matrix
Parameters
----------
columns:
list of column names
index:
list of index names
fill_diag: float (default 1.)
if squared matrix, then set diagonal with this value
Returns
-------
pd.DataFrame
Initialized matrix
"""
zeros = np.zeros((len(index), len(columns)), float)
if len(columns) == len(index):
rng = np.arange(len(zeros))
zeros[rng, rng] = fill_diag
return pd.DataFrame(zeros, columns=columns, index=index)
[docs]def encode_categorical_vars(df):
"""Encodes categorical variables from a dataframe to be numerical (discrete)
It uses LabelEncoder classes from scikit-learn
Parameters
----------
df: pd.DataFrame
Dataframe to update
Returns
-------
pd.DataFrame:
Encoded dataframe
dict:
Encoders with feature name on keys and
encoder as value
"""
cat_vars = df.select_dtypes(['object', 'category']).columns
data_encoded = df.copy()
for var in df.select_dtypes('category').columns:
data_encoded[var] = data_encoded[var].cat.add_categories('Unknown')
data_encoded[cat_vars] = data_encoded[cat_vars].fillna('Unknown')
# Use Label Encoder for categorical columns (including target column)
encoders = {}
for feature in cat_vars:
le = LabelEncoder()
le.fit(data_encoded[feature].dropna())
data_encoded[feature] = le.transform(data_encoded[feature])
encoders[feature] = le
return data_encoded, encoders
def object_has_function(obj, fn):
return bool(getattr(obj, fn, None))