Source code for pyllars.ml_utils

This module contains utilities for common machine learning tasks.

In particular, this module focuses on tasks "surrounding" machine learning,
such as cross-fold splitting, performance evaluation, etc. It does not include
helpers for use directly in :py:class:`sklearn.pipeline.Pipeline`.
import logging
logger = logging.getLogger(__name__)

import collections
import itertools
import joblib
import json
import networkx as nx
import numpy as np
import pandas as pd
import sklearn
import tqdm
import warnings

import sklearn
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing

from copy import deepcopy

import pyllars.collection_utils as collection_utils
import pyllars.utils as utils
import pyllars.validation_utils as validation_utils

from typing import Any, Callable, Container, Dict, Iterable, List, NamedTuple, Optional, Set, Tuple

# Data structures

[docs]class fold_data(NamedTuple): """ A named tuple for holding train, validation, and test datasets suitable for use in `sklearn`. This class can be more convenient than :class:`pyllars.ml_utils.split_masks` for modest-sized datasets. Attributes ------------ X_{train,test,validation} : numpy.ndarray The `X` data (features) for the respective dataset splits y_{train,test,validation} : numpy.ndarray The `y` data (target) for the respective dataset splits {train_test,validation}_indices : numpy.ndarray The row indices from the original dataset of the respective dataset splits """ X_train : np.ndarray y_train : np.ndarray X_test : np.ndarray y_test : np.ndarray X_validation : np.ndarray y_validation : np.ndarray train_indices : np.ndarray test_indices : np.ndarray validation_indices : np.ndarray
[docs]class split_masks(NamedTuple): """ A named tuple for holding boolean masks for the train, validation, and test splits of a complete dataset. These masks can be used to index :py:class:`numpy.ndarray` or :py:class:`pandas.DataFrame` objects to extract the relevant dataset split for `sklearn`. This class can be more appropriate than :class:`pyllars.ml_utils.fold_data` for large objects since it avoids any copies of the data. Attributes ----------- training,test,validation : numpy.ndarray Boolean masks for the respective dataset splits """ training : np.ndarray validation : np.ndarray test : np.ndarray
[docs]class estimators_predictions_metrics(NamedTuple): """ A named tuple for holding fit estimators, predictions on the respective datasets, and results. Attributes ------------ estimator_{val,test} : sklearn.base.BaseEstimators Estimators fit on the respective datasets. predictions_{val,test} : numpy.ndarray Predictions of the respective models. metrics_{val,test} : typing.Dict Metrics for the respective datasets. fold_{train,val,test} : typing.Any The identifiers of the respective folds. hyperparameters{_str} : typing.Optional[typing.Dict] The hyperparameters (in a string format) for training the models. """ estimator_val : sklearn.base.BaseEstimator estimator_test : sklearn.base.BaseEstimator predictions_val : np.ndarray predictions_test : np.ndarray true_val : np.ndarray true_test : np.ndarray metrics_val : Dict metrics_test : Dict fold_train : Any fold_val : Any fold_test : Any hyperparameters : Optional[Dict] hyperparameters_str : str
### # Cross-validation helpers ###
[docs]def get_cv_folds(y:np.ndarray, num_splits:int=10, use_stratified:bool=True, shuffle:bool=True, random_state:int=8675309) -> np.ndarray: """ Assign a split to each row based on the values of `y` Parameters ---------- y : numpy.ndarray The target variable for each row in a data frame. This is used to determine the stratification. num_splits : int The number of stratified splits to use use_stratified : bool Whether to use stratified cross-validation. For example, this may be set to False if choosing folds for regression. shuffle : bool Whether to shuffle during the split random_state : int The state for the random number generator Returns ------- splits : numpy.ndarray The split of each row """ if use_stratified: cv = sklearn.model_selection.StratifiedKFold( n_splits=num_splits, shuffle=shuffle, random_state=random_state ) else: cv = sklearn.model_selection.KFold( n_splits=num_splits, shuffle=shuffle, random_state=random_state ) splits = np.zeros(len(y), dtype=int) for fold, (train, test) in enumerate(cv.split(y,y)): splits[test] = fold return splits
[docs]def get_train_val_test_splits( df:pd.DataFrame, training_splits:Optional[Set]=None, validation_splits:Optional[Set]=None, test_splits:Optional[Set]=None, split_field:str='split') -> split_masks: """ Get the appropriate training, validation, and testing split masks The `split_field` column in `df` is used to assign each row to a particular split. Then, the splits specified in the parameters are assigned as indicated. By default, all splits not in `validation_splits` and `test_splits` are assumed to belong to the training set. Thus, unless a particular training set is given, the returned masks will cover the entire dataset. This function does not check whether the different splits overlap. So care should be taken, especially if specifying the training splits explicitly. It is not necessary that the `split_field` values are numeric. They must be compatible with `isin`, however. Parameters ---------- df : pandas.DataFrame A data frame. It must contain a column named `split_field`, but it is not otherwise validated. training_splits : typing.Optional[typing.Set] The splits to use for the training set. By default, anything not in the `validation_splits` or `test_splits` will be placed in the training set. If given, this container must be compatible with `isin`. Otherwise, it will be wrapped in a set. {validation,test}_splits : typing.Optional[typing.Set] The splits to use for the validation and test sets, respectively. If given, this container must be compatible with `isin`. Otherwise, it will be wrapped in a set. split_field : str The name of the column indicating the split for each row. Returns ------- split_masks : pyllars.ml_utils.split_masks Masks for the respective sets. `True` positions indicate the rows which belong to the respective sets. All three masks are always returned, but a mask may be always `False` if the given split does not contain any rows. """ validation_splits = collection_utils.wrap_in_set(validation_splits) test_splits = collection_utils.wrap_in_set(test_splits) if training_splits is None: training_splits = set(df[split_field].unique()) training_splits = training_splits - validation_splits - test_splits else: training_splits = collection_utils.wrap_in_set(training_splits) m_train = df[split_field].isin(training_splits) m_validation = df[split_field].isin(validation_splits) m_test = df[split_field].isin(test_splits) ret = split_masks( m_train, m_validation, m_test ) return ret
[docs]def get_fold_data( df:pd.DataFrame, target_field:str, m_train:np.ndarray, m_test:np.ndarray, m_validation:Optional[np.ndarray]=None, attribute_fields:Optional[Iterable[str]]=None, fields_to_ignore:Optional[Iterable[str]]=None, attributes_are_np_arrays:bool=False) -> fold_data: """ Prepare a data frame for `sklearn` according to the given splits **N.B.** This function creates copies of the data, so it is not appropriate for very large datasets. Parameters ---------- df : pandas.DataFrame A data frame target_field : str The name of the column containing the target variable m_{train,test,validation} : np.ndarray Boolean masks indicating the training, testing, and validation set rows. If `m_validation` is `None` (default), then no validation set will be included. attribute_fields : typing.Optional[typing.Iterable[str]] The names of the columns to use for attributes (that is, `X`). If `None` (default), then all columns except the `target_field` will be used as attributes. fields_to_ignore : typing.Optional[typing.Container[str]] The names of the columns to ignore. attributes_are_np_arrays : bool Whether to stack the values from the individual rows. This should be set to `True` when some of the columns in `attribute_fields` contain numpy arrays. Returns ------- fold_data : pyllars.ml_utils.fold_data A named tuple with the given splits """ caller = 'ml_utils.get_fold_data' # for validation # first, grab the attribute columns if not specified if attribute_fields is None: attribute_fields = df.columns.values.tolist() attribute_fields.remove(target_field) attribute_fields = deepcopy(attribute_fields) if fields_to_ignore is not None: # make sure to wrap strings, etc., so they behave as expected fields_to_ignore = collection_utils.wrap_in_set(fields_to_ignore) attribute_fields = collection_utils.list_remove_list( attribute_fields, fields_to_ignore) validation_utils.validate_is_sequence( attribute_fields, name='attribute_fields', caller=caller ) if len(attribute_fields) == 1: msg = ("[{}]: found a single attribute field. Treating as a " "single column".format(caller)) logger.warning(msg) attribute_fields = attribute_fields[0] X_train = df.loc[m_train, attribute_fields].values X_test = df.loc[m_test, attribute_fields].values if attributes_are_np_arrays: X_train = np.stack(X_train) X_test = np.stack(X_test) y_train = df.loc[m_train, target_field].values y_test = df.loc[m_test, target_field].values train_indices = np.where(m_train)[0] test_indices = np.where(m_test)[0] X_val = None y_val = None val_indices = None if m_validation is not None: # in case we were not given any validation instances, also ignore it if np.sum(m_validation) > 0: X_val = df.loc[m_validation, attribute_fields].values y_val = df.loc[m_validation, target_field].values val_indices = np.where(m_validation)[0] if attributes_are_np_arrays: X_val = np.stack(X_val) ret = fold_data( X_train, y_train, X_test, y_test, X_val, y_val, train_indices, test_indices, val_indices ) return ret
### # Training helpers ###
[docs]def _train_and_evaluate( estimator, X_train, y_train, X_test, y_test, target_transform, target_inverse_transform, collect_metrics, collect_metrics_kwargs, use_predict_proba): """ Train and evaluate `estimator` on the given datasets This function is a helper for `evaluate_hyperparameters`. It is not intended for external use. """ # transform the target, if necessary if target_transform is not None: y_train = target_transform(y_train) # train the estimator estimator_fit =, y_train) # make predictions if use_predict_proba: y_pred = estimator_fit.predict_proba(X_test) else: y_pred = estimator_fit.predict(X_test) # transform back, if needed if target_inverse_transform is not None: y_pred = target_inverse_transform(y_pred) # evaluate metrics = collect_metrics( y_test, y_pred, **collect_metrics_kwargs ) return estimator_fit, y_pred, metrics
[docs]def evaluate_hyperparameters( estimator_template:sklearn.base.BaseEstimator, hyperparameters:Dict, validation_folds:Any, test_folds:Any, data:pd.DataFrame, collect_metrics:Callable, use_predict_proba:bool=False, train_folds:Optional[Any]=None, split_field:str='fold', target_field:str='target', target_transform:Optional[Callable]=None, target_inverse_transform:Optional[Callable]=None, collect_metrics_kwargs:Optional[Dict]=None, attribute_fields:Optional[Iterable[str]]=None, fields_to_ignore:Optional[Container[str]]=None, attributes_are_np_arrays:bool=False) -> estimators_predictions_metrics: """ Evaluate `hyperparameters` for `fold` **N.B.** This function is not particularly efficient with creating copies of data. This function performs the following steps: 0. Create `estimator_val` and `estimator_test` based on `estimator_template` and `hyperparameters` 1. Split `data` into `train`, `val`, `test` based on `validation_fold` and `test_fold` 2. Transform `target_field` using the `target_transform` function 3. Train `estimator_val` using `train` 4. Evaluate the trained `estimator_val` on `val` using `collect_metrics` 5. Train `estimator_test` using both `train` and `val` 6. Evaluate the trained `estimator_test` on `test` using `collect_metrics` Parameters --------- estimator_template : sklearn.base.BaseEstimator The template for creating the `estimator`. hyperparameters : typing.Dict The hyperparameters for the model. These should be compatible with `estimator_template.set_params`. validation_folds : typing.Any The fold(s) to use for validation. The validation fold will be selected based on `isin`. If `validation_fold` is not a container, it will be cast as one. test_folds : typing.Any The fold(s) to use for testing. The test fold will be selected based on `isin`. If `test_fold` is not a container, it will be cast as one. data : pandas.DataFrame The data. collect_metrics : typing.Callable The function for evaluating the model performance. It should have at least two arguments, `y_true` and `y_pred`, in that order. This function will eventually return whatever this function returns. use_predict_proba : bool Whether to use `predict` (when `False`, the default) or `predict_proba` on the trained model. train_folds : typing.Optional[typing.Any] The fold(s) to use for training. If not given, the training fold will be taken as all rows in `data` which are not part of the validation or testing set. split_field : str The name of the column with the fold identifiers target_field : str The name of the column with the target value target_transform : typing.Optional[typing.Callable] A function for transforming the target before training models. Example: :py:func:`numpy.log1p` target_inverse_transform : typing.Optional[typing.Callable] A function for transforming model predictions back to the original domain. This should be a mathematical inverse of `target_transform`. Example: :py:func:`numpy.expm1` is the inverse of :py:func:`numpy.log1p`. collect_metrics_kwargs : typing.Optional[typing.Dict] Additional keyword arguments for `collect_metrics`. attribute_fields : typing.Optional[typing.Iterable[str]] The names of the columns to use for attributes (that is, `X`). If `None` (default), then all columns except the `target_field` will be used as attributes. fields_to_ignore : typing.Optional[typing.Container[str]] The names of the columns to ignore. attributes_are_np_arrays : bool Whether to stack the values from the individual rows. This should be set to `True` when some of the columns in `attribute_fields` contain numpy arrays. Returns ------- estimators_predictions_metrics : typing.NamedTuple The fit estimators, predictions on the respective datasets, and results from `collect_metrics`. """ ### # Based on the template of our estimator pipeline template # and hyperparameters, create a concrete estimator with the # specified hyperparameters. ### estimator_val = sklearn.clone(estimator_template) estimator_val = estimator_val.set_params(**hyperparameters) estimator_test = sklearn.clone(estimator_template) estimator_test = estimator_test.set_params(**hyperparameters) ### # Split `data` into `train`, `val`, `test` based on # `validation_fold` and `test_fold`. ### split_masks = get_train_val_test_splits( df=data, training_splits=train_folds, validation_splits=validation_folds, test_splits=test_folds, split_field=split_field ) ### # Create the data matrices necessary for the various # sklearn operations we will perform later. ### if fields_to_ignore is None: fields_to_ignore = list() fields_to_ignore = fields_to_ignore + [split_field] val_fold_data = get_fold_data( df=data, target_field=target_field,, m_test=split_masks.test, m_validation=split_masks.validation, attribute_fields=attribute_fields, fields_to_ignore=fields_to_ignore, attributes_are_np_arrays=attributes_are_np_arrays ) if collect_metrics_kwargs is None: collect_metrics_kwargs= dict() # get the validation performance estimator_val_fit, y_val, metrics_val = _train_and_evaluate( estimator_val, X_train=val_fold_data.X_train, y_train=val_fold_data.y_train, X_test=val_fold_data.X_validation, y_test=val_fold_data.y_validation, target_transform=target_transform, target_inverse_transform=target_inverse_transform, collect_metrics=collect_metrics, collect_metrics_kwargs=collect_metrics_kwargs, use_predict_proba=use_predict_proba ) # for predictions on the test set, we will train on # both the training and validation sets X_train = np.concatenate([val_fold_data.X_train, val_fold_data.X_validation]) y_train = np.concatenate([val_fold_data.y_train, val_fold_data.y_validation]) # get the testing performance estimator_test_fit, y_test, metrics_test = _train_and_evaluate( estimator_test, X_train=X_train, y_train=y_train, X_test=val_fold_data.X_test, y_test=val_fold_data.y_test, target_transform=target_transform, target_inverse_transform=target_inverse_transform, collect_metrics=collect_metrics, collect_metrics_kwargs=collect_metrics_kwargs, use_predict_proba=use_predict_proba ) hyperparameters_str = json.dumps(hyperparameters) ret = estimators_predictions_metrics( estimator_val=estimator_val_fit, estimator_test=estimator_test_fit, predictions_val=y_val, predictions_test=y_test, true_val=val_fold_data.y_train, true_test=val_fold_data.y_test, metrics_val=metrics_val, metrics_test=metrics_test, fold_train=train_folds, fold_val=validation_folds, fold_test=test_folds, hyperparameters=hyperparameters, hyperparameters_str=hyperparameters_str, ) return ret
### # Evaluation helpers ###
[docs]def collect_regression_metrics( y_true : np.ndarray, y_pred : np.ndarray, prefix:str = "") -> Dict: """ Collect various regression performance metrics for the predictions Parameters ---------- y_true : numpy.ndarray The true value of each instance y_pred : numpy.ndarray The prediction for each instance prefix : str An optional prefix for the keys in the `metrics` dictionary Returns ------- metrics: typing.Dict A mapping from the metric name to the respective value. Currently, the following metrics are included: * :py:func:`sklearn.metrics.explained_variance_score` * :py:func:`sklearn.metrics.mean_absolute_error` * :py:func:`sklearn.metrics.mean_squared_error` * :py:func:`sklearn.metrics.median_absolute_error` * :py:func:`sklearn.metrics.r2_score` """ validation_utils.validate_equal_shape(y_true, y_pred) ret = { "{}explained_variance".format(prefix): sklearn.metrics.explained_variance_score(y_true, y_pred), "{}mean_absolute_error".format(prefix): sklearn.metrics.mean_absolute_error(y_true, y_pred), "{}mean_squared_error".format(prefix): sklearn.metrics.mean_squared_error(y_true, y_pred), #"{}mean_squared_log_error".format(prefix): sklearn.metrics.mean_squared_log_error(y_true, y_pred), "{}median_absolute_error".format(prefix): sklearn.metrics.median_absolute_error(y_true, y_pred), "{}r2".format(prefix): sklearn.metrics.r2_score(y_true, y_pred) } return ret
[docs]def collect_multiclass_classification_metrics( y_true : np.ndarray, y_score : np.ndarray, prefix:str = "") -> Dict: """ Calculate various multi-class classification performance metrics Parameters ---------- y_true : numpy.ndarray The true label of each instance. The labels are assumed to be encoded with integers [0, 1, ... n_classes-1]. The respective columns in `y_score` should give the scores of the matching label. This should have shape (n_samples,). y_score : numpy.ndarray The score predictions for each class, e.g., from` pred_proba`, though they are not required to be probabilities. This should have shape (n_samples, n_classes). prefix : str An optional prefix for the keys in the `metrics` dictionary Returns ------- metrics : typing.Dict A mapping from the metric name to the respective value. Currently, the following metrics are included: * :py:func:`sklearn.metrics.cohen_kappa_score` * :py:func:`sklearn.metrics.accuracy_score` * :py:func:`sklearn.metrics.f1_score` (micro) * :py:func:`sklearn.metrics.f1_score` (macro) * :py:func:`sklearn.metrics.hamming_loss` * :py:func:`sklearn.metrics.precision_score` (micro) * :py:func:`sklearn.metrics.precision_score` (macro) * :py:func:`sklearn.metrics.recall_score` (micro) * :py:func:`sklearn.metrics.recall_score` (macro) * :py:func:`pyllars.ml_utils.calc_hand_and_till_m_score` * :py:func:`pyllars.ml_utils.calc_provost_and_domingos_auc` """ # make hard predictions y_pred = np.argmax(y_score, axis=1) # now collect all statistics ret = { "{}cohen_kappa".format(prefix): sklearn.metrics.cohen_kappa_score(y_true, y_pred), #"{}matthews_corrcoef".format(prefix): sklearn.metrics.matthews_corrcoef(y_true, y_pred), "{}accuracy".format(prefix): sklearn.metrics.accuracy_score(y_true, y_pred), "{}micro_f1_score".format(prefix): sklearn.metrics.f1_score(y_true, y_pred, average='micro'), "{}macro_f1_score".format(prefix): sklearn.metrics.f1_score(y_true, y_pred, average='macro'), "{}hamming_loss".format(prefix): sklearn.metrics.hamming_loss(y_true, y_pred), "{}micro_precision".format(prefix): sklearn.metrics.precision_score(y_true, y_pred, average='micro'), "{}macro_precision".format(prefix): sklearn.metrics.precision_score(y_true, y_pred, average='macro'), "{}micro_recall".format(prefix): sklearn.metrics.recall_score(y_true, y_pred, average='micro'), "{}macro_recall".format(prefix): sklearn.metrics.recall_score(y_true, y_pred, average='macro'), "{}hand_and_till_m_score".format(prefix): calc_hand_and_till_m_score(y_true, y_score), "{}provost_and_domingos_auc".format(prefix): calc_provost_and_domingos_auc(y_true, y_score) } return ret
[docs]def collect_binary_classification_metrics( y_true:np.ndarray, y_probas_pred:np.ndarray, threshold:float=0.5, pos_label=1, k:int=10, include_roc_curve:bool=True, include_pr_curve:bool=True, prefix:str = "") -> Dict: """ Collect various binary classification performance metrics for the predictions Parameters ---------- y_true: numpy.ndarray The true class of each instance. This should have shape (n_samples,). y_probas_pred: numpy.ndarray The score of each prediction for each instance. This should have shape (n_samples, n_classes). threshold: float The score threshold to choose "positive" predictions pos_label: str or int The "positive" class for some metrics k : int The value of `k` to use for `precision_at_k` include_roc_curve : bool Whether to include the fpr and trp points necessary to draw a roc curve include_pr_curve : bool Whether to include details on the precision-recall curve prefix : str An optional prefix for the keys in the `metrics` dictionary Returns ------- metrics: dict A mapping from the metric name to the respective value. Currently, the following metrics are included: * :py:func:`sklearn.metrics.cohen_kappa_score` * :py:func:`sklearn.metrics.hinge_loss` * :py:func:`sklearn.metrics.matthews_corrcoef` * :py:func:`sklearn.metrics.accuracy_score` * :py:func:`sklearn.metrics.f1_score` (binary) * :py:func:`sklearn.metrics.f1_score` (macro) * :py:func:`sklearn.metrics.f1_score` (micro) * :py:func:`sklearn.metrics.hamming_loss` * :py:func:`sklearn.metrics.jaccard_score` * :py:func:`sklearn.metrics.log_loss` * :py:func:`sklearn.metrics.precision_score` (binary) * :py:func:`sklearn.metrics.precision_score` (macro) * :py:func:`sklearn.metrics.precision_score` (micro) * :py:func:`sklearn.metrics.recall_score` (binary) * :py:func:`sklearn.metrics.recall_score` (macro) * :py:func:`sklearn.metrics.recall_score` (micro) * :py:func:`sklearn.metrics.zero_one_loss` * :py:func:`sklearn.metrics.average_precision_score` (macro) * :py:func:`sklearn.metrics.average_precision_score` (micro) * :py:func:`sklearn.metrics.roc_auc_score` (macro) * :py:func:`sklearn.metrics.roc_auc_score` (micro) * :py:func:`pyllars.ml_utils.precision_at_k` * `auprc`: area under the PR curve * `minpse`: See [Harutyunyan et al., 2019] for details * `roc_` {`fpr`, `tpr`, `thresholds`}: :py:func:`sklearn.metrics.roc_curve` * `pr_` {`precisions`, `recalls`, `thresholds`}: :py:func:`sklearn.metrics.precision_recall_curve` """ # first, validate the input if y_true.shape[0] != y_probas_pred.shape[0]: msg = ("[ml_utils.collect_binary_classification_metrics]: y_true " "and y_probas_pred do not have matching shapes. y_true: {}, " "y_probas_pred: {}".format(y_true.shape, y_probas_pred.shape)) raise ValueError(msg) if y_probas_pred.shape[1] != 2: msg = ("[ml_utils.collect_binary_classification_metrics]: " "y_probas_pred does not have scores for exactly two classes: " "y_probas_pred.shape: {}".format(y_probas_pred.shape)) raise ValueError(msg) # first, pull out the probability of positive classes y_score = y_probas_pred[:,pos_label] # and then make a hard prediction y_pred = (y_score >= threshold) precisions, recalls, thresholds = sklearn.metrics.precision_recall_curve(y_true, y_score) auprc = sklearn.metrics.auc(recalls, precisions) minpse = np.max([min(x, y) for (x, y) in zip(precisions, recalls)]) # now collect all statistics ret = { "cohen_kappa": sklearn.metrics.cohen_kappa_score(y_true, y_pred), "hinge_loss": sklearn.metrics.hinge_loss(y_true, y_score), "matthews_corrcoef": sklearn.metrics.matthews_corrcoef(y_true, y_pred), "accuracy": sklearn.metrics.accuracy_score(y_true, y_pred), "binary_f1_score": sklearn.metrics.f1_score(y_true, y_pred, average='binary', pos_label=pos_label), "micro_f1_score": sklearn.metrics.f1_score(y_true, y_pred, average='micro', pos_label=pos_label), "macro_f1_score": sklearn.metrics.f1_score(y_true, y_pred, average='macro', pos_label=pos_label), "hamming_loss": sklearn.metrics.hamming_loss(y_true, y_pred), "jaccard_score": sklearn.metrics.jaccard_score( y_true, y_pred), "log_loss": sklearn.metrics.log_loss(y_true, y_probas_pred), "micro_precision": sklearn.metrics.precision_score(y_true, y_pred, average='micro', pos_label=pos_label), "binary_precision": sklearn.metrics.precision_score(y_true, y_pred, average='binary', pos_label=pos_label), "macro_precision": sklearn.metrics.precision_score(y_true, y_pred, average='macro', pos_label=pos_label), "micro_recall": sklearn.metrics.recall_score(y_true, y_pred, average='micro', pos_label=pos_label), "macro_recall": sklearn.metrics.recall_score(y_true, y_pred, average='macro', pos_label=pos_label), "binary_recall": sklearn.metrics.recall_score(y_true, y_pred, average='binary', pos_label=pos_label), "zero_one_loss": sklearn.metrics.zero_one_loss(y_true, y_pred), "micro_average_precision": sklearn.metrics.average_precision_score( y_true, y_score, average='micro'), "macro_average_precision": sklearn.metrics.average_precision_score( y_true, y_score, average='macro'), "micro_roc_auc_score": sklearn.metrics.roc_auc_score(y_true, y_score, average='micro'), "macro_roc_auc_score": sklearn.metrics.roc_auc_score(y_true, y_score, average='macro'), "precision_at_k": precision_at_k(y_true, y_score, k, pos_label), "auprc": auprc, "minpse": minpse } if include_roc_curve: fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true, y_score) ret['roc_fpr'] = fpr ret['roc_tpr'] = tpr ret['roc_thresholds'] = thresholds if include_pr_curve: ret['pr_precisions'] = precisions ret['pr_recalls'] = recalls ret['pr_thresholds'] = thresholds # add the prefix, if given if len(prefix) > 0: ret = { "{}{}".format(prefix, key): value for key, value in ret.items() } return ret
[docs]def precision_at_k(y_true, y_score, k=10, pos_label=1): """Precision at rank k This code was adapted from this gist: Parameters ---------- y_true : array-like, shape = [n_samples] Ground truth (true relevance labels). y_score : array-like, shape = [n_samples] Predicted scores. k : int Rank. pos_label : int The label for "positive" instances Returns ------- precision @k : float """ # how many positives in total? n_pos = np.sum(y_true == pos_label) # pull out the top-k according to score order = np.argsort(y_score)[::-1] y_true = np.take(y_true, order[:k]) # how many positives were retrieved? n_relevant = np.sum(y_true == pos_label) # Divide by min(n_pos, k) such that the best achievable score is always 1.0. pak = float(n_relevant) / min(n_pos, k) return pak
[docs]def _calc_hand_and_till_a_value(y_true:np.ndarray, y_score:np.ndarray, i:int, j:int) -> float: """ Calculate the :math:`\hat{A}` value in Equation (3) of [1]_. Specifically; .. math:: \\hat{A}(i|j) = \\frac{ S_i - n_i*(n_i + 1)/2 }{n_i * n_j}, where :math:`n_i`, :math:`n_j` are the count of instances of the respective classes and :math:`S_i` is the (base-1) sum of the ranks of class :math:`i`. Parameters ---------- y_true : numpy.ndarray The true label of each instance. The labels are assumed to be encoded with integers [0, 1, ... n_classes-1]. The respective columns in `y_score` should give the probabilities of the matching label. This should have shape (n_samples,). y_score : numpy.ndarray The score predictions for each class, e.g., from `pred_proba`, though they are not required to be probabilities. This should have shape (n_samples, n_classes). {i,j} : int The class indices Returns ------- a_hat : float The :math:`\hat{A}` value from Equation (3) referenced above. Specifically, this is the probability that a randomly drawn member of class :math:`j` will have a lower estimated score for belonging to class :math:`i` than a randomly drawn member of class :math:`i`. References ---------- .. [1] Hand, D. & Till, R. A Simple Generalisation of the Area Under the ROC Curve for Multiple Class Classification Problems. Machine Learning, 2001, 45, 171-186. `Springer link <>`_. """ # so first pull out all elements of class i or j m_j = (y_true == j) m_i = (y_true == i) m_ij = (m_i | m_j) y_true_ij = y_true[m_ij] y_score_ij = y_score[m_ij] # count them n_i = np.sum(m_i) n_j = np.sum(m_j) # likelihood of class i y_score_i_ij = zip(y_true_ij, y_score_ij[:,i]) # rank the instances sorted_c_pi = np.array(sorted(y_score_i_ij, key=lambda a: a[1])) # sum the ranks for class i # first, find where the class_i's are m_ci = sorted_c_pi[:,0] == i # ranks are base-1, so add 1 ci_ranks = np.where(m_ci)[0] + 1 s_i = np.sum(ci_ranks) a_i_given_j = s_i - n_i * (n_i + 1)/2 a_i_given_j /= (n_i * n_j) return a_i_given_j
[docs]def calc_hand_and_till_m_score(y_true:np.ndarray, y_score:np.ndarray) -> float: """ Calculate the (multi-class AUC) :math:`M` score from Equation (7) of Hand and Till (2001). This is typically taken as a good multi-class extension of the AUC score. Please see [2]_ for more details about this score in particular and [3]_ for multi-class AUC in general. **N.B.** In case y_score contains any `np.nan` values, those will be removed before calculating the :math:`M` score. **N.B.** This function *can* handle unobserved labels, except for the label with the highest index. In particular, ``y_score.shape[1] != np.max(np.unique(y_true)) + 1`` causes an error. Parameters ---------- y_true: numpy.ndarray The true label of each instance. The labels are assumed to be encoded with integers [0, 1, ... n_classes-1]. The respective columns in y_score should give the scores of the matching label. This should have shape (n_samples,). y_score: numpy.ndarray The score predictions for each class, e.g., from `pred_proba`, though they are not required to be probabilities. This should have shape (n_samples, n_classes). Returns ------- m : float The "multi-class AUC" score referenced above See Also -------- _calc_hand_and_till_a_value : for calculating the :math:`\\hat{A}` value References ---------- .. [2] Hand, D. & Till, R. A Simple Generalisation of the Area Under the ROC Curve for Multiple Class Classification Problems. Machine Learning, 2001, 45, 171-186. `Springer link <>`_. .. [3] Fawcett, T. An introduction to ROC analysis. Pattern Recognition Letters, 2006, 27, 861 - 874. `Elsevier link <>`_. """ classes = np.unique(y_true) # make sure the classes are integers, or we will have problems indexing classes = np.array(classes, dtype=int) num_classes = np.max(classes)+1 # first, validate our input if y_true.shape[0] != y_score.shape[0]: msg = ("[ml_utils.m_score]: y_true and y_score do not have matching " "shapes. y_true: {}, y_score: {}".format(y_true.shape, y_score.shape)) raise ValueError(msg) if y_score.shape[1] != (num_classes): msg = ("[ml_utils.m_score]: y_score does not have the expected " "number of columns based on the maximum observed class in y_true. " "y_score.shape: {}. expected number of columns: {}".format( y_score.shape, num_classes)) raise ValueError(msg) # clear out the np.nan's m_nan = np.any(np.isnan(y_score), axis=1) y_score = y_score[~m_nan] y_true = y_true[~m_nan] # the specific equation is: # # M = \frac{2}{c*(c-1)}*\sum_{i<j} {\hat{A}(i,j)}, # # where \hat{A}(i,j) is \frac{A(i|j) + A(i|j)}{2} ij_pairs = itertools.combinations(classes, 2) m = 0 for i,j in ij_pairs: a_ij = _calc_hand_and_till_a_value(y_true, y_score, i,j) a_ji = _calc_hand_and_till_a_value(y_true, y_score, j, i) m += (a_ij + a_ji) / 2 m_1 = num_classes * (num_classes - 1) m_1 = 2 / m_1 m = m_1 * m #print("[hand_and_till] m: {}".format(m)) return m
[docs]def calc_provost_and_domingos_auc(y_true:np.ndarray, y_score:np.ndarray) -> float: """ Calculate the (multi-class AUC) :math:`M` score from Equation (7) of Provost and Domingos (2000). This is typically taken as a good multi-class extension of the AUC score. Please see [4]_ for more details about this score in particular and [5]_ for multi-class AUC in general. N.B. This function *can* handle unobserved labels, except for the label with the highest index. In particular, ``y_score.shape[1] != np.max(np.unique(y_true)) + 1`` causes an error. Parameters ---------- y_true : numpy.ndarray The true label of each instance. The labels are assumed to be encoded with integers [0, 1, ... n_classes-1]. The respective columns in y_score should give the scores of the matching label. This should have shape (n_samples,). y_score : numpy.ndarray The score predictions for each class, e.g., from `pred_proba`, though they are not required to be probabilities. This should have shape (n_samples, n_classes). Returns ------- m : float The "multi-class AUC" score referenced above References ---------- .. [4] Provost, F. & Domingos, P. Well-Trained PETs: Improving Probability Estimation Trees. Sterm School of Business, NYU, Sterm School of Business, NYU, 2000. `Citeseer link <>`_. .. [5] Fawcett, T. An introduction to ROC analysis. Pattern Recognition Letters, 2006, 27, 861 - 874. `Elsevier link <>`_. """ classes = np.unique(y_true) # make sure the classes are integers, or we will have problems indexing classes = np.array(classes, dtype=int) num_classes = np.max(classes)+1 # first, validate our input if y_true.shape[0] != y_score.shape[0]: msg = ("[ml_utils.m_score]: y_true and y_score do not have matching " "shapes. y_true: {}, y_score: {}".format(y_true.shape, y_score.shape)) raise ValueError(msg) if y_score.shape[1] != (num_classes): msg = ("[ml_utils.m_score]: y_score does not have the expected " "number of columns based on the maximum observed class in y_true. " "y_score.shape: {}. expected number of columns: {}".format( y_score.shape, num_classes)) raise ValueError(msg) m = 0 for c in classes: m_c = y_true == c p_c = np.sum(m_c) / len(y_true) y_true_c = (y_true == c) y_score_c = y_score[:,c] m_nan = np.isnan(y_score_c) y_score_c = y_score_c[~m_nan] y_true_c = y_true_c[~m_nan] auc_c = sklearn.metrics.roc_auc_score(y_true_c, y_score_c) a_c = auc_c * p_c m += a_c return m