Source code for pmdarima.model_selection._validation

# -*- coding: utf-8 -*-
"""
Cross-validation for ARIMA and pipeline estimators.
See: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_validation.py  # noqa: E501
"""

import numpy as np
import numbers
import warnings
import time
from traceback import format_exception_only

from sklearn import base
from sklearn.metrics import regression as reg
from sklearn.utils import indexable, safe_indexing

from ._split import check_cv
from .. import metrics
from ..utils import check_endog
from ..arima.warnings import ModelFitWarning

__all__ = [
    'cross_validate',
    'cross_val_score'
]


_valid_scoring = {
    'mean_absolute_error': reg.mean_absolute_error,
    'mean_squared_error': reg.mean_squared_error,
    'smape': metrics.smape,
}


def _check_scoring(metric):
    if callable(metric):
        return metric
    if isinstance(metric, str):
        try:
            return _valid_scoring[metric]
        except KeyError:
            raise ValueError('metric can be a callable or a string in %s'
                             % str(list(_valid_scoring.keys())))
    raise TypeError('expected a callable or a string, but got %r (type=%s)'
                    % (metric, type(metric)))


def _safe_split(y, exog, train, test):
    """Performs the CV indexing given the indices"""
    y_train, y_test = y.take(train), y.take(test)
    if exog is None:
        exog_train = exog_test = None
    else:
        exog_train, exog_test = \
            safe_indexing(exog, train), safe_indexing(exog, test)
    return y_train, y_test, exog_train, exog_test


def _fit_and_score(fold, estimator, y, exog, scorer, train, test, verbose,
                   error_score):
    """Fit estimator and compute scores for a given dataset split."""
    msg = 'fold=%i' % fold
    if verbose > 1:
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    start_time = time.time()
    y_train, y_test, exog_train, exog_test = _safe_split(y, exog, train, test)

    try:
        estimator.fit(y_train, exogenous=exog_train)

    except Exception as e:
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        else:
            test_scores = error_score
            warnings.warn("Estimator fit failed. The score on this train-test "
                          "partition will be set to %f. Details: \n%s"
                          % (error_score,
                             format_exception_only(type(e), e)[0]),
                          ModelFitWarning)

    else:
        fit_time = time.time() - start_time

        # forecast h periods into the future and compute the score
        preds = estimator.predict(n_periods=len(test), exogenous=exog_test)
        test_scores = scorer(y_test, preds)
        score_time = time.time() - start_time - fit_time

    if verbose > 2:
        total_time = score_time + fit_time
        msg += ", score=%.3f [time=%.3f sec]" % (test_scores, total_time)
        print(msg)

    # TODO: if we ever want train scores, we'll need to change this signature
    return test_scores, fit_time, score_time


[docs]def cross_validate(estimator, y, exogenous=None, scoring=None, cv=None,
                   verbose=0, error_score=np.nan):
    """Evaluate metric(s) by cross-validation and also record fit/score times.

    Parameters
    ----------
    estimator : estimator
        An estimator object that implements the ``fit`` method

    y : array-like or iterable, shape=(n_samples,)
            The time-series array.

    exogenous : array-like, shape=[n_obs, n_vars], optional (default=None)
        An optional 2-d array of exogenous variables.

    scoring : str or callable, optional (default=None)
        The scoring metric to use. If a callable, must adhere to the signature
        ``metric(true, predicted)``. Valid string scoring metrics include:

        - 'smape'
        - 'mean_absolute_error'
        - 'mean_squared_error'

    cv : BaseTSCrossValidator or None, optional (default=None)
        An instance of cross-validation. If None, will use a RollingForecastCV

    verbose : integer, optional
        The verbosity level.

    error_score : 'raise' or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If a numeric value is given, ModelFitWarning is raised. This parameter
        does not affect the refit step, which will always raise the error.
    """
    y, exog = indexable(y, exogenous)
    y = check_endog(y, copy=False)

    cv = check_cv(cv)
    scoring = _check_scoring(scoring)

    # validate the error score
    if not (error_score == "raise" or isinstance(error_score, numbers.Number)):
        raise ValueError('error_score should be the string "raise" or a '
                         'numeric value')

    # TODO: clone between each iteration?
    # TODO: in the future we might consider joblib for parallelizing, but it
    #   . could cause cross threads in parallelism..

    results = [
        _fit_and_score(fold, base.clone(estimator), y, exog,
                       scorer=scoring,
                       train=train,
                       test=test,
                       verbose=verbose,
                       error_score=error_score)
        for fold, (train, test) in enumerate(cv.split(y, exog))]
    scores, fit_times, score_times = list(zip(*results))

    ret = {
        'test_score': np.array(scores),
        'fit_time': np.array(fit_times),
        'score_time': np.array(score_times),
    }
    return ret


def cross_val_score(estimator, y, exogenous=None, scoring=None, cv=None,
                    verbose=0, error_score=np.nan):
    """Evaluate a score by cross-validation

    Parameters
    ----------
    estimator : estimator
        An estimator object that implements the ``fit`` method

    y : array-like or iterable, shape=(n_samples,)
            The time-series array.

    exogenous : array-like, shape=[n_obs, n_vars], optional (default=None)
        An optional 2-d array of exogenous variables.

    scoring : str or callable, optional (default=None)
        The scoring metric to use. If a callable, must adhere to the signature
        ``metric(true, predicted)``. Valid string scoring metrics include:

        - 'smape'
        - 'mean_absolute_error'
        - 'mean_squared_error'

    cv : BaseTSCrossValidator or None, optional (default=None)
        An instance of cross-validation. If None, will use a RollingForecastCV

    verbose : integer, optional
        The verbosity level.

    error_score : 'raise' or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If a numeric value is given, ModelFitWarning is raised. This parameter
        does not affect the refit step, which will always raise the error.
    """
    cv_results = cross_validate(estimator=estimator, y=y, exogenous=exogenous,
                                scoring=scoring, cv=cv,
                                verbose=verbose,
                                error_score=error_score)
    return cv_results['test_score']