Source code for pmdarima.model_selection._split

# -*- coding: utf-8 -*-

import abc
import numpy as np

from sklearn.base import BaseEstimator
from sklearn.utils.validation import indexable

__all__ = [
    'check_cv',
    'RollingForecastCV',
    'SlidingWindowForecastCV'
]


class BaseTSCrossValidator(BaseEstimator, metaclass=abc.ABCMeta):
    """Base class for time series cross validators

    Based on the scikit-learn base cross-validator with alterations to fit the
    time series interface.
    """
    def __init__(self, h, step):
        if h < 1:
            raise ValueError("h must be a positive value")
        if step < 1:
            raise ValueError("step must be a positive value")
        self.h = h
        self.step = step

    def split(self, y, exogenous=None):
        """Generate indices to split data into training and test sets

        Parameters
        ----------
        y : array-like or iterable, shape=(n_samples,)
            The time-series array.

        exogenous : array-like, shape=[n_obs, n_vars], optional (default=None)
            An optional 2-d array of exogenous variables.

        Yields
        ------
        train : np.ndarray
            The training set indices for the split

        test : np.ndarray
            The test set indices for the split
        """
        y, exog = indexable(y, exogenous)
        indices = np.arange(y.shape[0])
        for train_index, test_index in self._iter_train_test_masks(y, exog):
            train_index = indices[train_index]
            test_index = indices[test_index]
            yield train_index, test_index

    def _iter_train_test_masks(self, y, exog):
        """Generate boolean masks corresponding to test sets"""
        for train_index, test_index in self._iter_train_test_indices(y, exog):
            train_mask = np.zeros(y.shape[0], dtype=np.bool)
            test_mask = np.zeros(y.shape[0], dtype=np.bool)

            train_mask[train_index] = True
            test_mask[test_index] = True
            yield train_mask, test_mask

    @abc.abstractmethod
    def _iter_train_test_indices(self, y, exog):
        """Yields the train/test indices"""


[docs]class RollingForecastCV(BaseTSCrossValidator):
    """Use a rolling forecast to perform cross validation

    Sometimes called “evaluation on a rolling forecasting origin” [1], this
    approach to CV incrementally grows the training size while using a single
    future sample as a test sample, e.g.:

    With h == 1::

        array([15136., 16733., 20016., 17708., 18019., 19227., 22893., 23739.])
        1st: ~~~~ tr ~~~~ tr ~~~~ te
        2nd: ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ te
        3rd: ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ te

    With h == 2::

        array([15136., 16733., 20016., 17708., 18019., 19227., 22893., 23739.])
        1st: ~~~~ tr ~~~~ tr ~~~~ te ~~~~ te
        2nd: ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ te ~~~~ te
        3rd: ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ te ~~~~ te

    Parameters
    ----------
    h : int, optional (default=1)
        The forecasting horizon, or the number of steps into the future after
        the last training sample for the test set.

    step : int, optional (default=1)
        The size of step taken to increase the training sample size.

    initial : int, optional (default=None)
        The initial training size. If None, will use 1 // 3 the length of the
        time series.

    Examples
    --------
    With a step size of one and a forecasting horizon of one, the training size
    will grow by 1 for each step, and the test index will be 1 + the last
    training index:

    >>> import pmdarima as pm
    >>> from pmdarima.model_selection import RollingForecastCV
    >>> wineind = pm.datasets.load_wineind()
    >>> cv = RollingForecastCV()
    >>> cv_generator = cv.split(wineind)
    >>> next(cv_generator)
    (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
            51, 52, 53, 54, 55, 56, 57]), array([58]))
    >>> next(cv_generator)
    (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
            51, 52, 53, 54, 55, 56, 57, 58]), array([59]))

    With a step size of 2 and a forecasting horizon of 4, the training size
    will grow by 2 for each step, and the test index will 4 + the last index
    in the training fold:

    >>> cv = RollingForecastCV(step=2, h=4)
    >>> cv_generator = cv.split(wineind)
    >>> next(cv_generator)
    (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
           17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
           34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
           51, 52, 53, 54, 55, 56, 57]), array([58, 59, 60, 61]))
    >>> next(cv_generator)
    (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
           17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
           34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
           51, 52, 53, 54, 55, 56, 57, 58, 59]), array([60, 61, 62, 63]))

    See Also
    --------
    SlidingWindowForecastCV

    References
    ----------
    .. [1] https://robjhyndman.com/hyndsight/tscv/
    """
[docs]    def __init__(self, h=1, step=1, initial=None):
        super().__init__(h, step)
        self.initial = initial

    def _iter_train_test_indices(self, y, exog):
        """Yields the train/test indices"""
        n_samples = y.shape[0]
        initial = self.initial
        step = self.step
        h = self.h

        if initial is not None:
            if initial < 1:
                raise ValueError("Initial training size must be a positive "
                                 "integer")
            elif initial + h > n_samples:
                raise ValueError("The initial training size + forecasting "
                                 "horizon would exceed the length of the "
                                 "given timeseries!")
        else:
            # if it's 1, we have another problem..
            initial = max(1, n_samples // 3)

        # Determine the number of iterations that will take place. Must
        # guarantee that the forecasting horizon will not over-index the series
        all_indices = np.arange(n_samples)
        for train_step_size in range(0, n_samples - h - initial, step):
            train_size = initial + train_step_size
            train_indices = all_indices[:train_size]
            test_indices = all_indices[train_size: train_size + h]

            yield train_indices, test_indices


[docs]class SlidingWindowForecastCV(BaseTSCrossValidator):
    """Use a sliding window to perform cross validation

    This approach to CV slides a window over the training samples while using
    several future samples as a test set. While similar to the
    :class:`RollingForecastCV`, it differs in that the train set does not grow,
    but rather shifts.

    Parameters
    ----------
    h : int, optional (default=1)
        The forecasting horizon, or the number of steps into the future after
        the last training sample for the test set.

    step : int, optional (default=1)
        The size of step taken to increase the training sample size when not
        performing using a window.

    window_size : int or None, optional (default=None)
        The size of the rolling window to use. If None, a rolling window of
        size n_samples // 5 will be used.

    Examples
    --------
    With a step size of one and a forecasting horizon of one, the training size
    will grow by 1 for each step, and the test index will be 1 + the last
    training index. Notice the sliding window also adjusts where the training
    sample begins for each fold:

    >>> import pmdarima as pm
    >>> from pmdarima.model_selection import SlidingWindowForecastCV
    >>> wineind = pm.datasets.load_wineind()
    >>> cv = SlidingWindowForecastCV()
    >>> cv_generator = cv.split(wineind)
    >>> next(cv_generator)
    (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
           17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
           34]), array([35]))
    >>> next(cv_generator)
    (array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
           18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
           35]), array([36]))

    With a step size of 4, a forecasting horizon of 6, and a window size of 12,
    the training size will grow by 4 for each step, and the test index will 6 +
    the last index in the training fold:

    >>> cv = SlidingWindowForecastCV(step=4, h=6, window_size=12)
    >>> cv_generator = cv.split(wineind)
    >>> next(cv_generator)
    (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
     array([12, 13, 14, 15, 16, 17]))
    >>> next(cv_generator)
    (array([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]),
     array([16, 17, 18, 19, 20, 21]))

    See Also
    --------
    RollingForecastCV

    References
    ----------
    .. [1] https://robjhyndman.com/hyndsight/tscv/
    """
[docs]    def __init__(self, h=1, step=1, window_size=None):
        super().__init__(h, step)
        self.window_size = window_size

    def _iter_train_test_indices(self, y, exog):
        """Yields the train/test indices"""
        n_samples = y.shape[0]
        window_size = self.window_size
        step = self.step
        h = self.h

        if window_size is not None:
            if window_size + h > n_samples:
                raise ValueError("The window_size + forecasting "
                                 "horizon would exceed the length of the "
                                 "given timeseries!")
        else:
            # TODO: what's a good sane default for this?
            window_size = max(3, n_samples // 5)

        indices = np.arange(n_samples)
        for window_start in range(0, n_samples - h - window_size, step):
            window_end = window_start + window_size
            train_indices = indices[window_start: window_end]
            test_indices = indices[window_end: window_end + h]
            yield train_indices, test_indices


[docs]def check_cv(cv=None):
    """Input checker utility for building a cross-validator

    Parameters
    ----------
    cv : BaseTSCrossValidator or None, optional (default=None)
        An instance of CV or None. Possible inputs:

        - None, to use a default RollingForecastCV
        - A BaseTSCrossValidator as a passthrough
    """
    cv = RollingForecastCV() if cv is None else cv
    if not isinstance(cv, BaseTSCrossValidator):
        raise TypeError("cv should be an instance of BaseTSCrossValidator or "
                        "None, but got %r (type=%s)" % (cv, type(cv)))
    return cv