# -*- coding: utf-8 -*-
import abc
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils.validation import indexable
__all__ = [
'check_cv',
'RollingForecastCV',
'SlidingWindowForecastCV'
]
class BaseTSCrossValidator(BaseEstimator, metaclass=abc.ABCMeta):
"""Base class for time series cross validators
Based on the scikit-learn base cross-validator with alterations to fit the
time series interface.
"""
def __init__(self, h, step):
if h < 1:
raise ValueError("h must be a positive value")
if step < 1:
raise ValueError("step must be a positive value")
self.h = h
self.step = step
def split(self, y, exogenous=None):
"""Generate indices to split data into training and test sets
Parameters
----------
y : array-like or iterable, shape=(n_samples,)
The time-series array.
exogenous : array-like, shape=[n_obs, n_vars], optional (default=None)
An optional 2-d array of exogenous variables.
Yields
------
train : np.ndarray
The training set indices for the split
test : np.ndarray
The test set indices for the split
"""
y, exog = indexable(y, exogenous)
indices = np.arange(y.shape[0])
for train_index, test_index in self._iter_train_test_masks(y, exog):
train_index = indices[train_index]
test_index = indices[test_index]
yield train_index, test_index
def _iter_train_test_masks(self, y, exog):
"""Generate boolean masks corresponding to test sets"""
for train_index, test_index in self._iter_train_test_indices(y, exog):
train_mask = np.zeros(y.shape[0], dtype=np.bool)
test_mask = np.zeros(y.shape[0], dtype=np.bool)
train_mask[train_index] = True
test_mask[test_index] = True
yield train_mask, test_mask
@abc.abstractmethod
def _iter_train_test_indices(self, y, exog):
"""Yields the train/test indices"""
[docs]class RollingForecastCV(BaseTSCrossValidator):
"""Use a rolling forecast to perform cross validation
Sometimes called “evaluation on a rolling forecasting origin” [1], this
approach to CV incrementally grows the training size while using a single
future sample as a test sample, e.g.:
With h == 1::
array([15136., 16733., 20016., 17708., 18019., 19227., 22893., 23739.])
1st: ~~~~ tr ~~~~ tr ~~~~ te
2nd: ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ te
3rd: ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ te
With h == 2::
array([15136., 16733., 20016., 17708., 18019., 19227., 22893., 23739.])
1st: ~~~~ tr ~~~~ tr ~~~~ te ~~~~ te
2nd: ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ te ~~~~ te
3rd: ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ tr ~~~~ te ~~~~ te
Parameters
----------
h : int, optional (default=1)
The forecasting horizon, or the number of steps into the future after
the last training sample for the test set.
step : int, optional (default=1)
The size of step taken to increase the training sample size.
initial : int, optional (default=None)
The initial training size. If None, will use 1 // 3 the length of the
time series.
Examples
--------
With a step size of one and a forecasting horizon of one, the training size
will grow by 1 for each step, and the test index will be 1 + the last
training index:
>>> import pmdarima as pm
>>> from pmdarima.model_selection import RollingForecastCV
>>> wineind = pm.datasets.load_wineind()
>>> cv = RollingForecastCV()
>>> cv_generator = cv.split(wineind)
>>> next(cv_generator)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57]), array([58]))
>>> next(cv_generator)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57, 58]), array([59]))
With a step size of 2 and a forecasting horizon of 4, the training size
will grow by 2 for each step, and the test index will 4 + the last index
in the training fold:
>>> cv = RollingForecastCV(step=2, h=4)
>>> cv_generator = cv.split(wineind)
>>> next(cv_generator)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57]), array([58, 59, 60, 61]))
>>> next(cv_generator)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57, 58, 59]), array([60, 61, 62, 63]))
See Also
--------
SlidingWindowForecastCV
References
----------
.. [1] https://robjhyndman.com/hyndsight/tscv/
"""
[docs] def __init__(self, h=1, step=1, initial=None):
super().__init__(h, step)
self.initial = initial
def _iter_train_test_indices(self, y, exog):
"""Yields the train/test indices"""
n_samples = y.shape[0]
initial = self.initial
step = self.step
h = self.h
if initial is not None:
if initial < 1:
raise ValueError("Initial training size must be a positive "
"integer")
elif initial + h > n_samples:
raise ValueError("The initial training size + forecasting "
"horizon would exceed the length of the "
"given timeseries!")
else:
# if it's 1, we have another problem..
initial = max(1, n_samples // 3)
# Determine the number of iterations that will take place. Must
# guarantee that the forecasting horizon will not over-index the series
all_indices = np.arange(n_samples)
for train_step_size in range(0, n_samples - h - initial, step):
train_size = initial + train_step_size
train_indices = all_indices[:train_size]
test_indices = all_indices[train_size: train_size + h]
yield train_indices, test_indices
[docs]class SlidingWindowForecastCV(BaseTSCrossValidator):
"""Use a sliding window to perform cross validation
This approach to CV slides a window over the training samples while using
several future samples as a test set. While similar to the
:class:`RollingForecastCV`, it differs in that the train set does not grow,
but rather shifts.
Parameters
----------
h : int, optional (default=1)
The forecasting horizon, or the number of steps into the future after
the last training sample for the test set.
step : int, optional (default=1)
The size of step taken to increase the training sample size when not
performing using a window.
window_size : int or None, optional (default=None)
The size of the rolling window to use. If None, a rolling window of
size n_samples // 5 will be used.
Examples
--------
With a step size of one and a forecasting horizon of one, the training size
will grow by 1 for each step, and the test index will be 1 + the last
training index. Notice the sliding window also adjusts where the training
sample begins for each fold:
>>> import pmdarima as pm
>>> from pmdarima.model_selection import SlidingWindowForecastCV
>>> wineind = pm.datasets.load_wineind()
>>> cv = SlidingWindowForecastCV()
>>> cv_generator = cv.split(wineind)
>>> next(cv_generator)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34]), array([35]))
>>> next(cv_generator)
(array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
35]), array([36]))
With a step size of 4, a forecasting horizon of 6, and a window size of 12,
the training size will grow by 4 for each step, and the test index will 6 +
the last index in the training fold:
>>> cv = SlidingWindowForecastCV(step=4, h=6, window_size=12)
>>> cv_generator = cv.split(wineind)
>>> next(cv_generator)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
array([12, 13, 14, 15, 16, 17]))
>>> next(cv_generator)
(array([ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]),
array([16, 17, 18, 19, 20, 21]))
See Also
--------
RollingForecastCV
References
----------
.. [1] https://robjhyndman.com/hyndsight/tscv/
"""
[docs] def __init__(self, h=1, step=1, window_size=None):
super().__init__(h, step)
self.window_size = window_size
def _iter_train_test_indices(self, y, exog):
"""Yields the train/test indices"""
n_samples = y.shape[0]
window_size = self.window_size
step = self.step
h = self.h
if window_size is not None:
if window_size + h > n_samples:
raise ValueError("The window_size + forecasting "
"horizon would exceed the length of the "
"given timeseries!")
else:
# TODO: what's a good sane default for this?
window_size = max(3, n_samples // 5)
indices = np.arange(n_samples)
for window_start in range(0, n_samples - h - window_size, step):
window_end = window_start + window_size
train_indices = indices[window_start: window_end]
test_indices = indices[window_end: window_end + h]
yield train_indices, test_indices
[docs]def check_cv(cv=None):
"""Input checker utility for building a cross-validator
Parameters
----------
cv : BaseTSCrossValidator or None, optional (default=None)
An instance of CV or None. Possible inputs:
- None, to use a default RollingForecastCV
- A BaseTSCrossValidator as a passthrough
"""
cv = RollingForecastCV() if cv is None else cv
if not isinstance(cv, BaseTSCrossValidator):
raise TypeError("cv should be an instance of BaseTSCrossValidator or "
"None, but got %r (type=%s)" % (cv, type(cv)))
return cv