Source code for pmdarima.arima.auto

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# Automatically find optimal parameters for an ARIMA

from __future__ import absolute_import

from sklearn.utils.validation import check_array, column_or_1d, check_is_fitted
from sklearn.utils import check_random_state
from sklearn.externals.joblib import Parallel, delayed
from sklearn.linear_model import LinearRegression

import numpy as np

import warnings
import time

from ..base import BaseARIMA
from . import _doc
from .utils import ndiffs, is_constant, nsdiffs
from ..utils import diff, is_iterable
from ..utils.metaestimators import if_has_delegate
from ._auto_solvers import _fit_arima, _StepwiseFitWrapper

# for python 3 compat
from ..compat.python import xrange
from ..compat.numpy import DTYPE

__all__ = [
    'auto_arima',
    'AutoARIMA'
]

# The valid information criteria
VALID_CRITERIA = {'aic', 'aicc', 'bic', 'hqic', 'oob'}


[docs]class AutoARIMA(BaseARIMA):
    # Don't add the y, exog, etc. here since they are used in 'fit'
    __doc__ = _doc._AUTO_ARIMA_DOCSTR.format(
        y="", exogenous="", fit_args="", return_valid_fits="")

    # todo: someday store defaults somewhere else for single source of truth
[docs]    def __init__(self, start_p=2, d=None, start_q=2, max_p=5,
                 max_d=2, max_q=5, start_P=1, D=None, start_Q=1, max_P=2,
                 max_D=1, max_Q=2, max_order=10, m=1, seasonal=True,
                 stationary=False, information_criterion='aic', alpha=0.05,
                 test='kpss', seasonal_test='ocsb', stepwise=True, n_jobs=1,
                 start_params=None, trend=None, method=None, transparams=True,
                 solver='lbfgs', maxiter=None, disp=0, callback=None,
                 offset_test_args=None, seasonal_test_args=None,
                 suppress_warnings=False, error_action='warn', trace=False,
                 random=False, random_state=None, n_fits=10,
                 out_of_sample_size=0, scoring='mse',
                 scoring_args=None, with_intercept=True):

        self.start_p = start_p
        self.d = d
        self.start_q = start_q
        self.max_p = max_p
        self.max_d = max_d
        self.max_q = max_q
        self.start_P = start_P
        self.D = D
        self.start_Q = start_Q
        self.max_P = max_P
        self.max_D = max_D
        self.max_Q = max_Q
        self.max_order = max_order
        self.m = m
        self.seasonal = seasonal
        self.stationary = stationary
        self.information_criterion = information_criterion
        self.alpha = alpha
        self.test = test
        self.seasonal_test = seasonal_test
        self.stepwise = stepwise
        self.n_jobs = n_jobs
        self.start_params = start_params
        self.trend = trend
        self.method = method
        self.transparams = transparams
        self.solver = solver
        self.maxiter = maxiter
        self.disp = disp
        self.callback = callback
        self.offset_test_args = offset_test_args
        self.seasonal_test_args = seasonal_test_args
        self.suppress_warnings = suppress_warnings
        self.error_action = error_action
        self.trace = trace
        self.random = random
        self.random_state = random_state
        self.n_fits = n_fits
        self.out_of_sample_size = out_of_sample_size
        self.scoring = scoring
        self.scoring_args = scoring_args
        self.with_intercept = with_intercept

[docs]    def fit(self, y, exogenous=None, **fit_args):
        """Fit the auto-arima estimator

        Fit an AutoARIMA to a vector, ``y``, of observations with an
        optional matrix of ``exogenous`` variables.

        Parameters
        ----------
        y : array-like or iterable, shape=(n_samples,)
            The time-series to which to fit the ``ARIMA`` estimator. This may
            either be a Pandas ``Series`` object (statsmodels can internally
            use the dates in the index), or a numpy array. This should be a
            one-dimensional array of floats, and should not contain any
            ``np.nan`` or ``np.inf`` values.

        exogenous : array-like, shape=[n_obs, n_vars], optional (default=None)
            An optional 2-d array of exogenous variables. If provided, these
            variables are used as additional features in the regression
            operation. This should not include a constant or trend. Note that
            if an ``ARIMA`` is fit on exogenous features, it must be provided
            exogenous features for making predictions.

        **fit_args : dict or kwargs
            Any keyword arguments to pass to the auto-arima function.
        """
        self.model_ = auto_arima(
            y, exogenous=exogenous, start_p=self.start_p, d=self.d,
            start_q=self.start_q, max_p=self.max_p, max_d=self.max_d,
            max_q=self.max_q, start_P=self.start_P, D=self.D,
            start_Q=self.start_Q, max_P=self.max_P, max_D=self.max_D,
            max_Q=self.max_Q, max_order=self.max_order, m=self.m,
            seasonal=self.seasonal, stationary=self.stationary,
            information_criterion=self.information_criterion,
            alpha=self.alpha, test=self.test, seasonal_test=self.seasonal_test,
            stepwise=self.stepwise, n_jobs=self.n_jobs,
            start_params=self.start_params, trend=self.trend,
            method=self.method, transparams=self.transparams,
            solver=self.solver, maxiter=self.maxiter, disp=self.disp,
            callback=self.callback, offset_test_args=self.offset_test_args,
            seasonal_test_args=self.seasonal_test_args,
            suppress_warnings=self.suppress_warnings,
            error_action=self.error_action, trace=self.trace,
            random=self.random, random_state=self.random_state,
            n_fits=self.n_fits,
            return_valid_fits=False,  # only return ONE
            out_of_sample_size=self.out_of_sample_size, scoring=self.scoring,
            scoring_args=self.scoring_args, with_intercept=self.with_intercept,
            **fit_args)

        return self

    @if_has_delegate("model_")
    def predict_in_sample(self, exogenous=None, start=None,
                          end=None, dynamic=False):
        return self.model_.predict_in_sample(
            exogenous=exogenous, start=start, end=end, dynamic=dynamic)

    @if_has_delegate("model_")
    def predict(self, n_periods=10, exogenous=None,
                return_conf_int=False, alpha=0.05):
        return self.model_.predict(
            n_periods=n_periods, exogenous=exogenous,
            return_conf_int=return_conf_int, alpha=alpha)

    @if_has_delegate("model_")
    def update(self, y, exogenous=None, maxiter=None, **kwargs):
        return self.model_.update(
            y, exogenous=exogenous, maxiter=maxiter, **kwargs)

    # TODO: decorator to automate all this composition + AIC, etc.


[docs]def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5,
               max_d=2, max_q=5, start_P=1, D=None, start_Q=1, max_P=2,
               max_D=1, max_Q=2, max_order=10, m=1, seasonal=True,
               stationary=False, information_criterion='aic', alpha=0.05,
               test='kpss', seasonal_test='ocsb', stepwise=True, n_jobs=1,
               start_params=None, trend=None, method=None, transparams=True,
               solver='lbfgs', maxiter=None, disp=0, callback=None,
               offset_test_args=None, seasonal_test_args=None,
               suppress_warnings=False, error_action='warn', trace=False,
               random=False, random_state=None, n_fits=10,
               return_valid_fits=False, out_of_sample_size=0, scoring='mse',
               scoring_args=None, with_intercept=True, **fit_args):
    # NOTE: Doc is assigned BELOW this function

    start = time.time()

    # validate start/max points
    if any(_ < 0 for _ in (max_p, max_q, max_P, max_Q, start_p,
                           start_q, start_P, start_Q)):
        raise ValueError('starting and max p, q, P & Q values must '
                         'be positive integers (>= 0)')
    if max_p < start_p or max_q < start_q \
            or max_P < start_P or max_Q < start_Q:
        raise ValueError('max p, q, P & Q must be >= than '
                         'their starting values')

    # validate max_order
    if max_order is None:
        max_order = np.inf
    elif max_order < 0:
        raise ValueError('max_order must be None or a positive integer (>= 0)')

    # alternatively, if the start_p and start_q supercede
    # the max order, that's a failable offense...
    if (not seasonal and start_p + start_q > max_order) or \
            (seasonal and start_P + start_p + start_Q + start_q > max_order):
        raise ValueError('If max_order is prescribed, it must '
                         'exceed sum of starting orders')

    # validate d & D
    for _d, _max_d in ((d, max_d), (D, max_D)):
        if _max_d < 0:
            raise ValueError('max_d & max_D must be positive integers (>= 0)')
        if _d is not None:
            if _d < 0:
                raise ValueError('d & D must be None or a positive '
                                 'integer (>= 0)')
            # v0.9.0+ - ignore this if it's explicitly set...
            # if _d > _max_d:
            #     raise ValueError('if explicitly defined, d & D must be <= '
            #                      'max_d & <= max_D, respectively')

    # is stepwise AND parallel enabled?
    if stepwise and n_jobs != 1:
        n_jobs = 1
        warnings.warn('stepwise model cannot be fit in parallel (n_jobs=%i). '
                      'Falling back to stepwise parameter search.' % n_jobs)

    # check on m
    if m < 1:
        raise ValueError('m must be a positive integer (> 0)')

    # check on n_iter
    if random and n_fits < 0:
        raise ValueError('n_iter must be a positive integer '
                         'for a random search')

    # validate error action
    actions = {'warn', 'raise', 'ignore', None}
    if error_action not in actions:
        raise ValueError('error_action must be one of %r, but got %r'
                         % (actions, error_action))

    # copy array
    y = column_or_1d(check_array(y, ensure_2d=False, dtype=DTYPE, copy=True,
                                 force_all_finite=True))  # type: np.ndarray
    n_samples = y.shape[0]

    # check for constant data
    if is_constant(y):
        warnings.warn('Input time-series is completely constant; '
                      'returning a (0, 0, 0) ARMA.')
        return _return_wrapper(_post_ppc_arima(
            _fit_arima(y, xreg=exogenous, order=(0, 0, 0), seasonal_order=None,
                       start_params=start_params, trend=trend, method=method,
                       transparams=transparams, solver=solver, maxiter=maxiter,
                       disp=disp, callback=callback, fit_params=fit_args,
                       suppress_warnings=suppress_warnings, trace=trace,
                       error_action=error_action, scoring=scoring,
                       out_of_sample_size=out_of_sample_size,
                       scoring_args=scoring_args,
                       with_intercept=with_intercept)),
            return_valid_fits, start, trace)

    # test ic, and use AIC if n <= 3
    if information_criterion not in VALID_CRITERIA:
        raise ValueError('auto_arima not defined for information_criteria=%s. '
                         'Valid information criteria include: %r'
                         % (information_criterion, VALID_CRITERIA))

    # the R code handles this, but I don't think statsmodels
    # will even fit a model this small...
    # if n_samples <= 3:
    #     if information_criterion != 'aic':
    #         warnings.warn('n_samples (%i) <= 3 '
    #                       'necessitates using AIC' % n_samples)
    #     information_criterion = 'aic'

    # adjust max p, q -- R code:
    # max.p <- min(max.p, floor(serieslength/3))
    # max.q <- min(max.q, floor(serieslength/3))
    max_p = int(min(max_p, np.floor(n_samples / 3)))
    max_q = int(min(max_q, np.floor(n_samples / 3)))

    # this is not in the R code and poses a risk that R did not consider...
    # if max_p|q has now dropped below start_p|q, correct it.
    start_p = min(start_p, max_p)
    start_q = min(start_q, max_q)

    # if it's not seasonal, we can avoid multiple 'if not is None' comparisons
    # later by just using this shortcut (hack):
    if not seasonal:
        D = m = -1

    # choose the order of differencing
    xx = y.copy()
    if exogenous is not None:
        lm = LinearRegression().fit(exogenous, y)
        xx = y - lm.predict(exogenous)

    # is the TS stationary?
    if stationary:
        d = D = 0

    # now for seasonality
    if m == 1:
        D = max_P = max_Q = 0

    # m must be > 1 for nsdiffs
    elif D is None:  # we don't have a D yet and we need one (seasonal)
        seasonal_test_args = seasonal_test_args \
            if seasonal_test_args is not None else dict()
        D = nsdiffs(xx, m=m, test=seasonal_test, max_D=max_D,
                    **seasonal_test_args)

        if D > 0 and exogenous is not None:
            diffxreg = diff(exogenous, differences=D, lag=m)
            # check for constance on any column
            if np.apply_along_axis(is_constant, arr=diffxreg, axis=0).any():
                D -= 1

    # D might still be None if not seasonal. Py 3 will throw and error for that
    # unless we explicitly check for ``seasonal``
    if D > 0:
        dx = diff(xx, differences=D, lag=m)
    else:
        dx = xx

    # If D was too big, we might have gotten rid of x altogether!
    if dx.shape[0] == 0:
        raise ValueError("The seasonal differencing order, D=%i, was too "
                         "large for your time series, and after differencing, "
                         "there are no samples remaining in your data. "
                         "Try a smaller value for D, or if you didn't set D "
                         "to begin with, try setting it explicitly. This can "
                         "also occur in seasonal settings when m is too large."
                         % D)

    # difference the exogenous matrix
    if exogenous is not None:
        if D > 0:
            diffxreg = diff(exogenous, differences=D, lag=m)
        else:
            diffxreg = exogenous
    else:
        # here's the thing... we're only going to use diffxreg if exogenous
        # was not None in the first place. However, PyCharm doesn't know that
        # and it thinks we might use it before assigning it. Therefore, assign
        # it to None as a default value and it won't raise the warning anymore.
        diffxreg = None

    # determine/set the order of differencing by estimating the number of
    # orders it would take in order to make the TS stationary.
    if d is None:
        offset_test_args = offset_test_args \
            if offset_test_args is not None else dict()
        d = ndiffs(dx, test=test, alpha=alpha, max_d=max_d, **offset_test_args)

        if d > 0 and exogenous is not None:
            diffxreg = diff(diffxreg, differences=d, lag=1)

            # if any columns are constant, subtract one order of differencing
            if np.apply_along_axis(is_constant, arr=diffxreg, axis=0).any():
                d -= 1

    # check differences (do we want to warn?...)
    if error_action == 'warn' and not suppress_warnings:
        if D >= 2:
            warnings.warn("Having more than one seasonal differences is "
                          "not recommended. Please consider using only one "
                          "seasonal difference.")
        # if D is -1, this will be off, so we include the OR
        elif D + d > 2 or d > 2:
            warnings.warn("Having 3 or more differencing operations is not "
                          "recommended. Please consider reducing the total "
                          "number of differences.")

    if d > 0:
        dx = diff(dx, differences=d, lag=1)

    # check for constance
    if is_constant(dx):
        if exogenous is None and not (D > 0 or d < 2):
            raise ValueError('data follow a simple polynomial and are not '
                             'suitable for ARIMA modeling')

        # perfect regression
        ssn = None if not seasonal else (0, D, 0, m)
        return _return_wrapper(
            _post_ppc_arima(_fit_arima(y, xreg=exogenous, order=(0, d, 0),
                                       seasonal_order=ssn,
                                       start_params=start_params, trend=trend,
                                       method=method, transparams=transparams,
                                       solver=solver, maxiter=maxiter,
                                       disp=disp, callback=callback,
                                       fit_params=fit_args,
                                       suppress_warnings=suppress_warnings,
                                       trace=trace,
                                       error_action=error_action,
                                       scoring=scoring,
                                       out_of_sample_size=out_of_sample_size,
                                       scoring_args=scoring_args,
                                       with_intercept=with_intercept)),
            return_valid_fits, start, trace)

    # seasonality issues
    if m > 1:
        if max_P > 0:
            max_p = min(max_p, m - 1)
        if max_Q > 0:
            max_q = min(max_q, m - 1)

    # generate the set of (p, q, P, Q) FIRST, since it is contingent
    # on whether or not the user is interested in a seasonal ARIMA result.
    # This will reduce the search space for non-seasonal ARIMA models.
    # loop p, q. Make sure to loop at +1 interval,
    # since max_{p|q} is inclusive.
    if seasonal:
        gen = (
            ((p, d, q), (P, D, Q, m))
            for p in xrange(start_p, max_p + 1)
            for q in xrange(start_q, max_q + 1)
            for P in xrange(start_P, max_P + 1)
            for Q in xrange(start_Q, max_Q + 1)
            if p + q + P + Q <= max_order
        )
    else:
        # otherwise it's not seasonal, and we don't need the seasonal pieces
        gen = (
            ((p, d, q), None)
            for p in xrange(start_p, max_p + 1)
            for q in xrange(start_q, max_q + 1)
            if p + q <= max_order
        )

    if not stepwise:
        # if we are fitting a random search rather than an exhaustive one, we
        # will scramble up the generator (as a list) and only fit n_iter ARIMAs
        if random:
            random_state = check_random_state(random_state)

            # make a list to scramble...
            gen = random_state.permutation(list(gen))[:n_fits]

        # get results in parallel
        all_res = Parallel(n_jobs=n_jobs)(
            delayed(_fit_arima)(y, xreg=exogenous, order=order,
                                seasonal_order=seasonal_order,
                                start_params=start_params, trend=trend,
                                method=method, transparams=transparams,
                                solver=solver, maxiter=maxiter, disp=disp,
                                callback=callback,
                                fit_params=fit_args,
                                suppress_warnings=suppress_warnings,
                                trace=trace, error_action=error_action,
                                out_of_sample_size=out_of_sample_size,
                                scoring=scoring, scoring_args=scoring_args,
                                with_intercept=with_intercept)
            for order, seasonal_order in gen)

    # otherwise, we're fitting the stepwise algorithm...
    else:
        if n_samples < 10:
            start_p = min(start_p, 1)
            start_q = min(start_q, 1)
            start_P = start_Q = 0

        # adjust to p, q, P, Q vals
        p = start_p = min(start_p, max_p)
        q = start_q = min(start_q, max_q)
        P = start_P = min(start_P, max_P)
        Q = start_Q = min(start_Q, max_Q)

        # init the stepwise model wrapper
        stepwise_wrapper = _StepwiseFitWrapper(
            y, xreg=exogenous, start_params=start_params, trend=trend,
            method=method, transparams=transparams, solver=solver,
            maxiter=maxiter, disp=disp, callback=callback, fit_params=fit_args,
            suppress_warnings=suppress_warnings, trace=trace,
            error_action=error_action, out_of_sample_size=out_of_sample_size,
            scoring=scoring, scoring_args=scoring_args, p=p, d=d, q=q,
            P=P, D=D, Q=Q, m=m, start_p=start_p, start_q=start_q,
            start_P=start_P, start_Q=start_Q, max_p=max_p, max_q=max_q,
            max_P=max_P, max_Q=max_Q, seasonal=seasonal,
            information_criterion=information_criterion,
            max_order=max_order, with_intercept=with_intercept)

        # fit a baseline p, d, q model and then a null model
        stepwise_wrapper.fit_increment_k_cache_set(True)  # p, d, q, P, D, Q
        stepwise_wrapper.fit_increment_k_cache_set(
            True, p=0, q=0, P=0, Q=0)  # null model

        # fit a basic AR model
        stepwise_wrapper.fit_increment_k_cache_set(
            max_p > 0 or max_P > 0, p=int(max_p > 0),
            q=0, P=int(m > 1 and max_P > 0), Q=0)

        # fit a basic MA model now
        stepwise_wrapper.fit_increment_k_cache_set(
            max_q > 0 or max_Q > 0, p=0, q=int(max_q > 0),
            P=0, Q=int(m > 1 and max_Q > 0))

        # might not be 4 if p, q etc. are already 0
        # assert stepwise_wrapper.k == 4  # sanity check...

        # do the step-through...
        all_res = stepwise_wrapper.step_through()

    # filter the non-successful ones
    filtered = _post_ppc_arima(all_res)

    # sort by the criteria - lower is better for both AIC and BIC
    # (https://stats.stackexchange.com/questions/81427/aic-guidelines-in-model-selection)
    sorted_res = sorted(filtered,
                        key=(lambda mod:
                             getattr(mod, information_criterion)()))

    # remove all the cached .pmdpkl files... someday write this as an exit hook
    # in case of a KeyboardInterrupt or anything
    for model in sorted_res:
        model._clear_cached_state()

    return _return_wrapper(sorted_res, return_valid_fits, start, trace)


# Assign the doc to the auto_arima func
auto_arima.__doc__ = _doc._AUTO_ARIMA_DOCSTR.format(
    y=_doc._Y_DOCSTR,
    exogenous=_doc._EXOG_DOCSTR,
    fit_args=_doc._FIT_ARGS_DOCSTR,
    return_valid_fits=_doc._VALID_FITS_DOCSTR
)


def _post_ppc_arima(a):
    """If there are no suitable models, raise a ValueError.
    Otherwise, return ``a``. In the case that ``a`` is an iterable
    (i.e., it made it to the end of the function), this method will
    filter out the None values and assess whether the list is empty.

    Parameters
    ----------
    a : ARIMA or iterable
        The list or ARIMAs, or an ARIMA
    """
    # if it's a result of making it to the end, it will
    # be a list of ARIMA models. Filter out the Nones
    # (the failed models)...
    if is_iterable(a):
        a = [m for m in a if m is not None]

    # if the list is empty, or if it was an ARIMA and it's None
    if not a:  # check for truthiness rather than None explicitly
        raise ValueError('Could not successfully fit ARIMA to input data. '
                         'It is likely your data is non-stationary. Please '
                         'induce stationarity or try a different '
                         'range of model order params. If your data is '
                         'seasonal, check the period (m) of the data.')
    # good to return
    return a


def _return_wrapper(fits, return_all, start, trace):
    """If the user wants to get all of the models back, this will
    return a list of the ARIMA models, otherwise it will just return
    the model. If this is called from the end of the function, ``fits``
    will already be a list.

    We *know* that if a function call makes it here, ``fits`` is NOT None
    or it would have thrown an exception in :func:`_post_ppc_arima`.

    Parameters
    ----------
    fits : iterable or ARIMA
        The ARIMA(s)

    return_all : bool
        Whether to return all.
    """
    # make sure it's an iterable
    if not is_iterable(fits):
        fits = [fits]

    # whether to print the final runtime
    if trace:
        print('Total fit time: %.3f seconds' % (time.time() - start))

    # which to return? if not all, then first index (assume sorted)
    if not return_all:
        return fits[0]
    return fits