Source code for pmdarima.arima.seasonality

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# Tests for seasonal differencing terms

from __future__ import absolute_import, division

from sklearn.utils.validation import column_or_1d, check_array
from sklearn.linear_model import LinearRegression
from sklearn.externals import six

from scipy.linalg import svd
from abc import ABCMeta, abstractmethod

from numpy.linalg import solve
import numpy as np

from ..utils.array import c
from .stationarity import _BaseStationarityTest
from ..compat.numpy import DTYPE
from ._arima import C_canova_hansen_sd_test

__all__ = [
    'CHTest'
]


class _SeasonalStationarityTest(six.with_metaclass(ABCMeta,
                                                   _BaseStationarityTest)):
    """Provides the base class for seasonal differencing tests such as the
    Canova-Hansen test and the Osborn-Chui-Smith-Birchenhall tests. These tests
    are used to determine the seasonal differencing term for a time-series.
    """
    def __init__(self, m):
        self.m = m
        if m < 2:
            raise ValueError('m must be > 1')

    @abstractmethod
    def estimate_seasonal_differencing_term(self, x):
        """Estimate the seasonal differencing term.

        Parameters
        ----------
        x : array-like, shape=(n_samples,)
            The time series vector.
        """


[docs]class CHTest(_SeasonalStationarityTest): """Conduct a CH test for seasonality. The Canova-Hansen test for seasonal differences. Canova and Hansen (1995) proposed a test statistic for the null hypothesis that the seasonal pattern is stable. The test statistic can be formulated in terms of seasonal dummies or seasonal cycles. The former allows us to identify seasons (e.g. months or quarters) that are not stable, while the latter tests the stability of seasonal cycles (e.g. cycles of period 2 and 4 quarters in quarterly data). [1] Parameters ---------- m : int The seasonal differencing term. For monthly data, e.g., this would be 12. For quarterly, 4, etc. For the Canova-Hansen test to work, ``m`` must exceed 1. Notes ----- This test is generally not used directly, but in conjunction with :func:`pmdarima.arima.nsdiffs`, which directly estimates the number of seasonal differences. References ---------- .. [1] Testing for seasonal stability using the Canova and Hansen test statisic: http://bit.ly/2wKkrZo """
[docs] def __init__(self, m): super(CHTest, self).__init__(m=m)
@staticmethod def _sd_test(wts, s): # assume no NaN values since called internally # also assume s > 1 since called internally n = wts.shape[0] # no use checking, because this is an internal method # if n <= s: raise ValueError('too few samples (%i<=%i)' % (n, s)) frec = np.ones(int((s + 1) / 2), dtype=np.int) ltrunc = int(np.round(s * ((n / 100.0) ** 0.25))) R1 = CHTest._seas_dummy(wts, s) # fit model, get residuals lmch = LinearRegression(normalize=True).fit(R1, wts) # lmch = sm.OLS(wts, R1).fit(method='qr') residuals = wts - lmch.predict(R1) # translated R code: # multiply the residuals by the column vectors # Fhataux = Fhat.copy() # for i in range(Fhat.shape[1]): # for (i in 1:(s-1)) # Fhataux[:, i] = R1[:, i] * residuals # more efficient numpy: Fhataux = (R1.T * residuals).T # translated R code # matrix row cumsums # Fhat = np.ones((n, s - 1)) * np.nan # for i in range(n): # for n in range(Fhataux.shape[1]): # Fhat[i, n] = Fhataux[:i, n].sum() # more efficient numpy: Fhat = Fhataux.cumsum(axis=0) Ne = Fhataux.shape[0] # As of v0.9.1, use the C_canova_hansen_sd_test function to compute # Omnw, Omfhat, A, frecob. This avoids the overhead of multiple calls # to C functions A, AtOmfhatA = C_canova_hansen_sd_test(ltrunc, Ne, Fhataux, frec, s) # UPDATE 01/04/2018 - we can get away without computing u, v # (this is also MUCH MUCH faster!!!) sv = svd(AtOmfhatA, compute_uv=False) # type: np.ndarray # From R: # double.eps: the smallest positive floating-point number ‘x’ such that # ‘1 + x != 1’. It equals ‘double.base ^ ulp.digits’ if either # ‘double.base’ is 2 or ‘double.rounding’ is 0; otherwise, it # is ‘(double.base ^ double.ulp.digits) / 2’. Normally # ‘2.220446e-16’. # Numpy's float64 has an eps of 2.2204460492503131e-16 if sv.min() < np.finfo(sv.dtype).eps: # machine min eps return 0 # solve against the identity matrix, then produce # a nasty mess of dot products... this is the (horrendous) R code: # (1/N^2) * sum(diag(solve(tmp) %*% t(A) %*% t(Fhat) %*% Fhat %*% A)) # https://github.com/robjhyndman/forecast/blob/master/R/arima.R#L321 solved = solve(AtOmfhatA, np.identity(AtOmfhatA.shape[0])) return (1.0 / n ** 2) * solved.dot(A.T).dot( Fhat.T).dot(Fhat).dot(A).diagonal().sum() @staticmethod def _seas_dummy(x, m): # Here is the R code: # (https://github.com/robjhyndman/forecast/blob/master/R/arima.R#L132) # # SeasDummy <- function(x) { # n <- length(x) # m <- frequency(x) # if (m == 1) { # stop("Non-seasonal data") # } # tt <- 1:n # fmat <- matrix(NA, nrow = n, ncol = 2 * m) # for (i in 1:m) { # fmat[, 2 * i] <- sin(2 * pi * i * tt / m) # fmat[, 2 * (i - 1) + 1] <- cos(2 * pi * i * tt / m) # } # return(fmat[, 1:(m - 1)]) # } # set up seasonal dummies using fourier series n = x.shape[0] # assume m > 1 since this function called internally... assert m > 1, 'This function is called internally and ' \ 'should not encounter this issue' tt = np.arange(n) + 1 fmat = np.ones((n, 2 * m)) * np.nan pi = np.pi for i in range(1, m + 1): # for(i in 1:m) # subtract one, unlike the R code. in the R code, this sets # columns 2, 4, 6, etc... here it sets 1, 3, 5 # fmat[,2*i] <- sin(2*pi*i*tt/m) fmat[:, (2 * i) - 1] = np.sin(2 * pi * i * tt / m) # in the R code, this sets columns 1, 3, 5, etc. here it # sets 0, 2, 4, etc. # fmat[,2*(i-1)+1] <- cos(2*pi*i*tt/m) fmat[:, 2 * (i - 1)] = np.cos(2 * pi * i * tt / m) return fmat[:, :m - 1]
[docs] def estimate_seasonal_differencing_term(self, x): """Estimate the seasonal differencing term. Parameters ---------- x : array-like, shape=(n_samples,) The time series vector. Returns ------- D : int The seasonal differencing term. The CH test defines a set of critical values:: (0.4617146, 0.7479655, 1.0007818, 1.2375350, 1.4625240, 1.6920200, 1.9043096, 2.1169602, 2.3268562, 2.5406922, 2.7391007) For different values of ``m``, the CH statistic is compared to a different critical value, and returns 1 if the computed statistic is greater than the critical value, or 0 if not. """ if not self._base_case(x): return 0 # ensure vector x = column_or_1d(check_array( x, ensure_2d=False, dtype=DTYPE, force_all_finite=True)) # type: np.ndarray n = x.shape[0] m = int(self.m) if n < 2 * m + 5: return 0 chstat = self._sd_test(x, m) crit_vals = c(0.4617146, 0.7479655, 1.0007818, 1.2375350, 1.4625240, 1.6920200, 1.9043096, 2.1169602, 2.3268562, 2.5406922, 2.7391007) if m <= 12: return int(chstat > crit_vals[m - 2]) # R does m - 1... if m == 24: return int(chstat > 5.098624) if m == 52: return int(chstat > 10.341416) if m == 365: return int(chstat > 65.44445) return int(chstat > 0.269 * (m ** 0.928))