Source code for pmdarima.utils.array

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# Array utilities

from sklearn.utils import validation as skval

import numpy as np
import pandas as pd

from ..compat import DTYPE
from ._array import C_intgrt_vec

__all__ = [
    'as_series',
    'c',
    'check_endog',
    'check_exog',
    'diff',
    'diff_inv',
    'is_iterable'
]


[docs]def as_series(x, **kwargs): """Cast as pandas Series. Cast an iterable to a Pandas Series object. Note that the index will simply be a positional ``arange`` and cannot be set in this function. Parameters ---------- x : array-like, shape=(n_samples,) The 1d array on which to compute the auto correlation. Examples -------- >>> as_series([1, 2, 3]) 0 1 1 2 2 3 dtype: int64 >>> as_series(as_series((1, 2, 3))) 0 1 1 2 2 3 dtype: int64 >>> import pandas as pd >>> as_series(pd.Series([4, 5, 6], index=['a', 'b', 'c'])) a 4 b 5 c 6 dtype: int64 Returns ------- s : pd.Series A pandas Series object. """ if isinstance(x, pd.Series): return x return pd.Series(skval.column_or_1d(x), **kwargs)
[docs]def c(*args): r"""Imitates the ``c`` function from R. Since this whole library is aimed at re-creating in Python what R has already done so well, the ``c`` function was created to wrap ``numpy.concatenate`` and mimic the R functionality. Similar to R, this works with scalars, iterables, and any mix therein. Note that using the ``c`` function on multi-nested lists or iterables will fail! Examples -------- Using ``c`` with varargs will yield a single array: >>> c(1, 2, 3, 4) array([1, 2, 3, 4]) Using ``c`` with nested lists and scalars will also yield a single array: >>> c([1, 2], 4, c(5, 4)) array([1, 2, 4, 5, 4]) However, using ``c`` with multi-level lists will fail! >>> c([1, 2, 3], [[1, 2]]) # doctest: +SKIP ValueError: all the input arrays must have same number of dimensions References ---------- .. [1] https://stat.ethz.ch/R-manual/R-devel/library/base/html/c.html """ # R returns NULL for this if not args: return None # just an array of len 1 if len(args) == 1: element = args[0] # if it's iterable, make it an array if is_iterable(element): return np.asarray(element) # otherwise it's not iterable, put it in an array return np.asarray([element]) # np.concat all. This can be slow, as noted by numerous threads on # numpy concat efficiency, however an alternative using recursive # yields was tested and performed far worse: # # >>> def timeit(func, ntimes, *args): # ... times = [] # ... for i in range(ntimes): # ... start = time.time() # ... func(*args) # ... times.append(time.time() - start) # ... arr = np.asarray(times) # ... print("%s (%i times) - Mean: %.5f sec, " # ... "Min: %.5f sec, Max: %.5f" % (func.__name__, ntimes, # ... arr.mean(), arr.min(), # ... arr.max())) # >>> y = [np.arange(10000), range(500), (1000,), 100, np.arange(50000)] # >>> timeit(c1, 100, *y) # c1 (100 times) - Mean: 0.00009 sec, Min: 0.00006 sec, Max: 0.00065 # >>> timeit(c2, 100, *y) # c2 (100 times) - Mean: 0.08708 sec, Min: 0.08273 sec, Max: 0.10115 # # So we stick with c1, which is this variant. return np.concatenate([a if is_iterable(a) else [a] for a in args])
[docs]def check_endog( y, dtype=DTYPE, copy=True, force_all_finite=False, preserve_series=True, ): """Wrapper for ``check_array`` and ``column_or_1d`` from sklearn Parameters ---------- y : array-like, shape=(n_samples,) The 1d endogenous array. dtype : string, type or None (default=np.float64) Data type of result. If None, the dtype of the input is preserved. If "numeric", dtype is preserved unless array.dtype is object. copy : bool, optional (default=False) Whether a forced copy will be triggered. If copy=False, a copy might still be triggered by a conversion. force_all_finite : bool, optional (default=False) Whether to raise an error on np.inf and np.nan in an array. The possibilities are: - True: Force all values of array to be finite. - False: accept both np.inf and np.nan in array. preserve_series : bool, optional Whether to preserve a ``pd.Series`` object. Will also attempt to squeeze a dataframe into a ``pd.Series``. Returns ------- y : np.ndarray or pd.Series, shape=(n_samples,) A 1d numpy ndarray """ endog = skval.check_array( y, ensure_2d=False, force_all_finite=force_all_finite, copy=copy, dtype=dtype, ) endog = skval.column_or_1d(endog) if not preserve_series: return endog # possibly restore index information, if it was present, assigning # checked/casted values back into the series if isinstance(y, pd.DataFrame): y = y.squeeze() # dtype: pd.Series if isinstance(y, pd.Series): endog = pd.Series(endog, index=y.index) return endog
def check_exog(X, dtype=DTYPE, copy=True, force_all_finite=True): """A wrapper for ``check_array`` for 2D arrays Parameters ---------- X : array-like, shape=(n_samples, n_features) The exogenous array. If a Pandas frame, a Pandas frame will be returned as well. Otherwise, a numpy array will be returned. dtype : string, type or None (default=np.float64) Data type of result. If None, the dtype of the input is preserved. If "numeric", dtype is preserved unless array.dtype is object. copy : bool, optional (default=True) Whether a forced copy will be triggered. If copy=False, a copy might still be triggered by a conversion. force_all_finite : bool, optional (default=True) Whether to raise an error on np.inf and np.nan in an array. The possibilities are: - True: Force all values of array to be finite. - False: accept both np.inf and np.nan in array. Returns ------- X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features) Either a 2-d numpy array or pd.DataFrame """ if hasattr(X, 'ndim') and X.ndim != 2: raise ValueError("Must be a 2-d array or dataframe") if isinstance(X, pd.DataFrame): # if not copy, go straight to asserting finite if copy and dtype is not None: X = X.astype(dtype) # tantamount to copy if force_all_finite and (~X.apply(np.isfinite)).any().any(): raise ValueError("Found non-finite values in dataframe") return X # otherwise just a pass-through to the scikit-learn method return skval.check_array( X, ensure_2d=True, dtype=DTYPE, copy=copy, force_all_finite=force_all_finite, ) def _diff_vector(x, lag): # compute the lag for a vector (not a matrix) n = x.shape[0] lag = min(n, lag) # if lag > n, then we just want an empty array back return x[lag: n] - x[: n-lag] # noqa: E226 def _diff_matrix(x, lag): # compute the lag for a matrix (not a vector) m, _ = x.shape lag = min(m, lag) # if lag > n, then we just want an empty array back return x[lag: m, :] - x[: m-lag, :] # noqa: E226
[docs]def diff(x, lag=1, differences=1): """Difference an array. A python implementation of the R ``diff`` function [1]. This computes lag differences from an array given a ``lag`` and ``differencing`` term. If ``x`` is a vector of length :math:`n`, ``lag=1`` and ``differences=1``, then the computed result is equal to the successive differences ``x[lag:n] - x[:n-lag]``. Examples -------- Where ``lag=1`` and ``differences=1``: >>> x = c(10, 4, 2, 9, 34) >>> diff(x, 1, 1) array([ -6., -2., 7., 25.], dtype=float32) Where ``lag=1`` and ``differences=2``: >>> x = c(10, 4, 2, 9, 34) >>> diff(x, 1, 2) array([ 4., 9., 18.], dtype=float32) Where ``lag=3`` and ``differences=1``: >>> x = c(10, 4, 2, 9, 34) >>> diff(x, 3, 1) array([ -1., 30.], dtype=float32) Where ``lag=6`` (larger than the array is) and ``differences=1``: >>> x = c(10, 4, 2, 9, 34) >>> diff(x, 6, 1) array([], dtype=float32) For a 2d array with ``lag=1`` and ``differences=1``: >>> import numpy as np >>> >>> x = np.arange(1, 10).reshape((3, 3)).T >>> diff(x, 1, 1) array([[ 1., 1., 1.], [ 1., 1., 1.]], dtype=float32) Parameters ---------- x : array-like, shape=(n_samples, [n_features]) The array to difference. lag : int, optional (default=1) An integer > 0 indicating which lag to use. differences : int, optional (default=1) An integer > 0 indicating the order of the difference. Returns ------- res : np.ndarray, shape=(n_samples, [n_features]) The result of the differenced arrays. References ---------- .. [1] https://stat.ethz.ch/R-manual/R-devel/library/base/html/diff.html """ if any(v < 1 for v in (lag, differences)): raise ValueError('lag and differences must be positive (> 0) integers') x = skval.check_array(x, ensure_2d=False, dtype=DTYPE, copy=False) fun = _diff_vector if x.ndim == 1 else _diff_matrix res = x # "recurse" over range of differences for i in range(differences): res = fun(res, lag) # if it ever comes back empty, just return it as is if not res.shape[0]: return res return res
def _diff_inv_vector(x, lag, differences, xi): # R code: if (missing(xi)) xi < - rep(0., lag * differences) # R code: if (length(xi) != lag * differences) # R code: stop("'xi' does not have the right length") if xi is None: xi = np.zeros(lag * differences, dtype=DTYPE) else: xi = check_endog( xi, dtype=DTYPE, copy=False, force_all_finite=False, preserve_series=False, ) if xi.shape[0] != lag * differences: raise IndexError('"xi" does not have the right length') if differences == 1: return np.asarray(C_intgrt_vec(x=x, xi=xi, lag=lag)) else: # R code: diffinv.vector(diffinv.vector(x, lag, differences - 1L, # R code: diff(xi, lag=lag, differences=1L)), # R code: lag, 1L, xi[1L:lag]) return diff_inv( x=diff_inv(x=x, lag=lag, differences=differences - 1, xi=diff(x=xi, lag=lag, differences=1)), lag=lag, differences=1, xi=xi[:lag] # R: xi[1L:lag] ) def _diff_inv_matrix(x, lag, differences, xi): n, m = x.shape y = np.zeros((n + lag * differences, m), dtype=DTYPE) if m >= 1: # todo: R checks this. do we need to? # R: if(missing(xi)) xi <- matrix(0.0, lag*differences, m) if xi is None: xi = np.zeros((lag * differences, m), dtype=DTYPE) else: xi = skval.check_array( xi, dtype=DTYPE, copy=False, force_all_finite=False, ensure_2d=True, ) if xi.shape != (lag * differences, m): raise IndexError('"xi" does not have the right shape') # TODO: can we vectorize? for i in range(m): y[:, i] = _diff_inv_vector(x[:, i], lag, differences, xi[:, i]) return y
[docs]def diff_inv(x, lag=1, differences=1, xi=None): """ Inverse the difference of an array. A python implementation of the R ``diffinv`` function [1]. This computes the inverse of lag differences from an array given a ``lag`` and ``differencing`` term. If ``x`` is a vector of length :math:`n`, ``lag=1`` and ``differences=1``, then the computed result is equal to the cumulative sum plus left-padding of zeros equal to ``lag * differences``. Examples -------- Where ``lag=1`` and ``differences=1``: >>> x = c(10, 4, 2, 9, 34) >>> diff_inv(x, 1, 1) array([ 0., 10., 14., 16., 25., 59.]) Where ``lag=1`` and ``differences=2``: >>> x = c(10, 4, 2, 9, 34) >>> diff_inv(x, 1, 2) array([ 0., 0., 10., 24., 40., 65., 124.]) Where ``lag=3`` and ``differences=1``: >>> x = c(10, 4, 2, 9, 34) >>> diff_inv(x, 3, 1) array([ 0., 0., 0., 10., 4., 2., 19., 38.]) Where ``lag=6`` (larger than the array is) and ``differences=1``: >>> x = c(10, 4, 2, 9, 34) >>> diff_inv(x, 6, 1) array([ 0., 0., 0., 0., 0., 0., 10., 4., 2., 9., 34.]) For a 2d array with ``lag=1`` and ``differences=1``: >>> import numpy as np >>> >>> x = np.arange(1, 10).reshape((3, 3)).T >>> diff_inv(x, 1, 1) array([[ 0., 0., 0.], [ 1., 4., 7.], [ 3., 9., 15.], [ 6., 15., 24.]]) Parameters ---------- x : array-like, shape=(n_samples, [n_features]) The array to difference. lag : int, optional (default=1) An integer > 0 indicating which lag to use. differences : int, optional (default=1) An integer > 0 indicating the order of the difference. Returns ------- res : np.ndarray, shape=(n_samples, [n_features]) The result of the inverse of the difference arrays. References ---------- .. [1] https://stat.ethz.ch/R-manual/R-devel/library/stats/html/diffinv.html """ # noqa: E501 x = skval.check_array( x, dtype=DTYPE, copy=False, force_all_finite=False, ensure_2d=False, ) # R code: if (lag < 1L || differences < 1L) # R code: stop ("bad value for 'lag' or 'differences'") if any(v < 1 for v in (lag, differences)): raise ValueError('lag and differences must be positive (> 0) integers') if x.ndim == 1: return _diff_inv_vector(x, lag, differences, xi) elif x.ndim == 2: return _diff_inv_matrix(x, lag, differences, xi) raise ValueError("only vector and matrix inverse differencing " "are supported")
[docs]def is_iterable(x): """Test a variable for iterability. Determine whether an object ``x`` is iterable. In Python 2, this was as simple as checking for the ``__iter__`` attribute. However, in Python 3, strings became iterable. Therefore, this function checks for the ``__iter__`` attribute, returning True if present (except for strings, for which it will return False). Parameters ---------- x : str, iterable or object The object in question. Examples -------- Strings and other objects are not iterable: >>> x = "not me" >>> y = 123 >>> any(is_iterable(v) for v in (x, y)) False Tuples, lists and other structures with ``__iter__`` are: >>> x = ('a', 'tuple') >>> y = ['a', 'list'] >>> all(is_iterable(v) for v in (x, y)) True This even applies to numpy arrays: >>> import numpy as np >>> is_iterable(np.arange(10)) True Returns ------- isiter : bool True if iterable, else False. """ if isinstance(x, str): return False return hasattr(x, '__iter__')