# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# Array utilities
from sklearn.utils import validation as skval
import numpy as np
import pandas as pd
from ..compat import DTYPE
from ._array import C_intgrt_vec
__all__ = [
'as_series',
'c',
'check_endog',
'check_exog',
'diff',
'diff_inv',
'is_iterable'
]
[docs]
def as_series(x, **kwargs):
"""Cast as pandas Series.
Cast an iterable to a Pandas Series object. Note that the index
will simply be a positional ``arange`` and cannot be set in this
function.
Parameters
----------
x : array-like, shape=(n_samples,)
The 1d array on which to compute the auto correlation.
Examples
--------
>>> as_series([1, 2, 3])
0 1
1 2
2 3
dtype: int64
>>> as_series(as_series((1, 2, 3)))
0 1
1 2
2 3
dtype: int64
>>> import pandas as pd
>>> as_series(pd.Series([4, 5, 6], index=['a', 'b', 'c']))
a 4
b 5
c 6
dtype: int64
Returns
-------
s : pd.Series
A pandas Series object.
"""
if isinstance(x, pd.Series):
return x
return pd.Series(skval.column_or_1d(x), **kwargs)
[docs]
def c(*args):
r"""Imitates the ``c`` function from R.
Since this whole library is aimed at re-creating in
Python what R has already done so well, the ``c`` function was created to
wrap ``numpy.concatenate`` and mimic the R functionality. Similar to R,
this works with scalars, iterables, and any mix therein.
Note that using the ``c`` function on multi-nested lists or iterables
will fail!
Examples
--------
Using ``c`` with varargs will yield a single array:
>>> c(1, 2, 3, 4)
array([1, 2, 3, 4])
Using ``c`` with nested lists and scalars will also yield a single array:
>>> c([1, 2], 4, c(5, 4))
array([1, 2, 4, 5, 4])
However, using ``c`` with multi-level lists will fail!
>>> c([1, 2, 3], [[1, 2]]) # doctest: +SKIP
ValueError: all the input arrays must have same number of dimensions
References
----------
.. [1] https://stat.ethz.ch/R-manual/R-devel/library/base/html/c.html
"""
# R returns NULL for this
if not args:
return None
# just an array of len 1
if len(args) == 1:
element = args[0]
# if it's iterable, make it an array
if is_iterable(element):
return np.asarray(element)
# otherwise it's not iterable, put it in an array
return np.asarray([element])
# np.concat all. This can be slow, as noted by numerous threads on
# numpy concat efficiency, however an alternative using recursive
# yields was tested and performed far worse:
#
# >>> def timeit(func, ntimes, *args):
# ... times = []
# ... for i in range(ntimes):
# ... start = time.time()
# ... func(*args)
# ... times.append(time.time() - start)
# ... arr = np.asarray(times)
# ... print("%s (%i times) - Mean: %.5f sec, "
# ... "Min: %.5f sec, Max: %.5f" % (func.__name__, ntimes,
# ... arr.mean(), arr.min(),
# ... arr.max()))
# >>> y = [np.arange(10000), range(500), (1000,), 100, np.arange(50000)]
# >>> timeit(c1, 100, *y)
# c1 (100 times) - Mean: 0.00009 sec, Min: 0.00006 sec, Max: 0.00065
# >>> timeit(c2, 100, *y)
# c2 (100 times) - Mean: 0.08708 sec, Min: 0.08273 sec, Max: 0.10115
#
# So we stick with c1, which is this variant.
return np.concatenate([a if is_iterable(a) else [a] for a in args])
[docs]
def check_endog(
y,
dtype=DTYPE,
copy=True,
force_all_finite=False,
preserve_series=True,
):
"""Wrapper for ``check_array`` and ``column_or_1d`` from sklearn
Parameters
----------
y : array-like, shape=(n_samples,)
The 1d endogenous array.
dtype : string, type or None (default=np.float64)
Data type of result. If None, the dtype of the input is preserved.
If "numeric", dtype is preserved unless array.dtype is object.
copy : bool, optional (default=False)
Whether a forced copy will be triggered. If copy=False, a copy might
still be triggered by a conversion.
force_all_finite : bool, optional (default=False)
Whether to raise an error on np.inf and np.nan in an array. The
possibilities are:
- True: Force all values of array to be finite.
- False: accept both np.inf and np.nan in array.
preserve_series : bool, optional
Whether to preserve a ``pd.Series`` object. Will also attempt to
squeeze a dataframe into a ``pd.Series``.
Returns
-------
y : np.ndarray or pd.Series, shape=(n_samples,)
A 1d numpy ndarray
"""
endog = skval.check_array(
y,
ensure_2d=False,
force_all_finite=force_all_finite,
copy=copy,
dtype=dtype,
)
endog = skval.column_or_1d(endog)
if not preserve_series:
return endog
# possibly restore index information, if it was present, assigning
# checked/casted values back into the series
if isinstance(y, pd.DataFrame):
y = y.squeeze() # dtype: pd.Series
if isinstance(y, pd.Series):
endog = pd.Series(endog, index=y.index)
return endog
def check_exog(X, dtype=DTYPE, copy=True, force_all_finite=True):
"""A wrapper for ``check_array`` for 2D arrays
Parameters
----------
X : array-like, shape=(n_samples, n_features)
The exogenous array. If a Pandas frame, a Pandas frame will be returned
as well. Otherwise, a numpy array will be returned.
dtype : string, type or None (default=np.float64)
Data type of result. If None, the dtype of the input is preserved.
If "numeric", dtype is preserved unless array.dtype is object.
copy : bool, optional (default=True)
Whether a forced copy will be triggered. If copy=False, a copy might
still be triggered by a conversion.
force_all_finite : bool, optional (default=True)
Whether to raise an error on np.inf and np.nan in an array. The
possibilities are:
- True: Force all values of array to be finite.
- False: accept both np.inf and np.nan in array.
Returns
-------
X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features)
Either a 2-d numpy array or pd.DataFrame
"""
if hasattr(X, 'ndim') and X.ndim != 2:
raise ValueError("Must be a 2-d array or dataframe")
if isinstance(X, pd.DataFrame):
# if not copy, go straight to asserting finite
if copy and dtype is not None:
X = X.astype(dtype) # tantamount to copy
if force_all_finite and (~X.apply(np.isfinite)).any().any():
raise ValueError("Found non-finite values in dataframe")
return X
# otherwise just a pass-through to the scikit-learn method
return skval.check_array(
X,
ensure_2d=True,
dtype=DTYPE,
copy=copy,
force_all_finite=force_all_finite,
)
def _diff_vector(x, lag):
# compute the lag for a vector (not a matrix)
n = x.shape[0]
lag = min(n, lag) # if lag > n, then we just want an empty array back
return x[lag: n] - x[: n-lag] # noqa: E226
def _diff_matrix(x, lag):
# compute the lag for a matrix (not a vector)
m, _ = x.shape
lag = min(m, lag) # if lag > n, then we just want an empty array back
return x[lag: m, :] - x[: m-lag, :] # noqa: E226
[docs]
def diff(x, lag=1, differences=1):
"""Difference an array.
A python implementation of the R ``diff`` function [1]. This computes lag
differences from an array given a ``lag`` and ``differencing`` term.
If ``x`` is a vector of length :math:`n`, ``lag=1`` and ``differences=1``,
then the computed result is equal to the successive differences
``x[lag:n] - x[:n-lag]``.
Examples
--------
Where ``lag=1`` and ``differences=1``:
>>> x = c(10, 4, 2, 9, 34)
>>> diff(x, 1, 1)
array([ -6., -2., 7., 25.], dtype=float32)
Where ``lag=1`` and ``differences=2``:
>>> x = c(10, 4, 2, 9, 34)
>>> diff(x, 1, 2)
array([ 4., 9., 18.], dtype=float32)
Where ``lag=3`` and ``differences=1``:
>>> x = c(10, 4, 2, 9, 34)
>>> diff(x, 3, 1)
array([ -1., 30.], dtype=float32)
Where ``lag=6`` (larger than the array is) and ``differences=1``:
>>> x = c(10, 4, 2, 9, 34)
>>> diff(x, 6, 1)
array([], dtype=float32)
For a 2d array with ``lag=1`` and ``differences=1``:
>>> import numpy as np
>>>
>>> x = np.arange(1, 10).reshape((3, 3)).T
>>> diff(x, 1, 1)
array([[ 1., 1., 1.],
[ 1., 1., 1.]], dtype=float32)
Parameters
----------
x : array-like, shape=(n_samples, [n_features])
The array to difference.
lag : int, optional (default=1)
An integer > 0 indicating which lag to use.
differences : int, optional (default=1)
An integer > 0 indicating the order of the difference.
Returns
-------
res : np.ndarray, shape=(n_samples, [n_features])
The result of the differenced arrays.
References
----------
.. [1] https://stat.ethz.ch/R-manual/R-devel/library/base/html/diff.html
"""
if any(v < 1 for v in (lag, differences)):
raise ValueError('lag and differences must be positive (> 0) integers')
x = skval.check_array(x, ensure_2d=False, dtype=DTYPE, copy=False)
fun = _diff_vector if x.ndim == 1 else _diff_matrix
res = x
# "recurse" over range of differences
for i in range(differences):
res = fun(res, lag)
# if it ever comes back empty, just return it as is
if not res.shape[0]:
return res
return res
def _diff_inv_vector(x, lag, differences, xi):
# R code: if (missing(xi)) xi < - rep(0., lag * differences)
# R code: if (length(xi) != lag * differences)
# R code: stop("'xi' does not have the right length")
if xi is None:
xi = np.zeros(lag * differences, dtype=DTYPE)
else:
xi = check_endog(
xi,
dtype=DTYPE,
copy=False,
force_all_finite=False,
preserve_series=False,
)
if xi.shape[0] != lag * differences:
raise IndexError('"xi" does not have the right length')
if differences == 1:
return np.asarray(C_intgrt_vec(x=x, xi=xi, lag=lag))
else:
# R code: diffinv.vector(diffinv.vector(x, lag, differences - 1L,
# R code: diff(xi, lag=lag, differences=1L)),
# R code: lag, 1L, xi[1L:lag])
return diff_inv(
x=diff_inv(x=x, lag=lag, differences=differences - 1,
xi=diff(x=xi, lag=lag, differences=1)),
lag=lag,
differences=1,
xi=xi[:lag] # R: xi[1L:lag]
)
def _diff_inv_matrix(x, lag, differences, xi):
n, m = x.shape
y = np.zeros((n + lag * differences, m), dtype=DTYPE)
if m >= 1: # todo: R checks this. do we need to?
# R: if(missing(xi)) xi <- matrix(0.0, lag*differences, m)
if xi is None:
xi = np.zeros((lag * differences, m), dtype=DTYPE)
else:
xi = skval.check_array(
xi,
dtype=DTYPE,
copy=False,
force_all_finite=False,
ensure_2d=True,
)
if xi.shape != (lag * differences, m):
raise IndexError('"xi" does not have the right shape')
# TODO: can we vectorize?
for i in range(m):
y[:, i] = _diff_inv_vector(x[:, i], lag, differences, xi[:, i])
return y
[docs]
def diff_inv(x, lag=1, differences=1, xi=None):
"""
Inverse the difference of an array.
A python implementation of the R ``diffinv`` function [1]. This computes
the inverse of lag differences from an array given a ``lag``
and ``differencing`` term.
If ``x`` is a vector of length :math:`n`, ``lag=1`` and ``differences=1``,
then the computed result is equal to the cumulative sum plus left-padding
of zeros equal to ``lag * differences``.
Examples
--------
Where ``lag=1`` and ``differences=1``:
>>> x = c(10, 4, 2, 9, 34)
>>> diff_inv(x, 1, 1)
array([ 0., 10., 14., 16., 25., 59.])
Where ``lag=1`` and ``differences=2``:
>>> x = c(10, 4, 2, 9, 34)
>>> diff_inv(x, 1, 2)
array([ 0., 0., 10., 24., 40., 65., 124.])
Where ``lag=3`` and ``differences=1``:
>>> x = c(10, 4, 2, 9, 34)
>>> diff_inv(x, 3, 1)
array([ 0., 0., 0., 10., 4., 2., 19., 38.])
Where ``lag=6`` (larger than the array is) and ``differences=1``:
>>> x = c(10, 4, 2, 9, 34)
>>> diff_inv(x, 6, 1)
array([ 0., 0., 0., 0., 0., 0., 10., 4., 2., 9., 34.])
For a 2d array with ``lag=1`` and ``differences=1``:
>>> import numpy as np
>>>
>>> x = np.arange(1, 10).reshape((3, 3)).T
>>> diff_inv(x, 1, 1)
array([[ 0., 0., 0.],
[ 1., 4., 7.],
[ 3., 9., 15.],
[ 6., 15., 24.]])
Parameters
----------
x : array-like, shape=(n_samples, [n_features])
The array to difference.
lag : int, optional (default=1)
An integer > 0 indicating which lag to use.
differences : int, optional (default=1)
An integer > 0 indicating the order of the difference.
Returns
-------
res : np.ndarray, shape=(n_samples, [n_features])
The result of the inverse of the difference arrays.
References
----------
.. [1] https://stat.ethz.ch/R-manual/R-devel/library/stats/html/diffinv.html
""" # noqa: E501
x = skval.check_array(
x,
dtype=DTYPE,
copy=False,
force_all_finite=False,
ensure_2d=False,
)
# R code: if (lag < 1L || differences < 1L)
# R code: stop ("bad value for 'lag' or 'differences'")
if any(v < 1 for v in (lag, differences)):
raise ValueError('lag and differences must be positive (> 0) integers')
if x.ndim == 1:
return _diff_inv_vector(x, lag, differences, xi)
elif x.ndim == 2:
return _diff_inv_matrix(x, lag, differences, xi)
raise ValueError("only vector and matrix inverse differencing "
"are supported")
[docs]
def is_iterable(x):
"""Test a variable for iterability.
Determine whether an object ``x`` is iterable. In Python 2, this
was as simple as checking for the ``__iter__`` attribute. However, in
Python 3, strings became iterable. Therefore, this function checks for the
``__iter__`` attribute, returning True if present (except for strings,
for which it will return False).
Parameters
----------
x : str, iterable or object
The object in question.
Examples
--------
Strings and other objects are not iterable:
>>> x = "not me"
>>> y = 123
>>> any(is_iterable(v) for v in (x, y))
False
Tuples, lists and other structures with ``__iter__`` are:
>>> x = ('a', 'tuple')
>>> y = ['a', 'list']
>>> all(is_iterable(v) for v in (x, y))
True
This even applies to numpy arrays:
>>> import numpy as np
>>> is_iterable(np.arange(10))
True
Returns
-------
isiter : bool
True if iterable, else False.
"""
if isinstance(x, str):
return False
return hasattr(x, '__iter__')