Source code for pmdarima.datasets.sunspots

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# This is the sunspots dataset found in R.

import numpy as np
import pandas as pd

from os.path import join
import calendar

from ..compat import DTYPE
from . import _base as base

__all__ = [
    'load_sunspots'
]


[docs]def load_sunspots(as_series=False, dtype=DTYPE):
    """Monthly Sunspot Numbers, 1749 - 1983

    Monthly mean relative sunspot numbers from 1749 to 1983. Collected at Swiss
    Federal Observatory, Zurich until 1960, then Tokyo Astronomical
    Observatory.

    Parameters
    ----------
    as_series : bool, optional (default=False)
        Whether to return a Pandas series. If True, the index will be set to
        the observed years/months. If False, will return a 1d numpy array.

    dtype : type, optional (default=np.float64)
        The type to return for the array. Default is np.float64, which is used
        throughout the package as the default type.

    Notes
    -----
    This is monthly data, so *m* should be set to 12 when using in a seasonal
    context.

    Examples
    --------
    >>> from pmdarima.datasets import load_sunspots
    >>> load_sunspots()
    array([58. , 62.6, 70. , ..., 55.8, 33.3, 33.4])

    >>> load_sunspots(True).head()
    Jan 1749    58.0
    Feb 1749    62.6
    Mar 1749    70.0
    Apr 1749    55.7
    May 1749    85.0
    dtype: float64

    References
    ----------
    .. [1] https://www.rdocumentation.org/packages/datasets/versions/3.6.1/topics/sunspots

    Returns
    -------
    rslt : array-like, shape=(n_samples,)
        The sunspots dataset. There are 2820 observations.
    """  # noqa: E501
    rslt = base._cache.get('sunspots', None)
    if rslt is None:
        data_path = join(base.get_data_path(), 'sunspots.txt.gz')
        rslt = np.loadtxt(data_path).ravel()
        base._cache['sunspots'] = rslt

    # don't want to cache type conversion
    rslt = rslt.astype(dtype)

    if not as_series:
        return rslt

    # Otherwise we want a series and have to cleverly create the index
    index = [
        "%s %i" % (calendar.month_abbr[i + 1], year)
        for year in range(1749, 1984)
        for i in range(12)
    ]

    return pd.Series(rslt, index=index)