Source code for pyts.datasets.make

"""Functions to make datasets."""

# Author: Johann Faouzi <johann.faouzi@gmail.com>
# License: BSD-3-Clause

import numpy as np
from sklearn.utils import check_array, check_random_state
from sklearn.utils import shuffle as utils_shuffle


[docs]def make_cylinder_bell_funnel(
        n_samples=30, weights=None, shuffle=True, random_state=None,
        return_params=False
):
    r"""Make a Cylinder-Bell-Funnel dataset.

    The classes are coded with the following meaning:

        - 0: Cylinder
        - 1: Bell
        - 2: Funnel

    Parameters
    ----------
    n_samples : int (default = 30)
        The number of time series.

    weights : list of floats or None (default=None)
        The proportions of samples assigned to each class. If None, then
        classes are balanced. Note that if ``len(weights) == 2``,
        then the last class weight is automatically inferred.
        More than ``n_samples`` samples may be returned if the sum of
        ``weights`` exceeds 1.

    shuffle : bool (default = True)
        If True, shuffle the samples.

    return_params : bool (default = False)
        If True, a dictionary containing the parameters used to make the
        time series is returned.

    random_state : int, RandomState instance or None (default = None)
        Determines random number generation for dataset creation. Pass an int
        for reproducible output across multiple function calls.

    Returns
    -------
    X : array, shape = (n_samples, 128)
        Generated time series.

    y : array, shape = (n_samples,)
        Generated class labels. Labels have the following meaning:

        - 0: Cylinder
        - 1: Bell
        - 2: Funnel

    params : dict
        The parameters used to generate the data. The keys are 'a',
        'b', 'eta' and 'epsilon' and the values are the corresponding
        values. Only returned if ``return_params=True``.

    Notes
    -----
    The time series are generated from the following distributions:

    .. math::

        c(t) = (6 + \eta) \cdot 1_{[a, b]}(t) + \epsilon(t)

        b(t) = (6 + \eta) \cdot 1_{[a, b]}(t) \cdot (t - a) / (b - a) +
        \epsilon(t)

        f(t) = (6 + \eta) \cdot 1_{[a, b]}(t) \cdot (b - t) / (b - a) +
        \epsilon(t)

    where :math:`t=1,\ldots,128`, :math:`a` is an integer-valued uniform random
    variable on the inverval :math:`[16, 32]`, :math:`b-a` is an integer-valued
    uniform distribution on the inveral :math:`[32, 96]`, :math:`\eta` and
    :math:`\epsilon(t)` are standard normal variables,
    :math:`{1}_{[a, b]}` is the characteristic function on the interval
    :math:`[a, b]`. :math:`c`, :math:`b`, and :math:`f` stand for "cylinder",
    "bell", and "funnel" respectively.

    References
    ----------
    .. [1] N. Saito, "Local feature extraction and its application
           using a library of bases". Ph.D. thesis, Department of
           Mathematics, Yale University, 1994.

    Examples
    --------
    >>> import numpy as np
    >>> from pyts.datasets import make_cylinder_bell_funnel
    >>> X, y = make_cylinder_bell_funnel()
    >>> X.shape
    (30, 128)
    >>> print(np.bincount(y))
    [10 10 10]

    """
    if not isinstance(n_samples, (int, np.integer)):
        raise TypeError("'n_samples' must be an integer.")
    if not n_samples > 0:
        raise ValueError("'n_samples' must be a positive integer.")
    if weights is None:
        weights = [1.0 / 3] * 3
        weights[-1] = 1.0 - sum(weights[:-1])
    else:
        weights = check_array(weights, ensure_2d=False)
        if len(weights) not in [2, 3]:
            raise ValueError("'weights' must be None or a list with 2 or "
                             "3 elements (got {}).".format(len(weights)))
        if len(weights) == 2:
            if sum(weights) > 1:
                raise ValueError(
                    "'sum(weights)' cannot be larger than 1 if "
                    "len(weights) == 2 (got {})".format(sum(weights)))
            weights = np.append(weights, 1.0 - sum(weights))
    rng = check_random_state(random_state)

    n_samples_per_class = [int(n_samples * weights[k]) for k in range(3)]
    for i in range(n_samples - sum(n_samples_per_class)):
        n_samples_per_class[i] += 1
    n_samples = sum(n_samples_per_class)

    y = ([0] * n_samples_per_class[0] +
         [1] * n_samples_per_class[1] +
         [2] * n_samples_per_class[2])
    y = np.asarray(y)

    a = rng.randint(16, 33, size=(n_samples, 1))
    b = rng.randint(32, 97, size=(n_samples, 1)) + a
    eta = rng.randn(n_samples, 1)
    epsilon = rng.randn(n_samples, 128)

    arange = np.tile(np.arange(1, 129), n_samples).reshape(n_samples, 128)
    additional_term = (6 + eta) * \
        np.logical_and(arange >= np.repeat(a, 128, axis=1),
                       arange <= np.repeat(b, 128, axis=1))
    i, j = n_samples_per_class[0], sum(n_samples_per_class[0:2])
    additional_term[i:j] *= (np.arange(1, 129) - a[i:j]) / (b[i:j] - a[i:j])
    additional_term[j:] *= (b[j:] - np.arange(1, 129)) / (b[j:] - a[j:])

    X = epsilon + additional_term

    if shuffle:
        X, y = utils_shuffle(X, y, random_state=rng)

    if return_params:
        params = {'a': a, 'b': b, 'eta': eta, 'epsilon': epsilon}
        return X, y, params
    else:
        return X, y
Source code for pyts.datasets.make

Navigation

Related Topics