Source code for pyts.datasets.make

"""Functions to make datasets."""

# Author: Johann Faouzi <johann.faouzi@gmail.com>
# License: BSD-3-Clause

import numpy as np
from sklearn.utils import check_array, check_random_state
from sklearn.utils import shuffle as utils_shuffle


[docs]def make_cylinder_bell_funnel( n_samples=30, weights=None, shuffle=True, random_state=None, return_params=False ): r"""Make a Cylinder-Bell-Funnel dataset. The classes are coded with the following meaning: - 0: Cylinder - 1: Bell - 2: Funnel Parameters ---------- n_samples : int (default = 30) The number of time series. weights : list of floats or None (default=None) The proportions of samples assigned to each class. If None, then classes are balanced. Note that if ``len(weights) == 2``, then the last class weight is automatically inferred. More than ``n_samples`` samples may be returned if the sum of ``weights`` exceeds 1. shuffle : bool (default = True) If True, shuffle the samples. return_params : bool (default = False) If True, a dictionary containing the parameters used to make the time series is returned. random_state : int, RandomState instance or None (default = None) Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. Returns ------- X : array, shape = (n_samples, 128) Generated time series. y : array, shape = (n_samples,) Generated class labels. Labels have the following meaning: - 0: Cylinder - 1: Bell - 2: Funnel params : dict The parameters used to generate the data. The keys are 'a', 'b', 'eta' and 'epsilon' and the values are the corresponding values. Only returned if ``return_params=True``. Notes ----- The time series are generated from the following distributions: .. math:: c(t) = (6 + \eta) \cdot 1_{[a, b]}(t) + \epsilon(t) b(t) = (6 + \eta) \cdot 1_{[a, b]}(t) \cdot (t - a) / (b - a) + \epsilon(t) f(t) = (6 + \eta) \cdot 1_{[a, b]}(t) \cdot (b - t) / (b - a) + \epsilon(t) where :math:`t=1,\ldots,128`, :math:`a` is an integer-valued uniform random variable on the inverval :math:`[16, 32]`, :math:`b-a` is an integer-valued uniform distribution on the inveral :math:`[32, 96]`, :math:`\eta` and :math:`\epsilon(t)` are standard normal variables, :math:`{1}_{[a, b]}` is the characteristic function on the interval :math:`[a, b]`. :math:`c`, :math:`b`, and :math:`f` stand for "cylinder", "bell", and "funnel" respectively. References ---------- .. [1] N. Saito, "Local feature extraction and its application using a library of bases". Ph.D. thesis, Department of Mathematics, Yale University, 1994. Examples -------- >>> import numpy as np >>> from pyts.datasets import make_cylinder_bell_funnel >>> X, y = make_cylinder_bell_funnel() >>> X.shape (30, 128) >>> print(np.bincount(y)) [10 10 10] """ if not isinstance(n_samples, (int, np.integer)): raise TypeError("'n_samples' must be an integer.") if not n_samples > 0: raise ValueError("'n_samples' must be a positive integer.") if weights is None: weights = [1.0 / 3] * 3 weights[-1] = 1.0 - sum(weights[:-1]) else: weights = check_array(weights, ensure_2d=False) if len(weights) not in [2, 3]: raise ValueError("'weights' must be None or a list with 2 or " "3 elements (got {}).".format(len(weights))) if len(weights) == 2: if sum(weights) > 1: raise ValueError( "'sum(weights)' cannot be larger than 1 if " "len(weights) == 2 (got {})".format(sum(weights))) weights = np.append(weights, 1.0 - sum(weights)) rng = check_random_state(random_state) n_samples_per_class = [int(n_samples * weights[k]) for k in range(3)] for i in range(n_samples - sum(n_samples_per_class)): n_samples_per_class[i] += 1 n_samples = sum(n_samples_per_class) y = ([0] * n_samples_per_class[0] + [1] * n_samples_per_class[1] + [2] * n_samples_per_class[2]) y = np.asarray(y) a = rng.randint(16, 33, size=(n_samples, 1)) b = rng.randint(32, 97, size=(n_samples, 1)) + a eta = rng.randn(n_samples, 1) epsilon = rng.randn(n_samples, 128) arange = np.tile(np.arange(1, 129), n_samples).reshape(n_samples, 128) additional_term = (6 + eta) * \ np.logical_and(arange >= np.repeat(a, 128, axis=1), arange <= np.repeat(b, 128, axis=1)) i, j = n_samples_per_class[0], sum(n_samples_per_class[0:2]) additional_term[i:j] *= (np.arange(1, 129) - a[i:j]) / (b[i:j] - a[i:j]) additional_term[j:] *= (b[j:] - np.arange(1, 129)) / (b[j:] - a[j:]) X = epsilon + additional_term if shuffle: X, y = utils_shuffle(X, y, random_state=rng) if return_params: params = {'a': a, 'b': b, 'eta': eta, 'epsilon': epsilon} return X, y, params else: return X, y