Source code for pyts.datasets.load

"""Functions to load datasets."""

# Author: Johann Faouzi <johann.faouzi@gmail.com>
# License: BSD-3-Clause

import os
from .uea import _load_uea_dataset
from .ucr import _load_ucr_dataset


def _load_dataset(name, archive, return_X_y):
    r"""Load and return dataset.

    Parameters
    ----------
    name : str
        Name of the dataset.

    archive : 'UCR' or 'UEA'
        Archive the dataset belongs to.

    return_X_y : bool
        If True, return
        ``(data_train, data_test, target_train, target_test)`` instead of a
        Bunch object.

    Returns
    -------
    data : Bunch
        Dictionary-like object, with attributes:

        data_train : array of floats
            The time series in the training set.
        data_test : array of floats
            The time series in the test set.
        target_train : array of integers
            The classification labels in the training set.
        target_test : array of integers
            The classification labels in the test set.
        DESCR : str
            The full description of the dataset.
        url : str
            The url of the dataset.

    (data_train, data_test, target_train, target_test) : tuple if ``return_X_y`` is True

    """  # noqa: E501
    module_path = os.path.dirname(__file__)
    folder = os.path.join(module_path, 'cached_datasets', archive, '')
    if archive == 'UCR':
        bunch = _load_ucr_dataset(name, folder)
    else:
        bunch = _load_uea_dataset(name, folder)
    if return_X_y:
        return (bunch.data_train, bunch.data_test,
                bunch.target_train, bunch.target_test)
    return bunch


[docs]def load_basic_motions(return_X_y=False): r"""Load and return the Basic Motions dataset. The data was generated as part of a student project where four students performed four activities whilst wearing a smart watch. The watch collects 3D accelerometer and a 3D gyroscope It consists of four classes, which are walking, resting, running and badminton. Participants were required to record motion a total of five times, and the data is sampled once every tenth of a second, for a ten second period. ================ ============== Training samples 40 Test samples 40 Dimensionality 6 Timestamps 100 Classes 4 ================ ============== Parameters ---------- return_X_y : bool (default = False) If True, return ``(data_train, data_test, target_train, target_test)`` instead of a Bunch object. Returns ------- data : Bunch Dictionary-like object, with attributes: data_train : array of floats The time series in the training set. data_test : array of floats The time series in the test set. target_train : array of integers The classification labels in the training set. target_test : array of integers The classification labels in the test set. DESCR : str The full description of the dataset. url : str The url of the dataset. (data_train, data_test, target_train, target_test) : tuple if ``return_X_y`` is True References ---------- .. [1] `UCR archive entry for the PigCVP dataset <http://www.timeseriesclassification.com/description.php?Dataset=BasicMotions>`_ Examples -------- >>> from pyts.datasets import load_basic_motions >>> bunch = load_basic_motions() >>> bunch.data_train.shape (40, 6, 100) >>> X_train, X_test, y_train, y_test = load_basic_motions(return_X_y=True) >>> X_train.shape (40, 6, 100) """ # noqa: E501 return _load_dataset('BasicMotions', 'UEA', return_X_y)
[docs]def load_coffee(return_X_y=False): r"""Load and return the Coffee dataset. Food spectrographs are used in chemometrics to classify food types, a task that has obvious applications in food safety and quality assurance. The coffee data set is a two class problem to distinguish between Robusta and Aribica coffee beans. ================ ============== Training samples 28 Test samples 28 Timestamps 286 Classes 2 ================ ============== Parameters ---------- return_X_y : bool (default = False) If True, return ``(data_train, data_test, target_train, target_test)`` instead of a Bunch object. Returns ------- data : Bunch Dictionary-like object, with attributes: data_train : array of floats The time series in the training set. data_test : array of floats The time series in the test set. target_train : array of integers The classification labels in the training set. target_test : array of integers The classification labels in the test set. DESCR : str The full description of the dataset. url : str The url of the dataset. (data_train, data_test, target_train, target_test) : tuple if ``return_X_y`` is True References ---------- .. [1] R. Briandet, E.K. Kemsley, and R.H. Wilson, "Discrimination of Arabica and Robusta in Instant Coffee by Fourier Transform Infrared Spectroscopy and Chemometrics". Journal of Agricultural and Food Chemistry (1996). .. [2] A. Bagnall, L. Davis, J. Hills and J. Lines, "Transformation Based Ensembles for Time Series Classification". SDM (2012). .. [3] `UCR archive entry for the PigCVP dataset <http://www.timeseriesclassification.com/description.php?Dataset=Coffee>`_ Examples -------- >>> from pyts.datasets import load_coffee >>> bunch = load_coffee() >>> bunch.data_train.shape (28, 286) >>> X_train, X_test, y_train, y_test = load_coffee(return_X_y=True) >>> X_train.shape (28, 286) """ # noqa: E501 return _load_dataset('Coffee', 'UCR', return_X_y)
[docs]def load_gunpoint(return_X_y=False): r"""Load and return the GunPoint dataset. This dataset involves one female actor and one male actor making a motion with their hand. The two classes are: Gun-Draw and Point: For Gun-Draw the actors have their hands by their sides. They draw a replicate gun from a hip-mounted holster, point it at a target for approximately one second, then return the gun to the holster, and their hands to their sides. For Point the actors have their gun by their sides. They point with their index fingers to a target for approximately one second, and then return their hands to their sides. For both classes, we tracked the centroid of the actor's right hands in both X- and Y-axes, which appear to be highly correlated. The data in the archive is just the X-axis. ================ ============== Training samples 50 Test samples 150 Timestamps 150 Classes 2 ================ ============== Parameters ---------- return_X_y : bool (default = False) If True, return ``(data_train, data_test, target_train, target_test)`` instead of a Bunch object. Returns ------- data : Bunch Dictionary-like object, with attributes: data_train : array of floats The time series in the training set. data_test : array of floats The time series in the test set. target_train : array of integers The classification labels in the training set. target_test : array of integers The classification labels in the test set. DESCR : str The full description of the dataset. url : str The url of the dataset. (data_train, data_test, target_train, target_test) : tuple if ``return_X_y`` is True References ---------- .. [1] `UCR archive entry for the PigCVP dataset <http://www.timeseriesclassification.com/description.php?Dataset=GunPoint>`_ Examples -------- >>> from pyts.datasets import load_gunpoint >>> bunch = load_gunpoint() >>> bunch.data_train.shape (50, 150) >>> X_train, X_test, y_train, y_test = load_gunpoint(return_X_y=True) >>> X_train.shape (50, 150) """ # noqa: E501 return _load_dataset('GunPoint', 'UCR', return_X_y)
[docs]def load_pig_central_venous_pressure(return_X_y=False): r"""Load and return the PigCVP dataset. In the test set, a class is represented by four examples, the second and third 2000 data points of the before time series and the second and third 2000 data points of the after time series. Data created by Mathieu Guillame-Bert et al. Data edited by Shaghayegh Gharghabi and Eamonn Keogh. ================ ============== Training samples 104 Test samples 208 Timestamps 2000 Classes 52 ================ ============== Parameters ---------- return_X_y : bool (default = False) If True, return ``(data_train, data_test, target_train, target_test)`` instead of a Bunch object. Returns ------- data : Bunch Dictionary-like object, with attributes: data_train : array of floats The time series in the training set. data_test : array of floats The time series in the test set. target_train : array of integers The classification labels in the training set. target_test : array of integers The classification labels in the test set. DESCR : str The full description of the dataset. url : str The url of the dataset. (data_train, data_test, target_train, target_test) : tuple if ``return_X_y`` is True References ---------- .. [1] M. Guillame-Bert and A. Dubrawski, "Classification of Time Sequences using Graphs of Temporal Constraints". Journal of Machine Learning Research, 2017. .. [2] `UCR archive entry for the PigCVP dataset <http://www.timeseriesclassification.com/description.php?Dataset=PigCVP>`_ Examples -------- >>> from pyts.datasets import load_pig_central_venous_pressure >>> bunch = load_pig_central_venous_pressure() >>> bunch.data_train.shape (104, 2000) >>> X_train, X_test, y_train, y_test = load_pig_central_venous_pressure( ... return_X_y=True) >>> X_train.shape (104, 2000) """ # noqa: E501 return _load_dataset('PigCVP', 'UCR', return_X_y)