Source code for pyts.datasets.load

"""Functions to load datasets."""

# Author: Johann Faouzi <johann.faouzi@gmail.com>
# License: BSD-3-Clause

import os
from .uea import _load_uea_dataset
from .ucr import _load_ucr_dataset


def _load_dataset(name, archive, return_X_y):
    r"""Load and return dataset.

    Parameters
    ----------
    name : str
        Name of the dataset.

    archive : 'UCR' or 'UEA'
        Archive the dataset belongs to.

    return_X_y : bool
        If True, return
        ``(data_train, data_test, target_train, target_test)`` instead of a
        Bunch object.

    Returns
    -------
    data : Bunch
        Dictionary-like object, with attributes:

        data_train : array of floats
            The time series in the training set.
        data_test : array of floats
            The time series in the test set.
        target_train : array of integers
            The classification labels in the training set.
        target_test : array of integers
            The classification labels in the test set.
        DESCR : str
            The full description of the dataset.
        url : str
            The url of the dataset.

    (data_train, data_test, target_train, target_test) : tuple if ``return_X_y`` is True

    """  # noqa: E501
    module_path = os.path.dirname(__file__)
    folder = os.path.join(module_path, 'cached_datasets', archive, '')
    if archive == 'UCR':
        bunch = _load_ucr_dataset(name, folder)
    else:
        bunch = _load_uea_dataset(name, folder)
    if return_X_y:
        return (bunch.data_train, bunch.data_test,
                bunch.target_train, bunch.target_test)
    return bunch


[docs]def load_basic_motions(return_X_y=False):
    r"""Load and return the Basic Motions dataset.

    The data was generated as part of a student project where four students
    performed four activities whilst wearing a smart watch. The watch collects
    3D accelerometer and a 3D gyroscope It consists of four classes, which are
    walking, resting, running and badminton. Participants were required to
    record motion a total of five times, and the data is sampled once every
    tenth of a second, for a ten second period.

    ================   ==============
    Training samples               40
    Test samples                   40
    Dimensionality                  6
    Timestamps                    100
    Classes                         4
    ================   ==============

    Parameters
    ----------
    return_X_y : bool (default = False)
        If True, return
        ``(data_train, data_test, target_train, target_test)`` instead of a
        Bunch object.

    Returns
    -------
    data : Bunch
        Dictionary-like object, with attributes:

        data_train : array of floats
            The time series in the training set.
        data_test : array of floats
            The time series in the test set.
        target_train : array of integers
            The classification labels in the training set.
        target_test : array of integers
            The classification labels in the test set.
        DESCR : str
            The full description of the dataset.
        url : str
            The url of the dataset.

    (data_train, data_test, target_train, target_test) : tuple if ``return_X_y`` is True

    References
    ----------
    .. [1] `UCR archive entry for the PigCVP dataset
           <http://www.timeseriesclassification.com/description.php?Dataset=BasicMotions>`_

    Examples
    --------
    >>> from pyts.datasets import load_basic_motions
    >>> bunch = load_basic_motions()
    >>> bunch.data_train.shape
    (40, 6, 100)
    >>> X_train, X_test, y_train, y_test = load_basic_motions(return_X_y=True)
    >>> X_train.shape
    (40, 6, 100)

    """  # noqa: E501
    return _load_dataset('BasicMotions', 'UEA', return_X_y)


[docs]def load_coffee(return_X_y=False):
    r"""Load and return the Coffee dataset.

    Food spectrographs are used in chemometrics to classify food types, a task
    that has obvious applications in food safety and quality assurance. The
    coffee data set is a two class problem to distinguish between Robusta and
    Aribica coffee beans.

    ================   ==============
    Training samples               28
    Test samples                   28
    Timestamps                    286
    Classes                         2
    ================   ==============

    Parameters
    ----------
    return_X_y : bool (default = False)
        If True, return
        ``(data_train, data_test, target_train, target_test)`` instead of a
        Bunch object.

    Returns
    -------
    data : Bunch
        Dictionary-like object, with attributes:

        data_train : array of floats
            The time series in the training set.
        data_test : array of floats
            The time series in the test set.
        target_train : array of integers
            The classification labels in the training set.
        target_test : array of integers
            The classification labels in the test set.
        DESCR : str
            The full description of the dataset.
        url : str
            The url of the dataset.

    (data_train, data_test, target_train, target_test) : tuple if ``return_X_y`` is True

    References
    ----------
    .. [1] R. Briandet, E.K. Kemsley, and R.H. Wilson, "Discrimination of
           Arabica and Robusta in Instant Coffee by Fourier Transform Infrared
           Spectroscopy and Chemometrics". Journal of Agricultural and Food
           Chemistry (1996).

    .. [2] A. Bagnall, L. Davis, J. Hills and J. Lines, "Transformation Based
           Ensembles for Time Series Classification". SDM (2012).

    .. [3] `UCR archive entry for the PigCVP dataset
           <http://www.timeseriesclassification.com/description.php?Dataset=Coffee>`_

    Examples
    --------
    >>> from pyts.datasets import load_coffee
    >>> bunch = load_coffee()
    >>> bunch.data_train.shape
    (28, 286)
    >>> X_train, X_test, y_train, y_test = load_coffee(return_X_y=True)
    >>> X_train.shape
    (28, 286)

    """  # noqa: E501
    return _load_dataset('Coffee', 'UCR', return_X_y)


[docs]def load_gunpoint(return_X_y=False):
    r"""Load and return the GunPoint dataset.

    This dataset involves one female actor and one male actor making a motion
    with their hand. The two classes are: Gun-Draw and Point: For Gun-Draw the
    actors have their hands by their sides. They draw a replicate gun from a
    hip-mounted holster, point it at a target for approximately one second,
    then return the gun to the holster, and their hands to their sides. For
    Point the actors have their gun by their sides. They point with their index
    fingers to a target for approximately one second, and then return their
    hands to their sides. For both classes, we tracked the centroid of the
    actor's right hands in both X- and Y-axes, which appear to be highly
    correlated. The data in the archive is just the X-axis.

    ================   ==============
    Training samples               50
    Test samples                  150
    Timestamps                    150
    Classes                         2
    ================   ==============

    Parameters
    ----------
    return_X_y : bool (default = False)
        If True, return
        ``(data_train, data_test, target_train, target_test)`` instead of a
        Bunch object.

    Returns
    -------
    data : Bunch
        Dictionary-like object, with attributes:

        data_train : array of floats
            The time series in the training set.
        data_test : array of floats
            The time series in the test set.
        target_train : array of integers
            The classification labels in the training set.
        target_test : array of integers
            The classification labels in the test set.
        DESCR : str
            The full description of the dataset.
        url : str
            The url of the dataset.

    (data_train, data_test, target_train, target_test) : tuple if ``return_X_y`` is True

    References
    ----------
    .. [1] `UCR archive entry for the PigCVP dataset
           <http://www.timeseriesclassification.com/description.php?Dataset=GunPoint>`_

    Examples
    --------
    >>> from pyts.datasets import load_gunpoint
    >>> bunch = load_gunpoint()
    >>> bunch.data_train.shape
    (50, 150)
    >>> X_train, X_test, y_train, y_test = load_gunpoint(return_X_y=True)
    >>> X_train.shape
    (50, 150)

    """  # noqa: E501
    return _load_dataset('GunPoint', 'UCR', return_X_y)


[docs]def load_pig_central_venous_pressure(return_X_y=False):
    r"""Load and return the PigCVP dataset.

    In the test set, a class is represented by four examples, the second and
    third 2000 data points of the before time series and the second and third
    2000 data points of the after time series. Data created by Mathieu
    Guillame-Bert et al. Data edited by Shaghayegh Gharghabi and Eamonn Keogh.

    ================   ==============
    Training samples              104
    Test samples                  208
    Timestamps                   2000
    Classes                        52
    ================   ==============

    Parameters
    ----------
    return_X_y : bool (default = False)
        If True, return
        ``(data_train, data_test, target_train, target_test)`` instead of a
        Bunch object.

    Returns
    -------
    data : Bunch
        Dictionary-like object, with attributes:

        data_train : array of floats
            The time series in the training set.
        data_test : array of floats
            The time series in the test set.
        target_train : array of integers
            The classification labels in the training set.
        target_test : array of integers
            The classification labels in the test set.
        DESCR : str
            The full description of the dataset.
        url : str
            The url of the dataset.

    (data_train, data_test, target_train, target_test) : tuple if ``return_X_y`` is True

    References
    ----------
    .. [1] M. Guillame-Bert and A. Dubrawski, "Classification of Time Sequences
           using Graphs of Temporal Constraints". Journal of Machine Learning
           Research, 2017.

    .. [2] `UCR archive entry for the PigCVP dataset
           <http://www.timeseriesclassification.com/description.php?Dataset=PigCVP>`_

    Examples
    --------
    >>> from pyts.datasets import load_pig_central_venous_pressure
    >>> bunch = load_pig_central_venous_pressure()
    >>> bunch.data_train.shape
    (104, 2000)
    >>> X_train, X_test, y_train, y_test = load_pig_central_venous_pressure(
    ...    return_X_y=True)
    >>> X_train.shape
    (104, 2000)

    """  # noqa: E501
    return _load_dataset('PigCVP', 'UCR', return_X_y)
Source code for pyts.datasets.load

Navigation

Related Topics