Source code for pyts.transformation.bag_of_patterns

"""Code for Bag-of-patterns representation for time series."""

# Author: Johann Faouzi <johann.faouzi@gmail.com>
# License: BSD-3-Clause

from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils.validation import check_array, check_is_fitted
from ..bag_of_words import BagOfWords
from ..base import UnivariateTransformerMixin


[docs]class BagOfPatterns(BaseEstimator, UnivariateTransformerMixin):
    """Bag-of-patterns representation for time series.

    This algorithm uses a sliding window to extract subsequences from the
    time series and transforms each subsequence into a word using the
    Piecewise Aggregate Approximation and the Symbolic Aggregate approXimation
    algorithms. Thus it transforms each time series into a bag of words.
    Then it derives the frequencies of each word for each time series.

    Parameters
    ----------
    window_size : int or float (default = 0.5)
        Length of the sliding window. If float, it represents
        a percentage of the size of each time series and must be
        between 0 and 1.

    word_size : int or float (default = 0.5)
        Length of the words. If float, it represents
        a percentage of the length of the sliding window and must be
        between 0. and 1.

    n_bins : int (default = 4)
        The number of bins to produce. It must be between 2 and
        ``min(window_size, 26)``.

    strategy : 'uniform', 'quantile' or 'normal' (default = 'normal')
        Strategy used to define the widths of the bins:

        - 'uniform': All bins in each sample have identical widths
        - 'quantile': All bins in each sample have the same number of points
        - 'normal': Bin edges are quantiles from a standard normal distribution

    numerosity_reduction : bool (default = True)
        If True, delete sample-wise all but one occurence of back to back
        identical occurences of the same words.

    window_step : int or float (default = 1)
        Step of the sliding window. If float, it represents the percentage of
        the size of each time series and must be between 0 and 1. The step of
        sliding window will be computed as
        ``ceil(window_step * n_timestamps)``.

    norm_mean : bool (default = True)
        If True, center each subseries before scaling.

    norm_std : bool (default = True)
        If True, scale each subseries to unit variance.

    sparse : bool (default = True)
        Return a sparse matrix if True, else return an array.

    overlapping : bool (default = True)
        If True, time points may belong to two bins when decreasing the size
        of the subsequence with the Piecewise Aggregate Approximation
        algorithm. If False, each time point belong to one single bin, but
        the size of the bins may vary.

    alphabet : None or array-like, shape = (n_bins,)
        Alphabet to use. If None, the first `n_bins` letters of the Latin
        alphabet are used.

    Attributes
    ----------
    vocabulary_ : dict
        A mapping of feature indices to terms.

    References
    ----------
    .. [1] J. Lin, R. Khade and Y. Li, "Rotation-invariant similarity in time
           series using bag-of-patterns representation". Journal of Intelligent
           Information Systems, 39 (2), 287-315 (2012).

    Examples
    --------
    >>> import numpy as np
    >>> from pyts.transformation import BagOfPatterns
    >>> X = np.arange(12).reshape(2, 6)
    >>> bop = BagOfPatterns(window_size=4, word_size=4, sparse=False)
    >>> bop.fit_transform(X)
    array(...)
    >>> bop.set_params(numerosity_reduction=False)
    BagOfPatterns(...)
    >>> bop.fit_transform(X)
    array(...)

    """

[docs]    def __init__(self, window_size=0.5, word_size=0.5, n_bins=4,
                 strategy='normal', numerosity_reduction=True, window_step=1,
                 norm_mean=True, norm_std=True, sparse=True, overlapping=True,
                 alphabet=None):
        self.window_size = window_size
        self.word_size = word_size
        self.n_bins = n_bins
        self.strategy = strategy
        self.numerosity_reduction = numerosity_reduction
        self.window_step = window_step
        self.norm_mean = norm_mean
        self.norm_std = norm_std
        self.sparse = sparse
        self.overlapping = overlapping
        self.alphabet = alphabet

[docs]    def fit(self, X, y=None):
        """Learn the dictionary.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_timestamps)
            Input data

        y
            Ignored

        Returns
        -------
        self : object

        """
        # Transform each time series into a bag of words
        bow = BagOfWords(
            window_size=self.window_size, word_size=self.word_size,
            n_bins=self.n_bins, strategy=self.strategy,
            numerosity_reduction=self.numerosity_reduction,
            window_step=self.window_step, norm_mean=self.norm_mean,
            norm_std=self.norm_std, overlapping=self.overlapping,
            alphabet=self.alphabet
        )
        X_bow = bow.transform(X)

        # Learn the vocabulary
        vectorizer = CountVectorizer()
        vectorizer.fit(X_bow)
        self.vocabulary_ = {value: key for key, value in
                            vectorizer.vocabulary_.items()}
        self._vectorizer = vectorizer
        return self

[docs]    def transform(self, X):
        """Derive word frequencies for each time series.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_timestamps)
            Data to transform.

        Returns
        -------
        X_new : array, shape = (n_samples, n_words)
            Word frequencies.

        """
        X = check_array(X, dtype='float64')
        check_is_fitted(self, 'vocabulary_')

        # Transform each time series into a bag of words
        bow = BagOfWords(
            window_size=self.window_size, word_size=self.word_size,
            n_bins=self.n_bins, strategy=self.strategy,
            numerosity_reduction=self.numerosity_reduction,
            window_step=self.window_step, norm_mean=self.norm_mean,
            norm_std=self.norm_std, overlapping=self.overlapping,
            alphabet=self.alphabet
        )
        X_bow = bow.transform(X)

        # Derive frequencies for each word in the vocabulary
        X_bop = self._vectorizer.transform(X_bow)
        if not self.sparse:
            return X_bop.A
        return csr_matrix(X_bop)

[docs]    def fit_transform(self, X, y=None):
        """Derive word frequencies for each time series.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_timestamps)
            Data to transform.

        y
            Ignored

        Returns
        -------
        X_new : array, shape = (n_samples, n_words)
            Word frequencies.

        """
        # Transform each time series into a bag of words
        bow = BagOfWords(
            window_size=self.window_size, word_size=self.word_size,
            n_bins=self.n_bins, strategy=self.strategy,
            numerosity_reduction=self.numerosity_reduction,
            window_step=self.window_step, norm_mean=self.norm_mean,
            norm_std=self.norm_std, overlapping=self.overlapping,
            alphabet=self.alphabet
        )
        X_bow = bow.transform(X)

        # Derive frequencies of each word
        vectorizer = CountVectorizer()
        X_bop = vectorizer.fit_transform(X_bow)
        self.vocabulary_ = {value: key for key, value in
                            vectorizer.vocabulary_.items()}
        self._vectorizer = vectorizer
        if not self.sparse:
            return X_bop.A
        return csr_matrix(X_bop)
Source code for pyts.transformation.bag_of_patterns

Navigation

Related Topics