Source code for pyts.approximation.sax

"""Code for Symbolic Aggregate approXimation."""

# Author: Johann Faouzi <johann.faouzi@gmail.com>
# License: BSD-3-Clause

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array
from ..preprocessing import KBinsDiscretizer


[docs]class SymbolicAggregateApproximation(BaseEstimator, TransformerMixin): """Symbolic Aggregate approXimation. Parameters ---------- n_bins : int (default = 4) The number of bins to produce. It must be between 2 and ``min(n_timestamps, 26)``. strategy : 'uniform', 'quantile' or 'normal' (default = 'quantile') Strategy used to define the widths of the bins: - 'uniform': All bins in each sample have identical widths - 'quantile': All bins in each sample have the same number of points - 'normal': Bin edges are quantiles from a standard normal distribution alphabet : None, 'ordinal' or array-like, shape = (n_bins,) Alphabet to use. If None, the first `n_bins` letters of the Latin alphabet are used. If 'ordinal', integers are used. References ---------- .. [1] J. Lin, E. Keogh, L. Wei, and S. Lonardi, "Experiencing SAX: a novel symbolic representation of time series". Data Mining and Knowledge Discovery, 15(2), 107-144 (2007). Examples -------- >>> from pyts.approximation import SymbolicAggregateApproximation >>> X = [[0, 4, 2, 1, 7, 6, 3, 5], ... [2, 5, 4, 5, 3, 4, 2, 3]] >>> transformer = SymbolicAggregateApproximation() >>> print(transformer.transform(X)) [['a' 'c' 'b' 'a' 'd' 'd' 'b' 'c'] ['a' 'd' 'c' 'd' 'b' 'c' 'a' 'b']] """
[docs] def __init__(self, n_bins=4, strategy='quantile', alphabet=None): self.n_bins = n_bins self.strategy = strategy self.alphabet = alphabet
[docs] def fit(self, X=None, y=None): """Pass. Parameters ---------- X Ignored y Ignored """ return self
[docs] def transform(self, X): """Bin the data with the given alphabet. Parameters ---------- X : array-like, shape = (n_samples, n_timestamps) Data to transform. y Ignored Returns ------- X_new : array, shape = (n_samples, n_timestamps) Binned data. """ X = check_array(X, dtype='float64') n_timestamps = X.shape[1] alphabet = self._check_params(n_timestamps) discretizer = KBinsDiscretizer( n_bins=self.n_bins, strategy=self.strategy) indices = discretizer.fit_transform(X) if isinstance(alphabet, str): return indices else: return alphabet[indices]
def _check_params(self, n_timestamps): if not isinstance(self.n_bins, (int, np.integer)): raise TypeError("'n_bins' must be an integer.") if not 2 <= self.n_bins <= min(n_timestamps, 26): raise ValueError( "'n_bins' must be greater than or equal to 2 and lower than " "or equal to min(n_timestamps, 26) (got {0})." .format(self.n_bins) ) if self.strategy not in ['uniform', 'quantile', 'normal']: raise ValueError("'strategy' must be either 'uniform', 'quantile' " "or 'normal' (got {0})".format(self.strategy)) if not ((self.alphabet is None) or (self.alphabet == 'ordinal') or (isinstance(self.alphabet, (list, tuple, np.ndarray)))): raise TypeError("'alphabet' must be None, 'ordinal' or array-like " "with shape (n_bins,) (got {0})" .format(self.alphabet)) if self.alphabet is None: alphabet = np.array([chr(i) for i in range(97, 97 + self.n_bins)]) elif self.alphabet == 'ordinal': alphabet = 'ordinal' else: alphabet = check_array(self.alphabet, ensure_2d=False, dtype=None) if alphabet.shape != (self.n_bins,): raise ValueError("If 'alphabet' is array-like, its shape " "must be equal to (n_bins,).") return alphabet