Source code for pyts.preprocessing.transformer

"""Code for transformers."""

# Author: Johann Faouzi <johann.faouzi@gmail.com>
# License: BSD-3-Clause

from sklearn.base import BaseEstimator
from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer
from sklearn.preprocessing import (QuantileTransformer as
                                   SklearnQuantileTransformer)
from sklearn.utils.validation import check_array
from ..base import UnivariateTransformerMixin


[docs]class PowerTransformer(BaseEstimator, UnivariateTransformerMixin):
    """Apply a power transform sample-wise to make data more Gaussian-like.

    Power transforms are a family of parametric, monotonic transformations
    that are applied to make data more Gaussian-like. This is useful for
    modeling issues related to heteroscedasticity (non-constant variance),
    or other situations where normality is desired.

    Currently, PowerTransformer supports the Box-Cox transform and the
    Yeo-Johnson transform. The optimal parameter for stabilizing variance and
    minimizing skewness is estimated through maximum likelihood.

    Box-Cox requires input data to be strictly positive, while Yeo-Johnson
    supports both positive or negative data.

    By default, zero-mean, unit-variance normalization is applied to the
    transformed data.

    Parameters
    ----------
    method : 'yeo-johnson' or 'box-cox' (default = 'yeo-johnson')
        The power transform method. Available methods are:

        - 'yeo-johnson' [1]_, works with positive and negative values
        - 'box-cox' [2]_, only works with strictly positive values

    standardize : boolean (default = True)
        Set to True to apply zero-mean, unit-variance normalization to the
        transformed output.

    Notes
    -----
    NaNs are treated as missing values: disregarded in ``fit``, and maintained
    in ``transform``.

    References
    ----------
    .. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to
           improve normality or symmetry." Biometrika, 87(4), pp.954-959,
           (2000).
    .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal
           of the Royal Statistical Society B, 26, 211-252 (1964).

    Examples
    --------
    >>> import numpy as np
    >>> from pyts.preprocessing import PowerTransformer
    >>> X = [[1, 3, 4], [2, 2, 5]]
    >>> pt = PowerTransformer()
    >>> print(pt.transform(X))
    [[-1.316...  0.209...  1.106...]
     [-0.707... -0.707...  1.414...]]

    """

[docs]    def __init__(self, method='yeo-johnson', standardize=True):
        self.method = method
        self.standardize = standardize

[docs]    def fit(self, X=None, y=None):
        """Pass.

        Parameters
        ----------
        X
            Ignored

        y
            Ignored

        Returns
        -------
        self : object

        """
        return self

[docs]    def transform(self, X):
        """Transform the data.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_timestamps)
            Data to transform.

        Returns
        -------
        X_new : array-like, shape = (n_samples, n_timestamps)
            Transformed data.

        """
        X = check_array(X, dtype='float64', force_all_finite='allow-nan')
        transformer = SklearnPowerTransformer(
            method=self.method, standardize=self.standardize
        )
        X_new = transformer.fit_transform(X.T).T
        return X_new


[docs]class QuantileTransformer(BaseEstimator, UnivariateTransformerMixin):
    """Transform samples using quantiles information.

    This method transforms the samples to follow a uniform or a normal
    distribution. Therefore, for a given sample, this transformation tends
    to spread out the most frequent values. It also reduces the impact of
    (marginal) outliers: this is therefore a robust preprocessing scheme.
    The transformation is applied on each sample independently.

    The cumulative distribution function of a feature is used to project the
    original values. Note that this transform is non-linear.

    Parameters
    ----------
    n_quantiles : int, optional (default = 1000)
        Number of quantiles to be computed. It corresponds to the number
        of landmarks used to discretize the cumulative distribution function.

    output_distribution : 'uniform' or 'normal' (default = 'uniform')
        Marginal distribution for the transformed data. The choices are
        'uniform' (default) or 'normal'.

    subsample : int, optional (default = 1e5)
        Maximum number of timestamps used to estimate the quantiles for
        computational efficiency.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by np.random. Note that this is used by subsampling and smoothing
        noise.

    Examples
    --------
    >>> from pyts.datasets import load_gunpoint
    >>> from pyts.preprocessing import QuantileTransformer
    >>> X, _, _, _  = load_gunpoint(return_X_y=True)
    >>> qt = QuantileTransformer(n_quantiles=10)
    >>> qt.transform(X)
    array([...])

    """

[docs]    def __init__(self, n_quantiles=1000, output_distribution='uniform',
                 subsample=int(1e5), random_state=None):
        self.n_quantiles = n_quantiles
        self.output_distribution = output_distribution
        self.subsample = subsample
        self.random_state = random_state

[docs]    def fit(self, X=None, y=None):
        """Pass.

        Parameters
        ----------
        X
            Ignored

        y
            Ignored

        Returns
        -------
        self : object

        """
        return self

[docs]    def transform(self, X):
        """Transform the data.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_timestamps)
            Data to transform.

        Returns
        -------
        X_new : array-like, shape = (n_samples, n_timestamps)
            Transformed data.

        """
        X = check_array(X, dtype='float64')
        transformer = SklearnQuantileTransformer(
            n_quantiles=self.n_quantiles,
            output_distribution=self.output_distribution,
            subsample=self.subsample,
            random_state=self.random_state
        )
        X_new = transformer.fit_transform(X.T).T
        return X_new
Source code for pyts.preprocessing.transformer

Navigation

Related Topics