Source code for pyts.preprocessing.transformer
"""Code for transformers."""
# Author: Johann Faouzi <johann.faouzi@gmail.com>
# License: BSD-3-Clause
from sklearn.base import BaseEstimator
from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer
from sklearn.preprocessing import (QuantileTransformer as
SklearnQuantileTransformer)
from sklearn.utils.validation import check_array
from ..base import UnivariateTransformerMixin
[docs]class PowerTransformer(BaseEstimator, UnivariateTransformerMixin):
"""Apply a power transform sample-wise to make data more Gaussian-like.
Power transforms are a family of parametric, monotonic transformations
that are applied to make data more Gaussian-like. This is useful for
modeling issues related to heteroscedasticity (non-constant variance),
or other situations where normality is desired.
Currently, PowerTransformer supports the Box-Cox transform and the
Yeo-Johnson transform. The optimal parameter for stabilizing variance and
minimizing skewness is estimated through maximum likelihood.
Box-Cox requires input data to be strictly positive, while Yeo-Johnson
supports both positive or negative data.
By default, zero-mean, unit-variance normalization is applied to the
transformed data.
Parameters
----------
method : 'yeo-johnson' or 'box-cox' (default = 'yeo-johnson')
The power transform method. Available methods are:
- 'yeo-johnson' [1]_, works with positive and negative values
- 'box-cox' [2]_, only works with strictly positive values
standardize : boolean (default = True)
Set to True to apply zero-mean, unit-variance normalization to the
transformed output.
Notes
-----
NaNs are treated as missing values: disregarded in ``fit``, and maintained
in ``transform``.
References
----------
.. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to
improve normality or symmetry." Biometrika, 87(4), pp.954-959,
(2000).
.. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal
of the Royal Statistical Society B, 26, 211-252 (1964).
Examples
--------
>>> import numpy as np
>>> from pyts.preprocessing import PowerTransformer
>>> X = [[1, 3, 4], [2, 2, 5]]
>>> pt = PowerTransformer()
>>> print(pt.transform(X))
[[-1.316... 0.209... 1.106...]
[-0.707... -0.707... 1.414...]]
"""
[docs] def __init__(self, method='yeo-johnson', standardize=True):
self.method = method
self.standardize = standardize
[docs] def fit(self, X=None, y=None):
"""Pass.
Parameters
----------
X
Ignored
y
Ignored
Returns
-------
self : object
"""
return self
[docs] def transform(self, X):
"""Transform the data.
Parameters
----------
X : array-like, shape = (n_samples, n_timestamps)
Data to transform.
Returns
-------
X_new : array-like, shape = (n_samples, n_timestamps)
Transformed data.
"""
X = check_array(X, dtype='float64', force_all_finite='allow-nan')
transformer = SklearnPowerTransformer(
method=self.method, standardize=self.standardize
)
X_new = transformer.fit_transform(X.T).T
return X_new
[docs]class QuantileTransformer(BaseEstimator, UnivariateTransformerMixin):
"""Transform samples using quantiles information.
This method transforms the samples to follow a uniform or a normal
distribution. Therefore, for a given sample, this transformation tends
to spread out the most frequent values. It also reduces the impact of
(marginal) outliers: this is therefore a robust preprocessing scheme.
The transformation is applied on each sample independently.
The cumulative distribution function of a feature is used to project the
original values. Note that this transform is non-linear.
Parameters
----------
n_quantiles : int, optional (default = 1000)
Number of quantiles to be computed. It corresponds to the number
of landmarks used to discretize the cumulative distribution function.
output_distribution : 'uniform' or 'normal' (default = 'uniform')
Marginal distribution for the transformed data. The choices are
'uniform' (default) or 'normal'.
subsample : int, optional (default = 1e5)
Maximum number of timestamps used to estimate the quantiles for
computational efficiency.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by np.random. Note that this is used by subsampling and smoothing
noise.
Examples
--------
>>> from pyts.datasets import load_gunpoint
>>> from pyts.preprocessing import QuantileTransformer
>>> X, _, _, _ = load_gunpoint(return_X_y=True)
>>> qt = QuantileTransformer(n_quantiles=10)
>>> qt.transform(X)
array([...])
"""
[docs] def __init__(self, n_quantiles=1000, output_distribution='uniform',
subsample=int(1e5), random_state=None):
self.n_quantiles = n_quantiles
self.output_distribution = output_distribution
self.subsample = subsample
self.random_state = random_state
[docs] def fit(self, X=None, y=None):
"""Pass.
Parameters
----------
X
Ignored
y
Ignored
Returns
-------
self : object
"""
return self
[docs] def transform(self, X):
"""Transform the data.
Parameters
----------
X : array-like, shape = (n_samples, n_timestamps)
Data to transform.
Returns
-------
X_new : array-like, shape = (n_samples, n_timestamps)
Transformed data.
"""
X = check_array(X, dtype='float64')
transformer = SklearnQuantileTransformer(
n_quantiles=self.n_quantiles,
output_distribution=self.output_distribution,
subsample=self.subsample,
random_state=self.random_state
)
X_new = transformer.fit_transform(X.T).T
return X_new