Commit 941ca696 authored by Andrew Quinn's avatar Andrew Quinn

Update PCA to object oriented form, fix normalisation issue and add sanity check

parent 353f1495
Pipeline #25369 passed with stage
in 1 minute and 57 seconds
......@@ -334,64 +334,83 @@ def detect_artefacts(X, reject_dim='channels', reject_metric='std',
return out
def pca(X, npcs=10):
"""
Compute a Principal Components Analysis on a given dataset using the SVD
method.
Parameters
----------
X : ndarray
Input data of shape [nsamples x nfeatures]. The second dimension is
reduced by the PCA.
Returns
-------
scores : ndarray [nsamples x npcs]
The dimensionality-reduced data. This contains the value of each
observation in the new PCA-space.
class PCA:
"""The class structure here could be seriously cleaned up and merged into
main sails. Longer term it would be good to use this for dimensionality
reduction of sensor data, and projecting A and associated metrics back into
full space. Currently lots of redundancy and comments/links for my own
education whilst figuring this out.
weights : ndarray [nsamples x npcs]
The dimensionality-reduced data. This contains the value of each
observation in the new PCA-space.
components : ndarray [npcs x nfeatures]
The eigenvectors describing how each feature (node or connection) loads
onto the latent PCA dimensions.
loadings : ndarray [npcs x nfeatures]
The components scaled by their contribution to the variance in the
original data
explained_variance_ratio_ : ndarray [npcs]
The proportion of variance explained by each PC
Can also be the basis for some of the methods
in: https://doi.org/10.1016/j.jneumeth.2016.12.016 if interesting...
I'll update and merge into main sails
"""
U, s, V = np.linalg.svd(X - X.mean())
# Variance explained metrics
var = s ** 2 / X.shape[0]
explained_variance = var[:npcs]
explained_variance_ratio = explained_variance / var.sum()
# The weights for each original variable in each principal component
components = V[:npcs, :] # Eigenvectors
loadings = components * s[:npcs, None] # Scaled by variance contributed to data
# The new co-ordinates of each observation in PCA-space
weights = s[:npcs] * U[:, :npcs]
# By method in sklearn.decomposition.PCA.fit_transform
# the application of dimensionality reduction is done here
U = U[:, :npcs]
scores = U * s[:npcs]
# Weights and scores should be identical
assert(np.all(weights == scores))
# scores - [nsamples x npcs]
# weights - [nsamples x npcs]
# components - [npcs x features] matrix of eigenvectors
# explained_variance_ratio_ - [npcs]
return scores, weights, components, loadings, explained_variance_ratio
def __init__(self, data, npcs):
self.data_mean = data.mean(axis=0)
self._pca_svd(data, npcs)
def _pca_svd(self, data, npcs=10):
"""
Compute a Principal Components Analysis on a given dataset using the SVD
method.
Parameters
----------
X : ndarray
Input data of shape [nsamples x nfeatures]. The second dimension is
reduced by the PCA.
Returns
-------
scores : ndarray [nsamples x npcs]
The dimensionality-reduced data. This contains the value of each
observation in the new PCA-space.
components : ndarray [npcs x nfeatures]
The eigenvectors describing how each feature (node or connection) loads
onto the latent PCA dimensions.
loadings : ndarray [npcs x nfeatures]
The components scaled by their contribution to the variance in the
original data
explained_variance_ratio_ : ndarray [npcs]
The proportion of variance explained by each PC
"""
self.npcs = npcs
# Demean observations
self.data_mean = data.mean(axis=0)[None, :]
# compute
self._U, self._s, self._VT = np.linalg.svd(data - self.data_mean, full_matrices=False)
# Variance explained metrics
var = self._s ** 2 / (data.shape[0]-1)
self.explained_variance_ = var[:npcs]
self.explained_variance_ratio_ = var[:npcs] / var.sum()
# The weights for each original variable in each principal component
self.components = self._VT[:npcs, :] # Eigenvectors
self.loadings = self.components * self._s[:npcs, None] # Scaled by variance contributed to data
# The new co-ordinates of each observation in PCA-space
self.scores = self._s[:npcs] * self._U[:, :npcs]
# Check that component self.scores are properly decorrelated
try:
C = np.corrcoef(self.scores.T)
assert(np.sum(np.abs(C - np.diag(np.diag(C)))) < 1e-10)
except AssertionError:
print('Observations are not properly de-correlated by PCA. Data demeaning might have gone wrong.')
raise
def project_score(self, scores):
"""
Compute a projection of a set of scores back to original data space
"""
return self.data_mean + np.dot(scores, self.components)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment