Skip to content

Commit

Permalink
fix: correct whitening in HilbertCPCCA
Browse files Browse the repository at this point in the history
  • Loading branch information
nicrie committed Sep 13, 2024
1 parent bcbaa9e commit c84c4a4
Show file tree
Hide file tree
Showing 11 changed files with 495 additions and 283 deletions.
12 changes: 12 additions & 0 deletions tests/models/cross/test_hilbert_cpcca.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,18 @@ def generate_well_conditioned_data(lazy=False):
return X, Y


@pytest.mark.parametrize("use_pca", [True, False])
def test_singular_values(use_pca):
"""Test that the singular values of the Hilbert CCA are less than 1."""
X, Y = generate_well_conditioned_data()
cpcca = HilbertCPCCA(n_modes=2, alpha=0.0, use_pca=use_pca, n_pca_modes=2)
cpcca.fit(X, Y, "sample")
s_values = cpcca.data["singular_values"]

# Singular values are the canonical correlations, so they should be less than 1
assert np.all(s_values <= 1)


# Currently, netCDF4 does not support complex numbers, so skip this test
@pytest.mark.parametrize("engine", ["zarr"])
@pytest.mark.parametrize("alpha", [0.0, 0.5, 1.0])
Expand Down
2 changes: 1 addition & 1 deletion tests/models/single/test_pop.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_init():
# Assert preprocessor has been initialized
assert hasattr(pop, "_params")
assert hasattr(pop, "preprocessor")
assert hasattr(pop, "whitener")
assert hasattr(pop, "pca")


def test_fit(mock_data_array):
Expand Down
146 changes: 146 additions & 0 deletions tests/preprocessing/test_pca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import numpy as np
import pytest
import xarray as xr

from xeofs.preprocessing import PCA

from ..utilities import (
assert_expected_coords,
assert_expected_dims,
data_is_dask,
)

# =============================================================================
# GENERALLY VALID TEST CASES
# =============================================================================
N_SAMPLE_DIMS = [1]
N_FEATURE_DIMS = [1]
INDEX_POLICY = ["index"]
NAN_POLICY = ["no_nan"]
DASK_POLICY = ["no_dask", "dask"]
SEED = [0]

VALID_TEST_DATA = [
(ns, nf, index, nan, dask)
for ns in N_SAMPLE_DIMS
for nf in N_FEATURE_DIMS
for index in INDEX_POLICY
for nan in NAN_POLICY
for dask in DASK_POLICY
]


def generate_well_conditioned_data(lazy=False):
t = np.linspace(0, 50, 200)
std = 0.1
X = np.sin(t)[:, None] + np.random.normal(0, std, size=(200, 3))
X[:, 1] = X[:, 1] ** 3
X[:, 2] = abs(X[:, 2]) ** (0.5)
X = xr.DataArray(
X,
dims=["sample", "feature"],
coords={"sample": np.arange(200), "feature": np.arange(3)},
name="X",
)
X = X - X.mean("sample")
if lazy:
X = X.chunk({"sample": 5, "feature": -1})
return X


# TESTS
# =============================================================================
@pytest.mark.parametrize("lazy", [False, True])
def test_fit(lazy):
data = generate_well_conditioned_data(lazy)

pca = PCA(n_modes=2)
pca.fit(data)


@pytest.mark.parametrize("lazy", [False, True])
@pytest.mark.parametrize("use_pca", [True, False])
def test_transform(lazy, use_pca):
data = generate_well_conditioned_data(lazy)

pca = PCA(n_modes=2, use_pca=use_pca)
pca.fit(data)

# Transform data
transformed_data = pca.transform(data)
transformed_data2 = pca.transform(data)
assert transformed_data.identical(transformed_data2)

assert isinstance(transformed_data, xr.DataArray)
assert transformed_data.ndim == 2
assert transformed_data.dims == ("sample", "feature")

# Consistent dask behaviour
is_dask_before = data_is_dask(data)
is_dask_after = data_is_dask(transformed_data)
assert is_dask_before == is_dask_after


@pytest.mark.parametrize("lazy", [False, True])
@pytest.mark.parametrize("use_pca", [True, False])
def test_fit_transform(lazy, use_pca):
data = generate_well_conditioned_data(lazy)

pca = PCA(n_modes=2, use_pca=use_pca)

# Transform data
transformed_data = pca.fit_transform(data)
transformed_data2 = pca.transform(data)
assert transformed_data.identical(transformed_data2)

assert isinstance(transformed_data, xr.DataArray)
assert transformed_data.ndim == 2
assert transformed_data.dims == ("sample", "feature")

# Consistent dask behaviour
is_dask_before = data_is_dask(data)
is_dask_after = data_is_dask(transformed_data)
assert is_dask_before == is_dask_after


@pytest.mark.parametrize("lazy", [False, True])
@pytest.mark.parametrize("use_pca", [True, False])
def test_invserse_transform_data(lazy, use_pca):
data = generate_well_conditioned_data(lazy)

pca = PCA(n_modes=2, use_pca=use_pca)
pca.fit(data)

transformed = pca.transform(data)
untransformed = pca.inverse_transform_data(transformed)

is_dask_before = data_is_dask(data)
is_dask_after = data_is_dask(untransformed)

# Unstacked data has dimensions of original data
assert_expected_dims(data, untransformed, policy="all")
# Unstacked data has coordinates of original data
assert_expected_coords(data, untransformed, policy="all")
# inverse transform should not change dask-ness
assert is_dask_before == is_dask_after


@pytest.mark.parametrize("n_modes", [1, 2, 3])
def test_transform_pca_n_modes(n_modes):
data = generate_well_conditioned_data()

pca = PCA(use_pca=True, n_modes=n_modes)
transformed = pca.fit_transform(data)

# PCA reduces dimensionality
assert transformed.shape[1] == n_modes


@pytest.mark.parametrize("use_pca", [True, False])
def test_transform_keep_coordinates(use_pca):
X = generate_well_conditioned_data()

pca = PCA(use_pca=use_pca, n_modes="all")
transformed = pca.fit_transform(X)

assert len(transformed.coords) == len(X.coords)
Loading

0 comments on commit c84c4a4

Please sign in to comment.