Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
40 changes: 40 additions & 0 deletions bluemath_tk/core/data/sample_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import numpy as np
import xarray as xr


def get_2d_dataset():
# Define the coordinates
coord1 = np.linspace(-100, 100, 20)
coord2 = np.linspace(-100, 100, 20)
coord3 = np.arange(1, 50)

# Create a meshgrid
coord1, coord2, coord3 = np.meshgrid(coord1, coord2, coord3, indexing="ij")

# Create a 3D dataset
X = (
np.sin(np.radians(coord1)) * np.cos(np.radians(coord2)) * np.sin(coord3)
+ np.sin(2 * np.radians(coord1))
* np.cos(2 * np.radians(coord2))
* np.sin(2 * coord3)
+ np.sin(3 * np.radians(coord1))
* np.cos(3 * np.radians(coord2))
* np.sin(3 * coord3)
)
# Create a 3D dataset
Y = -np.sin(X)

# Create an xarray dataset
ds = xr.Dataset(
{
"X": (["coord1", "coord2", "coord3"], X),
"Y": (["coord1", "coord2", "coord3"], Y),
},
coords={
"coord1": coord1[:, 0, 0],
"coord2": coord2[0, :, 0],
"coord3": coord3[0, 0, :],
},
)

return ds
118 changes: 92 additions & 26 deletions bluemath_tk/datamining/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,27 +51,59 @@ class PCA(BaseReduction):
The value to replace NaNs in the dataset.
num_cols_for_vars : int
The number of columns for variables.
eofs : xr.Dataset
The Empirical Orthogonal Functions (EOFs).
explained_variance_ratio : np.ndarray
The explained variance ratio.
cumulative_explained_variance_ratio : np.ndarray
The cumulative explained variance ratio.

Methods
-------
fit(
data: xr.Dataset,
vars_to_stack: List[str],
coords_to_stack: List[str],
pca_dim_for_rows: str,
window_in_pca_dim_for_rows: List[int] = [0],
value_to_replace_nans: float = None,
) -> None
transform(data: xr.Dataset) -> xr.Dataset
fit_transform(
data: xr.Dataset,
vars_to_stack: List[str],
coords_to_stack: List[str],
pca_dim_for_rows: str,
window_in_pca_dim_for_rows: List[int] = [0],
value_to_replace_nans: float = None,
) -> xr.Dataset
inverse_transform(PCs: Union[np.ndarray, xr.Dataset]) -> xr.Dataset
_generate_stacked_data
Generate stacked data matrix.
_preprocess_data
Preprocess data for PCA.
_reshape_EOFs
Reshape EOFs to the original data shape.
_reshape_data
Reshape data to the original data shape.
fit
Fit PCA model to data.
transform
Transform data using the fitted PCA model.
fit_transform
Fit and transform data using PCA model.
inverse_transform
Inverse transform data using the fitted PCA model.

Examples
--------
>>> from bluemath_tk.core.data.sample_data import get_2d_dataset
>>> from bluemath_tk.datamining.pca import PCA
>>> ds = get_2d_dataset()
>>> pca = PCA(n_components=5)
>>> pca.fit(
... data=ds,
... vars_to_stack=["X", "Y"],
... coords_to_stack=["coord1", "coord2"],
... pca_dim_for_rows="coord3",
... )
>>> pcs = pca.transform(
... data=ds,
... )
>>> reconstructed_ds = pca.inverse_transform(PCs=pcs)
>>> eofs = pca.eofs
>>> explained_variance_ratio = pca.explained_variance_ratio
>>> cumulative_explained_variance_ratio = pca.cumulative_explained_variance_ratio

References
----------
[1] https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

[2] https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html

[3] https://www.sciencedirect.com/science/article/abs/pii/S0378383911000676
"""

def __init__(
Expand All @@ -97,6 +129,16 @@ def __init__(
If n_components is not an integer when it is greater than or equal to 1.
"""

initial_msg = f"""
-------------------------------------------------------------------
| Initializing PCA reduction model with the following parameters:
| - n_components: {n_components}
| - is_incremental: {is_incremental}
| For more information, please refer to the documentation.
-------------------------------------------------------------------
"""
print(initial_msg)

super().__init__()
self.set_logger_name(name=self.__class__.__name__)
if n_components <= 0:
Expand Down Expand Up @@ -144,6 +186,22 @@ def stacked_data_matrix(self) -> np.ndarray:
def standarized_stacked_data_matrix(self) -> np.ndarray:
return self._standarized_stacked_data_matrix

@property
def eofs(self) -> xr.Dataset:
return self._reshape_EOFs(destandarize=True)

@property
def explained_variance_ratio(self) -> np.ndarray:
return self.pca.explained_variance_ratio_

@property
def cumulative_explained_variance_ratio(self) -> np.ndarray:
return (
np.cumsum(self.explained_variance_ratio)
/ np.sum(self.explained_variance_ratio)
* 100.0
)

def _generate_stacked_data(self, data: xr.Dataset) -> np.ndarray:
"""
Generate stacked data matrix.
Expand Down Expand Up @@ -378,14 +436,16 @@ def fit(
self.is_fitted = True
self.logger.info("PCA model fitted successfully")

def transform(self, data: xr.Dataset) -> xr.Dataset:
def transform(self, data: xr.Dataset, after_fitting: bool = False) -> xr.Dataset:
"""
Transform data using the fitted PCA model.

Parameters
----------
data : xr.Dataset
The data to transform.
after_fitting : bool, optional
If True, use the already processed data. Default is False.

Returns
-------
Expand All @@ -395,11 +455,17 @@ def transform(self, data: xr.Dataset) -> xr.Dataset:

if self.is_fitted is False:
raise PCAError("PCA model must be fitted before transforming data")
self.logger.info("Transforming data using PCA model")
processed_data = self._preprocess_data(
data=data[self.vars_to_stack], is_fit=False
)

if not after_fitting:
self.logger.info("Transforming data using PCA model")
processed_data = self._preprocess_data(
data=data[self.vars_to_stack], is_fit=False
)
else:
processed_data = self.standarized_stacked_data_matrix.copy()

transformed_data = self.pca.transform(X=processed_data)

return xr.Dataset(
{
"PCs": ((self.pca_dim_for_rows, "n_component"), transformed_data),
Expand All @@ -408,7 +474,7 @@ def transform(self, data: xr.Dataset) -> xr.Dataset:
self.pca_dim_for_rows: data[self.pca_dim_for_rows],
"n_component": np.arange(self.pca.n_components_),
},
)
).squeeze() # Remove window dimension if it is not used

def fit_transform(
self,
Expand Down Expand Up @@ -451,8 +517,7 @@ def fit_transform(
window_in_pca_dim_for_rows=window_in_pca_dim_for_rows,
value_to_replace_nans=value_to_replace_nans,
)
# TODO: JAVI - Add a flag to use the already processed data??
return self.transform(data=data)
return self.transform(data=data, after_fitting=True)

def inverse_transform(self, PCs: Union[np.ndarray, xr.Dataset]) -> xr.Dataset:
"""
Expand All @@ -471,6 +536,7 @@ def inverse_transform(self, PCs: Union[np.ndarray, xr.Dataset]) -> xr.Dataset:

if self.is_fitted is False:
raise PCAError("PCA model must be fitted before inverse transforming data")

if isinstance(PCs, xr.Dataset):
X = PCs["PCs"].values
elif isinstance(PCs, np.ndarray):
Expand Down
6 changes: 3 additions & 3 deletions bluemath_tk/interpolation/rbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,11 @@ class RBF(BaseInterpolation):

Methods
-------
fit(...) :
fit :
Fits the model to the data.
predict(...) :
predict :
Predicts the data for the provided dataset.
fit_predict(...) :
fit_predict :
Fits the model to the subset and predicts the interpolated dataset.

Notes
Expand Down
4 changes: 2 additions & 2 deletions bluemath_tk/wrappers/_base_wrappers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import copy
import itertools
from typing import List
from typing import List, Union
import subprocess
import numpy as np
import xarray as xr
Expand Down Expand Up @@ -616,7 +616,7 @@ def join_postprocessed_files(

def postprocess_cases(
self, cases_to_postprocess: List[int] = None
) -> xr.Dataset or List[xr.Dataset]: # type: ignore
) -> Union[xr.Dataset, List[xr.Dataset]]:
"""
Postprocess the model output.

Expand Down
9 changes: 9 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ plugins:
markdown_extensions:
- admonition
- codehilite
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.snippets
- pymdownx.superfences

theme:
name: readthedocs
Expand All @@ -36,6 +43,8 @@ theme:
hljs_languages:
- yaml
- rust
features:
- content.code.copy

extra:
code-copy: true
41 changes: 1 addition & 40 deletions tests/datamining/test_pca.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,8 @@
import unittest
import numpy as np
import xarray as xr
from bluemath_tk.core.data.sample_data import get_2d_dataset
from bluemath_tk.datamining.pca import PCA


def get_2d_dataset():
# Define the coordinates
coord1 = np.linspace(-100, 100, 20)
coord2 = np.linspace(-100, 100, 20)
coord3 = np.arange(1, 50)

# Create a meshgrid
coord1, coord2, coord3 = np.meshgrid(coord1, coord2, coord3, indexing="ij")

# Create a 3D dataset
X = (
np.sin(np.radians(coord1)) * np.cos(np.radians(coord2)) * np.sin(coord3)
+ np.sin(2 * np.radians(coord1))
* np.cos(2 * np.radians(coord2))
* np.sin(2 * coord3)
+ np.sin(3 * np.radians(coord1))
* np.cos(3 * np.radians(coord2))
* np.sin(3 * coord3)
)
# Create a 3D dataset
Y = -np.sin(X)

# Create an xarray dataset
ds = xr.Dataset(
{
"X": (["coord1", "coord2", "coord3"], X),
"Y": (["coord1", "coord2", "coord3"], Y),
},
coords={
"coord1": coord1[:, 0, 0],
"coord2": coord2[0, :, 0],
"coord3": coord3[0, 0, :],
},
)

return ds


class TestPCA(unittest.TestCase):
def setUp(self):
self.ds = get_2d_dataset()
Expand Down
Loading