GeoOcean · tausiaj · Jan 16, 2025 · Jan 16, 2025
diff --git a/bluemath_tk/core/data/__init__.py b/bluemath_tk/core/data/__init__.py
diff --git a/bluemath_tk/core/data/sample_data.py b/bluemath_tk/core/data/sample_data.py
@@ -0,0 +1,40 @@
+import numpy as np
+import xarray as xr
+
+
+def get_2d_dataset():
+    # Define the coordinates
+    coord1 = np.linspace(-100, 100, 20)
+    coord2 = np.linspace(-100, 100, 20)
+    coord3 = np.arange(1, 50)
+
+    # Create a meshgrid
+    coord1, coord2, coord3 = np.meshgrid(coord1, coord2, coord3, indexing="ij")
+
+    # Create a 3D dataset
+    X = (
+        np.sin(np.radians(coord1)) * np.cos(np.radians(coord2)) * np.sin(coord3)
+        + np.sin(2 * np.radians(coord1))
+        * np.cos(2 * np.radians(coord2))
+        * np.sin(2 * coord3)
+        + np.sin(3 * np.radians(coord1))
+        * np.cos(3 * np.radians(coord2))
+        * np.sin(3 * coord3)
+    )
+    # Create a 3D dataset
+    Y = -np.sin(X)
+
+    # Create an xarray dataset
+    ds = xr.Dataset(
+        {
+            "X": (["coord1", "coord2", "coord3"], X),
+            "Y": (["coord1", "coord2", "coord3"], Y),
+        },
+        coords={
+            "coord1": coord1[:, 0, 0],
+            "coord2": coord2[0, :, 0],
+            "coord3": coord3[0, 0, :],
+        },
+    )
+
+    return ds
diff --git a/bluemath_tk/datamining/pca.py b/bluemath_tk/datamining/pca.py
@@ -51,27 +51,59 @@ class PCA(BaseReduction):
         The value to replace NaNs in the dataset.
     num_cols_for_vars : int
         The number of columns for variables.
+    eofs : xr.Dataset
+        The Empirical Orthogonal Functions (EOFs).
+    explained_variance_ratio : np.ndarray
+        The explained variance ratio.
+    cumulative_explained_variance_ratio : np.ndarray
+        The cumulative explained variance ratio.
 
     Methods
     -------
-    fit(
-        data: xr.Dataset,
-        vars_to_stack: List[str],
-        coords_to_stack: List[str],
-        pca_dim_for_rows: str,
-        window_in_pca_dim_for_rows: List[int] = [0],
-        value_to_replace_nans: float = None,
-    ) -> None
-    transform(data: xr.Dataset) -> xr.Dataset
-    fit_transform(
-        data: xr.Dataset,
-        vars_to_stack: List[str],
-        coords_to_stack: List[str],
-        pca_dim_for_rows: str,
-        window_in_pca_dim_for_rows: List[int] = [0],
-        value_to_replace_nans: float = None,
-    ) -> xr.Dataset
-    inverse_transform(PCs: Union[np.ndarray, xr.Dataset]) -> xr.Dataset
+    _generate_stacked_data
+        Generate stacked data matrix.
+    _preprocess_data
+        Preprocess data for PCA.
+    _reshape_EOFs
+        Reshape EOFs to the original data shape.
+    _reshape_data
+        Reshape data to the original data shape.
+    fit
+        Fit PCA model to data.
+    transform
+        Transform data using the fitted PCA model.
+    fit_transform
+        Fit and transform data using PCA model.
+    inverse_transform
+        Inverse transform data using the fitted PCA model.
+
+    Examples
+    --------
+    >>> from bluemath_tk.core.data.sample_data import get_2d_dataset
+    >>> from bluemath_tk.datamining.pca import PCA
+    >>> ds = get_2d_dataset()
+    >>> pca = PCA(n_components=5)
+    >>> pca.fit(
+    ...     data=ds,
+    ...     vars_to_stack=["X", "Y"],
+    ...     coords_to_stack=["coord1", "coord2"],
+    ...     pca_dim_for_rows="coord3",
+    ... )
+    >>> pcs = pca.transform(
+    ...     data=ds,
+    ... )
+    >>> reconstructed_ds = pca.inverse_transform(PCs=pcs)
+    >>> eofs = pca.eofs
+    >>> explained_variance_ratio = pca.explained_variance_ratio
+    >>> cumulative_explained_variance_ratio = pca.cumulative_explained_variance_ratio
+
+    References
+    ----------
+    [1] https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
+
+    [2] https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html
+
+    [3] https://www.sciencedirect.com/science/article/abs/pii/S0378383911000676
     """
 
     def __init__(
@@ -97,6 +129,16 @@ def __init__(
             If n_components is not an integer when it is greater than or equal to 1.
         """
 
+        initial_msg = f"""
+        -------------------------------------------------------------------
+        | Initializing PCA reduction model with the following parameters:
+        |    - n_components: {n_components}
+        |    - is_incremental: {is_incremental}
+        | For more information, please refer to the documentation.
+        -------------------------------------------------------------------
+        """
+        print(initial_msg)
+
         super().__init__()
         self.set_logger_name(name=self.__class__.__name__)
         if n_components <= 0:
@@ -144,6 +186,22 @@ def stacked_data_matrix(self) -> np.ndarray:
     def standarized_stacked_data_matrix(self) -> np.ndarray:
         return self._standarized_stacked_data_matrix
 
+    @property
+    def eofs(self) -> xr.Dataset:
+        return self._reshape_EOFs(destandarize=True)
+
+    @property
+    def explained_variance_ratio(self) -> np.ndarray:
+        return self.pca.explained_variance_ratio_
+
+    @property
+    def cumulative_explained_variance_ratio(self) -> np.ndarray:
+        return (
+            np.cumsum(self.explained_variance_ratio)
+            / np.sum(self.explained_variance_ratio)
+            * 100.0
+        )
+
     def _generate_stacked_data(self, data: xr.Dataset) -> np.ndarray:
         """
         Generate stacked data matrix.
@@ -378,14 +436,16 @@ def fit(
         self.is_fitted = True
         self.logger.info("PCA model fitted successfully")
 
-    def transform(self, data: xr.Dataset) -> xr.Dataset:
+    def transform(self, data: xr.Dataset, after_fitting: bool = False) -> xr.Dataset:
         """
         Transform data using the fitted PCA model.
 
         Parameters
         ----------
         data : xr.Dataset
             The data to transform.
+        after_fitting : bool, optional
+            If True, use the already processed data. Default is False.
 
         Returns
         -------
@@ -395,11 +455,17 @@ def transform(self, data: xr.Dataset) -> xr.Dataset:
 
         if self.is_fitted is False:
             raise PCAError("PCA model must be fitted before transforming data")
-        self.logger.info("Transforming data using PCA model")
-        processed_data = self._preprocess_data(
-            data=data[self.vars_to_stack], is_fit=False
-        )
+
+        if not after_fitting:
+            self.logger.info("Transforming data using PCA model")
+            processed_data = self._preprocess_data(
+                data=data[self.vars_to_stack], is_fit=False
+            )
+        else:
+            processed_data = self.standarized_stacked_data_matrix.copy()
+
         transformed_data = self.pca.transform(X=processed_data)
+
         return xr.Dataset(
             {
                 "PCs": ((self.pca_dim_for_rows, "n_component"), transformed_data),
@@ -408,7 +474,7 @@ def transform(self, data: xr.Dataset) -> xr.Dataset:
                 self.pca_dim_for_rows: data[self.pca_dim_for_rows],
                 "n_component": np.arange(self.pca.n_components_),
             },
-        )
+        ).squeeze()  # Remove window dimension if it is not used
 
     def fit_transform(
         self,
@@ -451,8 +517,7 @@ def fit_transform(
             window_in_pca_dim_for_rows=window_in_pca_dim_for_rows,
             value_to_replace_nans=value_to_replace_nans,
         )
-        # TODO: JAVI - Add a flag to use the already processed data??
-        return self.transform(data=data)
+        return self.transform(data=data, after_fitting=True)
 
     def inverse_transform(self, PCs: Union[np.ndarray, xr.Dataset]) -> xr.Dataset:
         """
@@ -471,6 +536,7 @@ def inverse_transform(self, PCs: Union[np.ndarray, xr.Dataset]) -> xr.Dataset:
 
         if self.is_fitted is False:
             raise PCAError("PCA model must be fitted before inverse transforming data")
+
         if isinstance(PCs, xr.Dataset):
             X = PCs["PCs"].values
         elif isinstance(PCs, np.ndarray):

diff --git a/bluemath_tk/interpolation/rbf.py b/bluemath_tk/interpolation/rbf.py
@@ -120,11 +120,11 @@ class RBF(BaseInterpolation):
 
     Methods
     -------
-    fit(...) :
+    fit :
         Fits the model to the data.
-    predict(...) :
+    predict :
         Predicts the data for the provided dataset.
-    fit_predict(...) :
+    fit_predict :
         Fits the model to the subset and predicts the interpolated dataset.
 
     Notes

diff --git a/bluemath_tk/wrappers/_base_wrappers.py b/bluemath_tk/wrappers/_base_wrappers.py
@@ -1,7 +1,7 @@
 import os
 import copy
 import itertools
-from typing import List
+from typing import List, Union
 import subprocess
 import numpy as np
 import xarray as xr
@@ -616,7 +616,7 @@ def join_postprocessed_files(
 
     def postprocess_cases(
         self, cases_to_postprocess: List[int] = None
-    ) -> xr.Dataset or List[xr.Dataset]:  # type: ignore
+    ) -> Union[xr.Dataset, List[xr.Dataset]]:
         """
         Postprocess the model output.
 

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -28,6 +28,13 @@ plugins:
 markdown_extensions:
   - admonition
   - codehilite
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  - pymdownx.superfences
 
 theme:
   name: readthedocs
@@ -36,6 +43,8 @@ theme:
   hljs_languages:
     - yaml
     - rust
+  features:
+    - content.code.copy
 
 extra:
   code-copy: true
diff --git a/tests/datamining/test_pca.py b/tests/datamining/test_pca.py
@@ -1,47 +1,8 @@
 import unittest
-import numpy as np
-import xarray as xr
+from bluemath_tk.core.data.sample_data import get_2d_dataset
 from bluemath_tk.datamining.pca import PCA
 
 
-def get_2d_dataset():
-    # Define the coordinates
-    coord1 = np.linspace(-100, 100, 20)
-    coord2 = np.linspace(-100, 100, 20)
-    coord3 = np.arange(1, 50)
-
-    # Create a meshgrid
-    coord1, coord2, coord3 = np.meshgrid(coord1, coord2, coord3, indexing="ij")
-
-    # Create a 3D dataset
-    X = (
-        np.sin(np.radians(coord1)) * np.cos(np.radians(coord2)) * np.sin(coord3)
-        + np.sin(2 * np.radians(coord1))
-        * np.cos(2 * np.radians(coord2))
-        * np.sin(2 * coord3)
-        + np.sin(3 * np.radians(coord1))
-        * np.cos(3 * np.radians(coord2))
-        * np.sin(3 * coord3)
-    )
-    # Create a 3D dataset
-    Y = -np.sin(X)
-
-    # Create an xarray dataset
-    ds = xr.Dataset(
-        {
-            "X": (["coord1", "coord2", "coord3"], X),
-            "Y": (["coord1", "coord2", "coord3"], Y),
-        },
-        coords={
-            "coord1": coord1[:, 0, 0],
-            "coord2": coord2[0, :, 0],
-            "coord3": coord3[0, 0, :],
-        },
-    )
-
-    return ds
-
-
 class TestPCA(unittest.TestCase):
     def setUp(self):
         self.ds = get_2d_dataset()