Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions bluemath_tk/core/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def wrapper(
min_number_of_points: int = None,
max_number_of_iterations: int = 10,
normalize_data: bool = False,
regression_guided: Dict[str, Dict[str, Any]] = {},
):
if data is None:
raise ValueError("Data cannot be None")
Expand All @@ -149,6 +150,8 @@ def wrapper(
raise ValueError("Maximum number of iterations must be integer and > 0")
if not isinstance(normalize_data, bool):
raise TypeError("Normalize data must be a boolean")
if not isinstance(regression_guided, dict):
raise TypeError("regression_guided must be a dictionary")
return func(
self,
data,
Expand All @@ -157,6 +160,7 @@ def wrapper(
min_number_of_points,
max_number_of_iterations,
normalize_data,
regression_guided
)

return wrapper
Expand Down Expand Up @@ -384,6 +388,7 @@ def wrapper(
self,
data: xr.Dataset,
fit_params: Dict[str, Dict[str, Any]] = {},
regression_guided: Dict[str, Dict[str, Any]] = {},
variable_to_sort_bmus: str = None,
):
if not isinstance(data, xr.Dataset):
Expand Down
59 changes: 55 additions & 4 deletions bluemath_tk/datamining/kma.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import List, Tuple
from typing import List, Tuple, Any, Dict

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression

from ..core.decorators import validate_data_kma
from ._base_datamining import BaseClustering
Expand Down Expand Up @@ -132,6 +133,7 @@ def __init__(
self.normalized_centroids: pd.DataFrame = pd.DataFrame()
self.centroid_real_indices: np.array = np.array([])
self.is_fitted: bool = False
self.regression_guided: dict = {}

@property
def kma(self) -> KMeans:
Expand Down Expand Up @@ -176,6 +178,42 @@ def data_to_fit(self) -> pd.DataFrame:

return self._data_to_fit


def add_regression_guided(self, data: pd.DataFrame, vars: List[str], alpha: List[float]) -> pd.DataFrame:

"""
Help KMA clustering features with regression-guided variables.
"""

# Stack guiding variables into (time, n_vars) array
X = data.drop(columns=vars)
Y = np.stack([data[var].values for var in vars], axis=1)

# Normalize input features
X_std = X.std().replace(0, 1)
X_norm = X / X_std

# Add intercept column to input
X_design = np.column_stack((np.ones(len(X)), X_norm.values))

# Normalize guiding targets
Y_std = np.nanstd(Y, axis=0)
Y_std[Y_std == 0] = 1.0

# Fit regression model to predict guiding vars from input
model = LinearRegression(fit_intercept=False).fit(X_design, Y / Y_std)
Y_pred = model.predict(X_design) * Y_std # De-normalize predictions

# Weight columns by input alpha
X_weight = 1.0 - np.sum(alpha)
X_scaled = X_weight * X.values
Y_scaled = Y_pred * alpha

df = pd.DataFrame(np.hstack([X_scaled, Y_scaled]), index=data.index)
df.columns = list(X.columns) + vars

return df

@validate_data_kma
def fit(
self,
Expand All @@ -185,6 +223,7 @@ def fit(
min_number_of_points: int = None,
max_number_of_iterations: int = 10,
normalize_data: bool = False,
regression_guided: Dict[str, Dict[str, Any]] = {},
) -> None:
"""
Fit the K-Means algorithm to the provided data.
Expand Down Expand Up @@ -215,15 +254,24 @@ def fit(
Default is 10.
normalize_data : bool, optional
A flag to normalize the data. Default is False.
regression_guided: dict, optional
A dictionary specifying regression-guided clustering variables and relative weights.
"""


if regression_guided:
data = self.add_regression_guided(
data=data,
vars = regression_guided.get("vars", None),
alpha = regression_guided.get("alpha", None)
)

super().fit(
data=data,
directional_variables=directional_variables,
custom_scale_factor=custom_scale_factor,
normalize_data=normalize_data,
)

# Fit K-Means algorithm
if min_number_of_points is not None:
stable_kma_child = False
Expand Down Expand Up @@ -255,6 +303,7 @@ def fit(
self.centroids = self.denormalize(
normalized_data=self.normalized_centroids, scale_factor=self.scale_factor
)

for directional_variable in self.directional_variables:
self.centroids[directional_variable] = self.get_degrees_from_uv(
xu=self.centroids[f"{directional_variable}_u"].values,
Expand Down Expand Up @@ -299,6 +348,7 @@ def fit_predict(
min_number_of_points: int = None,
max_number_of_iterations: int = 10,
normalize_data: bool = False,
regression_guided: Dict[str, Dict[str, Any]] = {},
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Fit the K-Means algorithm to the provided data and predict the nearest centroid
Expand Down Expand Up @@ -330,14 +380,15 @@ def fit_predict(
A tuple containing the nearest centroid index for each data point,
and the nearest centroids.
"""

self.fit(
data=data,
directional_variables=directional_variables,
custom_scale_factor=custom_scale_factor,
min_number_of_points=min_number_of_points,
max_number_of_iterations=max_number_of_iterations,
normalize_data=normalize_data,
regression_guided=regression_guided
)

return self.predict(data=data)
20 changes: 17 additions & 3 deletions bluemath_tk/predictor/xwt.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

import logging
import warnings
from datetime import datetime, timedelta
Expand Down Expand Up @@ -301,7 +303,7 @@ def get_conditioned_probabilities(self) -> pd.DataFrame:
)

return df_cond_probs

@validate_data_xwt
def fit(
self,
Expand Down Expand Up @@ -345,9 +347,21 @@ def fit(
kma: KMA = self.steps.get("kma")
self.num_clusters = kma.num_clusters
# TODO: standarize PCs by first PC variance
# pca.pcs_df / pca.pcs.stds.isel(n_component=0).values

data_to_kma = pca.pcs_df

if "regression_guided" in fit_params.get("kma", {}):
guiding_vars = fit_params["kma"]["regression_guided"].get("vars", [])

if guiding_vars:
guiding_data = pd.DataFrame(
{var: data[var].values for var in guiding_vars},
index=data.time.values
)
data_to_kma = pd.concat([data_to_kma, guiding_data], axis=1)

kma_bmus, _kma_bmus_df = kma.fit_predict(
data=pca.pcs_df,
data=data_to_kma,
**fit_params.get("kma", {}),
)
self.kma_bmus = kma_bmus + 1 # TODO: Check if this is necessary!!!
Expand Down
Loading