Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions bluemath_tk/core/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def wrapper(
data: pd.DataFrame,
directional_variables: List[str] = [],
custom_scale_factor: dict = {},
first_centroid_seed: int = None,
):
if data is None:
raise ValueError("Data cannot be None")
Expand All @@ -80,6 +81,15 @@ def wrapper(
raise TypeError("Directional variables must be a list")
if not isinstance(custom_scale_factor, dict):
raise TypeError("Custom scale factor must be a dict")
if first_centroid_seed is not None:
if (
not isinstance(first_centroid_seed, int)
or first_centroid_seed < 0
or first_centroid_seed > data.shape[0]
):
raise ValueError(
"First centroid seed must be an integer >= 0 and < num of data points"
)
return func(self, data, directional_variables, custom_scale_factor)

return wrapper
Expand Down
6 changes: 4 additions & 2 deletions bluemath_tk/datamining/kma.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ class KMA(BaseClustering):
def __init__(
self,
num_clusters: int,
seed: int = 0,
seed: int = None,
init: str = "k-means++",
n_init: str = "auto",
algorithm: str = "lloyd",
Expand Down Expand Up @@ -129,7 +129,9 @@ def __init__(
self.num_clusters = int(num_clusters)
else:
raise ValueError("Variable num_clusters must be > 0")
if seed >= 0:
if seed is None:
self.seed = np.random.randint(0, 100)
elif seed >= 0:
self.seed = int(seed)
else:
raise ValueError("Variable seed must be >= 0")
Expand Down
41 changes: 27 additions & 14 deletions bluemath_tk/datamining/mda.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,13 @@ class MDA(BaseClustering):

Methods
-------
fit(data, directional_variables, custom_scale_factor)
fit(data, directional_variables, custom_scale_factor, first_centroid_seed)
Fit the MDA algorithm to the provided data.
predict(data)
Predict the nearest centroid for the provided data.
fit_predict(data, directional_variables, custom_scale_factor)
fit_predict(data, directional_variables, custom_scale_factor, first_centroid_seed)
Fits the MDA model to the data and predicts the nearest centroids.

Notes
-----
- This class is designed to perform the MDA algorithm.

Examples
--------
>>> import numpy as np
Expand Down Expand Up @@ -145,7 +141,7 @@ def _normalized_distance(

Returns
-------
dist : np.ndarray
np.ndarray
An array of squared Euclidean distances between the two arrays for each row.

Raises
Expand All @@ -157,7 +153,7 @@ def _normalized_distance(
-----
- IMPORTANT: Data is assumed to be normalized before calling this function.
- The function assumes that the data_variables, directional_variables, and scale_factor
attributes have been set.
attributes have been set.
- The function calculates the squared sum of differences for each row.
- DEPRECATED: directional_distance calculation.
distance = np.absolute(array_to_compare[:, ix] - all_rest_data[:, ix])
Expand Down Expand Up @@ -196,9 +192,9 @@ def _nearest_indices_to_centroids(

Returns
-------
nearest_indices_array : np.ndarray
np.ndarray
An array containing the index of the nearest data point to centroids.
normalized_data.iloc[nearest_indices_array] : pd.DataFrame
pd.DataFrame
A DataFrame containing the nearest data points to centroids.

Raises
Expand Down Expand Up @@ -240,9 +236,9 @@ def _nearest_indices(

Returns
-------
nearest_indices_array : np.ndarray
np.ndarray
An array containing the index of the nearest centroid to the data.
self.centroids.iloc[nearest_indices_array] : pd.DataFrame
pd.DataFrame
A DataFrame containing the nearest centroids to the data.

Raises
Expand Down Expand Up @@ -277,6 +273,7 @@ def fit(
data: pd.DataFrame,
directional_variables: List[str] = [],
custom_scale_factor: dict = {},
first_centroid_seed: int = None,
) -> None:
"""
Fit the Maximum Dissimilarity Algorithm (MDA) to the provided data.
Expand All @@ -296,11 +293,15 @@ def fit(
custom_scale_factor : dict, optional
A dictionary specifying custom scale factors for normalization.
Default is {}.
first_centroid_seed : int, optional
The index of the first centroid to use in the MDA algorithm.
Default is None.

Notes
-----
- The function assumes that the data is validated by the `validate_data_mda`
decorator before execution.
decorator before execution.
- When first_centroid_seed is not provided, max value centroid is used.
"""

self._data = data.copy()
Expand Down Expand Up @@ -328,7 +329,14 @@ def fit(
# [DEPRECATED] Select the point with the maximum value in the first column of pandas dataframe
# seed = self.normalized_data[self.normalized_data.columns[0]].idxmax()
# Select the point with the maximum summed value
seed = np.argmax(self.normalized_data.sum(axis=1).values)
if first_centroid_seed is not None:
seed = first_centroid_seed
self.logger.info(f"Using specified seed={seed} as first centroid.")
else:
seed = np.argmax(self.normalized_data.sum(axis=1).values)
self.logger.info(
f"Using max calculated value seed={seed} as first centroid."
)

# Initialize centroids subset
subset = np.array(
Expand Down Expand Up @@ -426,6 +434,7 @@ def fit_predict(
data: pd.DataFrame,
directional_variables: List[str] = [],
custom_scale_factor: dict = {},
first_centroid_seed: int = None,
) -> Tuple[np.ndarray, pd.DataFrame]:
"""
Fits the MDA model to the data and predicts the nearest centroids.
Expand All @@ -440,6 +449,9 @@ def fit_predict(
custom_scale_factor : dict, optional
A dictionary specifying custom scale factors for normalization.
Default is {}.
first_centroid_seed : int, optional
The index of the first centroid to use in the MDA algorithm.
Default is None.

Returns
-------
Expand All @@ -451,6 +463,7 @@ def fit_predict(
data=data,
directional_variables=directional_variables,
custom_scale_factor=custom_scale_factor,
first_centroid_seed=first_centroid_seed,
)

return self.predict(data=data)
1 change: 1 addition & 0 deletions tests/datamining/test_mda.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def test_fit(self):
data=self.df,
directional_variables=["Dir"],
custom_scale_factor={"Dir": [0, 360]},
first_centroid_seed=10,
)
self.assertIsInstance(self.mda.centroids, pd.DataFrame)
self.assertEqual(self.mda.centroids.shape[0], 10)
Expand Down
Loading