GeoOcean · tausiaj · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/bluemath_tk/core/decorators.py b/bluemath_tk/core/decorators.py
@@ -71,6 +71,7 @@ def wrapper(
         data: pd.DataFrame,
         directional_variables: List[str] = [],
         custom_scale_factor: dict = {},
+        first_centroid_seed: int = None,
     ):
         if data is None:
             raise ValueError("Data cannot be None")
@@ -80,6 +81,15 @@ def wrapper(
             raise TypeError("Directional variables must be a list")
         if not isinstance(custom_scale_factor, dict):
             raise TypeError("Custom scale factor must be a dict")
+        if first_centroid_seed is not None:
+            if (
+                not isinstance(first_centroid_seed, int)
+                or first_centroid_seed < 0
+                or first_centroid_seed > data.shape[0]
+            ):
+                raise ValueError(
+                    "First centroid seed must be an integer >= 0 and < num of data points"
+                )
         return func(self, data, directional_variables, custom_scale_factor)
 
     return wrapper

diff --git a/bluemath_tk/datamining/kma.py b/bluemath_tk/datamining/kma.py
@@ -92,7 +92,7 @@ class KMA(BaseClustering):
     def __init__(
         self,
         num_clusters: int,
-        seed: int = 0,
+        seed: int = None,
         init: str = "k-means++",
         n_init: str = "auto",
         algorithm: str = "lloyd",
@@ -129,7 +129,9 @@ def __init__(
             self.num_clusters = int(num_clusters)
         else:
             raise ValueError("Variable num_clusters must be > 0")
-        if seed >= 0:
+        if seed is None:
+            self.seed = np.random.randint(0, 100)
+        elif seed >= 0:
             self.seed = int(seed)
         else:
             raise ValueError("Variable seed must be >= 0")

diff --git a/bluemath_tk/datamining/mda.py b/bluemath_tk/datamining/mda.py
@@ -52,17 +52,13 @@ class MDA(BaseClustering):
 
     Methods
     -------
-    fit(data, directional_variables, custom_scale_factor)
+    fit(data, directional_variables, custom_scale_factor, first_centroid_seed)
         Fit the MDA algorithm to the provided data.
     predict(data)
         Predict the nearest centroid for the provided data.
-    fit_predict(data, directional_variables, custom_scale_factor)
+    fit_predict(data, directional_variables, custom_scale_factor, first_centroid_seed)
         Fits the MDA model to the data and predicts the nearest centroids.
 
-    Notes
-    -----
-    - This class is designed to perform the MDA algorithm.
-
     Examples
     --------
     >>> import numpy as np
@@ -145,7 +141,7 @@ def _normalized_distance(
 
         Returns
         -------
-        dist : np.ndarray
+        np.ndarray
             An array of squared Euclidean distances between the two arrays for each row.
 
         Raises
@@ -157,7 +153,7 @@ def _normalized_distance(
         -----
         - IMPORTANT: Data is assumed to be normalized before calling this function.
         - The function assumes that the data_variables, directional_variables, and scale_factor
-        attributes have been set.
+            attributes have been set.
         - The function calculates the squared sum of differences for each row.
         - DEPRECATED: directional_distance calculation.
             distance = np.absolute(array_to_compare[:, ix] - all_rest_data[:, ix])
@@ -196,9 +192,9 @@ def _nearest_indices_to_centroids(
 
         Returns
         -------
-        nearest_indices_array : np.ndarray
+        np.ndarray
             An array containing the index of the nearest data point to centroids.
-        normalized_data.iloc[nearest_indices_array] : pd.DataFrame
+        pd.DataFrame
             A DataFrame containing the nearest data points to centroids.
 
         Raises
@@ -240,9 +236,9 @@ def _nearest_indices(
 
         Returns
         -------
-        nearest_indices_array : np.ndarray
+        np.ndarray
             An array containing the index of the nearest centroid to the data.
-        self.centroids.iloc[nearest_indices_array] : pd.DataFrame
+        pd.DataFrame
             A DataFrame containing the nearest centroids to the data.
 
         Raises
@@ -277,6 +273,7 @@ def fit(
         data: pd.DataFrame,
         directional_variables: List[str] = [],
         custom_scale_factor: dict = {},
+        first_centroid_seed: int = None,
     ) -> None:
         """
         Fit the Maximum Dissimilarity Algorithm (MDA) to the provided data.
@@ -296,11 +293,15 @@ def fit(
         custom_scale_factor : dict, optional
             A dictionary specifying custom scale factors for normalization.
             Default is {}.
+        first_centroid_seed : int, optional
+            The index of the first centroid to use in the MDA algorithm.
+            Default is None.
 
         Notes
         -----
         - The function assumes that the data is validated by the `validate_data_mda`
-        decorator before execution.
+            decorator before execution.
+        - When first_centroid_seed is not provided, max value centroid is used.
         """
 
         self._data = data.copy()
@@ -328,7 +329,14 @@ def fit(
         # [DEPRECATED] Select the point with the maximum value in the first column of pandas dataframe
         # seed = self.normalized_data[self.normalized_data.columns[0]].idxmax()
         # Select the point with the maximum summed value
-        seed = np.argmax(self.normalized_data.sum(axis=1).values)
+        if first_centroid_seed is not None:
+            seed = first_centroid_seed
+            self.logger.info(f"Using specified seed={seed} as first centroid.")
+        else:
+            seed = np.argmax(self.normalized_data.sum(axis=1).values)
+            self.logger.info(
+                f"Using max calculated value seed={seed} as first centroid."
+            )
 
         # Initialize centroids subset
         subset = np.array(
@@ -426,6 +434,7 @@ def fit_predict(
         data: pd.DataFrame,
         directional_variables: List[str] = [],
         custom_scale_factor: dict = {},
+        first_centroid_seed: int = None,
     ) -> Tuple[np.ndarray, pd.DataFrame]:
         """
         Fits the MDA model to the data and predicts the nearest centroids.
@@ -440,6 +449,9 @@ def fit_predict(
         custom_scale_factor : dict, optional
             A dictionary specifying custom scale factors for normalization.
             Default is {}.
+        first_centroid_seed : int, optional
+            The index of the first centroid to use in the MDA algorithm.
+            Default is None.
 
         Returns
         -------
@@ -451,6 +463,7 @@ def fit_predict(
             data=data,
             directional_variables=directional_variables,
             custom_scale_factor=custom_scale_factor,
+            first_centroid_seed=first_centroid_seed,
         )
 
         return self.predict(data=data)
diff --git a/tests/datamining/test_mda.py b/tests/datamining/test_mda.py
@@ -20,6 +20,7 @@ def test_fit(self):
             data=self.df,
             directional_variables=["Dir"],
             custom_scale_factor={"Dir": [0, 360]},
+            first_centroid_seed=10,
         )
         self.assertIsInstance(self.mda.centroids, pd.DataFrame)
         self.assertEqual(self.mda.centroids.shape[0], 10)