deepmodeling · anyangml · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/lambench/metrics/post_process.py b/lambench/metrics/post_process.py
@@ -16,6 +16,7 @@
     exp_average,
     aggregated_nve_md_results,
     aggregated_inference_efficiency_results,
+    aggregated_diatomics_results,
     get_leaderboard_models,
 )
 
@@ -173,6 +174,10 @@ def process_applicability_task_for_one_model(model: BaseLargeAtomModel):
             applicability_results[record.task_name] = (
                 aggregated_inference_efficiency_results(record.metrics)
             )
+        elif record.task_name == "homonuclear_diatomics":
+            applicability_results[record.task_name] = aggregated_diatomics_results(
+                record.metrics
+            )
     return applicability_results
 
 

diff --git a/lambench/metrics/utils.py b/lambench/metrics/utils.py
@@ -151,6 +151,61 @@ def aggregated_inference_efficiency_results(
     }
 
 
+def aggregated_diatomics_results(results: dict[str, dict]) -> dict[str, float]:
+    """
+    Aggregate per-molecule diatomics results into summary applicability metrics.
+
+    Leaderboard metric:
+        combined_roughness: avg_roughness × (1 + avg(min_pos_err / r_range))
+                            Multiplicative penalty couples smoothness with position accuracy.
+                            min_pos_err / r_range ∈ [0, 1], no free parameters.
+
+    Stored diagnostic metrics (not used for ranking):
+        avg_roughness:          arithmetic mean of per-molecule RMSE(d²residual/dr²) (eV/Å²).
+        avg_min_position_error: arithmetic mean of per-molecule |r_model_min - r_dft_min| (Å).
+                                Molecules without exactly one minimum contribute r_range as penalty.
+        avg_rmse:               arithmetic mean of per-molecule energy RMSE (eV).
+    """
+    roughness_values = []
+    normalized_pos_err_values = []
+    position_error_values = []
+    rmse_values = []
+
+    for mol_results in results.values():
+        if mol_results is None:
+            continue
+        if mol_results.get("roughness") is not None:
+            roughness_values.append(mol_results["roughness"])
+        r_range = mol_results.get("r_range")
+        min_pos_err = mol_results.get("min_position_error")
+        if min_pos_err is not None and r_range is not None and r_range > 0:
+            normalized_pos_err_values.append(min_pos_err / r_range)
+            position_error_values.append(min_pos_err)
+        if mol_results.get("rmse") is not None:
+            rmse_values.append(mol_results["rmse"])
+
+    if not roughness_values:
+        return {
+            "combined_roughness": None,
+            "avg_roughness": None,
+            "avg_min_position_error": None,
+            "avg_rmse": None,
+        }
+
+    avg_roughness = float(np.mean(roughness_values))
+    avg_norm_pos_err = (
+        float(np.mean(normalized_pos_err_values)) if normalized_pos_err_values else 0.0
+    )
+    return {
+        "combined_roughness": float(avg_roughness * (1 + avg_norm_pos_err)),
+        "avg_roughness": avg_roughness,
+        "avg_min_position_error": float(np.mean(position_error_values))
+        if position_error_values
+        else None,
+        "avg_rmse": float(np.mean(rmse_values)) if rmse_values else None,
+    }
+
+
 ####################################
 # Visualization utility functions #
 ####################################

diff --git a/lambench/metrics/vishelper/metrics_calculations.py b/lambench/metrics/vishelper/metrics_calculations.py
@@ -197,6 +197,21 @@ def _calculate_instability_error(self, cell: dict, lambda_0: float = 5e-4) -> fl
             else:
                 return np.clip(np.log10(slope / lambda_0), a_min=0, a_max=None)
 
+    def calculate_diatomics_roughness_results(self) -> dict[str, float]:
+        """
+        Returns per-model leaderboard scores for the homonuclear diatomics task.
+
+        Score = combined_roughness = avg_roughness × (1 + avg(min_pos_err/r_range)), lower is better.
+        Diagnostic metrics (avg_roughness, avg_min_position_error, avg_rmse) stored in DB but not ranked.
+        Models with missing results are excluded.
+        """
+        raw = self.fetcher.fetch_diatomics_results()
+        return {
+            model: metrics["combined_roughness"]
+            for model, metrics in raw.items()
+            if metrics is not None and metrics.get("combined_roughness") is not None
+        }
+
     def calculate_efficiency_results(self) -> dict[str, float]:
         efficiency_results = self.fetcher.fetch_inference_efficiency_results()
         # filter out models with missing efficiency results
@@ -223,6 +238,7 @@ def summarize_final_rankings(self):
         )
         stability_results = self.calculate_stability_results()
         efficiency_results = self.calculate_efficiency_results()
+        roughness_results = self.calculate_diatomics_roughness_results()
         if not generalizability_ood or not generalizability_downstream:
             logging.warning(
                 "Missing data for generalizability metrics (ood or downstream)"
@@ -255,6 +271,9 @@ def summarize_final_rankings(self):
             "Applicability-Efficiency ↑": [
                 efficiency_results[model] for model in shared_models
             ],
+            "Applicability-Roughness ↓": [
+                roughness_results.get(model) for model in shared_models
+            ],
         }
 
         # Create DataFrame with models as index
@@ -273,8 +292,9 @@ def summarize_final_rankings(self):
                 "Generalizability-PC Error ↓",
                 "Applicability-Instability ↓",
                 "Applicability-Efficiency ↑",
+                "Applicability-Roughness ↓",
             ],
-            ascending=[True, True, True, False],
+            ascending=[True, True, True, False, True],
         )
         print(
             "Final Rankings:\n",

diff --git a/lambench/metrics/vishelper/results_fetcher.py b/lambench/metrics/vishelper/results_fetcher.py
@@ -12,6 +12,7 @@
     get_domain_to_direct_task_mapping,
     get_leaderboard_models,
     aggregated_inference_efficiency_results,
+    aggregated_diatomics_results,
 )
 from lambench.models.basemodel import BaseLargeAtomModel
 import pandas as pd
@@ -127,6 +128,24 @@ def fetch_inference_efficiency_results(self) -> dict[str, dict[str, float]]:
             )
         return results
 
+    def fetch_diatomics_results(self) -> dict[str, dict]:
+        """Returns aggregated diatomics roughness results for all leaderboard models."""
+        results = {}
+        for model in self.leaderboard_models:
+            task_results = CalculatorRecord.query(
+                model_name=model.model_name, task_name="homonuclear_diatomics"
+            )
+            if len(task_results) != 1:
+                logging.warning(
+                    f"Expected one record for {model.model_name} and homonuclear_diatomics, "
+                    f"but got {len(task_results)}"
+                )
+                continue
+            results[model.model_metadata.pretty_name] = aggregated_diatomics_results(
+                task_results[0].metrics
+            )
+        return results
+
     def fetch_downstream_results(self) -> pd.DataFrame:
         """Returns downstream task results as a DataFrame with models as rows and task metrics as columns."""
 

diff --git a/lambench/models/ase_models.py b/lambench/models/ase_models.py
@@ -332,6 +332,10 @@ def evaluate(
                 from lambench.tasks.calculator.surface.surface import run_inference
 
                 assert task.test_data is not None
+                return {"metrics": run_inference(self, task.test_data)}
+            elif task.task_name == "homonuclear_diatomics":
+                from lambench.tasks.calculator.diatomics.diatomics import run_inference
+
                 return {"metrics": run_inference(self, task.test_data)}
             else:
                 raise NotImplementedError(f"Task {task.task_name} is not implemented.")

diff --git a/lambench/tasks/calculator/calculator_tasks.yml b/lambench/tasks/calculator/calculator_tasks.yml
@@ -51,3 +51,6 @@ interface:
 surface:
   test_data: /bohr/lambench-surfaces-43ll/v1/surface
   calculator_params: null
+homonuclear_diatomics:
+  test_data: null
+  calculator_params: null