Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions lambench/metrics/post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
exp_average,
aggregated_nve_md_results,
aggregated_inference_efficiency_results,
aggregated_diatomics_results,
get_leaderboard_models,
)

Expand Down Expand Up @@ -173,6 +174,10 @@ def process_applicability_task_for_one_model(model: BaseLargeAtomModel):
applicability_results[record.task_name] = (
aggregated_inference_efficiency_results(record.metrics)
)
elif record.task_name == "homonuclear_diatomics":
applicability_results[record.task_name] = aggregated_diatomics_results(
record.metrics
)
return applicability_results


Expand Down
55 changes: 55 additions & 0 deletions lambench/metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,61 @@ def aggregated_inference_efficiency_results(
}


def aggregated_diatomics_results(results: dict[str, dict]) -> dict[str, float]:
"""
Aggregate per-molecule diatomics results into summary applicability metrics.

Leaderboard metric:
combined_roughness: avg_roughness × (1 + avg(min_pos_err / r_range))
Multiplicative penalty couples smoothness with position accuracy.
min_pos_err / r_range ∈ [0, 1], no free parameters.

Stored diagnostic metrics (not used for ranking):
avg_roughness: arithmetic mean of per-molecule RMSE(d²residual/dr²) (eV/Ų).
avg_min_position_error: arithmetic mean of per-molecule |r_model_min - r_dft_min| (Å).
Molecules without exactly one minimum contribute r_range as penalty.
avg_rmse: arithmetic mean of per-molecule energy RMSE (eV).
"""
roughness_values = []
normalized_pos_err_values = []
position_error_values = []
rmse_values = []

for mol_results in results.values():
if mol_results is None:
continue
if mol_results.get("roughness") is not None:
roughness_values.append(mol_results["roughness"])
r_range = mol_results.get("r_range")
min_pos_err = mol_results.get("min_position_error")
if min_pos_err is not None and r_range is not None and r_range > 0:
normalized_pos_err_values.append(min_pos_err / r_range)
position_error_values.append(min_pos_err)
if mol_results.get("rmse") is not None:
rmse_values.append(mol_results["rmse"])

if not roughness_values:
return {
"combined_roughness": None,
"avg_roughness": None,
"avg_min_position_error": None,
"avg_rmse": None,
}

avg_roughness = float(np.mean(roughness_values))
avg_norm_pos_err = (
float(np.mean(normalized_pos_err_values)) if normalized_pos_err_values else 0.0
)
return {
"combined_roughness": float(avg_roughness * (1 + avg_norm_pos_err)),
"avg_roughness": avg_roughness,
"avg_min_position_error": float(np.mean(position_error_values))
if position_error_values
else None,
"avg_rmse": float(np.mean(rmse_values)) if rmse_values else None,
}


####################################
# Visualization utility functions #
####################################
Expand Down
22 changes: 21 additions & 1 deletion lambench/metrics/vishelper/metrics_calculations.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,21 @@ def _calculate_instability_error(self, cell: dict, lambda_0: float = 5e-4) -> fl
else:
return np.clip(np.log10(slope / lambda_0), a_min=0, a_max=None)

def calculate_diatomics_roughness_results(self) -> dict[str, float]:
"""
Returns per-model leaderboard scores for the homonuclear diatomics task.

Score = combined_roughness = avg_roughness × (1 + avg(min_pos_err/r_range)), lower is better.
Diagnostic metrics (avg_roughness, avg_min_position_error, avg_rmse) stored in DB but not ranked.
Models with missing results are excluded.
"""
raw = self.fetcher.fetch_diatomics_results()
return {
model: metrics["combined_roughness"]
for model, metrics in raw.items()
if metrics is not None and metrics.get("combined_roughness") is not None
}

def calculate_efficiency_results(self) -> dict[str, float]:
efficiency_results = self.fetcher.fetch_inference_efficiency_results()
# filter out models with missing efficiency results
Expand All @@ -223,6 +238,7 @@ def summarize_final_rankings(self):
)
stability_results = self.calculate_stability_results()
efficiency_results = self.calculate_efficiency_results()
roughness_results = self.calculate_diatomics_roughness_results()
if not generalizability_ood or not generalizability_downstream:
logging.warning(
"Missing data for generalizability metrics (ood or downstream)"
Expand Down Expand Up @@ -255,6 +271,9 @@ def summarize_final_rankings(self):
"Applicability-Efficiency ↑": [
efficiency_results[model] for model in shared_models
],
"Applicability-Roughness ↓": [
roughness_results.get(model) for model in shared_models
],
}

# Create DataFrame with models as index
Expand All @@ -273,8 +292,9 @@ def summarize_final_rankings(self):
"Generalizability-PC Error ↓",
"Applicability-Instability ↓",
"Applicability-Efficiency ↑",
"Applicability-Roughness ↓",
],
ascending=[True, True, True, False],
ascending=[True, True, True, False, True],
)
print(
"Final Rankings:\n",
Expand Down
19 changes: 19 additions & 0 deletions lambench/metrics/vishelper/results_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
get_domain_to_direct_task_mapping,
get_leaderboard_models,
aggregated_inference_efficiency_results,
aggregated_diatomics_results,
)
from lambench.models.basemodel import BaseLargeAtomModel
import pandas as pd
Expand Down Expand Up @@ -127,6 +128,24 @@ def fetch_inference_efficiency_results(self) -> dict[str, dict[str, float]]:
)
return results

def fetch_diatomics_results(self) -> dict[str, dict]:
"""Returns aggregated diatomics roughness results for all leaderboard models."""
results = {}
for model in self.leaderboard_models:
task_results = CalculatorRecord.query(
model_name=model.model_name, task_name="homonuclear_diatomics"
)
if len(task_results) != 1:
logging.warning(
f"Expected one record for {model.model_name} and homonuclear_diatomics, "
f"but got {len(task_results)}"
)
continue
results[model.model_metadata.pretty_name] = aggregated_diatomics_results(
task_results[0].metrics
)
return results

def fetch_downstream_results(self) -> pd.DataFrame:
"""Returns downstream task results as a DataFrame with models as rows and task metrics as columns."""

Expand Down
4 changes: 4 additions & 0 deletions lambench/models/ase_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,10 @@ def evaluate(
from lambench.tasks.calculator.surface.surface import run_inference

assert task.test_data is not None
return {"metrics": run_inference(self, task.test_data)}
elif task.task_name == "homonuclear_diatomics":
from lambench.tasks.calculator.diatomics.diatomics import run_inference

return {"metrics": run_inference(self, task.test_data)}
else:
raise NotImplementedError(f"Task {task.task_name} is not implemented.")
Expand Down
3 changes: 3 additions & 0 deletions lambench/tasks/calculator/calculator_tasks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,6 @@ interface:
surface:
test_data: /bohr/lambench-surfaces-43ll/v1/surface
calculator_params: null
homonuclear_diatomics:
test_data: null
calculator_params: null
Loading
Loading