Added nice metrics by default

MichaelGoodale · MichaelGoodale · commit 2f9e7f987ed2 · 2025-10-13T15:40:34.000+02:00
diff --git a/example.py b/example.py
@@ -1,4 +1,5 @@
 from python_mg import Lexicon
+from python_mg.metrics import grammar_f1, grammar_f1_from_strings
 import numpy as np
 import numpy.typing as npt
 
@@ -37,26 +38,28 @@
 for i in range(len(batch)):
     z[i, : len(batch[i])] = batch[i]
 
-cont = lexicon.token_continuations(z, "C")[:, :-1, :]
+cont = lexicon.token_continuations(z, "C")
 
-out = np.eye(len(tokens))[z]
+out = np.eye(len(tokens))[z[:, 1:]]
+out = np.log(out / out.sum(axis=-1, keepdims=True))
 
+print(grammar_f1_from_strings(lexicon, z, out, "C"))
 
 for i in range(len(z[0]) - 1):
     print(lexicon.detokenize(batch[0]))
     print([rev_tokens[s] for s in cont[0, i, :].nonzero()[0]])
 
 
-# for p in lexicon.generate_grammar("C", max_strings=50):
-#    print(p)
-#    tokens = p.tokens()
-#    print(tokens)
-#    print(lexicon.detokenize(tokens))
-#    print(lexicon.detokenize(tokens.tolist()))
-#    print(lexicon.parse_tokens(tokens, "C"))
-#    print(p.latex())
-#    print(p.log_prob())
-#    print(p.prob())
-#    tree = p.to_tree()
-#    print(tree.normal_string())
-#    print(tree.base_string())
+for p in lexicon.generate_grammar("C", max_strings=50):
+    print(p)
+    tokens = p.tokens()
+    print(tokens)
+    print(lexicon.detokenize(tokens))
+    print(lexicon.detokenize(tokens.tolist()))
+    print(lexicon.parse_tokens(tokens, "C"))
+    print(p.latex())
+    print(p.log_prob())
+    print(p.prob())
+    tree = p.to_tree()
+    print(tree.normal_string())
+    print(tree.base_string())
diff --git a/python/python_mg/metrics.py b/python/python_mg/metrics.py
@@ -0,0 +1,75 @@
+from typing import Literal
+
+import numpy as np
+import numpy.typing as npt
+
+from python_mg._lib_name import Lexicon
+
+
+def grammar_f1(
+    preds: npt.NDArray[np.float64],
+    correct: npt.NDArray[np.bool],
+) -> dict[str, npt.NDArray[np.float64]]:
+    if preds.shape != correct.shape:
+        raise ValueError("correct and preds must have matching shapes")
+
+    precision: npt.NDArray[np.float64] = np.exp(  # pyright: ignore[reportAny]
+        np.logaddexp.reduce(
+            np.where(correct, preds, -np.inf), axis=-1
+        )  # pyright: ignore[reportAny]
+    )
+
+    total_bad: npt.NDArray[np.float64] = (  # pyright: ignore[reportAny]
+        np.logaddexp.reduce(np.where(~correct, preds, -np.inf), axis=-1, keepdims=True)
+    )
+    better_than_bad = np.where(np.where(correct, preds, -np.inf) > total_bad, 1.0, 0.0)
+
+    recall = np.where(correct, better_than_bad, 0.0).sum(  # pyright: ignore[reportAny]
+        axis=-1
+    ) / correct.sum(axis=-1)
+
+    return {
+        "f1": (2 * precision * recall) / (precision + recall),
+        "precision": precision,
+        "recall": recall,
+    }
+
+
+def grammar_f1_from_strings(
+    lexicon: Lexicon,
+    tokens: npt.NDArray[np.int_],
+    preds: npt.NDArray[np.float64],
+    category: str,
+    min_log_prob: float | None = -128.0,
+    move_prob: float = 0.5,
+    max_steps: int | None = 64,
+    n_beams: int | None = 256,
+    reduction: Literal["none", "sentence_mean"] = "sentence_mean",
+) -> dict[str, npt.NDArray[np.float64]]:
+    conts = lexicon.token_continuations(
+        tokens,
+        category,
+        min_log_prob=min_log_prob,
+        move_prob=move_prob,
+        max_steps=max_steps,
+        n_beams=n_beams,
+    )[..., :-1, :]
+
+    d = grammar_f1(preds, conts)
+
+    if reduction == "sentence_mean":
+        mask = (tokens[..., :-1] != 2) & (  # pyright: ignore[reportAny]
+            tokens[..., :-1] != 1
+        )
+
+        d = {
+            k: np.where(mask, v, 0.0).sum(axis=-1)  # pyright: ignore[reportAny]
+            / mask.sum(axis=-1)  # pyright: ignore[reportAny]
+            for k, v in d.items()
+        }
+    elif reduction != "none":
+        raise ValueError(
+            f'"{reduction}" is not a valid reduction'
+        )  # pyright: ignore[reportUnreachable]
+
+    return d