refactor: Use dynamic reverse lookup in english_to_lambda (#9)

gemini-25-pro-collab · web-flow · commit 949a83f53179 · 2026-03-28T02:27:06.000+08:00
* docs: Clean up and format v0.2 grammar

* refactor: Remove duplicate atoms from extended vocabulary

* refactor: Dynamically generate reverse lookup in english_to_lambda
diff --git a/src/lambda_lang.py b/src/lambda_lang.py
@@ -434,29 +434,47 @@ def english_to_lambda(text: str) -> str:
     # Build comprehensive reverse lookup (English -> Lambda)
     rev = {}
     
-    # Add all categories
-    for cat in ["entities", "verbs", "modifiers", "time", "quantifiers"]:
-        for k, v in ATOMS.get(cat, {}).items():
-            for word in v["en"].lower().replace("/", " ").split():
-                word = word.strip("()")
-                if word and word not in rev:
-                    rev[word] = k
-    
-    # Add extended vocabulary
-    for k, v in ATOMS.get("extended", {}).items():
-        for word in v["en"].lower().replace("/", " ").split():
-            word = word.strip("()")
-            if word and word not in rev:
-                rev[word] = k
-    
-    # Add domain atoms with prefixes
+    def _iter_words(en_value: str):
+        """Yield normalized tokens from an English description."""
+        normalized = en_value.lower().replace("/", " ").replace("-", " ")
+        for part in normalized.split():
+            word = re.sub(r"[^a-z0-9']", "", part.strip("()")).strip("'")
+            if word:
+                yield word
+    
+    def _add_mapping(word: str, token: str):
+        if word and word not in rev:
+            rev[word] = token
+    
+    # Iterate all categories defined in ATOMS (except metadata and domains)
+    for category, entries in ATOMS.items():
+        if category in {"version", "changelog", "domains"}:
+            continue
+        if not isinstance(entries, dict):
+            continue
+        for atom, data in entries.items():
+            if atom == "_meta" or not isinstance(data, dict):
+                continue
+            en_value = data.get("en")
+            if not isinstance(en_value, str):
+                continue
+            for word in _iter_words(en_value):
+                _add_mapping(word, atom)
+    
+    # Include domain-specific atoms with domain prefixes
     for domain_code, domain_data in ATOMS.get("domains", {}).items():
-        domain_prefix = {"cd": "c", "vb": "v", "sc": "s", "emo": "e", "soc": "o"}.get(domain_code, domain_code)
-        for atom, atom_data in domain_data.get("atoms", {}).items():
-            for word in atom_data["en"].lower().replace("/", " ").split():
-                word = word.strip("()")
-                if word and word not in rev:
-                    rev[word] = f"{domain_prefix}:{atom}"
+        atoms = domain_data.get("atoms", {})
+        if not isinstance(atoms, dict):
+            continue
+        for atom, atom_data in atoms.items():
+            if atom == "_meta" or not isinstance(atom_data, dict):
+                continue
+            en_value = atom_data.get("en")
+            if not isinstance(en_value, str):
+                continue
+            token = f"{domain_code}:{atom}"
+            for word in _iter_words(en_value):
+                _add_mapping(word, token)
     
     # Add common word mappings (these override domain atoms when more specific)
     rev.update({