Skip to content

Commit 949a83f

Browse files
refactor: Use dynamic reverse lookup in english_to_lambda (#9)
* docs: Clean up and format v0.2 grammar * refactor: Remove duplicate atoms from extended vocabulary * refactor: Dynamically generate reverse lookup in english_to_lambda
1 parent 170ebc6 commit 949a83f

1 file changed

Lines changed: 40 additions & 22 deletions

File tree

src/lambda_lang.py

Lines changed: 40 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -434,29 +434,47 @@ def english_to_lambda(text: str) -> str:
434434
# Build comprehensive reverse lookup (English -> Lambda)
435435
rev = {}
436436

437-
# Add all categories
438-
for cat in ["entities", "verbs", "modifiers", "time", "quantifiers"]:
439-
for k, v in ATOMS.get(cat, {}).items():
440-
for word in v["en"].lower().replace("/", " ").split():
441-
word = word.strip("()")
442-
if word and word not in rev:
443-
rev[word] = k
444-
445-
# Add extended vocabulary
446-
for k, v in ATOMS.get("extended", {}).items():
447-
for word in v["en"].lower().replace("/", " ").split():
448-
word = word.strip("()")
449-
if word and word not in rev:
450-
rev[word] = k
451-
452-
# Add domain atoms with prefixes
437+
def _iter_words(en_value: str):
438+
"""Yield normalized tokens from an English description."""
439+
normalized = en_value.lower().replace("/", " ").replace("-", " ")
440+
for part in normalized.split():
441+
word = re.sub(r"[^a-z0-9']", "", part.strip("()")).strip("'")
442+
if word:
443+
yield word
444+
445+
def _add_mapping(word: str, token: str):
446+
if word and word not in rev:
447+
rev[word] = token
448+
449+
# Iterate all categories defined in ATOMS (except metadata and domains)
450+
for category, entries in ATOMS.items():
451+
if category in {"version", "changelog", "domains"}:
452+
continue
453+
if not isinstance(entries, dict):
454+
continue
455+
for atom, data in entries.items():
456+
if atom == "_meta" or not isinstance(data, dict):
457+
continue
458+
en_value = data.get("en")
459+
if not isinstance(en_value, str):
460+
continue
461+
for word in _iter_words(en_value):
462+
_add_mapping(word, atom)
463+
464+
# Include domain-specific atoms with domain prefixes
453465
for domain_code, domain_data in ATOMS.get("domains", {}).items():
454-
domain_prefix = {"cd": "c", "vb": "v", "sc": "s", "emo": "e", "soc": "o"}.get(domain_code, domain_code)
455-
for atom, atom_data in domain_data.get("atoms", {}).items():
456-
for word in atom_data["en"].lower().replace("/", " ").split():
457-
word = word.strip("()")
458-
if word and word not in rev:
459-
rev[word] = f"{domain_prefix}:{atom}"
466+
atoms = domain_data.get("atoms", {})
467+
if not isinstance(atoms, dict):
468+
continue
469+
for atom, atom_data in atoms.items():
470+
if atom == "_meta" or not isinstance(atom_data, dict):
471+
continue
472+
en_value = atom_data.get("en")
473+
if not isinstance(en_value, str):
474+
continue
475+
token = f"{domain_code}:{atom}"
476+
for word in _iter_words(en_value):
477+
_add_mapping(word, token)
460478

461479
# Add common word mappings (these override domain atoms when more specific)
462480
rev.update({

0 commit comments

Comments
 (0)