Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion dalla_data_processing/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,15 @@ def pack(
tokenizer = RBPETokenizer.from_pretrained(config_data["tokenizer_path"])
except ImportError:
logger.error("Missing rbpe package")
logger.error("Install with: pip install rbpe")
logger.error(
"rbpe is not included in the default installation due to "
"dependency conflicts with camel-tools (transformers version requirements)"
)
logger.error("Install separately with: pip install rbpe")
logger.error(
"Note: Installing rbpe may require a separate environment "
"if you also use dedup/stem/quality features"
)
sys.exit(1)
else:
try:
Expand Down
20 changes: 20 additions & 0 deletions dalla_data_processing/quality/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from types import MethodType
from typing import Any

from camel_tools.data.catalogue import Catalogue
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from camel_tools.disambig.mle import MLEDisambiguator
from datasets import Dataset
Expand Down Expand Up @@ -53,6 +54,25 @@ def __init__(self, timeout: int = 3600, model: str = "mle", use_gpu: bool = Fals

def _init_disambiguator(self):
"""Initialize and configure the disambiguator with caching."""
# Install required CAMeL Tools packages based on model type
logger.info("Checking CAMeL Tools data packages...")
catalogue = Catalogue.load_catalogue()

try:
catalogue.download_package("morphology-db-msa-r13")
catalogue.download_package("disambig-mle-calima-msa-r13")
logger.info("msa-r13 packages installed")
except Exception as e:
logger.warning(f"Package installation warning: {e}")

# Install BERT package if using BERT model
if self.model == "bert":
try:
catalogue.download_package("disambig-bert-unfactored-all")
logger.info("BERT package installed")
except Exception as e:
logger.warning(f"BERT package installation warning: {e}")

if self.model == "mle":
self.disambiguator = MLEDisambiguator.pretrained()
logger.info("MLE disambiguator loaded")
Expand Down
31 changes: 22 additions & 9 deletions dalla_data_processing/stemming/stemmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,12 +473,19 @@ def stem_dataset(
catalogue = Catalogue.load_catalogue()
try:
catalogue.download_package("morphology-db-msa-r13")
if model == "mle":
catalogue.download_package("disambig-mle-calima-msa-r13")
# For BERT, let it download automatically when pretrained() is called
logger.info("CAMeL Tools data packages ready")
catalogue.download_package("disambig-mle-calima-msa-r13")
logger.info("msa-r13 packages installed")
except Exception as e:
logger.warning(f"Could not verify CAMeL packages: {e}")
logger.warning(f"Package installation warning: {e}")

if model == "bert":
try:
catalogue.download_package("disambig-bert-unfactored-all")
logger.info("BERT package installed")
except Exception as e:
logger.warning(f"BERT package installation warning: {e}")

logger.info("CAMeL Tools data packages ready")

logger.info("Loading additional words lists...")
words_dir = os.path.join(os.path.dirname(__file__), "data")
Expand Down Expand Up @@ -597,15 +604,21 @@ def stem(
if not all(isinstance(t, str) for t in text_list):
raise TypeError("All items in text list must be strings")

# Initialize disambiguator (cached globally if possible)
logger.info(f"Initializing {model.upper()} disambiguator...")
catalogue = Catalogue.load_catalogue()
try:
catalogue.download_package("morphology-db-msa-r13")
if model == "mle":
catalogue.download_package("disambig-mle-calima-msa-r13")
catalogue.download_package("disambig-mle-calima-msa-r13")
logger.info("msa-r13 packages installed")
except Exception as e:
logger.warning(f"Could not verify CAMeL packages: {e}")
logger.warning(f"Package installation warning: {e}")

if model == "bert":
try:
catalogue.download_package("disambig-bert-unfactored-all")
logger.info("BERT package installed")
except Exception as e:
logger.warning(f"BERT package installation warning: {e}")

if model == "mle":
disambiguator = MLEDisambiguator.pretrained("calima-msa-r13", cache_size=1_000_000)
Expand Down
11 changes: 6 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ authors = [
{name = "Digital Research Unit - Arab Center", email = "dru@dohainstitute.edu.qa"}
]
readme = "README.md"
requires-python = ">=3.12"
requires-python = ">=3.12,<3.13"
keywords = ["arabic", "nlp", "data-processing", "deduplication", "stemming", "readability", "quality"]
classifiers = [
"Intended Audience :: Developers",
Expand Down Expand Up @@ -39,23 +39,24 @@ dev = [
"pre-commit>=3.0.0",
]
dedup = [
"camel-tools>=1.5.0",
"camel-tools==1.5.7",
]
dedup-native = [
"cffi>=1.15.0",
]
stem = [
"camel-tools>=1.5.0",
"camel-tools==1.5.7",
]
quality = [
"camel-tools>=1.5.0",
"camel-tools==1.5.7",
]
readability = [
"textstat>=0.7.0",
]
pack = [
"sentencepiece>=0.2.0",
"rbpe",
# "rbpe", # excluded due to transformers version conflict with camel-tools
# users should install separately if needed: pip install rbpe
"pyyaml",
]
all = [
Expand Down
Loading