@@ -93,6 +93,25 @@ RUN apt-get install -y git-lfs && \
9393 apt-get install -y xvfb && \
9494 /tmp/clean-layer.sh
9595
96+ RUN uv pip install --system --force-reinstall "nltk==3.9.1"
97+ RUN mkdir -p /usr/share/nltk_data && \
98+ # NLTK Downloader no longer continues smoothly after an error, so we explicitly list
99+ # the corpuses that work
100+ python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \
101+ basque_grammars biocreative_ppi bllip_wsj_no_aux \
102+ book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \
103+ comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \
104+ europarl_raw floresta gazetteers genesis gutenberg \
105+ ieer inaugural indian jeita kimmo knbc large_grammars lin_thesaurus mac_morpho machado \
106+ masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \
107+ mte_teip5 names nps_chat omw opinion_lexicon paradigms \
108+ pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \
109+ pros_cons ptb punkt punkt_tab qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \
110+ sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \
111+ state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \
112+ twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \
113+ vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe
114+
96115# Download base easyocr models.
97116# https://github.com/JaidedAI/EasyOCR#usage
98117RUN mkdir -p /root/.EasyOCR/model && \
0 commit comments