Skip to content

Commit 23423fb

Browse files
Peter JohnsonPeter Johnson
authored andcommitted
Create pkl (words) for context up to n=8; add bz2 compression
1 parent cfb4e3b commit 23423fb

11 files changed

Lines changed: 53 additions & 27 deletions

evaluation_function/models/shannon_words_ngram.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,36 +28,36 @@ def log(msg):
2828
# If creating when deployed:
2929
#FILE = Path(tempfile.gettempdir()) / "ngram_counts.pkl"
3030
# If creating locally, to be copied when deployed:
31-
FILE = MODEL_DIR / "ngram_counts.pkl"
31+
FILE_BASE = MODEL_DIR / "ngram_counts.pkl.bz2"
3232

3333
def get_counts(n=3, dev=False):
3434
print(f"Loading/building n-gram counts for n={n}...")
35-
if os.path.exists(FILE):
35+
FILE_NAME = FILE_BASE.with_name(FILE_BASE.stem + f"_{n:02d}" + "".join(FILE_BASE.suffixes))
36+
if os.path.exists(FILE_NAME):
3637
try:
37-
with open(FILE, "rb") as f:
38+
with bz2.BZ2File(FILE_NAME, "rb") as f:
3839
cache = pickle.load(f)
3940
if not isinstance(cache, dict):
4041
raise RuntimeError(f"Loaded cache is {type(cache)}, not dict — contents: {str(cache)[:300]}")
41-
if n not in cache:
42-
raise RuntimeError(f"Loaded keys={list(cache.keys())[:10]} (len={len(cache)}) — expected {n}")
42+
#if n not in cache:
43+
# raise RuntimeError(f"Loaded keys={list(cache.keys())[:10]} (len={len(cache)}) — expected {n}")
4344
except Exception as e:
44-
raise RuntimeError(f"Failed to load {FILE}: {e}")
45+
raise RuntimeError(f"Failed to load {FILE_NAME}: {e}")
4546
elif dev: # from here the deployed version will not work because the corpora are not bundled (to save space)
4647
cache = {}
4748
print(f"Building n-gram counts from NLTK corpora (dev mode)")
4849
try:
4950
if n not in cache:
50-
print(f"Building n={n} counts...")
51-
cache[n] = build_counts(n, START, END) # only works if NLTK corpora are available
52-
print(f"Saving n-gram counts to {FILE}...")
53-
with open(FILE, "wb") as f:
54-
pickle.dump(cache,f)
51+
print(f"Starting building counts up to n={n} ...")
52+
build_counts(FILE_BASE, list(range(1, n + 1)), START, END) # only works if NLTK corpora are available
53+
with bz2.BZ2File(FILE_NAME, "rb") as f:
54+
cache = pickle.load(f)
5555
except Exception as e:
5656
raise RuntimeError(f"Failed to rebuild or save n-gram counts {e}")
5757
else:
5858
raise FileNotFoundError(f"N-gram counts file not found at {FILE}, and dev mode is off so counts not generated.")
5959
print(f"Loaded cache is {type(cache)}, — contents: {str(cache)[:300]}")
60-
counts = cache[n]
60+
counts = cache
6161
if n == 1:
6262
counts.setdefault((), {}) # CHANGE: ensure unigram context exists
6363
counts[()].pop(END, None) # CHANGE: drop </s> if present (old caches)

evaluation_function/models/storage/ngram_counts.pkl

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:c5d8b1c820f7c17ec499651e125e48f3e02c0cc8bf524edf69259541ca7bbe4e
3+
size 451494
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:be05894ea4878520850161b7ba864b1f68f1780f571232f0d7c3e618f3232462
3+
size 3711212
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:bc26a3940cf9ab4ab85f48900218320997a7f184cd8e2e345dce39e8e2f64c17
3+
size 14369999
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:600106349b271d69ee3dfc5d9168f03e78239788d7bb30ea0a4e2c0c3d7db0b1
3+
size 31263714
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:b5e5e2f72d6740d8572ae5a8d41d6a39f511f7e809c9249a42678b78cf990081
3+
size 39222247
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:fc16262131b93f909597bf950815d7f85703645182b5a2fd8be30c1b3b0f74db
3+
size 41750737
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:741e9fda02f42409da78d59c2959b3d6c273a451953d5ebeb8a870736f9e8d84
3+
size 43868262
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:c865ca124783d95c67ed7bbd593d805fc1f5f235ee02ef7285adb8781b6d6dbb
3+
size 44082806

0 commit comments

Comments
 (0)