Skip to content

Commit 8f5c9b6

Browse files
committed
feat: Add explicit memory cleanup for sampling contexts
Implements `close()` and `__del__` for LlamaTokenDataArray and expands LlamaSamplingContext cleanup. Ensures NumPy views and internal C-references are properly released to allow Python GC to reclaim memory.
1 parent c6c85b1 commit 8f5c9b6

1 file changed

Lines changed: 55 additions & 1 deletion

File tree

llama_cpp/_internals.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,31 @@ def copy_logits(self, logits: npt.NDArray[np.single]):
772772
self.candidates.sorted = False
773773
self.candidates.selected = -1
774774

775+
def close(self):
776+
"""
777+
Release internal NumPy buffers and C-structure references.
778+
"""
779+
# Main structured NumPy buffer holding token data (id, logit, prob)
780+
self.candidates_data = None
781+
782+
# Cached NumPy field views (avoid dangling references)
783+
self._id_view = None
784+
self._logit_view = None
785+
self._p_view = None
786+
787+
# Precomputed default token id array
788+
self._default_ids = None
789+
790+
# Setting to None ensures no stale pointer references remain.
791+
self.candidates = None
792+
793+
def __del__(self):
794+
# Ensures memory cleanup in case close() was not called explicitly.
795+
try:
796+
self.close()
797+
except Exception:
798+
pass
799+
775800

776801
# Python wrappers over common/sampling structs
777802
# common/common.h common_params_sampling
@@ -1254,16 +1279,45 @@ def sample(
12541279

12551280
def close(self):
12561281
"""
1257-
Clear samplers cache
1282+
Release all sampling-related resources and break references
1283+
to large buffers to allow Python GC to reclaim memory.
1284+
1285+
This method must be called when the sampling context is no longer needed,
1286+
especially in long-running services, to prevent memory retention.
12581287
"""
1288+
1289+
# Free grammar sampler if it was initialized.
1290+
# This releases underlying llama.cpp sampler memory.
12591291
if self.grammar_sampler:
12601292
self.grammar_sampler.close()
12611293
self.grammar_sampler = None
12621294

1295+
# Free the sampler chain and all attached C samplers.
12631296
if self.sampler_chain:
12641297
self.sampler_chain.close()
12651298
self.sampler_chain = None
12661299

1300+
# Release large token data buffer used during sampling.
1301+
# Important for high-vocab models to avoid memory retention.
1302+
if hasattr(self, "_cur_p"):
1303+
try:
1304+
self._cur_p.close()
1305+
except Exception:
1306+
pass
1307+
self._cur_p = None
1308+
1309+
# Clear token history deque to drop references.
1310+
if hasattr(self, "prev"):
1311+
self.prev.clear()
1312+
self.prev = None
1313+
1314+
# Remove NumPy view pointing to llama logits buffer.
1315+
self._logits_view = None
1316+
1317+
# Break references to small C structs used in grammar rejection sampling.
1318+
self._single_token = None
1319+
self._single_array = None
1320+
12671321
def __del__(self):
12681322
try:
12691323
self.close()

0 commit comments

Comments
 (0)