@@ -772,6 +772,31 @@ def copy_logits(self, logits: npt.NDArray[np.single]):
772772 self .candidates .sorted = False
773773 self .candidates .selected = - 1
774774
775+ def close (self ):
776+ """
777+ Release internal NumPy buffers and C-structure references.
778+ """
779+ # Main structured NumPy buffer holding token data (id, logit, prob)
780+ self .candidates_data = None
781+
782+ # Cached NumPy field views (avoid dangling references)
783+ self ._id_view = None
784+ self ._logit_view = None
785+ self ._p_view = None
786+
787+ # Precomputed default token id array
788+ self ._default_ids = None
789+
790+ # Setting to None ensures no stale pointer references remain.
791+ self .candidates = None
792+
793+ def __del__ (self ):
794+ # Ensures memory cleanup in case close() was not called explicitly.
795+ try :
796+ self .close ()
797+ except Exception :
798+ pass
799+
775800
776801# Python wrappers over common/sampling structs
777802# common/common.h common_params_sampling
@@ -1254,16 +1279,45 @@ def sample(
12541279
12551280 def close (self ):
12561281 """
1257- Clear samplers cache
1282+ Release all sampling-related resources and break references
1283+ to large buffers to allow Python GC to reclaim memory.
1284+
1285+ This method must be called when the sampling context is no longer needed,
1286+ especially in long-running services, to prevent memory retention.
12581287 """
1288+
1289+ # Free grammar sampler if it was initialized.
1290+ # This releases underlying llama.cpp sampler memory.
12591291 if self .grammar_sampler :
12601292 self .grammar_sampler .close ()
12611293 self .grammar_sampler = None
12621294
1295+ # Free the sampler chain and all attached C samplers.
12631296 if self .sampler_chain :
12641297 self .sampler_chain .close ()
12651298 self .sampler_chain = None
12661299
1300+ # Release large token data buffer used during sampling.
1301+ # Important for high-vocab models to avoid memory retention.
1302+ if hasattr (self , "_cur_p" ):
1303+ try :
1304+ self ._cur_p .close ()
1305+ except Exception :
1306+ pass
1307+ self ._cur_p = None
1308+
1309+ # Clear token history deque to drop references.
1310+ if hasattr (self , "prev" ):
1311+ self .prev .clear ()
1312+ self .prev = None
1313+
1314+ # Remove NumPy view pointing to llama logits buffer.
1315+ self ._logits_view = None
1316+
1317+ # Break references to small C structs used in grammar rejection sampling.
1318+ self ._single_token = None
1319+ self ._single_array = None
1320+
12671321 def __del__ (self ):
12681322 try :
12691323 self .close ()
0 commit comments