perf(eval): skip unnecessary logit array copies during native sampling

JamePeng · JamePeng · commit 615e45a47f47 · 2026-05-23T13:58:10.000+08:00
- Introduce the `copy_logits` parameter to `Llama.eval()` to control
  whether C-level logits are copied into the Python `self.scores` array.
- Automatically disable `copy_logits` during the generation loop unless
  Python-side hooks (`logits_processor`, `stopping_criteria`) or
  `logits_all` explicitly require them.
- Skip logit copies entirely for intermediate prompt evaluations (e.g.,
  before hybrid checkpoints).
- Update logit retrieval to use `get_logits_ith(-1)` to accurately fetch
  the final token's logits when copying is required.

In a PDF-reading summarization workload, this reduced the end-to-end completion
time from 41.32s to 25.93s, a ~37.2% improvement. The main generation hot path
also improved noticeably:

- `_create_completion`: 41.32s -&gt; 25.93s
- `generate`: 37.82s -&gt; below the top sampled entries
- `eval`: 35.14s -&gt; 21.96s
- logits retrieval/copy path: 29.89s `get_logits()` -&gt; 18.68s `get_logits_ith()`
- `decode`: 3.89s -&gt; 2.25s
- `detokenize`: 2.60s -&gt; 1.33s
- `sample`: 2.35s -&gt; 2.03s

This significantly reduces CPU overhead and memory bandwidth during generation,
as the native `llama.cpp` sampler reads directly from the C context without
needing to expose the `n_vocab` array to Python on every token.

Signed-off-by: JamePeng &lt;jame_peng@sina.com&gt;
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1035,11 +1035,20 @@ def eval(
             tokens: Sequence[int],
             active_loras: Optional[List[Dict[str, Union[str, float]]]] = None,
             control_vector: Optional[Dict[str, Any]] = None,
+            copy_logits: bool = True,
     ):
         """Evaluate a list of tokens.
 
         Args:
-            tokens: The list of tokens to evaluate.
+            tokens: The token ids to evaluate.
+            active_loras: Optional LoRA adapters to apply for this evaluation.
+                Each item should contain a ``name`` and an optional ``scale``.
+            control_vector: Optional control vector configuration to apply during
+                this evaluation.
+            copy_logits: Whether to copy the final logits into ``self.scores`` when
+                ``logits_all`` is disabled. Set to ``False`` for native sampler paths
+                that sample directly from the llama context and do not need
+                Python-side logits.
         """
         n_eval = len(tokens)
         if n_eval == 0:
@@ -1246,9 +1255,11 @@ def eval(
                         if self.verbose:
                             print(f"Llama.eval: [Periodic Checkpoint] HybridCheckpoint save failed at pos {current_pos}, skipping update", file=sys.stderr)
 
-        # Save the final logit if not in _logits_all mode
-        if not self._logits_all:
-            logits_ptr = self._ctx.get_logits()
+        # Save the final logits only when Python-side logits are required.
+        # Native sampler can sample directly from ctx, so normal generation does not
+        # need to copy n_vocab floats into self.scores on every token.
+        if not self._logits_all and copy_logits:
+            logits_ptr = self._ctx.get_logits_ith(-1)
             logits_view = np.ctypeslib.as_array(logits_ptr, shape=(self._n_vocab,))
             self.scores[0, :] = logits_view
 
@@ -1666,6 +1677,14 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array):
 
         self._sampling_ctx = LlamaSamplingContext(params, self._model)
 
+        # Native sampler samples directly from ctx. Python-side logits are only needed
+        # for compatibility hooks that explicitly consume self._scores.
+        copy_logits = (
+            self._logits_all
+            or logits_processor is not None
+            or stopping_criteria is not None
+        )
+
         sample_idx = self.n_tokens + len(tokens) - 1
         tokens = list(tokens)
 
@@ -1685,8 +1704,13 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array):
                         body_tokens = tokens[:-1]
                         last_token = [tokens[-1]]
 
-                        # 1. Evaluate up to N-1
-                        self.eval(body_tokens, active_loras=active_loras, control_vector=control_vector)
+                        # 1. Evaluate up to N-1 without copying logits.
+                        self.eval(
+                            body_tokens,
+                            active_loras=active_loras,
+                            control_vector=control_vector,
+                            copy_logits=False,
+                        )
 
                         # 2. Save the N-1 state snapshot
                         current_history = self._input_ids[:self.n_tokens].tolist()
@@ -1695,11 +1719,21 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array):
                             tokens=current_history,
                             seq_id=0
                         )
-                        # 3. Evaluate the final token to refresh logits
-                        self.eval(last_token, active_loras=active_loras, control_vector=control_vector)
+                        # 3. Evaluate final token. Copy logits only if Python-side hooks need them.
+                        self.eval(
+                            last_token,
+                            active_loras=active_loras,
+                            control_vector=control_vector,
+                            copy_logits=copy_logits,
+                        )
                     else:
                         # Standard evaluation or single-token generation step
-                        self.eval(tokens, active_loras=active_loras, control_vector=control_vector)
+                        self.eval(
+                            tokens,
+                            active_loras=active_loras,
+                            control_vector=control_vector,
+                            copy_logits=copy_logits,
+                        )
 
                 # Sample loop
                 while sample_idx < self.n_tokens: