Skip to content

Commit 615e45a

Browse files
committed
perf(eval): skip unnecessary logit array copies during native sampling
- Introduce the `copy_logits` parameter to `Llama.eval()` to control whether C-level logits are copied into the Python `self.scores` array. - Automatically disable `copy_logits` during the generation loop unless Python-side hooks (`logits_processor`, `stopping_criteria`) or `logits_all` explicitly require them. - Skip logit copies entirely for intermediate prompt evaluations (e.g., before hybrid checkpoints). - Update logit retrieval to use `get_logits_ith(-1)` to accurately fetch the final token's logits when copying is required. In a PDF-reading summarization workload, this reduced the end-to-end completion time from 41.32s to 25.93s, a ~37.2% improvement. The main generation hot path also improved noticeably: - `_create_completion`: 41.32s -> 25.93s - `generate`: 37.82s -> below the top sampled entries - `eval`: 35.14s -> 21.96s - logits retrieval/copy path: 29.89s `get_logits()` -> 18.68s `get_logits_ith()` - `decode`: 3.89s -> 2.25s - `detokenize`: 2.60s -> 1.33s - `sample`: 2.35s -> 2.03s This significantly reduces CPU overhead and memory bandwidth during generation, as the native `llama.cpp` sampler reads directly from the C context without needing to expose the `n_vocab` array to Python on every token. Signed-off-by: JamePeng <jame_peng@sina.com>
1 parent 7e0cd12 commit 615e45a

1 file changed

Lines changed: 43 additions & 9 deletions

File tree

llama_cpp/llama.py

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,11 +1035,20 @@ def eval(
10351035
tokens: Sequence[int],
10361036
active_loras: Optional[List[Dict[str, Union[str, float]]]] = None,
10371037
control_vector: Optional[Dict[str, Any]] = None,
1038+
copy_logits: bool = True,
10381039
):
10391040
"""Evaluate a list of tokens.
10401041
10411042
Args:
1042-
tokens: The list of tokens to evaluate.
1043+
tokens: The token ids to evaluate.
1044+
active_loras: Optional LoRA adapters to apply for this evaluation.
1045+
Each item should contain a ``name`` and an optional ``scale``.
1046+
control_vector: Optional control vector configuration to apply during
1047+
this evaluation.
1048+
copy_logits: Whether to copy the final logits into ``self.scores`` when
1049+
``logits_all`` is disabled. Set to ``False`` for native sampler paths
1050+
that sample directly from the llama context and do not need
1051+
Python-side logits.
10431052
"""
10441053
n_eval = len(tokens)
10451054
if n_eval == 0:
@@ -1246,9 +1255,11 @@ def eval(
12461255
if self.verbose:
12471256
print(f"Llama.eval: [Periodic Checkpoint] HybridCheckpoint save failed at pos {current_pos}, skipping update", file=sys.stderr)
12481257

1249-
# Save the final logit if not in _logits_all mode
1250-
if not self._logits_all:
1251-
logits_ptr = self._ctx.get_logits()
1258+
# Save the final logits only when Python-side logits are required.
1259+
# Native sampler can sample directly from ctx, so normal generation does not
1260+
# need to copy n_vocab floats into self.scores on every token.
1261+
if not self._logits_all and copy_logits:
1262+
logits_ptr = self._ctx.get_logits_ith(-1)
12521263
logits_view = np.ctypeslib.as_array(logits_ptr, shape=(self._n_vocab,))
12531264
self.scores[0, :] = logits_view
12541265

@@ -1666,6 +1677,14 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array):
16661677

16671678
self._sampling_ctx = LlamaSamplingContext(params, self._model)
16681679

1680+
# Native sampler samples directly from ctx. Python-side logits are only needed
1681+
# for compatibility hooks that explicitly consume self._scores.
1682+
copy_logits = (
1683+
self._logits_all
1684+
or logits_processor is not None
1685+
or stopping_criteria is not None
1686+
)
1687+
16691688
sample_idx = self.n_tokens + len(tokens) - 1
16701689
tokens = list(tokens)
16711690

@@ -1685,8 +1704,13 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array):
16851704
body_tokens = tokens[:-1]
16861705
last_token = [tokens[-1]]
16871706

1688-
# 1. Evaluate up to N-1
1689-
self.eval(body_tokens, active_loras=active_loras, control_vector=control_vector)
1707+
# 1. Evaluate up to N-1 without copying logits.
1708+
self.eval(
1709+
body_tokens,
1710+
active_loras=active_loras,
1711+
control_vector=control_vector,
1712+
copy_logits=False,
1713+
)
16901714

16911715
# 2. Save the N-1 state snapshot
16921716
current_history = self._input_ids[:self.n_tokens].tolist()
@@ -1695,11 +1719,21 @@ def adapter(token_data_array: llama_cpp_lib.llama_token_data_array):
16951719
tokens=current_history,
16961720
seq_id=0
16971721
)
1698-
# 3. Evaluate the final token to refresh logits
1699-
self.eval(last_token, active_loras=active_loras, control_vector=control_vector)
1722+
# 3. Evaluate final token. Copy logits only if Python-side hooks need them.
1723+
self.eval(
1724+
last_token,
1725+
active_loras=active_loras,
1726+
control_vector=control_vector,
1727+
copy_logits=copy_logits,
1728+
)
17001729
else:
17011730
# Standard evaluation or single-token generation step
1702-
self.eval(tokens, active_loras=active_loras, control_vector=control_vector)
1731+
self.eval(
1732+
tokens,
1733+
active_loras=active_loras,
1734+
control_vector=control_vector,
1735+
copy_logits=copy_logits,
1736+
)
17031737

17041738
# Sample loop
17051739
while sample_idx < self.n_tokens:

0 commit comments

Comments
 (0)