docs(readme): add documentation for Assistant Prefill features

JamePeng · JamePeng · commit f9b531342193 · 2026-03-29T19:41:47.000+08:00
- Also slightly updated the `huggingface_hub` installation instructions for accuracy.

Signed-off-by: JamePeng &lt;jame_peng@sina.com&gt;
diff --git a/README.md b/README.md
@@ -344,20 +344,23 @@ By default `llama-cpp-python` generates completions in an OpenAI compatible form
 
 Text completion is available through the [`__call__`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__) and [`create_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_completion) methods of the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class.
 
-### Pulling models from Hugging Face Hub
+### Pulling models from [Hugging Face Hub](https://huggingface.co/models)
 
 You can download `Llama` models in `gguf` format directly from Hugging Face using the [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) method.
-You'll need to install the `huggingface-hub` package to use this feature (`pip install huggingface-hub`).
+
+You'll need to install the `huggingface_hub` package to use this feature (`pip install --upgrade huggingface_hub`).
+
+
 
 ```python
 llm = Llama.from_pretrained(
-    repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
-    filename="*q8_0.gguf",
+    repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF",
+    filename="qwen2.5-0.5b-instruct-q4_k_m.gguf",
     verbose=False
 )
 ```
 
-By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) will download the model to the huggingface cache directory, you can then manage installed model files with the [`huggingface-cli`](https://huggingface.co/docs/huggingface_hub/en/guides/cli) tool.
+By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) will download the model to the huggingface cache directory, you can then manage installed model files with the [`hf`](https://huggingface.co/docs/huggingface_hub/en/guides/cli) tool.
 
 ### Chat Completion
 
@@ -521,6 +524,39 @@ llm = Llama.from_pretrained(
 
 ---
 
+## Continuing Assistant Responses (Prefill)
+
+`llama-cpp-python` supports native **Assistant Prefill** for seamless message continuation. You can now simply use the `assistant_prefill=True` parameter in the `create_chat_completion` function.
+
+This safely renders the `N-1` conversation history using standard Jinja templates (preserving exact control tokens) and flawlessly appends your partial text directly to the prompt.
+
+```python
+from llama_cpp import Llama
+
+llm = Llama(model_path="path/to/model.gguf")
+
+# An interrupted/partial conversation
+messages = [
+    {"role": "user", "content": "What are the first 5 planets in the solar system?"},
+    {"role": "assistant", "content": "The first 5 planets in our solar system are:\n1. Mercury\n2."}
+]
+
+# Seamlessly continue the generation
+response = llm.create_chat_completion(
+    messages=messages,
+    max_tokens=50,
+    assistant_prefill=True # <--- Enables seamless continuation
+)
+
+prefilled_text = messages[-1]["content"]
+# The model will flawlessly continue from " Venus\n3. Earth..."
+generated_text = response["choices"][0]["message"]["content"]
+
+print(prefilled_text + generated_text)
+```
+
+---
+
 ## Dynamic LoRA Routing & Control Vectors (Multi-Tenant Serving)
 
 Historically, `llama-cpp-python` only supported "static loading" where a LoRA was permanently baked into the context during initialization. Switching personas required reloading the entire model or duplicating it in VRAM.
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -3141,8 +3141,8 @@ def from_pretrained(
         **kwargs: Any,
     ) -> "Llama":
         """Create a Llama model from a pretrained model name or path.
-        This method requires the huggingface-hub package.
-        You can install it with `pip install huggingface-hub`.
+        This method requires the huggingface_hub package.
+        You can install it with `pip install --upgrade huggingface_hub`.
 
         Args:
             repo_id: The model repo id.
@@ -3160,7 +3160,7 @@ def from_pretrained(
         except ImportError:
             raise ImportError(
                 "Llama.from_pretrained requires the huggingface-hub package. "
-                "You can install it with `pip install huggingface-hub`."
+                "You can install it with `pip install --upgrade huggingface_hub`."
             )
 
         validate_repo_id(repo_id)
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -3758,8 +3758,8 @@ def from_pretrained(
             from huggingface_hub.utils import validate_repo_id  # type: ignore
         except ImportError:
             raise ImportError(
-                "Llama.from_pretrained requires the huggingface-hub package. "
-                "You can install it with `pip install huggingface-hub`."
+                "Llama.from_pretrained requires the huggingface_hub package. "
+                "You can install it with `pip install --upgrade huggingface_hub`."
             )
 
         validate_repo_id(repo_id)