refactor(internals): align model metadata wrappers with llama.cpp API

JamePeng · JamePeng · commit 4a6c311364ca · 2026-05-28T00:55:17.000+08:00
- Use `llama_vocab_n_tokens()` instead of the old vocab size helper.
 - Add Python wrappers for model description, size, chat template, and
  trained RoPE frequency scaling.
 - Clarify model capability helpers with docstrings matching llama.cpp
  semantics.
 - Rename `desc()` and `size()` to `model_desc()` and `model_size()` to
  make their scope explicit.
 - Drop the unused `get_tensor()` stub since llama.cpp does not expose it.
 - Route rerank template lookup through `LlamaModel.model_chat_template()` for
  consistency with the internal model abstraction.

Signed-off-by: JamePeng &lt;jame_peng@sina.com&gt;
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -102,7 +102,7 @@ def vocab_type(self) -> int:
         return llama_cpp.llama_vocab_type(self.model)
 
     def n_vocab(self) -> int:
-        return llama_cpp.llama_n_vocab(self.vocab)
+        return llama_cpp.llama_vocab_n_tokens(self.vocab)
 
     def n_ctx_train(self) -> int:
         return llama_cpp.llama_model_n_ctx_train(self.model)
@@ -131,41 +131,76 @@ def n_head_kv(self) -> int:
     def n_swa(self) -> int:
         return llama_cpp.llama_model_n_swa(self.model)
 
+    def rope_freq_scale_train(self) -> float:
+        """
+        Get the model's RoPE frequency scaling factor
+        """
+        return llama_cpp.llama_model_rope_freq_scale_train(self.model)
+
+    def model_desc(self) -> str:
+        """
+        Get a string describing the model type
+        """
+        buf = ctypes.create_string_buffer(256)
+        llama_cpp.llama_model_desc(self.model, buf, 256)
+        return buf.value.decode("utf-8")
+
+    def model_size(self) -> int:
+        """
+        Returns the total size of all the tensors in the model in bytes
+        """
+        return llama_cpp.llama_model_size(self.model)
+
+    def model_chat_template(self, name: bytes) -> str:
+        """
+        Get the default chat template. Returns nullptr if not available
+        If name is NULL, returns the default chat template
+        """
+        return llama_cpp.llama_model_chat_template(self.model, name).decode("utf-8")
+
     def n_params(self) -> int:
+        """
+        Returns the total number of parameters in the model
+        """
         return llama_cpp.llama_model_n_params(self.model)
 
     def has_encoder(self) -> bool:
+        """
+        Returns true if the model contains an encoder that requires llama_encode() call
+        """
         return llama_cpp.llama_model_has_encoder(self.model)
 
     def has_decoder(self) -> bool:
+        """
+        Returns true if the model contains a decoder that requires llama_decode() call
+        """
         return llama_cpp.llama_model_has_decoder(self.model)
 
     def decoder_start_token(self) -> int:
+        """
+        For encoder-decoder models, this function returns id of the token that must be provided
+        to the decoder to start generating output sequence. For other models, it returns -1.
+        """
         return llama_cpp.llama_model_decoder_start_token(self.model)
 
     def is_recurrent(self) -> bool:
+        """
+        Returns true if the model is recurrent (like Mamba, RWKV, etc.)
+        """
         return llama_cpp.llama_model_is_recurrent(self.model)
 
     def is_hybrid(self) -> bool:
+        """
+        Returns true if the model is hybrid (like Jamba, Granite, etc.)
+        """
         return llama_cpp.llama_model_is_hybrid(self.model)
 
     def is_diffusion(self) -> bool:
+        """
+        Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
+        """
         return llama_cpp.llama_model_is_diffusion(self.model)
 
-    def rope_freq_scale_train(self) -> float:
-        return llama_cpp.llama_model_rope_freq_scale_train(self.model)
-
-    def desc(self) -> str:
-        buf = ctypes.create_string_buffer(1024)
-        llama_cpp.llama_model_desc(self.model, buf, 1024)
-        return buf.value.decode("utf-8")
-
-    def size(self) -> int:
-        return llama_cpp.llama_model_size(self.model)
-
-    def get_tensor(self, name: str) -> ctypes.c_void_p:
-        raise NotImplementedError("get_tensor is not implemented in llama.cpp")
-
     # Vocab
 
     def token_get_text(self, token: int) -> str:
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -696,13 +696,20 @@ def __init__(
 
         try:
             self.metadata = self._model.metadata()
+            self.model_desc = self._model.model_desc()
+            # The total size of all the tensors in the model in bytes
+            self.model_size = self._model.model_size()
+
         except Exception as e:
             self.metadata = {}
             if self.verbose:
                 print(f"Failed to load metadata: {e}", file=sys.stderr)
 
         if self.verbose:
-            print(f"Model metadata: {self.metadata}", file=sys.stderr)
+            print(f"Model desc: {self.model_desc}, "
+                  f"Model size: {self.model_size / (1024 * 1024):.2f} MB, "
+                  f"Model metadata: {self.metadata}",
+                  file=sys.stderr)
 
         eos_token_id = self.token_eos()
         bos_token_id = self.token_bos()
diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py
@@ -303,9 +303,7 @@ def rank(self, query: str, documents: List[str]) -> List[float]:
 
         # 1. Attempt to retrieve the built-in 'rerank' chat template from model metadata.
         # Modern GGUF models often include a template for formatting query/document pairs.
-        rerank_template = llama_cpp.llama_model_chat_template(self._model.model, b"rerank")
-        if rerank_template:
-            rerank_template = rerank_template.decode("utf-8")
+        rerank_template = self._model.model_chat_template(b"rerank")
 
         batch_inputs: List[List[int]] = []