InfiniTensor · wooway777 · May 7, 2026
diff --git a/examples/bench.py b/examples/bench.py
@@ -1,10 +1,10 @@
 import infinicore
-from transformers import AutoTokenizer
-from infinilm.modeling_utils import load_model_state_dict_by_file
-from infinilm.distributed import DistConfig
-from infinilm.infer_engine import GenerationConfig, InferEngine
 from infinilm.base_config import BaseConfig
 from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
+from infinilm.distributed import DistConfig
+from infinilm.infer_engine import GenerationConfig, InferEngine
+from infinilm.modeling_utils import load_model_state_dict_by_file
+from infinilm.tokenizer_utils import InfiniLMTokenizer
 import argparse
 import sys
 import time
@@ -42,9 +42,9 @@ def get_test_cases(
     input_len_list: list[int],
     output_len_list: list[int],
 ):
+    """Generate cases ordered by ascending KV cache memory usage."""
     model_path = os.path.expanduser(model_path)
 
-    """Generate cases ordered by ascending KV cache memory usage."""
     # Load model config to derive attention dimensions
     config = read_json_file(os.path.join(model_path, "config.json"))
     head_dim = config.get(
@@ -92,19 +92,23 @@ def get_test_cases(
     return case_dict
 
 
+# Load benchmark prompt from file
 with open("examples/bench_prompt.md", "r") as f:
     prompt = f.read()
 
 
 def repeat_prompt(input_ids: list[int], target_length: int):
+    """Repeat or truncate input_ids to match target_length."""
     num = len(input_ids)
     repeat_times = (target_length + num - 1) // num
     return (input_ids * repeat_times)[:target_length]
 
 
 class TestModel:
+    """Benchmark model wrapper for performance testing."""
+
     model: infinicore.nn.Module
-    tokenizer: AutoTokenizer
+    tokenizer: InfiniLMTokenizer
     input_ids_list: list[int]
 
     def __init__(
@@ -118,8 +122,9 @@ def __init__(
         attn_backend="default",
     ) -> None:
         model_path = os.path.expanduser(model_path)
+
         # ---------------------------------------------------------------------------- #
-        #                        创建模型,
+        #                        Create Model
         # ---------------------------------------------------------------------------- #
         model = InferEngine(
             model_path,
@@ -132,47 +137,30 @@ def __init__(
         )
 
         # ---------------------------------------------------------------------------- #
-        #                        加载权重
+        #                        Load Weights
         # ---------------------------------------------------------------------------- #
         if not skip_load:
             load_model_state_dict_by_file(model, model_path, dtype=model.dtype)
 
         # ---------------------------------------------------------------------------- #
-        #                        创建 tokenizer
+        #                        Initialize Tokenizer
         # ---------------------------------------------------------------------------- #
-        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-
-        if tokenizer.pad_token is None:
-            if tokenizer.eos_token is not None:
-                tokenizer.pad_token = tokenizer.eos_token
-                tokenizer.pad_token_id = tokenizer.eos_token_id
-            else:
-                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        self.tokenizer = InfiniLMTokenizer(model_path)
 
         # ---------------------------------------------------------------------------- #
-        #                        token编码
+        #                        Encode Prompt
         # ---------------------------------------------------------------------------- #
         input_content = [
-            tokenizer.apply_chat_template(
+            self.tokenizer.apply_chat_template(
                 conversation=[{"role": "user", "content": prompt}],
                 add_generation_prompt=True,
                 tokenize=False,
             )
         ]
 
-        # print(input_content, end="", flush=True)
-        # Support Transformers >= 5.0 for batch_encode_plus deprecation
-        encoding = tokenizer(
-            input_content,
-            padding=True,
-            truncation=True,
-            max_length=8192,
-        )
-
-        input_ids_list = encoding["input_ids"]
+        input_ids_list = self.tokenizer.encode(input_content)
 
         self.model = model
-        self.tokenizer = tokenizer
         self.input_ids_list = input_ids_list
 
     def run(
@@ -184,11 +172,12 @@ def run(
         top_p=1.0,
         temperature=1.0,
     ):
+        """Run a single benchmark test case."""
         input_ids = repeat_prompt(self.input_ids_list[0], target_length=input_len)
         input_ids_list = [input_ids] * batch_size
 
         # ---------------------------------------------------------------------------- #
-        #                        自回归生成
+        #                        Autoregressive Generation
         # ---------------------------------------------------------------------------- #
         input_ids_infini = infinicore.from_list(input_ids_list)
 
@@ -211,6 +200,7 @@ def run(
         numpy_output_ids = np.array(
             [output_id.to_numpy()[0] for output_id in output_ids]
         )
+        # Use InfiniLMTokenizer for decoding
         print(self.tokenizer.decode(numpy_output_ids, skip_special_tokens=True))
 
         print(
@@ -224,8 +214,9 @@ def run(
     device_str = cfg.get_device_str(cfg.device)
 
     _PAGED_KV_BLOCK_SIZE = cfg.block_size
+
     # -------------------------------------------------------- #
-    #             解析参数
+    #             Parse Arguments
     # -------------------------------------------------------- #
     model_path = cfg.model
 
@@ -252,8 +243,9 @@ def run(
         output_len = [output_len]
 
     cases_dict = get_test_cases(model_path, batch_size, input_len, output_len)
+
     # -------------------------------------------------------- #
-    #             测试
+    #             Initialize Test Configuration
     # -------------------------------------------------------- #
     if enable_paged_attn:
         paged_kv_block_size = _PAGED_KV_BLOCK_SIZE
@@ -290,7 +282,7 @@ def run(
     if cfg.warmup:
         warmup_steps = 1
 
-        # warmup cache capacity
+        # Warmup cache capacity
         warmup_cache_len = 128
         warmup_batch = len(test.input_ids_list)
 
@@ -316,7 +308,7 @@ def run(
             _ = test.model.generate(
                 input_ids_infini,
                 GenerationConfig(
-                    max_new_tokens=5,  # decode kernel warmup
+                    max_new_tokens=5,  # Decode kernel warmup
                     temperature=cfg.temperature,
                     top_k=cfg.top_k,
                     top_p=cfg.top_p,
@@ -327,12 +319,12 @@ def run(
 
         print("=================== warmup done ====================")
 
-        # reset cache back to benchmark config
+        # Reset cache back to benchmark config
         if cache_config is not None:
             test.model.reset_cache(cache_config)
 
     # ---------------------------------------------------------------------------- #
-    #                                Warmup done
+    #                                Run Benchmarks
     # ---------------------------------------------------------------------------- #
 
     for idx, case in tqdm(cases_dict.items(), desc="Processing cases"):
@@ -343,15 +335,15 @@ def run(
         output_len = case["output_len"]
 
         if not enable_paged_attn:
-            # reset cache if static kvcache is used
+            # Reset cache if static KV cache is used
             initial_capacity = input_len + output_len
             test.model.reset_cache(
                 StaticKVCacheConfig(
                     max_batch_size=batch_size, max_cache_len=initial_capacity
                 )
             )
 
-        # run test one case
+        # Run test for one case
         test.run(
             batch_size=batch_size,
             input_len=input_len,

diff --git a/examples/test_infer.py b/examples/test_infer.py
@@ -1,18 +1,16 @@
 import infinicore
-import transformers
-from transformers import AutoTokenizer
-from tokenizers import decoders as _dec
-from infinilm.modeling_utils import load_model_state_dict_by_file
-from infinilm.distributed import DistConfig
-from infinilm.infer_engine import GenerationConfig, InferEngine
 import argparse
 import sys
 import time
 import os
 import numpy as np
 from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
-from packaging import version
+
 from infinilm.base_config import BaseConfig
+from infinilm.distributed import DistConfig
+from infinilm.infer_engine import GenerationConfig, InferEngine
+from infinilm.modeling_utils import load_model_state_dict_by_file
+from infinilm.tokenizer_utils import InfiniLMTokenizer
 
 from PIL import Image
 import torch
@@ -37,6 +35,7 @@ def test(
     image_path=None,
 ):
     model_path = os.path.expanduser(model_path)
+
     # ---------------------------------------------------------------------------- #
     #                        Create Model
     # ---------------------------------------------------------------------------- #
@@ -51,16 +50,18 @@ def test(
         attention_backend=attn_backend,
         kv_cache_dtype=cfg.kv_cache_dtype,
     )
+
     # ---------------------------------------------------------------------------- #
     #                        Load Weights
     # ---------------------------------------------------------------------------- #
     load_model_state_dict_by_file(model, model_path, dtype=model.dtype)
 
     # ---------------------------------------------------------------------------- #
-    #                        create tokenizer
+    #                        Initialize Tokenizer
     # ---------------------------------------------------------------------------- #
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    tokenizer = InfiniLMTokenizer(model_path)
 
+    # Initialize processor for multimodal models
     processor = None
     if image_path is not None:
         if model.model_type == "minicpmv":
@@ -69,33 +70,14 @@ def test(
             processor = AutoProcessor.from_pretrained(
                 model_path, trust_remote_code=True
             )
-            tokenizer = processor.tokenizer
-
-    if "llama" == model.model_type:
-        backend = getattr(tokenizer, "backend_tokenizer", None)
-        target = getattr(backend, "_tokenizer", backend)
-        norm = getattr(target, "normalizer", None)
-        dec = getattr(target, "decoder", None)
-        sn = repr(norm)[:800] if norm is not None else ""
-        sd = repr(dec)[:800] if dec is not None else ""
-        has_prepend = "Prepend" in sn
-        has_strip = "Strip" in sd
-        if has_prepend and has_strip:
-            target.decoder = _dec.Sequence(
-                [
-                    _dec.Replace("▁", " "),
-                    _dec.ByteFallback(),
-                    _dec.Fuse(),
-                ]
-            )
 
     # ---------------------------------------------------------------------------- #
-    #                        tokenize
+    #                        Tokenize Inputs
     # ---------------------------------------------------------------------------- #
-    # prompt = "山东最高的山是？"
     if isinstance(prompts, str):
         prompts = [prompts]
 
+    # Handle image prompts for multimodal models
     if image_path is not None:
         updated_prompts = []
         for prompt in prompts:
@@ -104,18 +86,17 @@ def test(
             updated_prompts.append(prompt)
         prompts = updated_prompts
 
-    if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
-        input_contents = [
-            tokenizer.apply_chat_template(
-                conversation=[{"role": "user", "content": prompt}],
-                add_generation_prompt=True,
-                tokenize=False,
-            )
-            for prompt in prompts
-        ]
-    else:
-        input_contents = prompts
+    # Apply chat template or use raw prompts
+    input_contents = [
+        tokenizer.apply_chat_template(
+            conversation=[{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        for prompt in prompts
+    ]
 
+    # Process multimodal inputs or encode text
     pixel_values = None
     image_bound = None
     tgt_sizes = None
@@ -139,39 +120,14 @@ def test(
         else:
             raise ValueError(f"Unsupported multimodal model_type: {model.model_type}")
     else:
-        if hasattr(tokenizer, "batch_encode_plus"):
-            input_ids_list = tokenizer.batch_encode_plus(input_contents)["input_ids"]
-        elif hasattr(tokenizer, "_encode_plus"):
-            input_ids_list = tokenizer._encode_plus(input_contents)["input_ids"]
-        else:
-            input_ids_list = tokenizer(input_contents)[
-                "input_ids"
-            ]  # List: [[1, 1128, 526, 366, 29892]]
-
-        # input_ids_list = tokenizer.batch_encode_plus(input_contents)[
-        #     "input_ids"
-        # ]  # List: [[1, 1128, 526, 366, 29892]]
-        if version.parse(transformers.__version__) < version.parse("5.0.0"):
-            # Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used.
-            input_ids_list = [
-                tokenizer.encode_plus(
-                    text, truncation=True, max_length=2048, add_special_tokens=True
-                )["input_ids"]
-                for text in input_contents
-            ]
-        else:
-            input_ids_list = [
-                tokenizer._encode_plus(
-                    text, truncation=True, max_length=2048, add_special_tokens=True
-                )["input_ids"]
-                for text in input_contents
-            ]
+        # Use InfiniLMTokenizer for encoding
+        input_ids_list = tokenizer.encode(input_contents)
 
     # ---------------------------------------------------------------------------- #
-    #                       Create KVCache
+    #                       Create KV Cache
     # ---------------------------------------------------------------------------- #
     if enable_paged_attn:
-        batch_size = 1 if prompts is str else len(prompts)
+        batch_size = 1 if isinstance(prompts, str) else len(prompts)
         max_total_tokens = max_new_tokens + len(input_ids_list[0])
         cache_config = PagedKVCacheConfig(
             num_blocks=(
@@ -181,7 +137,7 @@ def test(
             block_size=_PAGED_KV_BLOCK_SIZE,
         )
     else:
-        batch_size = 1 if prompts is str else len(prompts)
+        batch_size = 1 if isinstance(prompts, str) else len(prompts)
         initial_capacity = max_new_tokens + len(input_ids_list[0])
         cache_config = StaticKVCacheConfig(
             max_batch_size=batch_size, max_cache_len=initial_capacity
@@ -223,7 +179,7 @@ def test(
             )
             pixel_values_infini = infinicore.from_torch(pixel_values_tensor)
 
-            # 2. tgt_sizes
+            # 2. Target sizes
             all_tgt_sizes = [
                 tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)
             ]
@@ -232,7 +188,7 @@ def test(
 
             tgt_sizes_infini = infinicore.from_torch(tgt_sizes_tensor)
 
-            # 3. image_bound
+            # 3. Image bounds
             batch_size = len(image_bound)
             max_ranges = max(len(b) for b in image_bound)