Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 31 additions & 39 deletions examples/bench.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import infinicore
from transformers import AutoTokenizer
from infinilm.modeling_utils import load_model_state_dict_by_file
from infinilm.distributed import DistConfig
from infinilm.infer_engine import GenerationConfig, InferEngine
from infinilm.base_config import BaseConfig
from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
from infinilm.distributed import DistConfig
from infinilm.infer_engine import GenerationConfig, InferEngine
from infinilm.modeling_utils import load_model_state_dict_by_file
from infinilm.tokenizer_utils import InfiniLMTokenizer
import argparse
import sys
import time
Expand Down Expand Up @@ -42,9 +42,9 @@ def get_test_cases(
input_len_list: list[int],
output_len_list: list[int],
):
"""Generate cases ordered by ascending KV cache memory usage."""
model_path = os.path.expanduser(model_path)

"""Generate cases ordered by ascending KV cache memory usage."""
# Load model config to derive attention dimensions
config = read_json_file(os.path.join(model_path, "config.json"))
head_dim = config.get(
Expand Down Expand Up @@ -92,19 +92,23 @@ def get_test_cases(
return case_dict


# Load benchmark prompt from file
with open("examples/bench_prompt.md", "r") as f:
prompt = f.read()


def repeat_prompt(input_ids: list[int], target_length: int):
"""Repeat or truncate input_ids to match target_length."""
num = len(input_ids)
repeat_times = (target_length + num - 1) // num
return (input_ids * repeat_times)[:target_length]


class TestModel:
"""Benchmark model wrapper for performance testing."""

model: infinicore.nn.Module
tokenizer: AutoTokenizer
tokenizer: InfiniLMTokenizer
input_ids_list: list[int]

def __init__(
Expand All @@ -118,8 +122,9 @@ def __init__(
attn_backend="default",
) -> None:
model_path = os.path.expanduser(model_path)

# ---------------------------------------------------------------------------- #
# 创建模型,
# Create Model
# ---------------------------------------------------------------------------- #
model = InferEngine(
model_path,
Expand All @@ -132,47 +137,30 @@ def __init__(
)

# ---------------------------------------------------------------------------- #
# 加载权重
# Load Weights
# ---------------------------------------------------------------------------- #
if not skip_load:
load_model_state_dict_by_file(model, model_path, dtype=model.dtype)

# ---------------------------------------------------------------------------- #
# 创建 tokenizer
# Initialize Tokenizer
# ---------------------------------------------------------------------------- #
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if tokenizer.pad_token is None:
if tokenizer.eos_token is not None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
else:
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
self.tokenizer = InfiniLMTokenizer(model_path)

# ---------------------------------------------------------------------------- #
# token编码
# Encode Prompt
# ---------------------------------------------------------------------------- #
input_content = [
tokenizer.apply_chat_template(
self.tokenizer.apply_chat_template(
conversation=[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
]

# print(input_content, end="", flush=True)
# Support Transformers >= 5.0 for batch_encode_plus deprecation
encoding = tokenizer(
input_content,
padding=True,
truncation=True,
max_length=8192,
)

input_ids_list = encoding["input_ids"]
input_ids_list = self.tokenizer.encode(input_content)

self.model = model
self.tokenizer = tokenizer
self.input_ids_list = input_ids_list

def run(
Expand All @@ -184,11 +172,12 @@ def run(
top_p=1.0,
temperature=1.0,
):
"""Run a single benchmark test case."""
input_ids = repeat_prompt(self.input_ids_list[0], target_length=input_len)
input_ids_list = [input_ids] * batch_size

# ---------------------------------------------------------------------------- #
# 自回归生成
# Autoregressive Generation
# ---------------------------------------------------------------------------- #
input_ids_infini = infinicore.from_list(input_ids_list)

Expand All @@ -211,6 +200,7 @@ def run(
numpy_output_ids = np.array(
[output_id.to_numpy()[0] for output_id in output_ids]
)
# Use InfiniLMTokenizer for decoding
print(self.tokenizer.decode(numpy_output_ids, skip_special_tokens=True))

print(
Expand All @@ -224,8 +214,9 @@ def run(
device_str = cfg.get_device_str(cfg.device)

_PAGED_KV_BLOCK_SIZE = cfg.block_size

# -------------------------------------------------------- #
# 解析参数
# Parse Arguments
# -------------------------------------------------------- #
model_path = cfg.model

Expand All @@ -252,8 +243,9 @@ def run(
output_len = [output_len]

cases_dict = get_test_cases(model_path, batch_size, input_len, output_len)

# -------------------------------------------------------- #
# 测试
# Initialize Test Configuration
# -------------------------------------------------------- #
if enable_paged_attn:
paged_kv_block_size = _PAGED_KV_BLOCK_SIZE
Expand Down Expand Up @@ -290,7 +282,7 @@ def run(
if cfg.warmup:
warmup_steps = 1

# warmup cache capacity
# Warmup cache capacity
warmup_cache_len = 128
warmup_batch = len(test.input_ids_list)

Expand All @@ -316,7 +308,7 @@ def run(
_ = test.model.generate(
input_ids_infini,
GenerationConfig(
max_new_tokens=5, # decode kernel warmup
max_new_tokens=5, # Decode kernel warmup
temperature=cfg.temperature,
top_k=cfg.top_k,
top_p=cfg.top_p,
Expand All @@ -327,12 +319,12 @@ def run(

print("=================== warmup done ====================")

# reset cache back to benchmark config
# Reset cache back to benchmark config
if cache_config is not None:
test.model.reset_cache(cache_config)

# ---------------------------------------------------------------------------- #
# Warmup done
# Run Benchmarks
# ---------------------------------------------------------------------------- #

for idx, case in tqdm(cases_dict.items(), desc="Processing cases"):
Expand All @@ -343,15 +335,15 @@ def run(
output_len = case["output_len"]

if not enable_paged_attn:
# reset cache if static kvcache is used
# Reset cache if static KV cache is used
initial_capacity = input_len + output_len
test.model.reset_cache(
StaticKVCacheConfig(
max_batch_size=batch_size, max_cache_len=initial_capacity
)
)

# run test one case
# Run test for one case
test.run(
batch_size=batch_size,
input_len=input_len,
Expand Down
102 changes: 29 additions & 73 deletions examples/test_infer.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
import infinicore
import transformers
from transformers import AutoTokenizer
from tokenizers import decoders as _dec
from infinilm.modeling_utils import load_model_state_dict_by_file
from infinilm.distributed import DistConfig
from infinilm.infer_engine import GenerationConfig, InferEngine
import argparse
import sys
import time
import os
import numpy as np
from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
from packaging import version

from infinilm.base_config import BaseConfig
from infinilm.distributed import DistConfig
from infinilm.infer_engine import GenerationConfig, InferEngine
from infinilm.modeling_utils import load_model_state_dict_by_file
from infinilm.tokenizer_utils import InfiniLMTokenizer

from PIL import Image
import torch
Expand All @@ -37,6 +35,7 @@ def test(
image_path=None,
):
model_path = os.path.expanduser(model_path)

# ---------------------------------------------------------------------------- #
# Create Model
# ---------------------------------------------------------------------------- #
Expand All @@ -51,16 +50,18 @@ def test(
attention_backend=attn_backend,
kv_cache_dtype=cfg.kv_cache_dtype,
)

# ---------------------------------------------------------------------------- #
# Load Weights
# ---------------------------------------------------------------------------- #
load_model_state_dict_by_file(model, model_path, dtype=model.dtype)

# ---------------------------------------------------------------------------- #
# create tokenizer
# Initialize Tokenizer
# ---------------------------------------------------------------------------- #
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer = InfiniLMTokenizer(model_path)

# Initialize processor for multimodal models
processor = None
if image_path is not None:
if model.model_type == "minicpmv":
Expand All @@ -69,33 +70,14 @@ def test(
processor = AutoProcessor.from_pretrained(
model_path, trust_remote_code=True
)
tokenizer = processor.tokenizer

if "llama" == model.model_type:
backend = getattr(tokenizer, "backend_tokenizer", None)
target = getattr(backend, "_tokenizer", backend)
norm = getattr(target, "normalizer", None)
dec = getattr(target, "decoder", None)
sn = repr(norm)[:800] if norm is not None else ""
sd = repr(dec)[:800] if dec is not None else ""
has_prepend = "Prepend" in sn
has_strip = "Strip" in sd
if has_prepend and has_strip:
target.decoder = _dec.Sequence(
[
_dec.Replace("▁", " "),
_dec.ByteFallback(),
_dec.Fuse(),
]
)

# ---------------------------------------------------------------------------- #
# tokenize
# Tokenize Inputs
# ---------------------------------------------------------------------------- #
# prompt = "山东最高的山是?"
if isinstance(prompts, str):
prompts = [prompts]

# Handle image prompts for multimodal models
if image_path is not None:
updated_prompts = []
for prompt in prompts:
Expand All @@ -104,18 +86,17 @@ def test(
updated_prompts.append(prompt)
prompts = updated_prompts

if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
input_contents = [
tokenizer.apply_chat_template(
conversation=[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
for prompt in prompts
]
else:
input_contents = prompts
# Apply chat template or use raw prompts
input_contents = [
tokenizer.apply_chat_template(
conversation=[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
for prompt in prompts
]

# Process multimodal inputs or encode text
pixel_values = None
image_bound = None
tgt_sizes = None
Expand All @@ -139,39 +120,14 @@ def test(
else:
raise ValueError(f"Unsupported multimodal model_type: {model.model_type}")
else:
if hasattr(tokenizer, "batch_encode_plus"):
input_ids_list = tokenizer.batch_encode_plus(input_contents)["input_ids"]
elif hasattr(tokenizer, "_encode_plus"):
input_ids_list = tokenizer._encode_plus(input_contents)["input_ids"]
else:
input_ids_list = tokenizer(input_contents)[
"input_ids"
] # List: [[1, 1128, 526, 366, 29892]]

# input_ids_list = tokenizer.batch_encode_plus(input_contents)[
# "input_ids"
# ] # List: [[1, 1128, 526, 366, 29892]]
if version.parse(transformers.__version__) < version.parse("5.0.0"):
# Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used.
input_ids_list = [
tokenizer.encode_plus(
text, truncation=True, max_length=2048, add_special_tokens=True
)["input_ids"]
for text in input_contents
]
else:
input_ids_list = [
tokenizer._encode_plus(
text, truncation=True, max_length=2048, add_special_tokens=True
)["input_ids"]
for text in input_contents
]
# Use InfiniLMTokenizer for encoding
input_ids_list = tokenizer.encode(input_contents)

# ---------------------------------------------------------------------------- #
# Create KVCache
# Create KV Cache
# ---------------------------------------------------------------------------- #
if enable_paged_attn:
batch_size = 1 if prompts is str else len(prompts)
batch_size = 1 if isinstance(prompts, str) else len(prompts)
max_total_tokens = max_new_tokens + len(input_ids_list[0])
cache_config = PagedKVCacheConfig(
num_blocks=(
Expand All @@ -181,7 +137,7 @@ def test(
block_size=_PAGED_KV_BLOCK_SIZE,
)
else:
batch_size = 1 if prompts is str else len(prompts)
batch_size = 1 if isinstance(prompts, str) else len(prompts)
initial_capacity = max_new_tokens + len(input_ids_list[0])
cache_config = StaticKVCacheConfig(
max_batch_size=batch_size, max_cache_len=initial_capacity
Expand Down Expand Up @@ -223,7 +179,7 @@ def test(
)
pixel_values_infini = infinicore.from_torch(pixel_values_tensor)

# 2. tgt_sizes
# 2. Target sizes
all_tgt_sizes = [
tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)
]
Expand All @@ -232,7 +188,7 @@ def test(

tgt_sizes_infini = infinicore.from_torch(tgt_sizes_tensor)

# 3. image_bound
# 3. Image bounds
batch_size = len(image_bound)
max_ranges = max(len(b) for b in image_bound)

Expand Down
Loading