Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions benchmarks/llm/scripts/vLLM/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import time
import argparse
import pandas as pd
import json
import os
import torch
from vllm import LLM, SamplingParams

# 尝试导入绘图库
try:
import matplotlib.pyplot as plt
MATPLOTLIB_AVAILABLE = True
except ImportError:
MATPLOTLIB_AVAILABLE = False


# --- 代表精确的目标Token数 ---
TARGET_PROMPT_TOKEN_COUNTS = [32, 64, 128, 256, 512, 1024, 2048]
MAX_TOKENS_LIST = [32, 64, 128, 256, 512, 1024, 2048]
TARGET_PROMPT_TOKEN_COUNTS = [32, 64, 128]
MAX_TOKENS_LIST = [32, 64, 128, 256]
# 使用一个通用的、非特殊的 token_id 来填充我们的输入
FILLER_TOKEN_ID = 10

def plot_results(df, output_filename="vllm_benchmark_plot.png"):

if not MATPLOTLIB_AVAILABLE:
print("\n警告: Matplotlib 未安装。运行 'pip install matplotlib' 以生成性能图表。")
return

device_groups = df['device_ids'].unique()
prompt_token_lengths = sorted(df['prompt_tokens'].unique())
num_prompts = len(prompt_token_lengths)

fig, axes = plt.subplots(nrows=num_prompts, ncols=1, figsize=(12, 6 * num_prompts), squeeze=False)
fig.suptitle('VLLM Throughput Benchmark', fontsize=16, y=1.0)

for i, prompt_len in enumerate(prompt_token_lengths):
ax = axes[i, 0]
ax.set_title(f'Performance for Input Prompt Tokens = {prompt_len}', fontsize=12)
ax.set_xlabel('Generated Output Tokens')
ax.set_ylabel('Throughput (Tokens/Sec)')
ax.grid(True, which='both', linestyle='--', linewidth=0.5)

for group in device_groups:
subset = df[(df['prompt_tokens'] == prompt_len) & (df['device_ids'] == group)]
subset = subset.sort_values('output_tokens')
if not subset.empty:
ax.plot(subset['output_tokens'], subset['tokens_per_sec'], marker='o', linestyle='-',
label=f'GPUs: {group} (tp={subset.iloc[0]["tp_size"]})')

ax.legend()

plt.tight_layout(rect=[0, 0, 1, 0.98])
plt.savefig(output_filename, dpi=300)
plt.close(fig)
print(f"\n性能图表已成功保存到: {output_filename}")


def run_single_benchmark(llm_engine, token_ids_list, sampling_params):

start_time = time.time()
# 使用 prompt_token_ids 参数直接进行推理
outputs = llm_engine.generate(prompt_token_ids=token_ids_list, sampling_params=sampling_params, use_tqdm=False)
end_time = time.time()

total_time = end_time - start_time
result = outputs[0]

prompt_len = len(result.prompt_token_ids)
output_len = len(result.outputs[0].token_ids)

max_tokens_requested = sampling_params.max_tokens
if output_len < max_tokens_requested:
print(f"警告:请求生成 {max_tokens_requested} tokens,但实际只生成了 {output_len} tokens。")

tokens_per_sec = output_len / total_time

metrics = {
'prompt_tokens': prompt_len,
'output_tokens': output_len,
'max_steps_requested': max_tokens_requested,
'total_time_s': total_time,
'tokens_per_sec': tokens_per_sec
}

return metrics

def main():
parser = argparse.ArgumentParser(
description="运行 vLLM 性能基准测试。可以灵活配置模型、循环测试多个GPU设备组、输入和输出长度。",
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument("model_path", type=str, help="要测试的模型路径。")
parser.add_argument("--device-groups", nargs='+', type=str, required=True,
help="【必填】指定一个或多个要测试的GPU设备组。例如: --device-groups \"0,1\" \"0,1,2,3\"")
parser.add_argument("--dtype", type=str, default="auto", help="模型加载的数据类型。")
args = parser.parse_args()

print(f"将使用模型进行测试: {args.model_path}")
print(f"将依次测试以下设备组: {args.device_groups}\n")

all_results = []
total_gpu_available = torch.cuda.device_count()
print(f"检测到系统中共有 {total_gpu_available} 个可用的GPU。")

for device_group_str in args.device_groups:
print(f"\n{'='*25} 开始测试设备组: \"{device_group_str}\" {'='*25}")
os.environ["CUDA_VISIBLE_DEVICES"] = device_group_str
device_id_list = [int(i) for i in device_group_str.split(',')]
tp_size = len(device_id_list)
max_requested_id = max(device_id_list)

if max_requested_id >= total_gpu_available:
print(f"错误: 设备组 \"{device_group_str}\" 请求了不存在的GPU ID {max_requested_id}。跳过此设备组。")
continue

print(f"配置: Tensor Parallel Size = {tp_size}, 使用物理GPU ID = {device_id_list}")
print("正在初始化 vLLM 引擎...")

try:
llm = LLM(model=args.model_path, tensor_parallel_size=tp_size, dtype=args.dtype, trust_remote_code=True)
print("vLLM 引擎初始化完成。")
except Exception as e:
print(f"错误:为设备组 \"{device_group_str}\" 初始化 vLLM 引擎失败: {e}")
print("跳过此设备组的测试。")
continue

# --- 循环遍历精确的Token数 ---
for prompt_len in TARGET_PROMPT_TOKEN_COUNTS:
# 直接生成token_id列表
prompt_token_ids = [FILLER_TOKEN_ID] * prompt_len

for max_tokens in MAX_TOKENS_LIST:
# 注意:vLLM可能对总长度(输入+输出)有限制,如果组合过长可能报错
if prompt_len + max_tokens > llm.llm_engine.model_config.max_model_len:
print(f"\n跳过: 输入({prompt_len}) + 输出({max_tokens}) > 模型最大长度({llm.llm_engine.model_config.max_model_len})")
continue

print(f"\n{'-'*10} 测试中: Input_Tokens: {prompt_len}, Max_Output_Tokens: {max_tokens} {'-'*10}")
sampling_params = SamplingParams(n=1, temperature=0.0, max_tokens=max_tokens, ignore_eos=True)

try:

extracted_metrics = run_single_benchmark(llm, [prompt_token_ids], sampling_params)
if extracted_metrics:
extracted_metrics['device_ids'] = device_group_str
extracted_metrics['tp_size'] = tp_size
all_results.append(extracted_metrics)
print("成功提取到性能数据:")
formatted_metrics = {k: f"{v:.4f}" if isinstance(v, float) else v for k, v in extracted_metrics.items()}
print(json.dumps(formatted_metrics, indent=2))
except Exception as e:
print(f"错误:在运行基准测试时发生异常: {e}")
print("跳过此测试用例。")

print(f"\n设备组 \"{device_group_str}\" 测试完成,正在释放资源...")
del llm
torch.cuda.empty_cache()
print("资源已释放。")

if all_results:
print(f"\n\n{'='*30} 所有测试结果汇总 {'='*30}")
df = pd.DataFrame(all_results)
cols_ordered = ['device_ids', 'tp_size', 'prompt_tokens', 'output_tokens',
'tokens_per_sec', 'total_time_s', 'max_steps_requested']
# 过滤掉可能不存在的列
cols_ordered = [col for col in cols_ordered if col in df.columns]
df = df[cols_ordered]
pd.options.display.float_format = '{:,.4f}'.format
print(df.to_string(index=False))

output_filename = "vllm_benchmark_results_multi_group.csv"
df.to_csv(output_filename, index=False)
print(f"\n测试结果已保存到: {output_filename}")

plot_results(df, "vllm_benchmark_plot.png")

else:
print("\n所有测试运行完毕,但未能收集到任何性能数据。")

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions benchmarks/llm/scripts/vLLM/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python test.py Qwen/Qwen1.5-7B-Chat --device-groups "5" "5,6" "5,6,7,8"