Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 148 additions & 15 deletions test/bench/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@
import re
import csv
import numpy as np
import infinicore
from infinilm.modeling_utils import load_model_state_dict_by_file
from infinilm.distributed import DistConfig
from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
from infinilm.infer_engine import GenerationConfig, InferEngine
from datasets import load_dataset, Dataset
from abc import ABC, abstractmethod

Expand Down Expand Up @@ -57,6 +52,11 @@ def __init__(
enable_paged_attn=False,
):
import transformers
import infinicore
from infinilm.modeling_utils import load_model_state_dict_by_file
from infinilm.distributed import DistConfig
from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
from infinilm.infer_engine import InferEngine

self.benchmark = benchmark

Expand Down Expand Up @@ -103,7 +103,9 @@ def __init__(
)
elif model_type in ["qwen2", "qwen3"]:
# For qwen2/qwen3 models: no trust_remote_code (matches jiuge line 534-536)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir_path)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_dir_path, trust_remote_code=True
)
else:
# Default: use trust_remote_code=True for other models
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
Expand Down Expand Up @@ -179,6 +181,9 @@ def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
which properly handles KV cache through GenerationMixin.
"""
# Convert tokens to infinicore format
import infinicore
from infinilm.infer_engine import GenerationConfig

input_ids_list = [tokens]
input_ids = infinicore.from_list(input_ids_list)

Expand Down Expand Up @@ -370,6 +375,124 @@ def destroy_model_instance(self):
print("Torch model destroyed")


class VLLMBenchmark(BaseBenchmark):
"""vLLM backend using vllm.LLM"""

def __init__(
self,
model_dir_path,
device_type_str="nvidia",
tensor_parallel_size=1,
benchmark="ceval",
):
import transformers
from vllm import LLM

if device_type_str == "cpu":
raise ValueError("vLLM backend does not support CPU device type.")

self.benchmark = benchmark

# ---- tokenizer ----
with open(os.path.join(model_dir_path, "config.json"), "r") as f:
import json

self.config_dict = json.load(f)

model_type = self.config_dict.get("model_type", "")
if model_type in ["qwen2", "qwen3"]:
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_dir_path, trust_remote_code=True
)
else:
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_dir_path, trust_remote_code=True
)

eos_token_id = self.config_dict.get("eos_token_id")
self.eos_token_id = (
[eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
)

# ---- vLLM engine ----
print("Loading model with vLLM backend...")
self.llm = LLM(
model=model_dir_path,
tensor_parallel_size=tensor_parallel_size,
trust_remote_code=True,
)
print("vLLM model loaded successfully")

def max_context_len(self):
return self.config_dict.get("max_position_embeddings", 2048)

def render_input_content(self, *args, **kwargs):
if self.benchmark == "ceval":
return render_ceval(self.tokenizer, *args, **kwargs)
elif self.benchmark == "mmlu":
return render_mmlu(self.tokenizer, *args, **kwargs)
else:
raise ValueError(f"Unknown benchmark: {self.benchmark}")

def generate(self, *args, max_steps=500, topp_=1.0, topk_=1, temperature_=1.0):
input_content = self.render_input_content(*args)
print(input_content, end="", flush=True)

tokens = self.encode_text(input_content)
return self._generate_step(tokens, max_steps, topp_, topk_, temperature_)

def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
from vllm import SamplingParams

prompt = self.tokenizer.decode(tokens)

sampling_params = SamplingParams(
max_tokens=max_steps,
temperature=temperature_,
top_p=topp_,
top_k=topk_,
stop_token_ids=self.eos_token_id,
)

start_time = time.perf_counter()

outputs = self.llm.generate(
prompts=[prompt],
sampling_params=sampling_params,
)

end_time = time.perf_counter()

# ---- post process ----
output_text = outputs[0].outputs[0].text

# ---- stats ----
input_tokens = len(tokens)
new_tokens = len(self.encode_text(output_text))
total_tokens = input_tokens + new_tokens

total_time = end_time - start_time
throughput = total_tokens / total_time if total_time > 0 else 0.0

print(output_text)
print()
print(f"Total time: {total_time * 1000:.2f} ms")
print(f"Input tokens: {input_tokens}")
print(f"New tokens: {new_tokens}")
print(f"Total tokens processed: {total_tokens}")
print(f"Throughput: {throughput:.2f} tok/s")

global TOTAL_TOKENS, TOTAL_TIME
TOTAL_TOKENS += total_tokens
TOTAL_TIME += total_time

return output_text

def destroy_model_instance(self):
del self.llm
print("vLLM model destroyed")


def render_ceval(_tokenizer, conversation):
"""Render C-Eval conversation to input content"""
return (
Expand Down Expand Up @@ -397,13 +520,16 @@ def render_mmlu(_tokenizer, question, choices):
if hasattr(_tokenizer, "apply_chat_template"):
conversation = [
{"role": "system", "content": instruction},
{"role": "user", "content": f"{question}\n{choices_text}\nAnswer:"},
{"role": "user", "content": f"{question}\n{choices_text}\n"},
]
try:
return _tokenizer.apply_chat_template(
conversation=conversation,
add_generation_prompt=True,
tokenize=False,
return (
_tokenizer.apply_chat_template(
conversation=conversation,
add_generation_prompt=True,
tokenize=False,
)
+ "The answer is: "
)
except Exception:
return prompt
Expand Down Expand Up @@ -663,7 +789,7 @@ def test():
# Parse arguments manually to handle device flags properly
if len(sys.argv) < 4:
print(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch|vllm] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
)
sys.exit(1)

Expand Down Expand Up @@ -750,7 +876,7 @@ def test():
device_type_str = "ali"
else:
print(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch|vllm] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
)
sys.exit(1)

Expand All @@ -773,7 +899,10 @@ def test():
# Create model based on backend (create once, reuse for all subjects)

if backend == "torch":
assert ndev == 1, "Torch backend only supports single-device evaluation"
model = TorchBenchmark(model_path, device_type_str, benchmark)
elif backend == "vllm":
model = VLLMBenchmark(model_path, device_type_str, ndev, benchmark)
else:
model = InfiniLMBenchmark(
model_path, device_type_str, ndev, backend, benchmark, enable_paged_attn
Expand Down Expand Up @@ -944,7 +1073,9 @@ def _load_mmlu_subject(subj):
splits_to_load = (
["test"]
if split == "test"
else ["validation"] if split == "val" else ["validation", "test"]
else ["validation"]
if split == "val"
else ["validation", "test"]
)
# Load each subject individually from hardcoded list, excluding "all"
for subject_name in mmlu_subjects:
Expand All @@ -966,7 +1097,9 @@ def _load_mmlu_subject(subj):
splits_to_load = (
["test"]
if split == "test"
else ["validation"] if split == "val" else ["validation", "test"]
else ["validation"]
if split == "val"
else ["validation", "test"]
)
records = []
for sp in splits_to_load:
Expand Down