Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ xmake && xmake install
- 运行模型推理测试

```bash
python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] path/to/model_dir [n_device]
python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device]
```

- 部署模型推理服务
Expand Down Expand Up @@ -63,6 +63,12 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
```


- 选择是否使用九齿计算路径,默认为false,即不依赖九齿算子
```bash
xmake f --ninetoothed= [true | false] -cv
```


- 安装 InfiniLM Python 包
```bash
pip install -e .
Expand All @@ -71,11 +77,11 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
- 单次推理测试
- llama示例
```bash
python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali] --model_path=<path/to/model_dir>
python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path=<path/to/model_dir>
```
- 例如:
```bash
python examples/llama.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
```
- 分布式推理测试
- 9g示例
Expand Down Expand Up @@ -113,7 +119,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
- 运行推理基准测试(C-Eval/MMLU)

```bash
python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
```

- 参数说明:
Expand Down
38 changes: 18 additions & 20 deletions csrc/cache/kv_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,26 +93,24 @@ StaticKVCache::update(size_t layer_idx,

auto device = k_cache_layer->device();

if (device.getType() == infinicore::Device::Type::NVIDIA
|| device.getType() == infinicore::Device::Type::ILUVATAR
|| device.getType() == infinicore::Device::Type::METAX) {
infinicore::op::kv_caching_(
k_cache_layer,
v_cache_layer,
k,
v,
past_sequence_lengths);
} else {
size_t cache_pos = reinterpret_cast<int64_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
auto result_len = cache_pos + update_len;
ASSERT(result_len <= cache_len_);

auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}});
auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}});

k_cache_update->copy_from(k);
v_cache_update->copy_from(v);
}
#ifdef ENABLE_NINETOOTHED
infinicore::op::kv_caching_(
k_cache_layer,
v_cache_layer,
k,
v,
past_sequence_lengths);
#else
size_t cache_pos = reinterpret_cast<int64_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
auto result_len = cache_pos + update_len;
ASSERT(result_len <= cache_len_);

auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}});
auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}});

k_cache_update->copy_from(k);
v_cache_update->copy_from(v);
#endif

return {k_cache_layer, v_cache_layer};
}
Expand Down
35 changes: 25 additions & 10 deletions csrc/models/llama/llama_mlp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,19 +71,34 @@ LlamaMLP::LlamaMLP(std::shared_ptr<infinilm::config::ModelConfig> model_config,
}

infinicore::Tensor LlamaMLP::forward(const infinicore::Tensor &hidden_states) const {
// 1. Project to gate and up
auto hidden_states_mutable = hidden_states;
auto [gate, up] = gate_up_proj_->forward_split(hidden_states_mutable);
infinicore::Device::Type dev_type = hidden_states->device().getType();
if(dev_type == infinicore::Device::Type::MOORE){
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我感觉Device相关的判断理应放到InfiniCore中,不应该放在推理框架层

// 1. Project to a single combined gate_up tensor
auto hidden_states_mutable = hidden_states;
auto gate_up = gate_up_proj_->forward(hidden_states_mutable);

// 2. Apply SwiGLU: silu(gate) * up
// Note: swiglu kernel expects (up, gate) and computes gate * sigmoid(gate) * up
// So we pass (up, gate) to get the correct result: gate * sigmoid(gate) * up
auto intermediate = infinicore::op::swiglu(up, gate);
// 2. Apply the fused silu_and_mul operator
// applies SiLU to the first half, and multiplies it by the second half.
// Mathematically equivalent to: result = SiLU(gate_up[..., :d]) * gate_up[..., d:]
auto intermediate = infinicore::op::silu_and_mul(gate_up);

// 3. Project down
auto output = down_proj_->forward(intermediate);
// 3. Project down
auto output = down_proj_->forward(intermediate);
return output;
} else{
// 1. Project to gate and up
auto hidden_states_mutable = hidden_states;
auto [gate, up] = gate_up_proj_->forward_split(hidden_states_mutable);

return output;
// 2. Apply SwiGLU: silu(gate) * up
// Note: swiglu kernel expects (up, gate) and computes gate * sigmoid(gate) * up
// So we pass (up, gate) to get the correct result: gate * sigmoid(gate) * up
auto intermediate = infinicore::op::swiglu(up, gate);

// 3. Project down
auto output = down_proj_->forward(intermediate);
return output;
}
}

} // namespace infinilm::models::llama
31 changes: 18 additions & 13 deletions examples/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,11 @@ def get_args():
action="store_true",
help="Run alippu test",
)
parser.add_argument(
"--hygon",
action="store_true",
help="Run hygon test",
)
parser.add_argument(
"--model",
type=str,
Expand Down Expand Up @@ -237,12 +242,13 @@ def get_args():
parser.add_argument(
"--warmup",
action="store_true",
help="Perform a warmup run before benchmarking/inference."
help="Perform a warmup run before benchmarking/inference.",
)
return parser.parse_args()


prompt = "泰山,又名岱山、岱宗、岱岳、东岳、泰岳,为五岳之一,有“五岳之首”、“五岳独尊”、“天下第一山”、“华夏神山”之称 ,被中外学者称为“中国的奥林匹斯山” 位于山东省中部,隶属于泰安市,绵亘于泰安、济南、淄博三市之间,总面积25000公顷,主峰玉皇顶海拔约1545米。泰山相伴上下五千年的华夏文明传承历史,集国家兴盛、民族存亡的象征于一身,是中华民族的精神家园 [31],东方文化的缩影,“天人合一”思想的寄托之地 [24],承载着丰厚的地理历史文化内涵 [15],被古人视为“直通帝座”的天堂,成为百姓崇拜,帝王告祭的神山,有“泰山安,四海皆安”的说法 [1]。自秦始皇起至清代,先后有13代帝王亲登泰山封禅或祭祀,另有24代帝王遣官祭祀72次。山体上既有寺庙、宫、观等古建筑群29处,古遗址128处,有大小碑碣、摩崖石刻2000余处 [15]。其景巍峨雄奇、幽奥俊秀,有石坞松涛、云海玉盘等美丽壮阔的自然景观。其历史文化、自然风光、地质奇观和谐融为一体,具有特殊的历史、文化、美学和科学价值。 [19]1982年,泰山被列入第一批国家级风景名胜区。1987年,泰山被联合国教科文组织批准列为全球首例世界文化与自然双重遗产 [14] [41-42]。2002年,泰山被评为“中华十大文化名山”之首 [15]。2005年,泰山成为国家地质公园。2006年,泰山因其独特的地质价值成为世界地质公园 [14]。2007年3月,泰山被评为国家AAAAA级旅游景区;12月,泰山被命名为中国首座“中国书法名山”。2025年3月20日,泰山迎来2025年第100万名游客。"
with open("examples/bench_prompt.md", "r") as f:
prompt = f.read()


def repeat_prompt(input_ids: list[int], target_length: int):
Expand Down Expand Up @@ -287,13 +293,13 @@ def __init__(
# 创建 tokenizer
# ---------------------------------------------------------------------------- #
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if tokenizer.pad_token is None:
if tokenizer.eos_token is not None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
else:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

# ---------------------------------------------------------------------------- #
# token编码
Expand All @@ -312,9 +318,8 @@ def __init__(
input_content,
padding=True,
truncation=True,
max_length=2048,
return_tensors="pt"
)
max_length=8192,
)

input_ids_list = encoding["input_ids"]

Expand Down Expand Up @@ -349,6 +354,7 @@ def run(
top_k=top_k,
top_p=top_p,
temperature=temperature,
stop_on_eos=False,
),
_measure_and_log_time=True,
)
Expand Down Expand Up @@ -386,6 +392,8 @@ def run(
device_str = "mlu"
elif args.ali:
device_str = "cuda"
elif args.hygon:
device_str = "cuda"
else:
print(
"python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tp=1 --input-len=50 --output-len=50"
Expand Down Expand Up @@ -459,10 +467,7 @@ def run(
)
)

avg_prompt_len = min(
64,
max(len(ids) for ids in test.input_ids_list)
)
avg_prompt_len = min(64, max(len(ids) for ids in test.input_ids_list))

warmup_ids = [
ids[:avg_prompt_len] if len(ids) >= avg_prompt_len else ids
Expand All @@ -477,10 +482,11 @@ def run(
_ = test.model.generate(
input_ids_infini,
GenerationConfig(
max_new_tokens=5, # decode kernel warmup
max_new_tokens=5, # decode kernel warmup
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
stop_on_eos=False,
),
_measure_and_log_time=False,
)
Expand All @@ -495,7 +501,6 @@ def run(
# Warmup done
# ---------------------------------------------------------------------------- #


for idx, case in tqdm(cases_dict.items(), desc="Processing cases"):
tqdm.write(f"\033[92mProcessing : {case}\033[0m")

Expand Down
Loading