性能验证
全量性能测试需要覆盖batch_size、input_seq_len、out_seq_len三个维度,默认取值范围:batch_size为1 ~ 32,input_seq_len和out_seq_len为25 ~ 210。
测试代码如下,具体可参考“pytorch/examples/xx/zhipu_test.py”:
# 给定batch/seq_len随机构造指定范围内的输入数据
past_key_values = None
input_ids = torch.randint(65504, (batch, seq_len)).npu()
position_ids = torch.arange(seq_len).reshape([1, seq_len]).npu()
attention_mask = torch.ones([1, seq_len]).to(torch.int64).npu()
model_inputs = {
"input_ids": input_ids,
"past_key_values": past_key_values,
"position_ids": position_ids,
"attention_mask": attention_mask
}
torch.npu.synchronize()
start = time.time()
# 首次推理
outputs = model(
**model_inputs,
return_dict=True,
output_attentions=False,
output_hidden_states=False,
)
torch.npu.synchronize()
end = time.time()
first_time = (end - start) * 1000
print(f"first token: {first_time}ms")
sum_time = 0
test_cycle -= 1
input_ids = torch.randint(65504, (batch, 1)).npu()
position_ids = torch.randint(seq_len, (1, 1)).npu()
attention_mask = torch.ones([1, 1]).to(torch.int64).npu()
for i in range(test_cycle):
past_key_values = outputs.past_key_values
model_inputs = {
"input_ids": input_ids,
"past_key_values": past_key_values,
"position_ids": position_ids,
"attention_mask": attention_mask
}
torch.npu.synchronize()
start = time.time()
# 增量推理
outputs = model(
**model_inputs,
return_dict=True,
output_attentions=False,
output_hidden_states=True,
)
torch.npu.synchronize()
end = time.time()
cur_time = (end - start) * 1000
sum_time += cur_time
# print(f"token_{i + 1}: {cur_time}ms")
avg_time = sum_time / test_cycle
return first_time, avg_time
父主题: 测试验证说明