使用Torch_NPU进行异步多流推理样例
前提条件
以Resnet18为例进行样例展示,运行该样例需要用户自行安装Torchvision,如未安装,请单击链接获取Torchvision安装包并参照如下方法进行安装:
使用以下命令安装Torchvision。
pip install 软件包名.whl
软件包名.whl表示Torchvision安装包torchvision-{version}-cp310-cp310-{os}_{arch}.whl,请根据实际包名进行替换。
Torchvision版本与Torch版本严格配套使用,推荐使用与Torch2.1.0配套的0.16.0版本。
Python开发环境
Python开发环境样例如下所示:
# 请务必先导入Torch,再导入MindIE Torch
import torch
import torch_npu
import mindietorch
import torchvision
COSINE_THRESHOLD = 0.99
# 计算张量相似度
def cosine_similarity(gt_tensor, pred_tensor):
gt_tensor = gt_tensor.flatten().to(torch.float32)
pred_tensor = pred_tensor.flatten().to(torch.float32)
if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0:
if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True):
return 1.0
res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6)
res = res.cpu().detach().item()
return res
if __name__ == "__main__":
# 1. 加载resnet18模型
batch = 32
tensor_cpu = torch.randn([batch, 3, 224, 224], pin_memory = True)
model_torchvision = torchvision.models.resnet18().eval()
model = torch.jit.trace(model_torchvision, tensor_cpu)
# 2. 模型编译,具体参考python接口的函数方法中的mindietorch.compile。
torch_npu.npu.set_device("npu:0")
input = [ mindietorch.Input((batch, 3, 224, 224)), ]
compile_model = mindietorch.compile(model, inputs = input)
# 3.模型推理
# torch forward
torch_result = model(tensor_cpu)
# 创建 stream
stream_h2d = torch_npu.npu.Stream()
stream_forward = torch_npu.npu.Stream()
stream_d2h = torch_npu.npu.Stream()
# 推理预热
input_npus = []
output_npus = []
output_cpu = torch.randn([batch, 1000], pin_memory = True)
for i in range(3):
input_npu = tensor_cpu.to("npu:0")
output_npu = compile_model(input_npu)
output_cpu.copy_(output_npu)
input_npus.append(input_npu)
output_npus.append(output_npu)
loop_time = 100
result = True
for i in range(loop_time + 2):
# dispatch
if i >= 1 and i < loop_time + 1:
with torch_npu.npu.stream(stream_forward):
output_npus[(i - 1) % 3] = compile_model(input_npus[(i - 1) % 3])
if i > 0 and i < loop_time:
with torch_npu.npu.stream(stream_h2d):
input_npus[i % 3] = tensor_cpu.to("npu:0", non_blocking = True)
if i >= 2:
with torch_npu.npu.stream(stream_d2h):
output_cpu.copy_(output_npus[(i - 2) % 3], non_blocking = True)
# 流同步,具体参考python接口的类参考中的mindietorch.npu.Stream的方法synchronize()
if i >= 2:
stream_d2h.synchronize()
cos_sim = cosine_similarity(output_cpu, torch_result)
if cos_sim < COSINE_THRESHOLD:
result = False
if i > 0 and i < loop_time:
stream_h2d.synchronize()
if i >= 1 and i < loop_time + 1:
stream_forward.synchronize()
if result:
print("test success")
else:
print("test error")
C++开发环境
C++开发环境样例如下所示:
#include <torch/torch.h>
#include <torch/script.h>
#include "torch_npu/library_npu.h"
#include "torch_npu/csrc/core/npu/NPUStream.h"
#include "cpp/include/torch_aie.h"
int main()
{
// 1. 加载resnet50模型
// init npu device
auto device = at::Device(at_npu::key::NativeDeviceType, 0);
torch_npu::init_npu(device);
const std::string modelPath = "Resnet50.pth"; // resnet50 torchsctipt module
torch::jit::script::Module module = torch::jit::load(modelPath);
module.to(device);
int batch = 64;
// 2. 模型编译
std::vector<int64_t> shape = { batch, 3, 224, 224 };
std::vector<torch_aie::Input> inputs;
inputs.emplace_back(torch_aie::Input(shape, torch_aie::DataType::FLOAT, torch_aie::TensorFormat::NCHW));
torch_aie::torchscript::CompileSpec compileSpec(inputs);
auto compiledModule = torch_aie::torchscript::compile(module, compileSpec);
torch::jit::setGraphExecutorOptimize(false);
// 3. 创建stream
c10_npu::NPUStream streamH2d = c10_npu::getNPUStreamFromPool();
c10_npu::NPUStream streamForward = c10_npu::getNPUStreamFromPool();
c10_npu::NPUStream streamD2h = c10_npu::getNPUStreamFromPool();
// 4. 准备数据和内存
std::vector<at::Tensor> intputNpus;
std::vector<at::Tensor> outputNpus;
auto optionCpu = torch::TensorOptions().device(at::Device("cpu")).layout(torch::kStrided)
.pinned_memory(true);
auto inputCpu = torch::ones({ batch, 3, 224, 224 }, optionCpu) * 0.5;
auto outputCpu = at::empty({ batch, 1000 }, optionCpu);
// 5. 推理预热
for (int i = 0; i < 3; i++) {
auto inputNpu = inputCpu.to(device);
auto outputNpu = compiledModule({ inputNpu }).toTensor();
outputCpu.copy_(outputNpu);
intputNpus.push_back(inputNpu);
outputNpus.push_back(outputNpu);
}
// 6. 执行torch_aie forward
int loop_time = 50;
for (int i = 0; i < loop_time + 2; i++) {
// dispatch
if (i >= 1 and i < loop_time + 1) {
c10_npu::setCurrentNPUStream(streamForward);
outputNpus[(i - 1) % 3] = compiledModule( { intputNpus[(i - 1) % 3] }).toTensor();
}
if (i > 0 and i < loop_time) {
c10_npu::setCurrentNPUStream(streamH2d);
intputNpus[i % 3] = inputCpu.to(device, true);
}
if (i >= 2) {
c10_npu::setCurrentNPUStream(streamD2h);
outputCpu.copy_(outputNpus[(i - 2) % 3], true);
}
// 流同步,具体参考python接口的类参考中的mindietorch.npu.Stream的方法synchronize()
if (i >= 2) {
streamD2h.synchronize();
// process data after synchronize
// ...
}
if (i > 0 and i < loop_time) {
streamH2d.synchronize();
}
if (i >= 1 and i < loop_time + 1) {
streamForward.synchronize();
}
}
torch_aie::finalize();
return 0;
}
父主题: 样例参考