以Resnet18为例进行样例展示,运行该样例需要用户自行安装Torchvision,如未安装,请单击链接获取Torchvision安装包并参照如下方法进行安装:
使用以下命令安装Torchvision。
pip install 软件包名.whl
软件包名.whl表示Torchvision安装包torchvision-{version}-cp310-cp310-{os}_{arch}.whl,请根据实际包名进行替换。
Torchvision版本与Torch版本严格配套使用,推荐使用与Torch2.1.0配套的0.16.0版本。
# 请务必先导入Torch,再导入MindIE Torch
import torch
import torch_npu
import mindietorch
import torchvision
COSINE_THRESHOLD = 0.99
# 计算张量相似度
def cosine_similarity(gt_tensor, pred_tensor):
gt_tensor = gt_tensor.flatten().to(torch.float32)
pred_tensor = pred_tensor.flatten().to(torch.float32)
if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0:
if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True):
return 1.0
res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6)
res = res.cpu().detach().item()
return res
if __name__ == "__main__":
# 1. 加载resnet18模型
batch = 32
tensor_cpu = torch.randn([batch, 3, 224, 224], pin_memory = True)
model_torchvision = torchvision.models.resnet18().eval()
model = torch.jit.trace(model_torchvision, tensor_cpu)
# 2. 模型编译,具体参考python接口的函数方法中的mindietorch.compile。
torch_npu.npu.set_device("npu:0")
input = [ mindietorch.Input((batch, 3, 224, 224)), ]
compile_model = mindietorch.compile(model, inputs = input)
# 3.模型推理
# torch forward
torch_result = model(tensor_cpu)
# 创建 stream
stream_h2d = torch_npu.npu.Stream()
stream_forward = torch_npu.npu.Stream()
stream_d2h = torch_npu.npu.Stream()
# 推理预热
input_npus = []
output_npus = []
output_cpu = torch.randn([batch, 1000], pin_memory = True)
for i in range(3):
input_npu = tensor_cpu.to("npu:0")
output_npu = compile_model(input_npu)
output_cpu.copy_(output_npu)
input_npus.append(input_npu)
output_npus.append(output_npu)
loop_time = 100
result = True
for i in range(loop_time + 2):
# dispatch
if i >= 1 and i < loop_time + 1:
with torch_npu.npu.stream(stream_forward):
output_npus[(i - 1) % 3] = compile_model(input_npus[(i - 1) % 3])
if i > 0 and i < loop_time:
with torch_npu.npu.stream(stream_h2d):
input_npus[i % 3] = tensor_cpu.to("npu:0", non_blocking = True)
if i >= 2:
with torch_npu.npu.stream(stream_d2h):
output_cpu.copy_(output_npus[(i - 2) % 3], non_blocking = True)
# 流同步,具体参考python接口的类参考中的mindietorch.npu.Stream的方法synchronize()
if i >= 2:
stream_d2h.synchronize()
cos_sim = cosine_similarity(output_cpu, torch_result)
if cos_sim < COSINE_THRESHOLD:
result = False
if i > 0 and i < loop_time:
stream_h2d.synchronize()
if i >= 1 and i < loop_time + 1:
stream_forward.synchronize()
if result:
print("test success")
else:
print("test error")
C++开发环境样例如下所示:
#include <torch/torch.h> #include <torch/script.h> #include "torch_npu/library_npu.h" #include "torch_npu/csrc/core/npu/NPUStream.h" #include "cpp/include/torch_aie.h" int main() { // 1. 加载resnet50模型 // init npu device auto device = at::Device(at_npu::key::NativeDeviceType, 0); torch_npu::init_npu(device); const std::string modelPath = "Resnet50.pth"; // resnet50 torchsctipt module torch::jit::script::Module module = torch::jit::load(modelPath); module.to(device); int batch = 64; // 2. 模型编译 std::vector<int64_t> shape = { batch, 3, 224, 224 }; std::vector<torch_aie::Input> inputs; inputs.emplace_back(torch_aie::Input(shape, torch_aie::DataType::FLOAT, torch_aie::TensorFormat::NCHW)); torch_aie::torchscript::CompileSpec compileSpec(inputs); auto compiledModule = torch_aie::torchscript::compile(module, compileSpec); torch::jit::setGraphExecutorOptimize(false); // 3. 创建stream c10_npu::NPUStream streamH2d = c10_npu::getNPUStreamFromPool(); c10_npu::NPUStream streamForward = c10_npu::getNPUStreamFromPool(); c10_npu::NPUStream streamD2h = c10_npu::getNPUStreamFromPool(); // 4. 准备数据和内存 std::vector<at::Tensor> intputNpus; std::vector<at::Tensor> outputNpus; auto optionCpu = torch::TensorOptions().device(at::Device("cpu")).layout(torch::kStrided) .pinned_memory(true); auto inputCpu = torch::ones({ batch, 3, 224, 224 }, optionCpu) * 0.5; auto outputCpu = at::empty({ batch, 1000 }, optionCpu); // 5. 推理预热 for (int i = 0; i < 3; i++) { auto inputNpu = inputCpu.to(device); auto outputNpu = compiledModule({ inputNpu }).toTensor(); outputCpu.copy_(outputNpu); intputNpus.push_back(inputNpu); outputNpus.push_back(outputNpu); } // 6. 执行torch_aie forward int loop_time = 50; for (int i = 0; i < loop_time + 2; i++) { // dispatch if (i >= 1 and i < loop_time + 1) { c10_npu::setCurrentNPUStream(streamForward); outputNpus[(i - 1) % 3] = compiledModule( { intputNpus[(i - 1) % 3] }).toTensor(); } if (i > 0 and i < loop_time) { c10_npu::setCurrentNPUStream(streamH2d); intputNpus[i % 3] = inputCpu.to(device, true); } if (i >= 2) { c10_npu::setCurrentNPUStream(streamD2h); outputCpu.copy_(outputNpus[(i - 2) % 3], true); } // 流同步,具体参考python接口的类参考中的mindietorch.npu.Stream的方法synchronize() if (i >= 2) { streamD2h.synchronize(); // process data after synchronize // ... } if (i > 0 and i < loop_time) { streamH2d.synchronize(); } if (i >= 1 and i < loop_time + 1) { streamForward.synchronize(); } } torch_aie::finalize(); return 0; }