以PyTorch框架调用为例,只需要在对应的Python代码侧替换对应的原始Module即可,具体样例可参考“pytorch/examples/xx/”目录:
# 1.环境设置 torch.npu.set_compile_mode(jit_compile=False) # 使能二进制优化,消除动态shape的编译问题 local_rank, world_size = setup_model_parallel() # 并行推理初始化 # 2.初始化tokenizer/model tokenizer = AutoTokenizer.from_pretrained("./", trust_remote_code=True) model = AutoModel.from_pretrained("./", trust_remote_code=True).half().npu() # 3.优化ND NZ排布,消除transdata:不同芯片格式支持情况不一致 soc_version = torch_npu._C._npu_get_soc_version() if soc_version in [104, 220, 221, 222, 223]: for name, module in model.named_modules(): if isinstance(module, torch.nn.Linear): module.weight.data = module.weight.data.npu_format_cast(2) print("soc_version:", soc_version, " is ***B, support ND") else: for name, module in model.named_modules(): if isinstance(module, torch.nn.Linear): if name == "lm_head": module.weight = torch.nn.parameter.Parameter(module.weight.data) module.weight.data = module.weight.data.npu_format_cast(29) print("soc_version:", soc_version, " is not ***B, support NZ") for name, module in model.named_modules(): if isinstance(module, torch.nn.Embedding): module.weight.data = module.weight.data.npu_format_cast(2) ... # 在上述初始化基础上即可完成后续推理
# 4.加载加速库LIB文件 LIB_PATH = os.path.join(ACLTRANSFORMER_HOME_PATH, "lib/libacltransformer_torch.so") torch.classes.load_library(LIB_PATH) ... # 5.加速库Model初始化:以ChatGlm6B为例 acl_param = json.dumps({"transKey": True, "dk": self.hidden_size_per_attention_head, "headNum": self.num_attention_heads, "layerNum": self.num_layers, "layerNormEps": self.layernorm_epsilon, "residualAddScale": math.sqrt(2 * self.num_layers), "beginNormAxis": 2}) self.acl_decoder_operation = torch.classes.ModelTorch.ModelTorch("ChatGlm6BDecoderWithoutFusionModel") # 6. 初始化参数以及加载权重 self.acl_decoder_operation.set_param(acl_param) self.acl_decoder_operation.set_weight(weights) ... # 7.执行推理过程 acl_model_out = self.acl_decoder_operation.execute(self.acl_decoder_operation_inputs, acl_param) # self.acl_decoder_operation->LIST[Tensor]