以PyTorch框架调用为例，只需要在对应的Python代码侧替换对应的原始Module即可，具体样例可参考“pytorch/examples/xx/”目录：

调用脚本中对应NPU的初始化设置。

# 1.环境设置
torch.npu.set_compile_mode(jit_compile=False)  # 使能二进制优化，消除动态shape的编译问题
local_rank, world_size = setup_model_parallel()  # 并行推理初始化

# 2.初始化tokenizer/model
tokenizer = AutoTokenizer.from_pretrained("./", trust_remote_code=True)
model = AutoModel.from_pretrained("./", trust_remote_code=True).half().npu()

# 3.优化ND NZ排布，消除transdata：不同芯片格式支持情况不一致
soc_version = torch_npu._C._npu_get_soc_version()
if soc_version in [104, 220, 221, 222, 223]:
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
        module.weight.data = module.weight.data.npu_format_cast(2)
        print("soc_version:", soc_version, " is ***B, support ND")
else:
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            if name == "lm_head":
                module.weight = torch.nn.parameter.Parameter(module.weight.data)
module.weight.data = module.weight.data.npu_format_cast(29)
print("soc_version:", soc_version, " is not ***B, support NZ")

for name, module in model.named_modules():
    if isinstance(module, torch.nn.Embedding):
        module.weight.data = module.weight.data.npu_format_cast(2)
... # 在上述初始化基础上即可完成后续推理

具体模型调用文件修改。

# 4.加载加速库LIB文件
LIB_PATH = os.path.join(ACLTRANSFORMER_HOME_PATH, "lib/libacltransformer_torch.so")
torch.classes.load_library(LIB_PATH)
...

# 5.加速库Model初始化：以ChatGlm6B为例
acl_param = json.dumps({"transKey": True, "dk": self.hidden_size_per_attention_head, "headNum": self.num_attention_heads, "layerNum": self.num_layers,
                        "layerNormEps": self.layernorm_epsilon, "residualAddScale": math.sqrt(2 * self.num_layers), "beginNormAxis": 2})
self.acl_decoder_operation = torch.classes.ModelTorch.ModelTorch("ChatGlm6BDecoderWithoutFusionModel")

# 6. 初始化参数以及加载权重
self.acl_decoder_operation.set_param(acl_param)
self.acl_decoder_operation.set_weight(weights)
...

# 7.执行推理过程
acl_model_out = self.acl_decoder_operation.execute(self.acl_decoder_operation_inputs, acl_param)  # self.acl_decoder_operation->LIST[Tensor]

框架接口调用