通过MindIE Torch对文本嵌入模型和重排序模型进行编译优化、权重保存的样例如下。MindIE Torch后端当前支持大部分能基于pytorch+torch_npu运行的模型,常见模型列表可参考表1。
编译优化步骤依赖于模型运行时的Python库,若报错缺少部分三方库,请自行下载安装(例如:nomic系列模型依赖einops)。
import torch import mindietorch from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel BATCH_SIZE = 128 sentences = ["This is a sentence." for _ in range(BATCH_SIZE)] with torch.no_grad(): # load model model_id = '/home/data/models/bge-large-zh-v1.5' # 注意将文件中的model_id修改为实际路径 tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModel.from_pretrained(model_id, torchscript=True) model.eval() inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512) inputs['input_ids'] = inputs['input_ids'].to(torch.int32) inputs['attention_mask'] = inputs['attention_mask'].to(torch.int32) model = torch.jit.trace(model, [inputs['input_ids'], inputs['attention_mask']], strict=False) # compile MIN_SHAPE = (1, 1) # 元组内的元素分别代表mindietorch编译出的模型所能接受的最小输入batch_size及最小sequence_length,请按需调整 MAX_SHAPE = (128, 512) # 元组内的元素分别代表mindietorch编译出的模型所能接受的最大输入batch_size及最大sequence_length,请按需调整 dynamic_inputs = [] dynamic_inputs.append(mindietorch.Input(min_shape=MIN_SHAPE, max_shape=MAX_SHAPE, dtype=inputs['input_ids'].dtype)) dynamic_inputs.append(mindietorch.Input(min_shape=MIN_SHAPE, max_shape=MAX_SHAPE, dtype=inputs['attention_mask'].dtype)) compiled_model = mindietorch.compile( model, inputs = dynamic_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, truncate_long_and_double=True, require_full_compilation=False, allow_tensor_replace_int=False, min_block_size=3, torch_executed_ops=[], #soc_version根据硬件型号填入,"xxxxx"与npu-smi info打屏信息中的'Name'字段的前五位一致 soc_version="Ascendxxxxx", optimization_level=0 ) # save model compiled_model.save(model_id+"/compiled_model.pt") print('compiled model saved!')
由于PyTorch 2.1.0 Arm版本不支持对reranker模型进行trace操作(原因是该版本下执行out_features=1的Linear操作存在待修复问题,报错为:RuntimeError: could not create a primitive descriptor for a matmul primitive),请使用torch>=2.2.0版本的Python环境执行trace操作(执行完trace.py后可继续使用2.1.0版本的torch及torch_npu)。
# 对于Anaconda conda create -n myenv python=3.10 conda activate myenv conda install sympy==1.13.3 conda install pytorch==2.2.0 # 对于venv python3.10 -m venv myenv source myenv/bin/activate # Linux/macOS # 或者 myenv\Scripts\activate # Windows pip install --upgrade pip pip install torch==2.2.0
pip show transformers
回显示例如下所示:
/usr/local/python3.10/site-packages/transformers
vim /usr/local/python3.10/site-packages/transformers/modeling_utils.py
extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
将其修改为如下所示:
extended_attention_mask = (1.0 - extended_attention_mask) * (-1000)
import torch from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']] with torch.no_grad(): model_path = '/home/data/models/bge-reranker-large' # 注意将文件中的model_path修改为实际路径 tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForSequenceClassification.from_pretrained(model_path, trust_remote_code=True, torchscript=True) model.eval() inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512) inputs['input_ids'] = inputs['input_ids'].to(torch.int32) inputs['attention_mask'] = inputs['attention_mask'].to(torch.int32) model = torch.jit.trace(model, [inputs['input_ids'], inputs['attention_mask']], strict=False) model.save(model_path+"/traced_model.ts") print('tradced model saved!')
# 对于Anaconda conda deactivate # 对于venv deactivate
整体编译耗时约为30分钟,请耐心等待,当打印'compiled model saved!'回显时,则表示模型编译成功。
import torch import mindietorch from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel import sys with torch.no_grad(): model_id = '/home/data/models/bge-reranker-large' # 注意将文件中的model_id修改为实际路径 tokenizer = AutoTokenizer.from_pretrained(model_id) model = torch.jit.load(model_id+'/traced_model.ts') model.eval() MIN_SHAPE = (1, 1) # 元组内的元素分别代表mindietorch编译出的模型所能接受的最小输入batch_size及最小sequence_length,请按需调整 MAX_SHAPE = (128, 512) # 元组内的元素分别代表mindietorch编译出的模型所能接受的最大输入batch_size及最大sequence_length,请按需调整 dynamic_inputs = [] dynamic_inputs.append(mindietorch.Input(min_shape=MIN_SHAPE, max_shape=MAX_SHAPE, dtype=torch.int32)) dynamic_inputs.append(mindietorch.Input(min_shape=MIN_SHAPE, max_shape=MAX_SHAPE, dtype=torch.int32)) compiled_model = mindietorch.compile( model, inputs = dynamic_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, truncate_long_and_double=True, require_full_compilation=False, allow_tensor_replace_int=False, min_block_size=3, torch_executed_ops=[], #soc_version根据硬件型号填入,"xxxxx"与npu-smi info打屏信息中的'Name'字段的前五位一致 soc_version="Ascendxxxxx", optimization_level=0 ) compiled_model.save(model_id+"/compiled_model.pt") print('compiled model saved!')