微调功能
主要基于sentence-transformers框架提供的SentenceTransformerTrainer,通过前述微调合成数据自动生成方法生成的数据集对embedding模型进行微调,相关微调训练参数及超参按照实际进行调整。
调用示例
import torch
import torch_npu
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers import SentenceTransformerTrainer
torch.npu.set_device(torch.device("npu:0"))
model = SentenceTransformer("model_path", device="npu" if torch.npu.is_available() else "cpu")
train_loss = MultipleNegativesRankingLoss(model)
train_dataset = load_dataset("json", data_files="train_data.jsonl", split="train")
args = SentenceTransformerTrainingArguments(
output_dir="output_dir", # output directory and hugging face model ID
num_train_epochs=4, # number of epochs
per_device_train_batch_size=8, # train batch size
gradient_accumulation_steps=16, # for a global batch size of 512
warmup_ratio=0.1, # warmup ratio
learning_rate=2e-5, # learning rate, 2e-5 is a good value
lr_scheduler_type="cosine", # use constant learning rate scheduler
optim="adamw_torch_fused", # use fused adamw optimizer
batch_sampler=BatchSamplers.NO_DUPLICATES, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
logging_steps=10, # log every 10 steps
)
trainer = SentenceTransformerTrainer(
model=model, # bg-base-en-v1
args=args, # training arguments
train_dataset=train_dataset.select_columns(["query", "corpus"]), # training dataset
loss=train_loss,
)
trainer.train()
trainer.save_model()
父主题: 模型评估和微调方法