import torch import torch_npu torch_npu.npu.set_compile_mode(jit_compile=False) ......
if is_distributed: mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: main_worker(args.gpu, ngpus_per_node, args) def main_worker(gpu, ngpus_per_node, args): # 加在进程拉起的主函数中 torch_npu.npu.set_compile_mode(jit_compile=False) ......
export MASTER_ADDR=127.0.0.1 export MASTER_PORT=29688 export HCCL_WHITELIST_DISABLE=1 NPUS=($(seq 0 7)) export RANK_SIZE=${#NPUS[@]} rank=0 for i in ${NPUS[@]} do export DEVICE_ID=${i} export RANK_ID=${rank} echo run process ${rank} please input your shell script here > output_npu_${i}.log 2>&1 & let rank++ done
参数 |
说明 |
---|---|
MASTER_ADDR |
指定训练服务器的ip |
MASTER_PORT |
指定训练服务器的端口 |
HCCL_WHITELIST_DISABLE |
hccl后端环境 |
NPUS |
指定在特定NPU上运行 |
RANK_SIZE |
指定调用卡的数量 |
DEVICE_ID |
指定调用的device_id |
RANK_ID |
指定调用卡的逻辑ID |
PATH_MAPPING_CONFIG = { 'input': { # Add path mapping here for downloading data before training # format: <local path>: <obs/s3 path> # For example: '/data/dataset/imagenet': 'obs://dataset/imagenet', }, 'output': { # Add path mapping here for uploading output after training # format: <local path>: <obs/s3 path> # For example: './checkpoints': 'obs://outputs/', } }
'input'中对应训练前需要下载的数据,映射格式为:本地运行时的数据路径:对应云端对象存储的路径。
'output'中对应训练完成后需要上传的数据,映射格式为:训练生成的数据文件路径:生成文件在对象存储上的对应路径。