import torch import torch_npu torch_npu.npu.set_compile_mode(jit_compile=False) ......
if is_distributed: mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: main_worker(args.gpu, ngpus_per_node, args) def main_worker(gpu, ngpus_per_node, args): # 加在进程拉起的主函数中 torch_npu.npu.set_compile_mode(jit_compile=False) ......
export MASTER_ADDR=127.0.0.1 export MASTER_PORT=29688 export HCCL_WHITELIST_DISABLE=1 NPUS=($(seq 0 7)) export RANK_SIZE=${#NPUS[@]} rank=0 for i in ${NPUS[@]} do export DEVICE_ID=${i} export RANK_ID=${rank} echo run process ${rank} please input your shell script here > output_npu_${i}.log 2>&1 & let rank++ done
参数 |
说明 |
---|---|
MASTER_ADDR |
指定训练服务器的ip |
MASTER_PORT |
指定训练服务器的端口 |
HCCL_WHITELIST_DISABLE |
hccl后端环境 |
NPUS |
指定在特定NPU上运行 |
RANK_SIZE |
指定调用卡的数量 |
DEVICE_ID |
指定调用的device_id |
RANK_ID |
指定调用卡的逻辑ID |