HCCL Controller或Ascend Operator将在训练启动时,为训练任务生成集合通信所需的RankTable文件。集合通信根据RankTable File文件中的设备ID以及IP构建集合通信域,完成集合通信的信息交换。
hccl.json文件示例如下:
hccl.json:
----
{
"status": "completed", // HCCL Controller是否写入完成
"server_list": [{ // 节点列表
"device": [{ // NPU列表
"device_id": "1", // NPU的设备ID
"device_ip": "192.168.101.xx", // NPU的设备IP
"rank_id": "0" // NPU对应的训练rank id
}, {
"device_id": "2",
"device_ip": "192.168.102.xx",
"rank_id": "1"
}, {
"device_id": "3",
"device_ip": "192.168.103.xx",
"rank_id": "2"
}, {
...
}],
"server_id": "192.168.101.xx", // 节点IP
"container_ip": "192.168.149.xx", // Pod IP
}]
"server_count": "1", // 任务总服务器数量
"version": "1.0"
}
{ "version": "1.2", "server_count": "1", "server_list": [ // 节点列表 { "server_id": "xx.xx.xx.xx", //节点的IP "device": [ { "device_id": "0", // NPU的设备ID "device_ip": "xx.xx.xx.xx", // NPU的设备IP "rank_id": "0", // NPU对应的训练rank id "super_device_id": "37748736" //NPU的设备ID }, ... { "device_id": "7", "device_ip": "xx.xx.xx.xx", "rank_id": "7", "super_device_id": "38600711" } ], } ], "super_pod_list": [ //超节点列表 { "super_pod_id": "0", //逻辑超节点ID "server_list": [ { "server_id": "xx.xx.xx.xx" //节点的IP } ] } ], "status": "completed" // 状态 }