昇腾社区首页
中文
注册

样例执行

本节以单机8卡组网、通过rank table文件配置资源信息的方式为例,介绍如何运行代码示例中的样例代码。

  1. 准备rank table文件。
    Atlas A2 训练系列产品 ,组网为单机8卡为例,rank_table.json配置示例如下,不同产品形态rank table文件的配置示例及详细参数说明可参见rank table文件配置资源信息

    以下JSON文件仅为配置示例,使用时请根据实际组网信息进行配置,并删除注释。

    {
        "status":"completed",   // rank table可用标识,completed为可用
        "version": "1.0",
        "server_count": "1",   // 参与训练的AI Server数目
        "server_list": [
            {
                "server_id": "SERVER_ID_SV1",   // AI Server标识,String类型,请确保全局唯一
                "device": [                     // AI Server中的Device列表
                    {
                        "device_id": "0",
                        "device_ip": "192.168.1.8",
                        "rank_id": "0"
                    },
                    {
                        "device_id": "1",
                        "device_ip": "192.168.1.9",
                        "rank_id": "1"
                    },
                    {
                        "device_id": "2",
                        "device_ip": "192.168.1.10",
                        "rank_id": "2"
                    },
                    {
                        "device_id": "3",
                        "device_ip": "192.168.1.11",
                        "rank_id": "3"
                    },
                    {
                        "device_id": "4",
                        "device_ip": "192.168.1.12",
                        "rank_id": "4"
                    },
                    {
                        "device_id": "5",
                        "device_ip": "192.168.1.13",
                        "rank_id": "5"
                    },
                    {
                        "device_id": "6",
                        "device_ip": "192.168.1.14",
                        "rank_id": "6"
                    },
                    {
                        "device_id": "7",
                        "device_ip": "192.168.1.15",
                        "rank_id": "7"
                    }
                ]
            }
        ]
    }
  2. 构造启动脚本。

    假设命名为hccl_start_8p.sh,示例如下:

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    # 配置CANN软件环境变量(以root用户为例):
    source /usr/local/Ascend/ascend-toolkit/set_env.sh 
    # TF Adapter python库,其中${TFPLUGIN_INSTALL_PATH}为TF Adapter软件包安装路径。
    export PYTHONPATH=${TFPLUGIN_INSTALL_PATH}:$PYTHONPATH
    
    export RANK_SIZE=8
    export RANK_TABLE_FILE=/home/test/rank_table.json    # rank table资源配置文件路径,请根据实际情况替换
    export JOB_ID=10087      # 用户自定义,指定任务ID,可以包含大小写字母、数字、中划线或下划线
    
    for((RANK_ID=0;RANK_ID<$((RANK_SIZE));RANK_ID++));
    do
        export RANK_ID=$RANK_ID
        export ASCEND_DEVICE_ID=$RANK_ID
        # 执行脚本,脚本路径与名称请根据实际情况替换
        nohup python3 /home/test/hccl_test.py &
    done
    
  3. 执行启动脚本。
    1
    bash hccl_start_8p.sh 
    

    结果示例如下:

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    ... ...
    'reduce_sum': array([[ 0,  0,  0, ...,  0,  0,  0],
           [ 0,  0,  0, ...,  0,  0,  0],
           [ 0,  0,  0, ...,  0,  0,  0],
           ...,
           [ 0,  0,  0, ...,  0,  0,  0],
           [ 0,  0,  0, ...,  0,  0,  0],
           [ 0,  0,  0, ..., 44, 44, 44]]), 'reduce_max': array([[4097, 4098, 4099, ..., 4222, 4223, 4224],
           [4225, 4226, 4227, ..., 4350, 4351, 4352],
           [4353, 4354, 4355, ..., 4478, 4479, 4480],
           ...,
           [4737, 4738, 4739, ..., 4862, 4863, 4864],
           [4865, 4866, 4867, ..., 4990, 4991, 4992],
           [4993, 4994, 4995, ...,    9,    9,    9]]), 'reduce_min': array([[0, 0, 0, ..., 0, 0, 0],
           [0, 0, 0, ..., 0, 0, 0],
           [0, 0, 0, ..., 0, 0, 0],
           ...,
           [0, 0, 0, ..., 0, 0, 0],
           [0, 0, 0, ..., 0, 0, 0],
           [0, 0, 0, ..., 2, 2, 2]]), 'reduce_prod': array([[     0,      0,      0, ...,      0,      0,      0],
           [     0,      0,      0, ...,      0,      0,      0],
           [     0,      0,      0, ...,      0,      0,      0],
           ...,
           [     0,      0,      0, ...,      0,      0,      0],
           [     0,      0,      0, ...,      0,      0,      0],
           [     0,      0,      0, ..., 362880, 362880, 362880]]), 'alltoallv_tensor': array([   1,    2,    3, ..., 8246, 8247, 8248]), 'check_tensors': array([   1,    2,    3, ..., 8246, 8247, 8248])
    train success