指导用户在Atlas 800T A2 训练服务器上实现MindCluster NodeD、MindCluster Ascend Device Plugin、MindCluster Ascend Docker Runtime、MindCluster Volcano、MindCluster ClusterD和MindCluster Ascend Operator组件的快速安装,同时指导使用整卡调度特性快速下发训练任务。
安装组件前,需要确保集群环境已经搭建完成。
mkdir /home/noded mkdir /home/devicePlugin mkdir /home/Ascend-docker-runtime
mkdir /home/ascend-volcano mkdir /home/ascend-operator mkdir /home/clusterd mkdir /home/noded mkdir /home/devicePlugin
cd /home/noded wget https://gitee.com/ascend/mind-cluster/releases/download/v6.0.0-RC2/Ascend-mindxdl-noded_6.0.RC2_linux-aarch64.zip unzip Ascend-mindxdl-noded_6.0.RC2_linux-aarch64.zip cd /home/devicePlugin wget https://gitee.com/ascend/mind-cluster/releases/download/v6.0.0-RC2/Ascend-mindxdl-device-plugin_6.0.RC2_linux-aarch64.zip unzip Ascend-mindxdl-device-plugin_6.0.RC2_linux-aarch64.zip cd /home/Ascend-docker-runtime wget https://gitee.com/ascend/mind-cluster/releases/download/v6.0.0-RC2/Ascend-docker-runtime_6.0.RC2_linux-aarch64.run
cd /home/ascend-volcano wget https://gitee.com/ascend/mind-cluster/releases/download/v6.0.0-RC2/Ascend-mindxdl-volcano_6.0.RC2_linux-aarch64.zip unzip Ascend-mindxdl-volcano_6.0.RC2_linux-aarch64.zip cd /home/ascend-operator wget https://mindx.obs.cn-south-1.myhuaweicloud.com/OpenSource/MindX/MindX%206.0.RC2/MindX%20DL%206.0.RC2/Ascend-mindxdl-ascend-operator_6.0.RC2_linux-aarch64.zip unzip Ascend-mindxdl-ascend-operator_6.0.RC2_linux-aarch64.zip cd /home/clusterd wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindX/MindX%206.0.RC2/Ascend-mindxdl-clusterd_6.0.RC2_linux-aarch64.zip unzip Ascend-mindxdl-clusterd_6.0.RC2_linux-aarch64.zip
docker pull ubuntu:18.04
docker pull arm64v8/alpine:latest docker tag arm64v8/alpine:latest alpine:latest docker pull ubuntu:18.04
cd /home/noded docker build --no-cache -t noded:v6.0.RC2 ./ cd /home/devicePlugin docker build --no-cache -t ascend-k8sdeviceplugin:v6.0.RC2 ./
cd /home/ascend-volcano/volcano-v1.7.0 docker build --no-cache -t volcanosh/vc-scheduler:v1.7.0 ./ -f ./Dockerfile-scheduler docker build --no-cache -t volcanosh/vc-controller-manager:v1.7.0 ./ -f ./Dockerfile-controller cd /home/ascend-operator docker build --no-cache -t ascend-operator:v6.0.RC2 ./ cd /home/clusterd docker build --no-cache -t clusterd:v6.0.RC2 ./
kubectl get node
NAME STATUS ROLES AGE VERSION worker01 Ready worker 23h v1.17.3
kubectl label nodes worker01 node-role.kubernetes.io/worker=worker kubectl label nodes worker01 workerselector=dls-worker-node kubectl label nodes worker01 host-arch=huawei-arm kubectl label nodes worker01 accelerator=huawei-Ascend910 kubectl label nodes worker01 accelerator-type=module-{xxx}b-8 # {xxx}表示芯片的型号数值 kubectl label nodes worker01 accelerator-type=nodeDEnable=on
kubectl label nodes master01 masterselector=dls-master-node
useradd -d /home/hwMindX -u 9000 -m -s /usr/sbin/nologin hwMindX usermod -a -G HwHiAiUser hwMindX
useradd -d /home/hwMindX -u 9000 -m -s /usr/sbin/nologin hwMindX
mkdir -m 755 /var/log/mindx-dl chown root:root /var/log/mindx-dl mkdir -m 750 /var/log/mindx-dl/devicePlugin chown root:root /var/log/mindx-dl/devicePlugin mkdir -m 750 /var/log/mindx-dl/noded chown hwMindX:hwMindX /var/log/mindx-dl/noded
mkdir -m 755 /var/log/mindx-dl chown root:root /var/log/mindx-dl mkdir -m 750 /var/log/mindx-dl/volcano-controller chown hwMindX:hwMindX /var/log/mindx-dl/volcano-controller mkdir -m 750 /var/log/mindx-dl/volcano-scheduler chown hwMindX:hwMindX /var/log/mindx-dl/volcano-scheduler mkdir -m 750 /var/log/mindx-dl/ascend-operator chown hwMindX:hwMindX /var/log/mindx-dl/ascend-operator mkdir -m 750 /var/log/mindx-dl/clusterd chown hwMindX:hwMindX /var/log/mindx-dl/clusterd
kubectl create ns mindx-dl
cd /home/Ascend-docker-runtime chmod u+x Ascend-docker-runtime_6.0.RC2_linux-aarch64.run ./Ascend-docker-runtime_6.0.RC2_linux-aarch64.run --install systemctl daemon-reload && systemctl restart docker
cd /home/noded scp noded-v6.0.RC2.yaml root@{管理节点IP地址}:/home/noded cd /home/devicePlugin scp device-plugin-volcano-v6.0.RC2.yaml root@{管理节点IP地址}:/home/devicePlugin
cd /home/ascend-operator
kubectl apply -f ascend-operator-v6.0.RC2.yaml
cd /home/ascend-volcano/volcano-v1.7.0 # 使用1.4.0版本MindCluster Volcano需要修改为v1.4.0
kubectl apply -f volcano-v1.7.0.yaml
cd /home/noded
kubectl apply -f noded-v6.0.RC2.yaml
cd /home/clusterd
kubectl apply -f clusterd-v6.0.RC2.yaml
cd /home/devicePlugin
kubectl apply -f device-plugin-volcano-v6.0.RC2.yaml
serviceaccount/noded created clusterrole.rbac.authorization.k8s.io/pods-noded-role created clusterrolebinding.rbac.authorization.k8s.io/pods-noded-rolebinding created daemonset.apps/noded created
kubectl get pod -n mindx-dl
以MindCluster NodeD组件为例,回显示例如下,出现Running表示组件启动成功。
NAME READY STATUS RESTARTS AGE ... noded-fd6t8 1/1 Running 0 74s ...
从昇腾镜像仓库根据系统架构(Arm/x86_64)、下载24.0.RC2版本的ascend-pytorch训练基础镜像,该镜像配套CANN 8.0.RC2,Ascend HDK 24.1.RC2版本。基于训练基础镜像进行修改,将容器中默认用户修改为root。基础镜像中不包含训练脚本、代码等文件,训练时通常使用挂载的方式将训练脚本、代码等文件映射到容器内。
root@ubuntu:/data/atlas_dls/public/dataset/resnet50/imagenet# pwd
def main(): args = parser.parse_args() os.environ['MASTER_ADDR'] = args.addr #os.environ['MASTER_PORT'] = '29501' # 注释该行代码 if os.getenv('ALLOW_FP32', False) and os.getenv('ALLOW_HF32', False): raise RuntimeError('ALLOW_FP32 and ALLOW_HF32 cannot be set at the same time!') elif os.getenv('ALLOW_HF32', False): torch.npu.conv.allow_hf32 = True elif os.getenv('ALLOW_FP32', False): torch.npu.conv.allow_hf32 = False torch.npu.matmul.allow_hf32 = False
root@ubuntu:/data/atlas_dls/public/code/ResNet50_ID4149_for_PyTorch/scripts# scripts/ ├── train_start.sh
apiVersion: mindxdl.gitee.com/v1 kind: AscendJob ... spec: ... replicaSpecs: Master: ... spec: nodeSelector: host-arch: huawei-arm accelerator-type: module-{xxx}b-8 # 由原来的card-{xxx}b-2修改为module-{xxx}b-8,{xxx}表示芯片的数值型号 containers: - name: ascend image: pytorch-test:latest # 修改为步骤1中获取的镜像名称 ... resources: limits: huawei.com/Ascend910: 1 requests: huawei.com/Ascend910: 1 ...
kubectl apply -f pytorch_standalone_acjob_{xxx}b.yaml
kubectl get pod --all-namespaces -o wide
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES default default-test-pytorch-master-0 1/1 Running 0 6s 192.168.244.xxx worker01 <none> <none>
kubectl logs -n 命令空间名称 pod名字
如:
kubectl logs -n default default-test-pytorch-master-0
[20240517-11:33:27] [MindXDL Service Log]Training start at 2024-05-17-11:33:27 /usr/local/python3.9.2/lib/python3.9/site-packages/torch_npu/utils/path_manager.py:79: UserWarning: Warning: The /usr/local/Ascend/ascend-toolkit/latest owner does not match the current user. warnings.warn(f"Warning: The {path} owner does not match the current user.") /usr/local/python3.9.2/lib/python3.9/site-packages/torch_npu/utils/path_manager.py:79: UserWarning: Warning: The /usr/local/Ascend/ascend-toolkit/8.0.RC2.alpha003/aarch64-linux/ascend_toolkit_install.info owner does not match the current user. warnings.warn(f"Warning: The {path} owner does not match the current user.") /usr/local/python3.9.2/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: warn(f"Failed to load image Python extension: {e}") /job/code/main.py:201: UserWarning: You have chosen to seed training. This will turn on the CUDNN deterministic setting, which can slow down your training considerably! You may see unexpected behavior when restarting from checkpoints. warnings.warn('You have chosen to seed training. ' /job/code/main.py:208: UserWarning: You have chosen a specific GPU. This will completely disable data parallelism. warnings.warn('You have chosen a specific GPU. This will completely ' Use GPU: 0 for training => creating model 'resnet50'