apiVersion: v1 kind: ConfigMap metadata: name: rings-config-mindx-dls-test # rings-config-后的名字需要与任务名一致 ... labels: ring-controller.atlas: ascend-910 # 标识任务使用的芯片的产品类型 ... --- apiVersion: batch.volcano.sh/v1alpha1 # 不可修改。必须使用Volcano的API kind: Job # 目前只支持Job类型 metadata: name: mindx-dls-test # 任务名,可自定义 labels: ring-controller.atlas: ascend-910 # 标识任务使用的芯片的产品类型 fault-scheduling: "grace" # 开启故障重调度 elastic-scheduling: "on" # 开启弹性训练,需添加""号 annotations: minReplicas: "1" # 最小副本数 ... spec: minAvailable: 1 # 设置为1 ... maxRetry: 0 #设置为0 ... - name: "default-test" template: metadata: ... spec: ... env: ... - name: ASCEND_VISIBLE_DEVICES # Ascend Docker Runtime会使用该字段 valueFrom: fieldRef: fieldPath: metadata.annotations['huawei.com/Ascend910'] # 需要和下面resources.requests保持一致 ... resources: requests: huawei.com/Ascend910: 8 # 需要的NPU芯片个数为8 limits: huawei.com/Ascend910: 8 # 目前需要和上面requests保持一致 ... nodeSelector: host-arch: huawei-arm # 可选值,根据实际情况填写 ...
... volumeMounts: #弹性训练扩容 - name: shm mountPath: /dev/shm volumes: - name: shm emptyDir: medium: Memory sizeLimit: 16Gi ...
... resources: requests: huawei.com/Ascend910: 8 cpu: 100m memory: 100Gi limits: huawei.com/Ascend910: 8 cpu: 100m memory: 100Gi ...
从昇腾镜像仓库拉取的基础镜像中不包含训练脚本、代码等文件,训练时通常使用挂载的方式将训练脚本、代码等文件映射到容器内。
volumeMounts: - name: ascend-910-config mountPath: /user/serverid/devindex/config - name: code mountPath: /job/code # 容器中训练脚本路径 - name: data mountPath: /job/data # 容器中训练数据集路径 - name: output mountPath: /job/output # 容器中训练输出路径
command: - "/bin/bash" - "-c" - "cd /job/code/scripts;chmod +x train_start.sh;bash train_start.sh /job/code/ /job/output/ tensorflow/resnet_ctl_imagenet_main.py --data_dir=/job/data/imagenet_TF --distribution_strategy=one_device --use_tf_while_loop=true --epochs_between_evals=1 --skip_eval --enable_checkpoint_and_export;" ...
command: - "/bin/bash" - "-c" - "cd /job/code/scripts;chmod +x train_start.sh;bash train_start.sh /job/code/ /job/output/ main.py --data=/job/data/resnet50/imagenet --amp --arch=resnet50 --seed=49 -j=128 --lr=1.6 --dist-backend='hccl' --multiprocessing-distributed --epochs=90 --batch-size=1024 --resume=true;" ...
... volumeMounts: - name: ascend-910-config mountPath: /user/serverid/devindex/config - name: code mountPath: /job/code # 容器中训练脚本路径 - name: data mountPath: /job/data # 容器中训练数据集路径 - name: output mountPath: /job/output # 容器中训练输出路径 ... volumes: ... - name: code nfs: server: 127.0.0.1 # NFS服务器IP地址 path: "xxxxxx" # 配置训练脚本路径 - name: data nfs: server: 127.0.0.1 path: "xxxxxx" # 配置训练集路径 - name: output nfs: server: 127.0.0.1 path: "xxxxxx" # 设置脚本相关配置模型保存路径 ...