本章节目的在于帮助用户快速熟悉如何下发一个使用NPU的任务,基于这些基本流程,用户可以尝试后续章节,如:使用命令行或者通过编程的方式下发NPU任务。最后,还可以融入高级特性。本章基于典型场景中的内容进行说明。
典型场景章节中关于NPU训练任务分为下面两种情况:
apiVersion: v1 kind: ConfigMap metadata: name: rings-config-mindx-dls-test namespace: vcjob labels: ring-controller.atlas: ascend-910 data: hccl.json: | { "status":"initializing" }
apiVersion: batch.volcano.sh/v1alpha1 kind: Job metadata: name: mindx-dls-test namespace: vcjob labels: ring-controller.atlas: ascend-910 spec: minAvailable: 1 schedulerName: volcano maxRetry: 3 queue: default tasks: - name: "default-test" replicas: 1 template: metadata: labels: app: tf ring-controller.atlas: ascend-910 spec: containers: - image: tf_arm64:b030 imagePullPolicy: IfNotPresent name: tf env: - name: mindx-dls-test valueFrom: fieldRef: fieldPath: metadata.name - name: XDL_IP valueFrom: fieldRef: fieldPath: status.hostIP command: xxxxxxx resources: requests: huawei.com/Ascend910: 8 limits: huawei.com/Ascend910: 8 volumeMounts: - name: ascend-910-config mountPath: /user/serverid/devindex/config nodeSelector: host-arch: huawei-arm volumes: - name: ascend-910-config configMap: name: rings-config-mindx-dls-test restartPolicy: OnFailure
apiVersion: apps/v1 kind: Deployment metadata: name: mindx-dls-test labels: app: tf ring-controller.atlas: ascend-910 namespace: vcjob spec: replicas: 1 selector: matchLabels: app: tf template: metadata: labels: app: tf ring-controller.atlas: ascend-910 deploy-name: mindx-dls-test spec: schedulerName: volcano nodeSelector: host-arch: huawei-x86 containers: - image: tf_arm64:b030 imagePullPolicy: IfNotPresent name: tf env: - name: mindx-dls-test valueFrom: fieldRef: fieldPath: metadata.name - name: XDL_IP valueFrom: fieldRef: fieldPath: status.hostIP command: xxx resources: requests: huawei.com/Ascend910: 8 limits: huawei.com/Ascend910: 8 volumeMounts: - name: ascend-910-config mountPath: /user/serverid/devindex/config volumes: - name: ascend-910-config configMap: name: rings-config-mindx-dls-test