检查运行情况

操作步骤

  1. 在管理节点执行以下命令,查看Pod运行状况。

    kubectl get pod --all-namespaces -o wide
    • 单机单芯片训练任务示例
      root@ubuntu:~# kubectl get pod --all-namespaces -o wide
      NAMESPACE        NAME                                       READY   STATUS              RESTARTS   AGE     IP                NODE           NOMINATED NODE   READINESS GATES
      default          hccl-controller-688c7cb8c6-4b88n           1/1     Running             0          8d      192.168.243.199   ubuntu         <none>           <none>
      kube-system      ascend-device-plugin-daemonset-8f2dx       1/1     Running             2          8d      192.168.243.218   ubuntu         <none>           <none>
      kube-system      ascend-device-plugin-daemonset-f2jk9       1/1     Running             1          8d      192.168.207.49    ubuntu-96      <none>           <none>
      kube-system      ascend310-device-plugin-daemonset-fls4v    1/1     Running             0          4m15s   192.168.240.66    ubuntu-infer   <none>           <none>
      kube-system      calico-kube-controllers-8464785d6b-bj4pk   1/1     Running             1          8d      192.168.243.198   ubuntu         <none>           <none>
      kube-system      calico-node-bkbvl                          1/1     Running             0          8m16s   10.174.216.214    ubuntu-infer   <none>           <none>
      kube-system      calico-node-bzd7q                          1/1     Running             3          8d      10.174.217.94     ubuntu         <none>           <none>
      kube-system      calico-node-fh58s                          1/1     Running             1          8d      10.174.217.96     ubuntu-96      <none>           <none>
      kube-system      coredns-6955765f44-4pdhg                   1/1     Running             0          8d      192.168.243.249   ubuntu         <none>           <none>
      kube-system      coredns-6955765f44-n9pg4                   1/1     Running             2          8d      192.168.243.237   ubuntu         <none>           <none>
      kube-system      etcd-ubuntu                                1/1     Running             3          8d      10.174.217.94     ubuntu         <none>           <none>
      kube-system      kube-controller-manager-ubuntu             1/1     Running             4          8d      10.174.217.94     ubuntu         <none>           <none>
      kube-system      kube-proxy-b5flw                           1/1     Running             1          8d      10.174.217.96     ubuntu-96      <none>           <none>
      kube-system      kube-proxy-ttsjp                           1/1     Running             3          8d      10.174.217.94     ubuntu         <none>           <none>
      kube-system      kube-proxy-zp9xw                           1/1     Running             0          8m16s   10.174.216.214    ubuntu-infer   <none>           <none>
      kube-system      kube-scheduler-ubuntu                      1/1     Running             4          8d      10.174.217.94     ubuntu         <none>           <none>
      npu-exporter     npu-exporter-jwq5l                         1/1     Running             0          9h      192.168.70.80     ubuntu         <none>           <none>
      vcjob            mindx-dls-test-default-test-0             1/1     Running            0          4m      192.168.243.198   ubuntu         <none>           <none>
      volcano-system   volcano-controllers-7d6d465877-nnf7l       1/1     Running             1          8d      192.168.243.238   ubuntu         <none>           <none>
      volcano-system   volcano-scheduler-67f89949b4-ncs8q         1/1     Running             2          8d      192.168.243.211   ubuntu         <none>           <none>
    • 两个训练节点,执行2*8芯片分布式训练任务示例。
      root@ubuntu:~# kubectl get pod --all-namespaces -o wide
      NAMESPACE        NAME                                       READY   STATUS              RESTARTS   AGE     IP                NODE           NOMINATED NODE   READINESS GATES
      default          hccl-controller-688c7cb8c6-4b88n           1/1     Running             0          8d      192.168.243.199   ubuntu         <none>           <none>
      kube-system      ascend-device-plugin-daemonset-8f2dx       1/1     Running             2          8d      192.168.243.218   ubuntu         <none>           <none>
      kube-system      ascend-device-plugin-daemonset-f2jk9       1/1     Running             1          8d      192.168.207.49    ubuntu-96      <none>           <none>
      kube-system      ascend310-device-plugin-daemonset-fls4v    1/1     Running             0          4m15s   192.168.240.66    ubuntu-infer   <none>           <none>
      kube-system      calico-kube-controllers-8464785d6b-bj4pk   1/1     Running             1          8d      192.168.243.198   ubuntu         <none>           <none>
      kube-system      calico-node-bkbvl                          1/1     Running             0          8m16s   10.174.216.214    ubuntu-infer   <none>           <none>
      kube-system      calico-node-bzd7q                          1/1     Running             3          8d      10.174.217.94     ubuntu         <none>           <none>
      kube-system      calico-node-fh58s                          1/1     Running             1          8d      10.174.217.96     ubuntu-96      <none>           <none>
      kube-system      coredns-6955765f44-4pdhg                   1/1     Running             0          8d      192.168.243.249   ubuntu         <none>           <none>
      kube-system      coredns-6955765f44-n9pg4                   1/1     Running             2          8d      192.168.243.237   ubuntu         <none>           <none>
      kube-system      etcd-ubuntu                                1/1     Running             3          8d      10.174.217.94     ubuntu         <none>           <none>
      kube-system      kube-controller-manager-ubuntu             1/1     Running             4          8d      10.174.217.94     ubuntu         <none>           <none>
      kube-system      kube-proxy-b5flw                           1/1     Running             1          8d      10.174.217.96     ubuntu-96      <none>           <none>
      kube-system      kube-proxy-ttsjp                           1/1     Running             3          8d      10.174.217.94     ubuntu         <none>           <none>
      kube-system      kube-proxy-zp9xw                           1/1     Running             0          8m16s   10.174.216.214    ubuntu-infer   <none>           <none>
      kube-system      kube-scheduler-ubuntu                      1/1     Running             4          8d      10.174.217.94     ubuntu         <none>           <none>
      npu-exporter     npu-exporter-jwq5l                         1/1     Running             0          9h      192.168.70.80     ubuntu         <none>           <none>
      vcjob            mindx-dls-test-default-test-0             1/1     Running            0          3m      192.168.243.198   ubuntu         <none>           <none>
      vcjob            mindx-dls-test-default-test-1             1/1     Running            0          3m      192.168.243.199   ubuntu         <none>           <none>
      volcano-system   volcano-controllers-7d6d465877-nnf7l       1/1     Running             1          8d      192.168.243.238   ubuntu         <none>           <none>
      volcano-system   volcano-scheduler-67f89949b4-ncs8q         1/1     Running             2          8d      192.168.243.211   ubuntu         <none>           <none>

  2. (可选)执行以下命令,查看日志。

    kubectl logs -n [Pod运行namespace] [Pod 名称]

    例如:

    kubectl logs -n vcjob mindx-dls-test-default-test-0

  3. 查看计算节点的NPU分配情况。在管理节点执行以下命令查看。

    kubectl describe nodes

    Allocated resourceshuawei.com/Ascend910字段表示已使用的NPU芯片数量。

    • 单机单芯片训练任务示例。
      root@ubuntu:/home/test/yaml# kubectl describe nodes
      Name:               ubuntu
      Roles:              master,worker
      Labels:             accelerator=huawei-Ascend910
                          beta.kubernetes.io/arch=arm64
                          beta.kubernetes.io/os=linux
                          host-arch=huawei-arm
                          kubernetes.io/arch=arm64
                          kubernetes.io/hostname=ubuntu
                          kubernetes.io/os=linux
                          masterselector=dls-master-node
                          node-role.kubernetes.io/master=
                          node-role.kubernetes.io/worker=worker
                          workerselector=dls-worker-node
      Annotations:        kubeadm.alpha.kubernetes.io/cri-socket: /var/run/dockershim.sock
                          node.alpha.kubernetes.io/ttl: 0
                          projectcalico.org/IPv4Address: XXX.XXX.XXX.XXX/23
                          projectcalico.org/IPv4IPIPTunnelAddr: 192.168.243.192
                          volumes.kubernetes.io/controller-managed-attach-detach: true
      CreationTimestamp:  Mon, 28 Sep 2020 14:36:54 +0800
      ...
      Capacity:
        cpu:                   192
        ephemeral-storage:     1537233808Ki
        huawei.com/Ascend910:  8
        hugepages-2Mi:         0
        memory:                792307468Ki
        pods:                  110
      Allocatable:
        cpu:                   192
        ephemeral-storage:     1416714675108
        huawei.com/Ascend910:  8
        hugepages-2Mi:         0
        memory:                792205068Ki
        pods:                  110
      ...
      Allocated resources:
        (Total limits may be over 100 percent, i.e., overcommitted.)
        Resource              Requests        Limits
        --------              --------        ------
        cpu                   37250m (19%)    37500m (19%)
        memory                117536Mi (15%)  119236Mi (15%)
        ephemeral-storage     0 (0%)          0 (0%)
        huawei.com/Ascend910  1               1
      Events:                 <none>

      Allocated resources的字段huawei.com/Ascend910的值为1,表明训练使用了一个处理器。

    • 两个训练节点,执行2*8芯片分布式训练任务,查看其中一个节点示例。
      root@ubuntu:/home/test/yaml# kubectl describe nodes
      Name:               ubuntu
      Roles:              master,worker
      Labels:             accelerator=huawei-Ascend910
                          beta.kubernetes.io/arch=arm64
                          beta.kubernetes.io/os=linux
                          host-arch=huawei-arm
                          kubernetes.io/arch=arm64
                          kubernetes.io/hostname=ubuntu
                          kubernetes.io/os=linux
                          masterselector=dls-master-node
                          node-role.kubernetes.io/master=
                          node-role.kubernetes.io/worker=worker
                          workerselector=dls-worker-node
      Annotations:        kubeadm.alpha.kubernetes.io/cri-socket: /var/run/dockershim.sock
                          node.alpha.kubernetes.io/ttl: 0
                          projectcalico.org/IPv4Address: XXX.XXX.XXX.XXX/23
                          projectcalico.org/IPv4IPIPTunnelAddr: 192.168.243.192
                          volumes.kubernetes.io/controller-managed-attach-detach: true
      CreationTimestamp:  Mon, 28 Sep 2020 14:36:54 +0800
      ...
      Capacity:
        cpu:                   192
        ephemeral-storage:     1537233808Ki
        huawei.com/Ascend910:  8
        hugepages-2Mi:         0
        memory:                792307468Ki
        pods:                  110
      Allocatable:
        cpu:                   192
        ephemeral-storage:     1416714675108
        huawei.com/Ascend910:  8
        hugepages-2Mi:         0
        memory:                792205068Ki
        pods:                  110
      ...
      Allocated resources:
        (Total limits may be over 100 percent, i.e., overcommitted.)
        Resource              Requests        Limits
        --------              --------        ------
        cpu                   37250m (19%)    37500m (19%)
        memory                117536Mi (15%)  119236Mi (15%)
        ephemeral-storage     0 (0%)          0 (0%)
        huawei.com/Ascend910  8               8
      Events:                 <none>

      Allocated resources的字段huawei.com/Ascend910的值为8,表明分布式训练使用了所有芯片。

  4. 查看Pod的NPU使用情况。

    本例中使用kubectl describe pod mindx-dls-test-default-test-0 -n vcjob命令查看运行Pod的情况。

    Annotations字段表示使用的NPU芯片信息。

    • 单机单芯片训练任务示例。
      root@ubuntu:/home/test/yaml# kubectl describe pod mindx-dls-test-default-test-0 -n vcjob
      Name:         mindx-dls-test-default-test-0
      Namespace:    vcjob
      Priority:     0
      Node:         ubuntu/XXX.XXX.XXX.XXX
      Start Time:   Wed, 30 Sep 2020 15:38:22 +0800
      Labels:       app=tf
                    ring-controller.atlas=ascend-910
                    volcano.sh/job-name=mindx-dls-test
                    volcano.sh/job-namespace=vcjob
      Annotations:  atlas.kubectl.kubernetes.io/ascend-910-configuration:
                      {"pod_name":"0","server_id":"XXX.XXX.XXX.XXX","devices":[{"device_id":"3","device_ip":"192.168.20.102"}...
                    cni.projectcalico.org/podIP: 192.168.243.195/32
                    cni.projectcalico.org/podIPs: 192.168.243.195/32
                    huawei.com/Ascend910: Ascend910-3
                    huawei.com/AscendReal: Ascend910-3
                    huawei.com/kltDev: Ascend910-3
                    predicate-time: 18446744073709551615
                    scheduling.k8s.io/group-name: mindx-dls-test
                    volcano.sh/job-name: mindx-dls-test
                    volcano.sh/job-version: 0
                    volcano.sh/task-spec: default-test
      Status:       Running
    • 两个训练节点,执行2*8芯片分布式训练任务示例。
      root@ubuntu:/home/test/yaml# kubectl describe pod mindx-dls-test-default-test-0 -n vcjob
      Name:         mindx-dls-test-default-test-0
      Namespace:    vcjob
      Priority:     0
      Node:         ubuntu/XXX.XXX.XXX.XXX
      Start Time:   Wed, 30 Sep 2020 15:38:22 +0800
      Labels:       app=tf
                    ring-controller.atlas=ascend-910
                    volcano.sh/job-name=mindx-dls-test
                    volcano.sh/job-namespace=vcjob
      Annotations:  atlas.kubectl.kubernetes.io/ascend-910-configuration:
                      {"pod_name":"0","server_id":"XXX.XXX.XXX.XXX","devices":[{"device_id":"0","device_ip":"192.168.20.100"}...
                    cni.projectcalico.org/podIP: 192.168.243.195/32
                    cni.projectcalico.org/podIPs: 192.168.243.195/32
                    huawei.com/Ascend910: Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4,Ascend910-5,Ascend910-6,Ascend910-7
                    huawei.com/AscendReal: Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4,Ascend910-5,Ascend910-6,Ascend910-7,Ascend910-0
                    huawei.com/kltDev: Ascend910-3,Ascend910-4,Ascend910-5,Ascend910-6,Ascend910-7,Ascend910-0,Ascend910-1,Ascend910-2
                    predicate-time: 18446744073709551615
                    scheduling.k8s.io/group-name: mindx-dls-test
                    volcano.sh/job-name: mindx-dls-test
                    volcano.sh/job-version: 0
                    volcano.sh/task-spec: default-test
      Status:       Running