kubectl get pod --all-namespaces -o wide
root@ubuntu:~# kubectl get pod --all-namespaces -o wide NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES default hccl-controller-688c7cb8c6-4b88n 1/1 Running 0 8d 192.168.243.199 ubuntu <none> <none> kube-system ascend-device-plugin-daemonset-8f2dx 1/1 Running 2 8d 192.168.243.218 ubuntu <none> <none> kube-system ascend-device-plugin-daemonset-f2jk9 1/1 Running 1 8d 192.168.207.49 ubuntu-96 <none> <none> kube-system ascend310-device-plugin-daemonset-fls4v 1/1 Running 0 4m15s 192.168.240.66 ubuntu-infer <none> <none> kube-system calico-kube-controllers-8464785d6b-bj4pk 1/1 Running 1 8d 192.168.243.198 ubuntu <none> <none> kube-system calico-node-bkbvl 1/1 Running 0 8m16s 10.174.216.214 ubuntu-infer <none> <none> kube-system calico-node-bzd7q 1/1 Running 3 8d 10.174.217.94 ubuntu <none> <none> kube-system calico-node-fh58s 1/1 Running 1 8d 10.174.217.96 ubuntu-96 <none> <none> kube-system coredns-6955765f44-4pdhg 1/1 Running 0 8d 192.168.243.249 ubuntu <none> <none> kube-system coredns-6955765f44-n9pg4 1/1 Running 2 8d 192.168.243.237 ubuntu <none> <none> kube-system etcd-ubuntu 1/1 Running 3 8d 10.174.217.94 ubuntu <none> <none> kube-system kube-controller-manager-ubuntu 1/1 Running 4 8d 10.174.217.94 ubuntu <none> <none> kube-system kube-proxy-b5flw 1/1 Running 1 8d 10.174.217.96 ubuntu-96 <none> <none> kube-system kube-proxy-ttsjp 1/1 Running 3 8d 10.174.217.94 ubuntu <none> <none> kube-system kube-proxy-zp9xw 1/1 Running 0 8m16s 10.174.216.214 ubuntu-infer <none> <none> kube-system kube-scheduler-ubuntu 1/1 Running 4 8d 10.174.217.94 ubuntu <none> <none> npu-exporter npu-exporter-jwq5l 1/1 Running 0 9h 192.168.70.80 ubuntu <none> <none> vcjob mindx-dls-test-default-test-0 1/1 Running 0 4m 192.168.243.198 ubuntu <none> <none> volcano-system volcano-controllers-7d6d465877-nnf7l 1/1 Running 1 8d 192.168.243.238 ubuntu <none> <none> volcano-system volcano-scheduler-67f89949b4-ncs8q 1/1 Running 2 8d 192.168.243.211 ubuntu <none> <none>
root@ubuntu:~# kubectl get pod --all-namespaces -o wide NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES default hccl-controller-688c7cb8c6-4b88n 1/1 Running 0 8d 192.168.243.199 ubuntu <none> <none> kube-system ascend-device-plugin-daemonset-8f2dx 1/1 Running 2 8d 192.168.243.218 ubuntu <none> <none> kube-system ascend-device-plugin-daemonset-f2jk9 1/1 Running 1 8d 192.168.207.49 ubuntu-96 <none> <none> kube-system ascend310-device-plugin-daemonset-fls4v 1/1 Running 0 4m15s 192.168.240.66 ubuntu-infer <none> <none> kube-system calico-kube-controllers-8464785d6b-bj4pk 1/1 Running 1 8d 192.168.243.198 ubuntu <none> <none> kube-system calico-node-bkbvl 1/1 Running 0 8m16s 10.174.216.214 ubuntu-infer <none> <none> kube-system calico-node-bzd7q 1/1 Running 3 8d 10.174.217.94 ubuntu <none> <none> kube-system calico-node-fh58s 1/1 Running 1 8d 10.174.217.96 ubuntu-96 <none> <none> kube-system coredns-6955765f44-4pdhg 1/1 Running 0 8d 192.168.243.249 ubuntu <none> <none> kube-system coredns-6955765f44-n9pg4 1/1 Running 2 8d 192.168.243.237 ubuntu <none> <none> kube-system etcd-ubuntu 1/1 Running 3 8d 10.174.217.94 ubuntu <none> <none> kube-system kube-controller-manager-ubuntu 1/1 Running 4 8d 10.174.217.94 ubuntu <none> <none> kube-system kube-proxy-b5flw 1/1 Running 1 8d 10.174.217.96 ubuntu-96 <none> <none> kube-system kube-proxy-ttsjp 1/1 Running 3 8d 10.174.217.94 ubuntu <none> <none> kube-system kube-proxy-zp9xw 1/1 Running 0 8m16s 10.174.216.214 ubuntu-infer <none> <none> kube-system kube-scheduler-ubuntu 1/1 Running 4 8d 10.174.217.94 ubuntu <none> <none> npu-exporter npu-exporter-jwq5l 1/1 Running 0 9h 192.168.70.80 ubuntu <none> <none> vcjob mindx-dls-test-default-test-0 1/1 Running 0 3m 192.168.243.198 ubuntu <none> <none> vcjob mindx-dls-test-default-test-1 1/1 Running 0 3m 192.168.243.199 ubuntu <none> <none> volcano-system volcano-controllers-7d6d465877-nnf7l 1/1 Running 1 8d 192.168.243.238 ubuntu <none> <none> volcano-system volcano-scheduler-67f89949b4-ncs8q 1/1 Running 2 8d 192.168.243.211 ubuntu <none> <none>
kubectl logs -n [Pod运行namespace] [Pod 名称]
例如:
kubectl logs -n vcjob mindx-dls-test-default-test-0
kubectl describe nodes
Allocated resources的huawei.com/Ascend910字段表示已使用的NPU芯片数量。
root@ubuntu:/home/test/yaml# kubectl describe nodes Name: ubuntu Roles: master,worker Labels: accelerator=huawei-Ascend910 beta.kubernetes.io/arch=arm64 beta.kubernetes.io/os=linux host-arch=huawei-arm kubernetes.io/arch=arm64 kubernetes.io/hostname=ubuntu kubernetes.io/os=linux masterselector=dls-master-node node-role.kubernetes.io/master= node-role.kubernetes.io/worker=worker workerselector=dls-worker-node Annotations: kubeadm.alpha.kubernetes.io/cri-socket: /var/run/dockershim.sock node.alpha.kubernetes.io/ttl: 0 projectcalico.org/IPv4Address: XXX.XXX.XXX.XXX/23 projectcalico.org/IPv4IPIPTunnelAddr: 192.168.243.192 volumes.kubernetes.io/controller-managed-attach-detach: true CreationTimestamp: Mon, 28 Sep 2020 14:36:54 +0800 ... Capacity: cpu: 192 ephemeral-storage: 1537233808Ki huawei.com/Ascend910: 8 hugepages-2Mi: 0 memory: 792307468Ki pods: 110 Allocatable: cpu: 192 ephemeral-storage: 1416714675108 huawei.com/Ascend910: 8 hugepages-2Mi: 0 memory: 792205068Ki pods: 110 ... Allocated resources: (Total limits may be over 100 percent, i.e., overcommitted.) Resource Requests Limits -------- -------- ------ cpu 37250m (19%) 37500m (19%) memory 117536Mi (15%) 119236Mi (15%) ephemeral-storage 0 (0%) 0 (0%) huawei.com/Ascend910 1 1 Events: <none>
Allocated resources的字段huawei.com/Ascend910的值为1,表明训练使用了一个处理器。
root@ubuntu:/home/test/yaml# kubectl describe nodes Name: ubuntu Roles: master,worker Labels: accelerator=huawei-Ascend910 beta.kubernetes.io/arch=arm64 beta.kubernetes.io/os=linux host-arch=huawei-arm kubernetes.io/arch=arm64 kubernetes.io/hostname=ubuntu kubernetes.io/os=linux masterselector=dls-master-node node-role.kubernetes.io/master= node-role.kubernetes.io/worker=worker workerselector=dls-worker-node Annotations: kubeadm.alpha.kubernetes.io/cri-socket: /var/run/dockershim.sock node.alpha.kubernetes.io/ttl: 0 projectcalico.org/IPv4Address: XXX.XXX.XXX.XXX/23 projectcalico.org/IPv4IPIPTunnelAddr: 192.168.243.192 volumes.kubernetes.io/controller-managed-attach-detach: true CreationTimestamp: Mon, 28 Sep 2020 14:36:54 +0800 ... Capacity: cpu: 192 ephemeral-storage: 1537233808Ki huawei.com/Ascend910: 8 hugepages-2Mi: 0 memory: 792307468Ki pods: 110 Allocatable: cpu: 192 ephemeral-storage: 1416714675108 huawei.com/Ascend910: 8 hugepages-2Mi: 0 memory: 792205068Ki pods: 110 ... Allocated resources: (Total limits may be over 100 percent, i.e., overcommitted.) Resource Requests Limits -------- -------- ------ cpu 37250m (19%) 37500m (19%) memory 117536Mi (15%) 119236Mi (15%) ephemeral-storage 0 (0%) 0 (0%) huawei.com/Ascend910 8 8 Events: <none>
Allocated resources的字段huawei.com/Ascend910的值为8,表明分布式训练使用了所有芯片。
本例中使用kubectl describe pod mindx-dls-test-default-test-0 -n vcjob命令查看运行Pod的情况。
Annotations字段表示使用的NPU芯片信息。
root@ubuntu:/home/test/yaml# kubectl describe pod mindx-dls-test-default-test-0 -n vcjob Name: mindx-dls-test-default-test-0 Namespace: vcjob Priority: 0 Node: ubuntu/XXX.XXX.XXX.XXX Start Time: Wed, 30 Sep 2020 15:38:22 +0800 Labels: app=tf ring-controller.atlas=ascend-910 volcano.sh/job-name=mindx-dls-test volcano.sh/job-namespace=vcjob Annotations: atlas.kubectl.kubernetes.io/ascend-910-configuration: {"pod_name":"0","server_id":"XXX.XXX.XXX.XXX","devices":[{"device_id":"3","device_ip":"192.168.20.102"}... cni.projectcalico.org/podIP: 192.168.243.195/32 cni.projectcalico.org/podIPs: 192.168.243.195/32 huawei.com/Ascend910: Ascend910-3 huawei.com/AscendReal: Ascend910-3 huawei.com/kltDev: Ascend910-3 predicate-time: 18446744073709551615 scheduling.k8s.io/group-name: mindx-dls-test volcano.sh/job-name: mindx-dls-test volcano.sh/job-version: 0 volcano.sh/task-spec: default-test Status: Running
root@ubuntu:/home/test/yaml# kubectl describe pod mindx-dls-test-default-test-0 -n vcjob Name: mindx-dls-test-default-test-0 Namespace: vcjob Priority: 0 Node: ubuntu/XXX.XXX.XXX.XXX Start Time: Wed, 30 Sep 2020 15:38:22 +0800 Labels: app=tf ring-controller.atlas=ascend-910 volcano.sh/job-name=mindx-dls-test volcano.sh/job-namespace=vcjob Annotations: atlas.kubectl.kubernetes.io/ascend-910-configuration: {"pod_name":"0","server_id":"XXX.XXX.XXX.XXX","devices":[{"device_id":"0","device_ip":"192.168.20.100"}... cni.projectcalico.org/podIP: 192.168.243.195/32 cni.projectcalico.org/podIPs: 192.168.243.195/32 huawei.com/Ascend910: Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4,Ascend910-5,Ascend910-6,Ascend910-7 huawei.com/AscendReal: Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4,Ascend910-5,Ascend910-6,Ascend910-7,Ascend910-0 huawei.com/kltDev: Ascend910-3,Ascend910-4,Ascend910-5,Ascend910-6,Ascend910-7,Ascend910-0,Ascend910-1,Ascend910-2 predicate-time: 18446744073709551615 scheduling.k8s.io/group-name: mindx-dls-test volcano.sh/job-name: mindx-dls-test volcano.sh/job-version: 0 volcano.sh/task-spec: default-test Status: Running