kubectl get pod --all-namespaces -o wide
root@ubuntu:~# kubectl get pod --all-namespaces -o wide NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES ... default default-test-pytorch-master-0 1/1 Running 0 4m 192.168.243.198 ubuntu <none> <none> ...
root@ubuntu:~# kubectl get pods -A -o wide NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES ... default default-test-pytorch-master-0 1/1 Running 0 11h 192.168.70.69 ubuntu-155 <none> <none> default default-test-pytorch-worker-0 1/1 Running 0 11h 192.168.41.41 ubuntu-173 <none> <none> ...
kubectl describe nodes {任务运行节点的节点名}
root@ubuntu:/home/test/yaml# kubectl describe nodes Name: ubuntu Roles: master,worker Labels: accelerator=huawei-Ascend910 ... Allocated resources: (Total limits may be over 100 percent, i.e., overcommitted.) Resource Requests Limits -------- -------- ------ cpu 37250m (19%) 37500m (19%) memory 117536Mi (15%) 119236Mi (15%) ephemeral-storage 0 (0%) 0 (0%) huawei.com/Ascend910 1 1 Events: <none>
Allocated resources的字段huawei.com/Ascend910的值为1,表明训练使用了一个处理器。
root@ubuntu:/home/test/yaml# kubectl describe nodes Name: ubuntu Roles: master,worker Labels: accelerator=huawei-Ascend910 beta.kubernetes.io/arch=arm64 ... Allocated resources: (Total limits may be over 100 percent, i.e., overcommitted.) Resource Requests Limits -------- -------- ------ cpu 37250m (19%) 37500m (19%) memory 117536Mi (15%) 119236Mi (15%) ephemeral-storage 0 (0%) 0 (0%) huawei.com/Ascend910 8 8 Events: <none>
Allocated resources的字段huawei.com/Ascend910的值为8,表明分布式训练使用了所有芯片。
本例中使用kubectl describe pod mindx-dls-test-default-test-0 -n vcjob命令查看运行Pod的情况。
root@ubuntu:/home/test/yaml# kubectl describe pod default-test-pytorch-master-0 Name: default-test-pytorch-master-0 Namespace: default Priority: 0 Node: ubuntu-XXX/XXX.XXX.XXX.XXX Start Time: Mon, 27 Feb 2023 22:40:07 +0800 Labels: group-name=mindxdl.gitee.com job-name=default-test-pytorch replica-index=0 replica-type=master training.kubeflow.org/job-name=default-test-pytorch training.kubeflow.org/job-role=master training.kubeflow.org/operator-name=ascendjob-controller training.kubeflow.org/replica-index=0 training.kubeflow.org/replica-type=master Annotations: ascend.kubectl.kubernetes.io/ascend-910-configuration: {"pod_name":"default-test-pytorch-master-0","server_id":"xx.xx.xx.xx","devices":[{"device_id":"1","device_ip":"192.168.101.106"}]} cni.projectcalico.org/podIP: 192.168.70.104/32 cni.projectcalico.org/podIPs: 192.168.70.104/32 huawei.com/AscendReal: Ascend910-1 huawei.com/kltDev: Ascend910-1 Status: Running
root@ubuntu:/home/test/yaml# kubectl describe pod default-test-pytorch-worker-0 Name: default-test-pytorch-worker-0 Namespace: default Priority: 0 Node: ubuntu-XXX/XXX.XXX.XXX.XXX Start Time: Mon, 27 Feb 2023 22:09:57 +0800 Labels: group-name=mindxdl.gitee.com job-name=default-test-pytorch replica-index=0 replica-type=worker training.kubeflow.org/job-name=default-test-pytorch training.kubeflow.org/operator-name=ascendjob-controller training.kubeflow.org/replica-index=0 training.kubeflow.org/replica-type=worker Annotations: ascend.kubectl.kubernetes.io/ascend-910-configuration: {"pod_name":"default-test-pytorch-worker-0","server_id":"xx.xx.xx.xx","devices":[{"device_id":"0","device_ip":"192.168.100.108"},{"device... cni.projectcalico.org/podIP: 192.168.185.221/32 cni.projectcalico.org/podIPs: 192.168.185.221/32 huawei.com/AscendReal: Ascend910-7,Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4,Ascend910-5,Ascend910-6 huawei.com/kltDev: Ascend910-7,Ascend910-0,Ascend910-1,Ascend910-2,Ascend910-3,Ascend910-4,Ascend910-5,Ascend910-6 Status: Running