在Atlas 推理系列产品上部署Ascend Device Plugin和Volcano组件并开启动态虚拟化开关,下发任务后虚拟设备创建成功,但推理任务运行失败。
推理任务容器使用普通用户运行,会出现以下问题,最终导致普通用户访问root属组的vNPU设备失败,从而导致推理业务容器运行失败。
command: [ "/bin/bash", "-c", "--"] args: [ "device-plugin -useAscendDocker=true -volcanoType=true -logFile=/var/log/mindx-dl/devicePlugin/devicePlugin.log -logLevel=0" ] securityContext: privileged: true readOnlyRootFilesystem: true imagePullPolicy: Never volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins ... # 此处省略若干字段 - name: tmp mountPath: /tmp - name: dev mountPath: /dev # 在此处挂载/dev env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins ... # 此处省略若干字段 - name: tmp hostPath: path: /tmp - name: dev # 在此处挂载/dev hostPath: path: /dev