平台适配
当MindCluster Volcano上报故障后,会在ConfigMap中记录节点故障信息。MindCluster Volcano获取节点故障信息的代码示例如下,若用户想要获取节点故障信息做其它业务逻辑处理,可参考实现对应的代码。
import "volcano.sh/volcano/pkg/scheduler/plugins/ascend-volcano-plugin/plugin" func (fNode *FaultNode) setNodeHealthyByNodeD(node *plugin.NPUNode) { if !fNode.isNodeDEnabled(node) { klog.V(util.LogInfoLev).Infof("node %s nodeD not enabled", node.Name) fNode.setNodeDValue(false) return } fNode.setNodeDValue(true) // 1. last node heartbeat update time until now being greater than maxInterval indicates unhealthy if !fNode.isNodeHealthyByHeartbeat() { fNode.setIsFaultNodeValue(true) fNode.setNodeHealthStateValue(NodeUnhealthy) klog.V(util.LogInfoLev).Infof("Node %s health state set %s for wrong heartbeat", node.Name, NodeUnhealthy) } }
父主题: 节点故障