平台适配
当MindCluster Volcano上报故障后,会在ConfigMap中记录节点故障信息。MindCluster Volcano获取节点故障信息的代码示例如下,若用户想要获取节点故障信息做其它业务逻辑处理,可参考实现对应的代码。
import "volcano.sh/volcano/pkg/scheduler/plugins/ascend-volcano-plugin/plugin"
func (fNode *FaultNode) setNodeHealthyByNodeD(node *plugin.NPUNode) {
if !fNode.isNodeDEnabled(node) {
klog.V(util.LogInfoLev).Infof("node %s nodeD not enabled", node.Name)
fNode.setNodeDValue(false)
return
}
fNode.setNodeDValue(true)
// 1. last node heartbeat update time until now being greater than maxInterval indicates unhealthy
if !fNode.isNodeHealthyByHeartbeat() {
fNode.setIsFaultNodeValue(true)
fNode.setNodeHealthStateValue(NodeUnhealthy)
klog.V(util.LogInfoLev).Infof("Node %s health state set %s for wrong heartbeat", node.Name, NodeUnhealthy)
}
}
父主题: 节点故障