健康探针配置样例

本章节提供了健康探针示例脚本（health_probe.sh），用于在服务高压的场景下检查服务是否正常，如果健康状态查询接口响应超时，可使用以下两种健康检查方案。

基于Kubernetes平台的健康检查方案，用户需提前了解Kubernetes探针的使用方法，详情请参见Kubernetes官网链接。此方案需要用户在创建Pod时配置健康探针，具体请参见脚本介绍~基于Kubernetes探针配置样例。
非Kubernetes平台的健康检查方案，该方案提供了可执行脚本，用户可基于执行脚本的返回结果，判断服务是否健康，详情请参见非Kubernetes场景探针配置样例。

原理介绍

通过调用health/timed-${TIMEOUT}的接口，并结合NPU占用率信息，判断服务是否正常。

限制与约束

Atlas 800I A2 推理服务器和Atlas 800I A3 超节点服务器支持该检查方式。
支持PD混部和单机部署场景。

脚本介绍

健康探针示例脚本health_probe.sh如下所示，关键参数配置参见表1。

#!/bin/bash

#######################################################################################
# Check /health/timed-3
#######################################################################################

config_file="/user/local/Ascend/mindie/latest/mindie-service/conf/config.json"
management_port=$(grep '"managementPort"' "$config_file" | sed 's/[^0-9]*//g')
HEALTH_CHECK_URL="https://127.0.0.2:$management_port/health/timed-3"

response_file=~/health_response
curl --silent --write-out "HTTPSTATUS:%{http_code}" -m 3 "$HEALTH_CHECK_URL" > "$response_file" &


#######################################################################################
# Check npu-smi info
#######################################################################################

npu_id=$(awk 'NR==2 {print $1}' ~/device_info)
lower_limit=10
max_aicore_usage=0
num_samples=4

for ((i=0; i<num_samples; i++)); do
    output=$(npu-smi info -t usages -i "$npu_id")

    aicore_usage=$(echo "$output" | grep 'Aicore Usage Rate(%)' | awk '{print $NF}' | tr -d '%' | sort -nr | head -1)
    if [[ -n "$aicore_usage" && "$aicore_usage" -gt "$max_aicore_usage" ]]; then
        max_aicore_usage=$aicore_usage
    fi

    if (( i < num_samples - 1 )); then
        sleep 0.1
    fi
done

if (( max_aicore_usage < lower_limit )); then
    core_abnormal=true
else
    core_abnormal=false
fi


#######################################################################################
# Final conclusion
#######################################################################################

wait $!
response=$(<"$response_file")
response_body=$(echo "$response" | sed -e 's/HTTPSTATUS\:.*//g')
response_code=$(echo "$response" | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')

if [[ "$response_code" -ne 200 ]] || [[ "$response_body" != '{"status":"healthy"}' ]]; then
    timed_out=true
else
    timed_out=false
fi

if [[ "$timed_out" == true && "$core_abnormal" == true ]]; then
    max_aicore_usage=0
    num_samples=6

    for ((i=0; i<num_samples; i++)); do
        output=$(npu-smi info -t usages -i "$npu_id")

        aicore_usage=$(echo "$output" | grep 'Aicore Usage Rate(%)' | awk '{print $NF}' | tr -d '%' | sort -nr | head -1)

        if [[ -n "$aicore_usage" && "$aicore_usage" -gt "$max_aicore_usage" ]]; then
            max_aicore_usage=$aicore_usage
        fi

        if (( i < num_samples - 1 )); then
            sleep 0.1
        fi
    done

    if (( max_aicore_usage < lower_limit )); then
        core_abnormal=true
    else
        core_abnormal=false
    fi

    if [[ "$timed_out" == true && "$core_abnormal" == true ]]; then
        echo 501
        exit 1
    else
        echo 200
        exit 0
    fi
else
    echo 200
    exit 0
fi

代码解析如下：

健康接口检查：调用health/timed-x的接口，并记录响应信息至response_file文件。

#######################################################################################
# Check /health/timed-3
#######################################################################################

# 读取PD混部服务化配置参数中的管理面端口，赋值给management_port变量
config_file="/home/HwHiAiUser/Ascend/mindie/latest/mindie-service/conf/config.json"
management_port=$(grep '"managementPort"' "$config_file" | sed 's/[^0-9]*//g')
HEALTH_CHECK_URL="https://127.0.0.2:$management_port/health/timed-3"

# 调用PD混部健康探针接口：/health/timed-${TIMEOUT}，详情请参见健康探针接口
# 并把结果写入到response_file文件中
# 0.0.0.0需要修改为PD混部服务化配置参数中的管理面IP
# --silent：静默模式
# --write-out "HTTPSTATUS:%{http_code}"：表示在响应输出最后追加一段自定义信息，这里是HTTP状态码。例如如果返回200，则在输出的最后加上HTTPSTATUS:200
# -m 3：表示设置最大执行时间为3秒，超过3秒自动退出（防止请求超时）
response_file=~/health_response
curl --silent --write-out "HTTPSTATUS:%{http_code}" -m 3 "$HEALTH_CHECK_URL" > "$response_file" &

AICore使用率检查：检查NPU的AICore使用率，并判断是否低于限定值。

#######################################################################################
# Check npu-smi info
#######################################################################################

npu_id=$(awk 'NR==2 {print $1}' ~/device_info)                                      # 读取NPU卡号
max_aicore_usage=0                                                                  # 存放AICore使用率
num_samples=4                                                                       # 设置采样次数
lower_limit=10                                                                      # AICore使用率的判断阈值，本样例为10

# 循环重试机制：最多尝试num_samples次
for ((i=0; i<num_samples; i++)); do
    # npu-smi info -t usages：获取NPU使用率相关信息
    # "$npu_id"：NPU卡号
    output=$(npu-smi info -t usages -i "$npu_id")

    # 提取AICore使用率信息
    # echo "$output"：输出NPU使用率相关信息
    # grep 'Aicore Usage Rate(%)：过滤出Aicore Usage Rate(%)字段
    # awk '{print $NF}' | tr -d '%'： 提取最后一个字段并取出百分号%
    aicore_usage=$(echo "$output" | grep 'Aicore Usage Rate(%)' | awk '{print $NF}' | tr -d '%' | sort -nr | head -1)

    # 如果提取到的值不为空，并且比当前记录的最大值大，就更新最大值
    if [[ -n "$aicore_usage" && "$aicore_usage" -gt "$max_aicore_usage" ]]; then
        max_aicore_usage=$aicore_usage
    fi
    # 每次采样之间休眠0.1秒，避免采样间隔过短
    if (( i < num_samples - 1 )); then
        sleep 0.1
    fi
done

# 判断当前AICore使用率是否低于设定值
if (( max_aicore_usage < lower_limit )); then
    core_abnormal=true
else
    core_abnormal=false
fi

信息汇总判断：如果健康接口返回异常，但AICore使用率高于限定值，则认为服务正常；如果健康接口返回异常且AICore使用率低于限定值，则认为服务异常。

#######################################################################################
# Final conclusion
#######################################################################################

# 阻塞，确保HTTP请求完成后再处理结果
wait $!
# 读取响应信息
response=$(<"$response_file")
# 提取响应体
response_body=$(echo "$response" | sed -e 's/HTTPSTATUS\:.*//g')
# 提取响应码
response_code=$(echo "$response" | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')

# 如果http请求的返回值正常，则将timed_out标志位至为false
if [[ "$response_code" -ne 200 ]] || [[ "$response_body" != '{"status":"healthy"}' ]]; then
    timed_out=true
else
    timed_out=false
fi

# 判断服务是否健康
# 条件1：HTTP状态码必须为 200 或 响应体内容是 '{"status":"healthy"}'
# 条件2：AICore的使用率高于设定值
if [[ "$timed_out" == true && "$core_abnormal" == true ]]; then
    max_aicore_usage=0
    num_samples=6

    # 保底机制：如果服务可能存在异常，则再次检查AICore的使用率
    # 循环重试机制：最多尝试 num_samples 次
    for ((i=0; i<num_samples; i++)); do
        output=$(npu-smi info -t usages -i "$npu_id")

        aicore_usage=$(echo "$output" | grep 'Aicore Usage Rate(%)' | awk '{print $NF}' | tr -d '%' | sort -nr | head -1)

        if [[ -n "$aicore_usage" && "$aicore_usage" -gt "$max_aicore_usage" ]]; then
            max_aicore_usage=$aicore_usage
        fi

        if (( i < num_samples - 1 )); then
            sleep 0.1
        fi
    done

    # 判断当前AICore使用率是否低于设定值
    if (( max_aicore_usage < lower_limit )); then
        core_abnormal=true
    else
        core_abnormal=false
    fi

    # 如果健康接口响应超时且AICore使用率低于设定值，认为服务不健康；否则认为服务健康
    if [[ "$timed_out" == true && "$core_abnormal" == true ]]; then
        echo 501
        exit 1
    else
        echo 200
        exit 0
    fi
else
    echo 200
    exit 0
fi

表1 health_probe.sh关键参数
参数	说明
config_file	容器中配置MindIE服务的配置文件（config.json）路径，默认值为：/user/local/Ascend/mindie/latest/mindie-service/conf/config.json。该配置文件详情请参见《MindIE LLM开发指南》中的“核心概念与配置 > 配置参数说明（服务化）”章节。
HEALTH_CHECK_URL	调用健康探针接口的URL（/health/timed-${TIMEOUT}），该接口的使用详情请参见健康探针接口，接口中的IP地址需设置为服务的管理面IP，默认值为12.0.0.2。
npu_id	被监测的NPU卡号，首先从~/device_info文件读取NPU信息，如果该文件不存在，可配置为0、1等。例如：npu_id=0。
lower_limit	AiCore使用率的判断阈值，使用率低于该阈值认为服务异常，默认值为10。如果/health/timed-${TIMEOUT}接口未正常返回且AiCore使用率小于该值时，则认为服务健康状态异常。

基于Kubernetes探针配置样例

存活探针（livenessProbe）/就绪探针（readinessProbe）：使用/{健康探针示例脚本所在目录}/health_probe.sh作为存活探针，周期/超时时间大于等于5秒。
启动探针（startupProbe）：直接调用/health/time-${TIMEOUT}接口，周期/超时时间大于等于5秒。
port：为MindIE的管理面端口，即MindIE的config.json配置文件中“managementPort”字段的值。

如果Kubernetes版本太低无法配置启动探针，建议为存活/就绪探针配置“initialDelaySeconds”字段，其值需要大于模型启动时间。
Kubernetes探针的具体使用方式，详情请参考Kubernetes官网链接。

以下代码为Kubernetes存活、就绪和启动探针的配置样例，其中，探针的周期和超时间根据实际情况调整。

livenessProbe:
  exec:
    command:
      - bash
      - -c
      - "/{健康探针示例脚本所在目录}/health_probe.sh"
  periodSeconds: 5
  timeoutSeconds: 5
  # initialDelaySeconds: 180
readinessProbe:
  exec:
    command:
      - bash
      - -c
      - "/{健康探针示例脚本所在目录}/health_probe.sh"
  periodSeconds: 5
  timeoutSeconds: 5
  # initialDelaySeconds: 180
startupProbe:
  httpGet:
      path: health/time-3
      port: 1026
  periodSeconds: 5

非Kubernetes场景探针配置样例

没有安装Kubernetes的场景中，用户可以直接调用如下Shell脚本来检测MindIE服务的健康状态，配置完成后直接通过bash命令执行即可。

脚本会周期性的调用health/timed-${TIMEOUT}接口，结合NPU使用率信息，判断服务是否正常。脚本执行周期约为15秒（受采样次数影响），检测逻辑如下：

服务正常：返回退出码0，记录运行状态日志。
服务异常：返回退出码1，记录异常原因日志。
脚本执行周期计算公式：执行周期（秒）=健康接口采样次数 * 4 + NPU使用率采样次数 * 0.1

本样例仅提供了服务状态的查询能力，业务系统需根据查询结果判断是否进行下一步处理。

Shell脚本如下所示，请重点关注加粗内容：

#!/bin/bash

# 日志文件，该路径为绝对路径，此处直接写文件名代表日志记录在脚本的同级目录。
LOG_FILE="monitor.log"

# 服务健康检查URL
HEALTH_CHECK_URL="https://127.0.0.2:1026/health/timed-3"

# 设置采样次数，脚本会基于多次采样中的最佳结果判断服务是否健康
max_retries=3  # 健康接口采样次数，每次采样最多耗时4秒
num_samples=4  # NPU使用率采样次数，每次采样耗时0.1秒

# 记录日志函数
log_message() {
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    echo "[$timestamp] $1" >> "$LOG_FILE"
}

# 检查服务状态函数
check_service_health() {
    local response
    local http_code
    local retry_count=0
    # 设置采样次数

    # 重试机制
    while [ $retry_count -lt $max_retries ]; do
        response=$(curl -s -w "\n%{http_code}" "$HEALTH_CHECK_URL" 2>/dev/null)
        http_code=$(echo "$response" | tail -n1)
        content=$(echo "$response" | sed '$d')

        # 检查HTTP状态码和响应内容
        if [ "$http_code" = "200" ] && [ "$content" = '{"status":"healthy"}' ]; then
            return 0
        fi

        retry_count=$((retry_count + 1))
        sleep 1
    done

    if check_aicore_usage; then
        log_message "健康探针未响应，但AICore使用率高于10%，判定服务为正常状态"
        return 0
    fi

    return 1
}

# NPU信息检查函数
check_aicore_usage() {
    # 从 ~/device_info文件读取NPU信息，假如没有该文件，请设置为卡号，如0、1
    local npu_id=$(awk 'NR==2 {print $1}' ~/device_info)
    local max_aicore_usage=0
    local lower_limit=10

    for ((i=0; i<num_samples; i++)); do
        # 提取AICore使用率
        output=$(npu-smi info -t usages -i "$npu_id")
        aicore_usage=$(echo "$output" | grep 'Aicore Usage Rate(%)' | awk '{print $NF}' | tr -d '%' | sort -nr | head -1)
        # 如果提取到的值不为空，并且比当前记录的最大值大，就更新最大值
        if [[ -n "$aicore_usage" && "$aicore_usage" -gt "$max_aicore_usage" ]]; then
            max_aicore_usage=$aicore_usage
        fi
        # 每次采样之间短暂休眠0.1秒，避免采样间隔过短
        if (( i < num_samples - 1 )); then
            sleep 0.1
        fi
    done

    # 根据最大值判断核心是否异常（可选）
    # 这里设定阈值为10%，小于10%认为异常
    if (( max_aicore_usage < lower_limit )); then
        return 1
    else
        return 0
    fi
}


# 检查服务健康状态
if check_service_health; then
    # 服务正常
    log_message "服务正常运行"
    exit 0
else
    # 服务异常
    log_message "服务异常，请检查服务异常原因"
    exit 1
fi

父主题： 附录