昇腾社区首页
中文
注册
开发者
下载

故障配置说明

针对芯片故障的不同级别进行分级处理时,Container Manager组件会获取到当前故障的故障码,根据故障码处理级别,对故障进行相应处理。

默认故障码配置

Container Manager启动后,会默认按照如下配置作为当前故障处理依据:
{
  "NotHandleFaultCodes":[
    "80E21007","80E38003","80F78006","80C98006","80CB8006","81318006","80A18006","80A18005","80FB8000","8C1F8609",
    "80CD8006","80CD8003","80A38006","80A38003","80A58006","80A58003","80DE1805","80F18006","80F18003","80DF8006",
    "80E01805","80E18400","80E01809","80E18401","80E00209","80F38006","80F38003","80E18006","80D38009","819B800D",
    "80DD8008","80DD8007","80B98006","80BD8006","819B8006","80DE1803","819D8000","81998006","81978006","81978004",
    "815F8006","815F8004","81338006","81338004","817F8006","817F8004","816F8006","816F8004","814F8006","814F8004",
    "81938006","81938004","81478006","81478004","813B8006","813B8004","81578006","81578004","81958006","81958004",
    "81078603","8C2FA009","A4025021","A60250C1","A4025081","A214000D","A414000D","A4028801","A4025101","A2140007",
    "A4140007","A2140008","A4140008","A40250E1","A214000A","A414000A","A4025061","A4025041","A214000B","A414000B",
    "A414000C","A2140009","A4140009","A4303002","80B78006","80B78005","80E1800F","80DE0200","814D8006","8C1F860B",
    "8C1F8608","4C1F8608","819B8003","80DF8401","80DF8400","80818200","80818201","80818202","80818203","80818204",
    "80818205","80F38009","81A3880C","81AD8605","80E20207","81078605","80DE0207","8C2FA001","819B8605","80818C06",
    "8C1F860A","80E18405"
  ],
  "RestartRequestCodes":[
    "80C98008","80C98002","80C98003","80C98009","80CB8002","80CB8008","80CB8009","80CF8003","81318008","80D58000",
    "80D58009","80D98008","80DB800A","80DB8000","80DD8000","80DD8003","80C98000","81AB800D","81AB8003","80BD8000",
    "80BB8009","80BD8003","80BD8009","80BB8000","80BB8003","80BB8008","80BB800A","81AB8008","80C9800A","80CB800A"
  ],
  "RestartBusinessCodes":[
    "8C204E00","A8028802","A4302003","A4302004","A4302005","A4302006","A4302009","A430200A", "A6301002","B4060011",
    "B406009C","B4060008","B4060009","B406000E","A60250A1","A2301001","A2301002","A2303001", "B4060006","B4060007",
    "B406000D","B4060014","B4060010","B4060011","80E01801", "81B38009","81B38004"
  ],
  "FreeRestartNPUCodes":[
    "8C0E4E00","8C104E00","8C0C4E00","8C044E00","8C064E00","8C17A005","8C1DA005","8C19A005","8C0A4E00","8C084E00",
    "A4193217","A4193218","A42A0000","A42F3917","A42F3918","8C464E00","8C124E00"
  ],
  "RestartNPUCodes":[
    "8C03A000","8C1FA006","40F84E00","80E24E00","80E21E01","80E38008","80E3A202","80E3A203","80E39200","819B800A",
    "80E2120D","80E78000","80E78008","80FA4E00","812E4E00","80C78008","80F78009","80F78008","80F78003","80E18404",
    "80FB8005","80A18008","80CD8008","80A38008","80A58008","80DE1801","80F18008","80F18000","80F1800A","80CF8000",
    "80DF8000","80DF8009","80DF8008","80DF800A","80F38008","80F2180D","80E18005","80E18008","80E1800A","812F8000",
    "80B98000","80B98008","80BD8008","80CB8001","81998009","81998008","81978008","815F8008","81338008","817F8008",
    "81478008","813B8008","81578008","81958008","A2141004","A2141006","A2142004","A2142006","A2145004","A4183200",
    "A6023001","A6023002","A6023003","A6023004","A6060000","A6060001","A6060002","A6060003","A6060004","A6060005",
    "A606000A","A606000B","A606000C","A606000F","A606009D","A6060FFF","A607FFFF","A6140001","A6140002","A6140003",
    "A6140004","A6140005","A6140006","A6141003","A6142003","A6143003","A6144003","A6145003","A6192D15","A6193206",
    "A6193215","A6193248","A62F3905","A62FFFFF","A6303003","A6303004","A6360000","A6361000","A6362000","A8021004",
    "A8060FFF","A807FFFF","A82A0000","80B78000","80B58000","81498004","80F78C02","80F78C03","80F78C04","81B38008",
    "80E18000","80E21008","80C98001","80E58005","80E58009","80E58E02","80E58E03","816F8008","814F8008","81938008",
    "80E44E00","80CF8009","80CF8008","813B8002","81338002","81578002","81958002","81938002","81478002","81978002",
    "815F8002","81C9800A","81C7800A","81C5800A","813F800A","8139800A","8145800A","8C4BA00C","80E3A207"
  ],
  "SeparateNPUCodes":[
    "80E3A201","80E18402","80E0020B","817F8002","816F8002","814F8002","9419321B","A2301000","A2301001","A2302001",
    "A4192C1A","A4193216","A419321B","A419321C","A42F390F","A42F3916","A42F391A","A6183207","A62F3934","A8028801",
    "A819320F","A8193234","A8193235","80818c00","80818C05","80DF8402","80818C00","4C4BA00C"
  ]
}

故障码级别说明

Container Manager从驱动获取到芯片故障码后,根据故障码对设备及业务的影响将故障划分为以下六种级别,详细说明请参见表1。若用户需要修改故障码的故障级别,请参见(可选)配置芯片故障级别

表1 故障级别及处理说明

故障级别

NPU复位策略

容器处理策略

NotHandleFault

对业务无影响的故障,无需处理。

暂不处理。

RestartRequest

Container Manager在故障持续60秒后,将故障芯片和关联芯片加入到待复位芯片缓存中。待复位芯片缓存复位逻辑详细请参见故障处理

当命令run的启动参数“-ctrStrategy”配置为“singleRecover”或者“ringRecover”时,开启容器启停功能。两个配置参数的差异请参见表1

RestartBusiness

FreeRestartNPU

Container Manager收到故障后,立即将故障芯片和关联芯片加入到待复位芯片缓存中。待复位芯片缓存复位逻辑详细请参见故障处理

RestartNPU

SeparateNPU

故障无法通过复位恢复,需要隔离芯片。

暂不处理。