diff --git a/build/Dockerfile b/build/Dockerfile index bc6d84939633c1a4396ac50a63829070bd3baabb..4ec5916c8c78ca06e42f8b02f5c8ff91df591b7e 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -9,11 +9,13 @@ ENV LD_LIBRARY_PATH /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib6 COPY ./device-plugin /usr/local/bin/ COPY ./faultCode.json /usr/local/ COPY ./faultCustomization.json /usr/local/ +COPY ./SwitchFaultCode.json /usr/local/ RUN chmod 550 /usr/local/bin/device-plugin &&\ chmod 550 /usr/local/bin &&\ chmod 440 /usr/local/faultCode.json &&\ chmod 440 /usr/local/faultCustomization.json &&\ + chmod 440 /usr/local/SwitchFaultCode.json &&\ chmod 750 /home/HwHiAiUser &&\ chmod 750 /home/hwMindX &&\ echo 'umask 027' >> /etc/profile &&\ diff --git a/build/SwitchFaultCode.json b/build/SwitchFaultCode.json new file mode 100644 index 0000000000000000000000000000000000000000..4ae6f9fee359c53dde6933b42993c45d94837381 --- /dev/null +++ b/build/SwitchFaultCode.json @@ -0,0 +1,19 @@ +{ + "NotHandleFaultCodes":["0000002d"], + "ReportFaultCodes": [ + "00000002", "00000003", "0000520a", "00000055", "00000056", "00000057", "00000058", "00000059", "0000005a", + "0000005b", "0000005c", "000001be"], + "SubHealthFaultCodes": [ + "00000008", "00002712", "00002afa", "0000000c", "0000000d", "0000000f", "00000010", "00000012", "000055f2", + "000059da", "000061aa", "000000d1", "000000d2", "000000d3", "000000d4", "000000d5", "000000d6", "000000d7", + "000000d8", "000000f3", "000000f4", "000000f5", "000000f6", "00000113", "00000114", "00000115", "00000140", + "000001c0", "000001c1"], + "ResetFaultCodes": [ + "0000001b", "0000001c", "00002710", "00002af8", "00002711", "00002af9", "0000002e", "0000003a", "00000025", + "0000003b", "0000003c", "0000003f", "00000040", "00005208", "00005209", "000055f0", "000059d8", "000055f1", + "000059d9", "000061a8", "000061a9", "000000cf", "000000d0", "000000f9", "000000fa", "0000014a", "00000151", + "00000148", "00000007", "00000004", "00000005", "00000006"], + "SeparateFaultCodes": [ + "000000de", "000000ff", "00000116", "00000117", "00000149", "0000014b", "0000014c", "0000014d", "0000014e", + "0000014f", "00001f40", "00001f41"] +} \ No newline at end of file diff --git a/build/build.sh b/build/build.sh index acd78e950f9eec01e07e7adc99fb663aa3c23bbd..592f04c56896fb3734aba5b02e9b07f46d6acb03 100644 --- a/build/build.sh +++ b/build/build.sh @@ -104,6 +104,7 @@ function modify_version() { cp "$CUR_DIR"/faultCode.json "$TOP_DIR"/output/faultCode.json cp "$CUR_DIR"/faultCustomization.json "$TOP_DIR"/output/faultCustomization.json + cp "$CUR_DIR"/SwitchFaultCode.json "$TOP_DIR"/output/SwitchFaultCode.json sed -i "s#output/device-plugin#device-plugin#" "$TOP_DIR"/output/Dockerfile } diff --git a/main.go b/main.go index 2f09202c24aad2597fa99bd48b40643df0d22787..b0cbc2c8b11c48cc1c485b5d934231c9b3d112ef 100644 --- a/main.go +++ b/main.go @@ -25,6 +25,7 @@ import ( "huawei.com/npu-exporter/v6/devmanager" "Ascend-device-plugin/pkg/common" + "Ascend-device-plugin/pkg/device/deviceSwitch" "Ascend-device-plugin/pkg/server" ) @@ -204,6 +205,18 @@ func InitFunction() (*server.HwDevManager, error) { return nil, fmt.Errorf("init device manager failed") } hwlog.RunLog.Info("init device manager success") + common.ParamOption.EnableSwitchFault = true + if common.ParamOption.RealCardType == common.Ascend910A3 && common.ParamOption.EnableSwitchFault { + switchDevMgr := deviceSwitch.NewSwitchDevManager() + if err := switchDevMgr.InitSwitchDev(); err != nil { + hwlog.RunLog.Warnf("failed to init switch switch device manager, will not deal with switch fault, "+ + "err: %s", err.Error()) + common.ParamOption.EnableSwitchFault = false + // will not return err, to ensure dp keep running while switch is not reachable + return hdm, nil + } + hdm.SwitchDevManager = switchDevMgr + } return hdm, nil } diff --git a/pkg/common/constants.go b/pkg/common/constants.go index d580943b1d376321c3f94ee224408b9e15da2728..cfc8da479bdc718a3861a1e68df7680cc60eebe2 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -64,6 +64,8 @@ const ( DeviceInfoCMNamePrefix = "mindx-dl-deviceinfo-" // DeviceInfoCMDataKey device info configmap data key DeviceInfoCMDataKey = "DeviceInfoCfg" + //SwitchInfoCMDataKey the key of switch info in device-info configmap + SwitchInfoCMDataKey = "SwitchInfoCfg" // DeviceInfoCMManuallySeparateNPUKey for deviceinfo configmap ManuallySeparateNPU key DeviceInfoCMManuallySeparateNPUKey = "ManuallySeparateNPU" // SlowNodeNoticeCMName the name for slow node notice configmap @@ -622,12 +624,16 @@ const ( PollFaultCodeCMMaxInterval = 3600 // PollFaultCodeCMMinInterval is the min interval(second) of polling fault code CM PollFaultCodeCMMinInterval = 30 + // GetSwitchFaultCodeInterval is the interval(second) of get all fault code by get interface + GetSwitchFaultCodeInterval = 300 // FaultCodeCMName is the name of the configmap that is used to save fault code FaultCodeCMName = "mindx-dl-fault-config" // FaultCodeCMNameSpace is the namespace of the fault code configmap FaultCodeCMNameSpace = "kube-system" // FaultCodeKey is the key to find fault code in cm FaultCodeKey = "faultCode.json" + // SwitchFaultCodeKey is the key of the switch fault code + SwitchFaultCodeKey = "SwitchFaultCode.json" // FaultCustomizationKey is the key to find fault customization in cm FaultCustomizationKey = "faultCustomization.json" // PollIntervalKey is the key to find poll interval in cm diff --git a/pkg/common/device.go b/pkg/common/device.go index 4f8e3e9bf6f73445de77f837bcbe7bb5d5382cb0..a80175372d0d5410beadb9a961ea9814e7dd79d8 100644 --- a/pkg/common/device.go +++ b/pkg/common/device.go @@ -19,12 +19,41 @@ import ( "fmt" "strconv" "strings" + "sync" + "time" "huawei.com/npu-exporter/v6/common-utils/hwlog" + "huawei.com/npu-exporter/v6/devmanager/common" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) +const ( + // NotHandleFaultLevel NotHandle Fault Level + NotHandleFaultLevel = 0 + // PreSeparateFaultLevel PreSeparate Fault Level + PreSeparateFaultLevel = 1 + // SeparateFaultLevel Separate Fault Level + SeparateFaultLevel = 2 + // NotHandleFaultLevelStr NotHandle Fault Level Str + NotHandleFaultLevelStr = "NotHandle" + // PreSeparateFaultLevelStr PreSeparate Fault Level Str + PreSeparateFaultLevelStr = "PreSeparate" + // SeparateFaultLevelStr Separate Fault Level Str + SeparateFaultLevelStr = "Separate" +) + +var ( + // SwitchFaultLevelMapLock Lock SwitchFaultLevelMap to avoid concurrence write and read + SwitchFaultLevelMapLock sync.Mutex + // SwitchFaultLevelMap record every fault code and it's level + SwitchFaultLevelMap = make(map[int64]int, GeneralMapSize) + // SwitchFaultLock is used for CurrentSwitchFault which may be used concurrence + SwitchFaultLock sync.Mutex + // CurrentSwitchFault store all switch fault which will be reported to device-info configmap + currentSwitchFault = make([]int64, 0, GeneralMapSize) +) + // GetDeviceID get device physical id and virtual by device name func GetDeviceID(deviceName string, ascendRuntimeOptions string) (int, int, error) { // share mode of ascend310 ascend310P:davinci-devID-index, like Ascend310P-0-99 @@ -59,6 +88,59 @@ func GetDeviceID(deviceName string, ascendRuntimeOptions string) (int, int, erro return phyID, virID, nil } +// GetSwitchFaultInfo GetSwitch Fault Info by CurrentSwitchFault and fault config of switch +func GetSwitchFaultInfo() SwitchFaultInfo { + if ParamOption.RealCardType != common.Ascend910A3 || !ParamOption.EnableSwitchFault { + return SwitchFaultInfo{} + } + maxFaultLevel := 0 + SwitchFaultLock.Lock() + defer SwitchFaultLock.Unlock() + SwitchFaultLevelMapLock.Lock() + for _, code := range currentSwitchFault { + level := SwitchFaultLevelMap[code] + if level > maxFaultLevel { + maxFaultLevel = level + } + } + SwitchFaultLevelMapLock.Unlock() + faultLevel, NodeStatus := NotHandleFaultLevelStr, "Healthy" + switch maxFaultLevel { + case NotHandleFaultLevel: + faultLevel, NodeStatus = NotHandleFaultLevelStr, "Healthy" + case PreSeparateFaultLevel: + faultLevel, NodeStatus = PreSeparateFaultLevelStr, "SubHealthy" + case SeparateFaultLevel: + faultLevel, NodeStatus = SeparateFaultLevelStr, "UnHealthy" + } + // keep those none zero codes + reportFaultCodes := make([]string, 0) + for _, code := range currentSwitchFault { + if code != 0 { + reportFaultCodes = append(reportFaultCodes, fmt.Sprintf("%08x", code)) + } + } + + return SwitchFaultInfo{ + FaultCode: reportFaultCodes, + FaultLevel: faultLevel, + UpdateTime: time.Now().Unix(), + NodeStatus: NodeStatus, + } +} + +// SetSwitchFaultCode set switch fault code +func SetSwitchFaultCode(newFaults []int64) { + SwitchFaultLock.Lock() + defer SwitchFaultLock.Unlock() + currentSwitchFault = newFaults +} + +// GetSwitchFaultCode get switch fault code +func GetSwitchFaultCode() []int64 { + return currentSwitchFault +} + // GetDeviceListID get device id by input device name func GetDeviceListID(devices []string, ascendRuntimeOptions string) (map[int]int, []int, error) { if len(devices) > MaxDevicesNum { diff --git a/pkg/common/device_test.go b/pkg/common/device_test.go index dfab01c908432f2b4bee7b49948836c4ed8d31d1..5aac88b603dc43e9931e126edb392f9cf71f747c 100644 --- a/pkg/common/device_test.go +++ b/pkg/common/device_test.go @@ -22,10 +22,15 @@ import ( "testing" "github.com/smartystreets/goconvey/convey" + "huawei.com/npu-exporter/v6/devmanager/common" "k8s.io/apimachinery/pkg/util/sets" ) -const cardNum = 2 +const ( + cardNum = 2 + generalFaultCode = 100 + firstFaultIdx = 0 +) // TestToString for test ToString func TestToString(t *testing.T) { @@ -195,3 +200,32 @@ func TestCheckCardUsageMode(t *testing.T) { }) }) } + +// TestGetSwitchFaultInfo test for convert fault code into struct +func TestGetSwitchFaultInfo(t *testing.T) { + convey.Convey("test GetSwitchFaultInfo", t, func() { + ParamOption.RealCardType = common.Ascend910A3 + ParamOption.EnableSwitchFault = true + currentSwitchFault = []int64{} + SwitchFaultLevelMap = map[int64]int{} + convey.Convey("test empty SwitchFaultLevelMap", func() { + currentSwitchFault = append(currentSwitchFault, generalFaultCode) + fault := GetSwitchFaultInfo() + convey.So(fault.FaultLevel == NotHandleFaultLevelStr, convey.ShouldBeTrue) + }) + convey.Convey("test actually level", func() { + currentSwitchFault = append(currentSwitchFault, generalFaultCode) + SwitchFaultLevelMap = map[int64]int{generalFaultCode: NotHandleFaultLevel} + fault := GetSwitchFaultInfo() + convey.So(fault.FaultLevel == NotHandleFaultLevelStr, convey.ShouldBeTrue) + + SwitchFaultLevelMap = map[int64]int{generalFaultCode: PreSeparateFaultLevel} + fault = GetSwitchFaultInfo() + convey.So(fault.FaultLevel == PreSeparateFaultLevelStr, convey.ShouldBeTrue) + + SwitchFaultLevelMap = map[int64]int{generalFaultCode: SeparateFaultLevel} + fault = GetSwitchFaultInfo() + convey.So(fault.FaultLevel == SeparateFaultLevelStr, convey.ShouldBeTrue) + }) + }) +} diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 334cd5917946cfe2ec2a3c61742688038585f643..84a1f7f51c975d38e4ded3d04050330078c22fdb 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -65,6 +65,8 @@ const ( faultCodeFilePath = "/usr/local/faultCode.json" // faultCustomizationFilePath load the path for fault customization faultCustomizationFilePath = "/usr/local/faultCustomization.json" + // switchFaultCodeFilePath is the path for switch fault code file + switchFaultCodeFilePath = "/usr/local/SwitchFaultCode.json" // halfDivisor is the number of 2 halfDivisor = 2 // WaitNpuReadyTime is the time used in waiting for npu ready @@ -81,6 +83,12 @@ const ( var ( faultTypeCode FaultTypeCode + // NotHandleFaultCodes contains all fault code that believed to be not handled, in this case is L1 + NotHandleFaultCodes = make([]int64, 0, GeneralMapSize) + // PreSeparateFaultCodes contains all fault code that believed to be PreSeparate, in this case is L2-L3 + PreSeparateFaultCodes = make([]int64, 0, GeneralMapSize) + // SeparateFaultCodes contains all fault code that believed to be Separate, in this case is L4-L5 + SeparateFaultCodes = make([]int64, 0, GeneralMapSize) // initLogicIDs need init fault code device. add by train or inference initLogicIDs []int32 // logicIDLock operate initLogicIDs lock @@ -97,6 +105,8 @@ var ( devFaultInfoMapLock sync.Mutex // SubscribeFailed subscribe failed flag SubscribeFailed bool + // SwitchSubscribeFailed indicate switch fault subscribe failed result, true is subscribe failed + SwitchSubscribeFailed bool // Synchronize used for synchronizing the fault cache between the main process and the grace tolerance coroutines Synchronize bool // manuallySeparateNpuMapLock operate manuallySeparateNpuMap lock @@ -170,6 +180,15 @@ type faultFileInfo struct { SeparateNPUNetworkCodes []string } +// SwitchFaultFileInfo contains all fault code loading from faultconfig configmap or switchfaultconfig.json +type SwitchFaultFileInfo struct { + NotHandleFaultCodes []string + ReportFaultCodes []string + SubHealthFaultCodes []string + ResetFaultCodes []string + SeparateFaultCodes []string +} + // FaultCustomization is the customization info of fault type FaultCustomization struct { GraceTolerance GraceToleranceCustomization @@ -275,6 +294,15 @@ func LoadFaultCodeFromFile() error { return LoadFaultCode(faultCodeBytes) } +// LoadSwitchFaultCodeFromFile load fault code from SwitchFaultCode.json +func LoadSwitchFaultCodeFromFile() error { + switchFaultsBytes, err := utils.LoadFile(switchFaultCodeFilePath) + if err != nil { + return fmt.Errorf("load switch fault code failed: %v", err) + } + return LoadSwitchFaultCode(switchFaultsBytes) +} + // LoadFaultCustomizationFromFile load fault customization from faultCustomization.json func LoadFaultCustomizationFromFile() error { faultCodeBytes, err := utils.LoadFile(faultCustomizationFilePath) @@ -388,6 +416,49 @@ func LoadFaultCustomization(faultCustomizationByte []byte) error { return nil } +// LoadSwitchFaultCode Load SwitchFault Code from bytes of config file or configmap +func LoadSwitchFaultCode(switchFaultCodeByte []byte) error { + var switchFileInfo SwitchFaultFileInfo + if err := json.Unmarshal(switchFaultCodeByte, &switchFileInfo); err != nil { + return fmt.Errorf("failed to unmarsha switch fault code, err: %s", err.Error()) + } + + NotHandleFaultCodes = make([]int64, 0, GeneralMapSize) + PreSeparateFaultCodes = make([]int64, 0, GeneralMapSize) + SeparateFaultCodes = make([]int64, 0, GeneralMapSize) + + for _, code := range switchFileInfo.NotHandleFaultCodes { + codeInt64, err := strconv.ParseInt(code, Hex, BitSize) + if err != nil { + hwlog.RunLog.Warnf("failed to parse NotHandleFaultCodes faultcode:%v", code) + continue + } + NotHandleFaultCodes = append(NotHandleFaultCodes, codeInt64) + } + + switchFileInfo.ReportFaultCodes = append(switchFileInfo.ReportFaultCodes, switchFileInfo.SubHealthFaultCodes...) + for _, code := range switchFileInfo.ReportFaultCodes { + codeInt64, err := strconv.ParseInt(code, Hex, BitSize) + if err != nil { + hwlog.RunLog.Warnf("failed to parse PreSeparateFaultCodes:%v", code) + continue + } + PreSeparateFaultCodes = append(PreSeparateFaultCodes, codeInt64) + } + + switchFileInfo.SeparateFaultCodes = append(switchFileInfo.SeparateFaultCodes, switchFileInfo.ResetFaultCodes...) + for _, code := range switchFileInfo.SeparateFaultCodes { + codeInt64, err := strconv.ParseInt(code, Hex, BitSize) + if err != nil { + hwlog.RunLog.Warnf("failed to parse SeparateFaultCodes:%v", code) + continue + } + SeparateFaultCodes = append(SeparateFaultCodes, codeInt64) + } + + return nil +} + func loadFaultDurationCustomization(customization []FaultDurationCustomization) { handledEventId := make(sets.String, common.MaxErrorCodeCount) for _, cus := range customization { diff --git a/pkg/common/fault_code_test.go b/pkg/common/fault_code_test.go index cf6d161f0ba66c16910c351cca221d71ed9e79e5..28e27b39471cd4441cb81c85b82d796208c4e065 100644 --- a/pkg/common/fault_code_test.go +++ b/pkg/common/fault_code_test.go @@ -18,6 +18,7 @@ package common import ( "encoding/json" "errors" + "fmt" "strconv" "strings" "testing" @@ -1557,3 +1558,18 @@ func TestGetTimeoutFaultCodes(t *testing.T) { convey.So(GetTimeoutFaultCodes(NetworkFaultMode), convey.ShouldResemble, expectedNetworkFaultCodes) }) } + +// TestLoadSwitchFaultCode Test LoadSwitchFaultCode +func TestLoadSwitchFaultCode(t *testing.T) { + convey.Convey("test LoadSwitchFaultCode", t, func() { + switchFileInfo := SwitchFaultFileInfo{ + NotHandleFaultCodes: []string{fmt.Sprintf("%08x", generalFaultCode)}, + } + bytes, err := json.Marshal(switchFileInfo) + convey.So(err, convey.ShouldBeNil) + err = LoadSwitchFaultCode(bytes) + convey.So(err, convey.ShouldBeNil) + convey.So(len(NotHandleFaultCodes) > 0, convey.ShouldBeTrue) + convey.So(NotHandleFaultCodes[firstFaultIdx] == generalFaultCode, convey.ShouldBeTrue) + }) +} diff --git a/pkg/common/proto.go b/pkg/common/proto.go index af85648725b6fe66a67cb53ccbd06c6a9aa403b4..023cc006db734319f34bb634dd05bc954d48c75b 100644 --- a/pkg/common/proto.go +++ b/pkg/common/proto.go @@ -37,6 +37,14 @@ type NodeDeviceInfoCache struct { CheckCode string } +// SwitchFaultInfo Switch Fault Info +type SwitchFaultInfo struct { + FaultCode []string + FaultLevel string + UpdateTime int64 + NodeStatus string +} + // NodeDeviceInfo record node NPU device information. Will be solidified into cm. type NodeDeviceInfo struct { DeviceList map[string]string @@ -113,6 +121,7 @@ type Option struct { RealCardType string // real card type LinkdownTimeout int64 // linkdown timeout duration DealWatchHandler bool // update pod cache when receiving pod informer watch errors + EnableSwitchFault bool // if enable switch faul } // GetAllDeviceInfoTypeList Get All Device Info Type List diff --git a/pkg/device/ascend310_test.go b/pkg/device/ascend310_test.go index 01ccca1f28b976cacf9127734dfc0f57e6ec9838..cf372604af5457078fcac7dd969dbdfc70071af0 100644 --- a/pkg/device/ascend310_test.go +++ b/pkg/device/ascend310_test.go @@ -98,7 +98,7 @@ func TestDoWithVolcanoListAndWatch310(t *testing.T) { }) mockCreateConfigMap := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), "WriteDeviceInfoDataIntoCM", func(_ *kubeclient.ClientK8s, - deviceInfo map[string]string, manuallySeparateNPU string, superPodID, + deviceInfo map[string]string, manuallySeparateNPU string, _ common.SwitchFaultInfo, superPodID, serverIndex int32) (*common.NodeDeviceInfoCache, error) { return &common.NodeDeviceInfoCache{}, nil }) diff --git a/pkg/device/ascend310p_test.go b/pkg/device/ascend310p_test.go index ab619ade38a648e3b3412f0b1b5985711ce5dfd5..bab53b2ec4ccebb435da82517d381a2e29acf3ce 100644 --- a/pkg/device/ascend310p_test.go +++ b/pkg/device/ascend310p_test.go @@ -73,7 +73,7 @@ func TestDoWithVolcanoListAndWatch310p(t *testing.T) { }) mockCreateConfigMap := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), "WriteDeviceInfoDataIntoCM", func(_ *kubeclient.ClientK8s, - deviceInfo map[string]string, manuallySeparateNPU string, superPodID, + deviceInfo map[string]string, manuallySeparateNPU string, _ common.SwitchFaultInfo, superPodID, serverIndex int32) (*common.NodeDeviceInfoCache, error) { return &common.NodeDeviceInfoCache{}, nil }) diff --git a/pkg/device/ascend910_test.go b/pkg/device/ascend910_test.go index d6e24b4374beba8cf8192ce43047d97fdba99d9e..9241d89ae0da36c6818ddabe56fd0ab2929258af 100644 --- a/pkg/device/ascend910_test.go +++ b/pkg/device/ascend910_test.go @@ -109,7 +109,7 @@ func TestDoWithVolcanoListAndWatch910(t *testing.T) { }) mockCreateConfigMap := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), "WriteDeviceInfoDataIntoCM", func(_ *kubeclient.ClientK8s, - deviceInfo map[string]string, manuallySeparateNPU string, superPodID, + deviceInfo map[string]string, manuallySeparateNPU string, _ common.SwitchFaultInfo, superPodID, serverIndex int32) (*common.NodeDeviceInfoCache, error) { return &common.NodeDeviceInfoCache{}, nil }) diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 110ef22a559bd362947541debeaf24d81d9b2b82..08c7848f903acc31b6a1b0aa0f41f3ec5ae12eac 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -35,6 +35,7 @@ import ( "k8s.io/utils/strings/slices" "Ascend-device-plugin/pkg/common" + "Ascend-device-plugin/pkg/device/deviceSwitch" "Ascend-device-plugin/pkg/kubeclient" ) @@ -229,8 +230,18 @@ func (tool *AscendTools) UpdateNodeDeviceInfo(devStatusSet common.DevStatusSet, tool.delVirDevInfo(newDeviceList) manuallySeparateNPU := tool.handleManuallySeparateNPUFaultInfo() + // if subscribe failed, will use get interface + if common.SwitchSubscribeFailed && common.ParamOption.EnableSwitchFault { + var err error + newFaults, err := deviceSwitch.GetSwitchFaults() + common.SetSwitchFaultCode(newFaults) + if err != nil { + hwlog.RunLog.Error("failed to query all fault codes of switch") + } + } + switchFaultInfo := common.GetSwitchFaultInfo() - if err := tool.client.WriteDeviceInfoDataIntoCMCache(newDeviceList, manuallySeparateNPU, + if err := tool.client.WriteDeviceInfoDataIntoCMCache(newDeviceList, manuallySeparateNPU, switchFaultInfo, tool.GetSuperPodID(), tool.GetServerIndex()); err != nil { hwlog.RunLog.Errorf("write device info failed: %v", err) return false, nil diff --git a/pkg/device/deviceSwitch/ascendSwitch.go b/pkg/device/deviceSwitch/ascendSwitch.go new file mode 100644 index 0000000000000000000000000000000000000000..4c93f061832beb48f0247535a0ad471226d0610a --- /dev/null +++ b/pkg/device/deviceSwitch/ascendSwitch.go @@ -0,0 +1,294 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package deviceSwitch functions of getting switch faults code +package deviceSwitch + +import ( + "context" + "fmt" + "sync" + "time" + "unsafe" + + "huawei.com/npu-exporter/v6/common-utils/hwlog" + "huawei.com/npu-exporter/v6/common-utils/utils" + devmanagercommon "huawei.com/npu-exporter/v6/devmanager/common" + + "Ascend-device-plugin/pkg/common" +) + +/* + #cgo LDFLAGS: -ldl + #cgo CFLAGS: -I/usr/local/Ascend/driver + + #include + #include + #include + #include + + #include "library.h" + + void *dcmiHandle; + #define SO_NOT_FOUND -99999 + #define FUNCTION_NOT_FOUND -99998 + #define SUCCESS 0 + #define ERROR_UNKNOWN -99997 + // dcmi + int (*lq_dcmi_init_func)(); + static int dcmi_init_lq(){ + lq_dcmi_init_func(); + } + + int (*lq_dcmi_get_fault_info_func)(unsigned int list_len, unsigned int *event_list_len, struct LqDcmiEvent *event_list); + static int lq_dcmi_get_fault_info(unsigned int list_len, unsigned int *event_list_len, struct LqDcmiEvent *event_list){ + lq_dcmi_get_fault_info_func(list_len,event_list_len,event_list); + } + + void goFaultEventHandler(struct LqDcmiEvent *fault_event); + static void event_handler(struct LqDcmiEvent *fault_event){ + goFaultEventHandler(fault_event); + } + + int(*lq_dcmi_subscribe_fault_event_func)(struct lq_dcmi_event_filter filter,lq_dcmi_fault_event_callback handler); + static int lq_dcmi_subscribe_fault_event(struct lq_dcmi_event_filter filter){ + lq_dcmi_subscribe_fault_event_func(filter,event_handler); + } + + // load .so files and functions + static int dcmiInit_lq(const char* dcmiLibPath){ + if (dcmiLibPath == NULL) { + fprintf (stderr,"lib path is null\n"); + return SO_NOT_FOUND; + } + dcmiHandle = dlopen(dcmiLibPath,RTLD_LAZY | RTLD_GLOBAL); + if (dcmiHandle == NULL){ + fprintf (stderr,"%s\n",dlerror()); + return SO_NOT_FOUND; + } + + + lq_dcmi_init_func = dlsym(dcmiHandle,"lq_dcmi_init"); + lq_dcmi_subscribe_fault_event_func = dlsym(dcmiHandle,"lq_dcmi_subscribe_fault_event"); + lq_dcmi_get_fault_info_func = dlsym(dcmiHandle,"lq_dcmi_get_fault_info"); + return SUCCESS; + } + + static int lqDcmiShutDown(void){ + if (dcmiHandle == NULL) { + return SUCCESS; + } + return (dlclose(dcmiHandle) ? ERROR_UNKNOWN : SUCCESS); + } +*/ +import "C" + +const ( + maxFaultNum = 128 + subTypeBase = 1000 +) + +// SwitchFaultEvent is the struct for switch reported fault +type SwitchFaultEvent struct { + EventType uint + // SubType fault subtype used for id a fault + SubType uint + // PeerPortDevice used to tell what kind of device connected to + PeerPortDevice uint + PeerPortId uint + SwitchChipId uint + SwitchPortId uint + // Severity used to tell how serious is the fault + Severity uint + // Assertion tell what kind of fault, recover, happen or once + Assertion uint + EventSerialNum int + NotifySerialNum int + AlarmRaisedTime int64 + AdditionalParam string + AdditionalInfo string +} + +// SwitchDevManager is the manager for switch +type SwitchDevManager struct { +} + +var ( + switchInitOnce sync.Once + // fault code with subtype like 8,has 3 different kind, with different connect device:NPU CPU Switch, + // this kind of fault code will be faultcode * 1000 + PeerPortDevice + duplicateSubEventMap = map[int]bool{8: true, 10: true, 11: true, 21: true, 22: true, 23: true, 25: true} +) + +// UpdateSwitchFaultLevel update the map recording fault code and it's level, as long as deviceinfo changed +func UpdateSwitchFaultLevel() { + common.SwitchFaultLevelMapLock.Lock() + defer common.SwitchFaultLevelMapLock.Unlock() + common.SwitchFaultLevelMap = make(map[int64]int, common.GeneralMapSize) + for _, code := range common.NotHandleFaultCodes { + common.SwitchFaultLevelMap[code] = common.NotHandleFaultLevel + } + for _, code := range common.PreSeparateFaultCodes { + common.SwitchFaultLevelMap[code] = common.PreSeparateFaultLevel + } + for _, code := range common.SeparateFaultCodes { + common.SwitchFaultLevelMap[code] = common.SeparateFaultLevel + } +} + +// NewSwitchDevManager create a new SwitchDevManager +func NewSwitchDevManager() *SwitchDevManager { + return &SwitchDevManager{} +} + +// InitSwitchDev try to call init func of driver, before call any other function +func (sdm *SwitchDevManager) InitSwitchDev() error { + // path is not determined yet + dcmiLibName := "liblingqu-dcmi.so" + dcmiLibPath, err := utils.GetDriverLibPath(dcmiLibName) + if err != nil { + return fmt.Errorf("failed to find switch library so, err:%s", err.Error()) + } + cDcmiTemplateName := C.CString(dcmiLibPath) + defer C.free(unsafe.Pointer(cDcmiTemplateName)) + if retCode := C.dcmiInit_lq(cDcmiTemplateName); retCode != C.SUCCESS { + return fmt.Errorf("dcmi lib load failed, error code: %d", int32(retCode)) + } + if retCode := C.dcmi_init_lq(); retCode != C.SUCCESS { + return fmt.Errorf("dcmi init call failed, error code: %d", int32(retCode)) + } + hwlog.RunLog.Info("init switch library succeed") + + return nil +} + +// ShutDownSwitch need to be called before dp exit +func (sdm *SwitchDevManager) ShutDownSwitch() { + if retCode := C.lqDcmiShutDown(); retCode != C.SUCCESS { + hwlog.RunLog.Error("failed to shutdown switch library") + return + } + hwlog.RunLog.Info("switch library has been shutdown") +} + +//export goFaultEventHandler +func goFaultEventHandler(event *C.struct_LqDcmiEvent) { + // faultEventHandler callback function for subscribe mod, witch will receive fault code when fault happens + faultEvent := convertFaultEvent(event) + hwlog.RunLog.Warnf("switch subscribe got fault:%#v, hex:%v", faultEvent, + fmt.Sprintf("%08x", faultEvent.SubType)) + // for recovered fault, delete them from current fault codes + if int8(faultEvent.Assertion) == devmanagercommon.FaultRecover { + newFaultCodes := make([]int64, 0) + for _, code := range common.GetSwitchFaultCode() { + if code != int64(faultEvent.SubType) { + newFaultCodes = append(newFaultCodes, code) + } + } + common.SetSwitchFaultCode(newFaultCodes) + return + } + currentFault := common.GetSwitchFaultCode() + common.SetSwitchFaultCode(append(currentFault, int64(faultEvent.SubType))) +} + +// GetSwitchFaultCodeByInterval start a none stop loop to query and update switch fault code +func (sdm *SwitchDevManager) GetSwitchFaultCodeByInterval(ctx context.Context, interval time.Duration) { + for { + select { + case _, ok := <-ctx.Done(): + if !ok { + hwlog.RunLog.Info("stop signal channel closed") + } + hwlog.RunLog.Info("query switch fault by interval stopped") + return + default: + hwlog.RunLog.Debug("will start to query all switch fault codes") + errCodes, err := GetSwitchFaults() + if err != nil { + hwlog.RunLog.Error(err) + time.Sleep(interval) + continue + } + + common.SetSwitchFaultCode(errCodes) + + time.Sleep(interval) + } + } +} + +// SubscribeSwitchFaults will start to subscribe fault from switch, +// and the callback function is faultEventHandler(event *C.struct_fault_event) +func (sdm *SwitchDevManager) SubscribeSwitchFaults() error { + var filter C.struct_lq_dcmi_event_filter + if retCode := C.lq_dcmi_subscribe_fault_event(filter); int32(retCode) != 0 { + hwlog.RunLog.Errorf("failed to subscribe switch fault, errCode: %v", retCode) + return fmt.Errorf("failed to subscribe switch fault, errCode: %v", retCode) + } + hwlog.RunLog.Info("succeed to subscribe switch fault") + return nil +} + +// GetSwitchFaults will try to get all fault +func GetSwitchFaults() ([]int64, error) { + var errCount C.uint + var errInfoArray [maxFaultNum]C.struct_LqDcmiEvent + if retCode := C.lq_dcmi_get_fault_info(C.uint(maxFaultNum), &errCount, &errInfoArray[0]); int32(retCode) != devmanagercommon.Success { + return []int64{}, fmt.Errorf("failed to get switch device errorcodes, errCode:%v", retCode) + } + if int32(errCount) < 0 || int32(errCount) > maxFaultNum { + return []int64{}, fmt.Errorf("failed to get switch device errcodes, cause errcodes nums %d is illegal", errCount) + } + + retErrores := make([]int64, 0) + for i := 0; i < len(errInfoArray); i++ { + faultEvent := convertFaultEvent(&errInfoArray[i]) + if faultEvent.SubType == 0 { + continue + } + if int8(faultEvent.Assertion) == devmanagercommon.FaultRecover { + continue + } + retErrores = append(retErrores, int64(faultEvent.SubType)) + } + hwlog.RunLog.Warnf("get fault:%#v", retErrores) + return retErrores, nil +} + +// convertFaultEvent convert event getting from driver to go struct +func convertFaultEvent(event *C.struct_LqDcmiEvent) SwitchFaultEvent { + fault := SwitchFaultEvent{ + EventType: uint(event.eventType), + SubType: uint(event.subType), + PeerPortDevice: uint(event.peerportDevice), + PeerPortId: uint(event.peerportId), + SwitchChipId: uint(event.switchChipid), + SwitchPortId: uint(event.switchPortid), + Severity: uint(event.severity), + Assertion: uint(event.assertion), + EventSerialNum: int(event.eventSerialNum), + NotifySerialNum: int(event.notifySerialNum), + AlarmRaisedTime: int64(event.alarmRaisedTime), + } + fault.SubType = getEventType(fault) + return fault +} + +func getEventType(event SwitchFaultEvent) uint { + if duplicate, ok := duplicateSubEventMap[int(event.SubType)]; ok && duplicate { + return event.SubType*subTypeBase + event.PeerPortDevice + } + return event.SubType +} diff --git a/pkg/device/deviceSwitch/library.h b/pkg/device/deviceSwitch/library.h new file mode 100644 index 0000000000000000000000000000000000000000..6c23b4a0c6003ed998cf586381b6310c5c4a20de --- /dev/null +++ b/pkg/device/deviceSwitch/library.h @@ -0,0 +1,78 @@ +#ifndef LINGQU_LIBRARY_H +#define LINGQU_LIBRARY_H + +#define MAX_EVENT_RESV_LENGTH 32 +#define DCMIDLLEXPORT static + +#define LQ_DCMI_EVENT_FILTER_FLAG_EVENT_TYPE_ID (1UL << 0) +#define LQ_DCMI_EVENT_FILTER_FLAG_EVENT_ID (1UL << 1) +#define LQ_DCMI_EVENT_FILTER_FLAG_SERVERITY (1UL << 2) +#define LQ_DCMI_EVENT_FILTER_FLAG_CHIP_ID (1UL << 3) + +typedef enum { + HAL_REPORT_FAULT_BLOCK = 0, + HAL_REPORT_FAULT_MEMORY, + HAL_REPORT_FAULT_DISCARD, + HAL_REPORT_FAULT_MEMORY_ALARM, + HAL_REPORT_FAULT_MODULE_RESET, + HAL_REPORT_HEART_LASTWORD, + HAL_REPORT_FAULT_HEART, + HAL_REPORT_PORT_FAULT_INVALID_PKG, + HAL_REPORT_PORT_FAULT_UNSTABLE, + HAL_REPORT_PORT_FAULT_FAIL, + HAL_REPORT_FAULT_BY_DEVICE, + HAL_REPORT_FAULT_CONFIG, + HAL_REPORT_FAULT_MEM_SINGLE, + HAL_REPORT_FAULT_M7, + HAL_REPORT_FAULT_BLOCK_C, + HAL_REPORT_FAULT_MEM_MULTI, + HAL_REPORT_FAULT_PCIE, + HAL_REPORT_FAULT_FATAL, + HAL_REPORT_PORT_FAULT_TIMEOUT_RP, + HAL_REPORT_PORT_FAULT_TIMEOUT_LP, + HAL_REPORT_FAULT_MAX +} HalReportFaultType; + +typedef struct LqDcmiEvent{ + HalReportFaultType eventType; + unsigned int subType; + unsigned short peerportDevice; + unsigned short peerportId; + unsigned short switchChipid; + unsigned short switchPortid; + + unsigned char severity; + unsigned char assertion; + char res[6]; + unsigned int eventSerialNum; + unsigned int notifySerialNum; + unsigned long alarmRaisedTime; + + unsigned char additionalParam + [40]; + char additionalInfo[32]; +}LqDcmiEvent; + +typedef enum { + EventTypeId = 1UL << 0, + EventId = 1UL << 1, + Severity = 1 << 2, + ChipId = 1 << 3 +} LqDcmiEventFilterFlag; + +typedef struct lq_dcmi_event_filter { + LqDcmiEventFilterFlag filterFlag; + HalReportFaultType eventTypeId; + unsigned int eventId; + unsigned char severity; + unsigned int chipId; +} LqDcmiEventFilter; + + +typedef void (*lq_dcmi_fault_event_callback)(struct LqDcmiEvent *event); + +DCMIDLLEXPORT int lq_dcmi_init(); +DCMIDLLEXPORT int lq_dcmi_subscribe_fault_event(struct lq_dcmi_event_filter filter); +DCMIDLLEXPORT int lq_dcmi_get_fault_info(unsigned int list_len, unsigned int *event_list_len, struct LqDcmiEvent *event_list); + +#endif// LINGQU_LIBRARY_H \ No newline at end of file diff --git a/pkg/kubeclient/client_server.go b/pkg/kubeclient/client_server.go index 13f7388ff00f5bc3855cec01bb7609c13fc6b385..85da291d20e3e2a58c24707e950cad7ab0309bb7 100644 --- a/pkg/kubeclient/client_server.go +++ b/pkg/kubeclient/client_server.go @@ -169,7 +169,7 @@ func (ki *ClientK8s) GetManuallySeparateNPUIDFromDeviceInfo(deviceInfoCMName, de // WriteDeviceInfoDataIntoCM write deviceinfo into config map func (ki *ClientK8s) WriteDeviceInfoDataIntoCM(deviceInfo map[string]string, - manuallySeparateNPU string, superPodID, serverIndex int32) (*common.NodeDeviceInfoCache, error) { + manuallySeparateNPU string, switchInfo common.SwitchFaultInfo, superPodID, serverIndex int32) (*common.NodeDeviceInfoCache, error) { var nodeDeviceData = common.NodeDeviceInfoCache{ DeviceInfo: common.NodeDeviceInfo{ @@ -181,18 +181,29 @@ func (ki *ClientK8s) WriteDeviceInfoDataIntoCM(deviceInfo map[string]string, } nodeDeviceData.CheckCode = common.MakeDataHash(nodeDeviceData.DeviceInfo) - var data []byte + var data, switchData []byte if data = common.MarshalData(nodeDeviceData); len(data) == 0 { return nil, fmt.Errorf("marshal nodeDeviceData failed") } + if switchData = common.MarshalData(switchInfo); len(switchData) == 0 { + return nil, fmt.Errorf("marshal switchDeviceData failed") + } deviceInfoCM := &v1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: ki.DeviceInfoName, Namespace: common.DeviceInfoCMNameSpace, Labels: map[string]string{common.CmConsumer: common.CmConsumerValue}, }, - Data: map[string]string{common.DeviceInfoCMDataKey: string(data), - common.DeviceInfoCMManuallySeparateNPUKey: manuallySeparateNPU}, + } + if common.ParamOption.RealCardType != common.Ascend910A3 { + deviceInfoCM.Data = map[string]string{ + common.DeviceInfoCMDataKey: string(data), + common.DeviceInfoCMManuallySeparateNPUKey: manuallySeparateNPU} + } else { + deviceInfoCM.Data = map[string]string{ + common.DeviceInfoCMDataKey: string(data), + common.SwitchInfoCMDataKey: string(switchData), + common.DeviceInfoCMManuallySeparateNPUKey: manuallySeparateNPU} } hwlog.RunLog.Debugf("write device info cache into cm: %s/%s.", deviceInfoCM.Namespace, deviceInfoCM.Name) diff --git a/pkg/kubeclient/client_server_test.go b/pkg/kubeclient/client_server_test.go index d23184417290e360acda6cccc70dff8496dd432a..8e2dc861d6a6278ce502dace9f432d6f23b5f6a6 100644 --- a/pkg/kubeclient/client_server_test.go +++ b/pkg/kubeclient/client_server_test.go @@ -157,7 +157,7 @@ func TestWriteDeviceInfoDataIntoCM(t *testing.T) { }) defer mockGetCM.Reset() _, err := utKubeClient.WriteDeviceInfoDataIntoCM(getDeviceInfo(common.HuaweiAscend310P, npuChip310PPhyID0), - "", -1, -1) + "", common.SwitchFaultInfo{}, -1, -1) convey.So(err, convey.ShouldEqual, nil) }) convey.Convey("get write device info (cm) when get cm success", t, func() { @@ -167,7 +167,7 @@ func TestWriteDeviceInfoDataIntoCM(t *testing.T) { }) defer mockGetCM.Reset() _, err := utKubeClient.WriteDeviceInfoDataIntoCM(getDeviceInfo(common.HuaweiAscend310P, npuChip310PPhyID0), - "", -1, -1) + "", common.SwitchFaultInfo{}, -1, -1) convey.So(err, convey.ShouldEqual, nil) }) } @@ -374,7 +374,7 @@ func resetMock(resetMockList ...*gomonkey.Patches) { func annotationResetMock(devErr, stateErr, nodeErr error) (*gomonkey.Patches, *gomonkey.Patches, *gomonkey.Patches) { node := getMockNode(common.HuaweiAscend910, npuChip910PhyID0) mockWrite := gomonkey.ApplyMethod(reflect.TypeOf(new(ClientK8s)), "WriteDeviceInfoDataIntoCM", - func(_ *ClientK8s, _ map[string]string, _ string, _, _ int32) (*common.NodeDeviceInfoCache, error) { + func(_ *ClientK8s, _ map[string]string, _ string, _ common.SwitchFaultInfo, _, _ int32) (*common.NodeDeviceInfoCache, error) { return nil, devErr }) mockPatchNode := gomonkey.ApplyMethod(reflect.TypeOf(new(ClientK8s)), "PatchNodeState", diff --git a/pkg/kubeclient/kube_cache.go b/pkg/kubeclient/kube_cache.go index 3e9dd846082a35d5900376510ff46cb1acfffe72..09b80e3ab5edb36c731682da3fd39edfb9e5fc8f 100644 --- a/pkg/kubeclient/kube_cache.go +++ b/pkg/kubeclient/kube_cache.go @@ -189,11 +189,12 @@ func (ki *ClientK8s) GetDeviceInfoCMCache() *common.NodeDeviceInfoCache { // WriteDeviceInfoDataIntoCMCache write deviceinfo into config map with cache func (ki *ClientK8s) WriteDeviceInfoDataIntoCMCache(deviceInfo map[string]string, manuallySeparateNPU string, - superPodID, serverIndex int32) error { - newNodeDeviceInfoCache, err := ki.WriteDeviceInfoDataIntoCM(deviceInfo, manuallySeparateNPU, superPodID, serverIndex) + switchInfo common.SwitchFaultInfo, superPodID, serverIndex int32) error { + newNodeDeviceInfoCache, err := ki.WriteDeviceInfoDataIntoCM(deviceInfo, manuallySeparateNPU, switchInfo, superPodID, serverIndex) if err != nil { return err } + nodeDeviceInfoCache = newNodeDeviceInfoCache return nil } diff --git a/pkg/kubeclient/kubeclient.go b/pkg/kubeclient/kubeclient.go index a15eaac23e72ba606517bf72f975a1d24706f9b7..53ee9d64d5cdaee47bc9265f45b15460d7d67f2d 100644 --- a/pkg/kubeclient/kubeclient.go +++ b/pkg/kubeclient/kubeclient.go @@ -242,7 +242,7 @@ func (ki *ClientK8s) resetNodeAnnotations(node *v1.Node) { // ResetDeviceInfo reset device info func (ki *ClientK8s) ResetDeviceInfo() { deviceList := make(map[string]string, 1) - if err := ki.WriteDeviceInfoDataIntoCMCache(deviceList, "", -1, -1); err != nil { + if err := ki.WriteDeviceInfoDataIntoCMCache(deviceList, "", common.GetSwitchFaultInfo(), -1, -1); err != nil { hwlog.RunLog.Errorf("write device info failed, error is %v", err) } } diff --git a/pkg/server/manager.go b/pkg/server/manager.go index 338dafa3f86b4bd60383fb3a7fcd154a63b94ebf..5647471db33e76b663dba86bdc0cc7641a1a8e5f 100644 --- a/pkg/server/manager.go +++ b/pkg/server/manager.go @@ -36,6 +36,7 @@ import ( "Ascend-device-plugin/pkg/common" "Ascend-device-plugin/pkg/device" + "Ascend-device-plugin/pkg/device/deviceSwitch" "Ascend-device-plugin/pkg/kubeclient" ) @@ -43,12 +44,13 @@ var lastStatus = common.NewAtomicBool(false) // HwDevManager manages huawei device devices. type HwDevManager struct { - groupDevice map[string][]*common.NpuDevice - ServerMap map[string]InterfaceServer - allInfo common.NpuAllInfo - manager device.DevManager - RunMode string - WorkMode string + SwitchDevManager *deviceSwitch.SwitchDevManager + groupDevice map[string][]*common.NpuDevice + ServerMap map[string]InterfaceServer + allInfo common.NpuAllInfo + manager device.DevManager + RunMode string + WorkMode string } // NewHwDevManager function is used to new a dev manager. @@ -344,6 +346,10 @@ func (hdm *HwDevManager) separateNPUIDFromDeviceInfoIntoCache() { func (hdm *HwDevManager) ListenDevice(ctx context.Context) { hwlog.RunLog.Info("starting the listen device") hdm.subscribeFaultEvent() + if common.ParamOption.RealCardType == common.Ascend910A3 && common.ParamOption.EnableSwitchFault { + // will set a goroutine to query all switch faults every 5 min + go hdm.SwitchDevManager.GetSwitchFaultCodeByInterval(ctx, time.Second*common.GetSwitchFaultCodeInterval) + } // when device-plugin is started, the value of ManuallySeparateNPU in device info configmap needs to be written into // cache to prevent manually separate npu IDs in cache from been lost hdm.separateNPUIDFromDeviceInfoIntoCache() @@ -640,6 +646,7 @@ func (hdm *HwDevManager) SignCatch(cancel context.CancelFunc) { cancel() hdm.stopAllSever() hdm.manager.GetDmgr().ShutDown() + hdm.SwitchDevManager.ShutDownSwitch() } } @@ -926,6 +933,27 @@ func (hdm *HwDevManager) execResetChip(logicID int32, isResetExec *bool) error { } func (hdm *HwDevManager) subscribeFaultEvent() { + hdm.subscribeNpuFaultEvent() + hdm.subscribeSwitchFaultEvent() +} + +func (hdm *HwDevManager) subscribeSwitchFaultEvent() { + if common.ParamOption.RealCardType != common.Ascend910A3 || !common.ParamOption.EnableSwitchFault { + return + } + for i := 0; i < common.GeneralSubscribeTime; i++ { + if err := hdm.SwitchDevManager.SubscribeSwitchFaults(); err != nil { + time.Sleep(time.Second) + continue + } + return + } + common.SwitchSubscribeFailed = true + hwlog.RunLog.Error("request Subscribe Switch FaultEvent failed, the subscribe way is closed") +} + +// subscribeNpuFaultEvent subscribe fault happend on npus +func (hdm *HwDevManager) subscribeNpuFaultEvent() { if err := common.LoadFaultCodeFromFile(); err != nil { common.SubscribeFailed = true hwlog.RunLog.Errorf("load faultCode.json failed, the subscribe way is closed, err: %v", err) @@ -1001,6 +1029,10 @@ func (hdm *HwDevManager) pollFaultCodeCM(ctx context.Context) { interval = getFaultCodeCMPollInterval(configMap) resourceVersion = configMap.ResourceVersion loadFaultCode(configMap) + if common.ParamOption.RealCardType == common.Ascend910A3 && common.ParamOption.EnableSwitchFault { + loadSwitchFaultCode(configMap) + deviceSwitch.UpdateSwitchFaultLevel() + } loadFaultCustomization(configMap) hwlog.RunLog.Infof("handling '%s' configmap change complete", common.FaultCodeCMName) } @@ -1016,6 +1048,13 @@ func initFaultInfoFromFile() { if err := common.LoadFaultCustomizationFromFile(); err != nil { hwlog.RunLog.Errorf("load fault customization from file failed, err: %v", err) } + if common.ParamOption.RealCardType == common.Ascend910A3 && common.ParamOption.EnableSwitchFault { + if err := common.LoadSwitchFaultCodeFromFile(); err != nil { + hwlog.RunLog.Errorf("load switch fault code from file failed, err: %v", err) + return + } + deviceSwitch.UpdateSwitchFaultLevel() + } } func loadFaultCode(configMap *v1.ConfigMap) { @@ -1041,6 +1080,30 @@ func loadFaultCode(configMap *v1.ConfigMap) { hwlog.RunLog.Infof("load fault code from configmap success") } +func loadSwitchFaultCode(configMap *v1.ConfigMap) { + switchFaultCode, ok := configMap.Data[common.SwitchFaultCodeKey] + if !ok { + hwlog.RunLog.Errorf("cannot find key '%s' in CM, try to load SwitchFaultCode.json", common.SwitchFaultCodeKey) + if err := common.LoadSwitchFaultCodeFromFile(); err != nil { + hwlog.RunLog.Errorf("load switch fault code from SwitchFaultCode.json failed, err: %v", err) + return + } + hwlog.RunLog.Info("load switch fault code from file success") + return + } + if err := common.LoadSwitchFaultCode([]byte(switchFaultCode)); err != nil { + hwlog.RunLog.Errorf("failed to load switch fault code from configmap, err: %s, "+ + "will try to load from file", err.Error()) + if err := common.LoadSwitchFaultCodeFromFile(); err != nil { + hwlog.RunLog.Errorf("load switch fault code from SwitchFaultCode.json failed, err: %v", err) + return + } + hwlog.RunLog.Info("load switch fault code from file success") + return + } + hwlog.RunLog.Info("load switch fault code from configmap success") +} + func loadFaultCustomization(configMap *v1.ConfigMap) { faultCustomization, ok := configMap.Data[common.FaultCustomizationKey] if !ok { diff --git a/pkg/server/manager_test.go b/pkg/server/manager_test.go index 5723710713b6a9cd4ad39c683cd44b6beed97ccc..bbf653b706fab2e1b41a2ce18730fee734194a83 100644 --- a/pkg/server/manager_test.go +++ b/pkg/server/manager_test.go @@ -17,6 +17,7 @@ package server import ( "fmt" + "os" "reflect" "testing" "time" @@ -52,11 +53,25 @@ func setPatch() *gomonkey.Patches { return patch } +func createFile(filePath string) error { + f, err := os.Create(filePath) + if err != nil { + return err + } + defer f.Close() + return f.Chmod(common.SocketChmod) +} + // TestTestNewHwDevManager for testTestNewHwDevManager func TestNewHwDevManager(t *testing.T) { patch := setPatch() defer patch.Reset() convey.Convey("test NewHwDevManager", t, func() { + if _, err := os.Stat(common.HiAIManagerDevice); err != nil { + if err = createFile(common.HiAIManagerDevice); err != nil { + t.Fatal("TestGetDefaultDevices Run Failed") + } + } mockGetChipAiCoreCount := gomonkey.ApplyMethod(reflect.TypeOf(new(device.AscendTools)), "GetChipAiCoreCount", func(_ *device.AscendTools) (int32, error) { return common.DeviceNotSupport, nil