diff --git a/build/faultCode.json b/build/faultCode.json index c57f820ad43d654e1136f121777423f93e467cd9..4676858b03743dbca8e57fa0e2b5895bfb7cc768 100644 --- a/build/faultCode.json +++ b/build/faultCode.json @@ -9,7 +9,7 @@ "81078603","8C2FA009","A4025021","A60250C1","A4025081","A214000D","A414000D","A4028801","A4025101","A2140007", "A4140007","A2140008","A4140008","A40250E1","A214000A","A414000A","A4025061","A4025041","A214000B","A414000B", "A414000C","A2140009","A4140009","A4303002","81A3880C","80B78006","80B78005","80E1800F","80E21008","819B8003", - "8C464E00","814D8006" + "8C464E00","814D8006","80E01801" ], "RestartRequestCodes":[ "80C98008","80C98002","80C98003","80C98009","80CB8002","80CB8008","80CB8009","80CF8003","81318008","80D58000", @@ -18,7 +18,7 @@ "RestartBusinessCodes":[ "8C084E00","8C204E00","8C124E00","A8028802","A4302003","A4302004","A4302005","A4302006","A4302009","A430200A", "A6301002","B4060011","B406009C","B4060008","B4060009","B406000E","A60250A1","A2301001","A2301002","A2303001", - "B4060006","B4060007","B406000D","B4060014","B4060010","B4060011","80E01801" + "B4060006","B4060007","B406000D","B4060014","B4060010","B4060011" ], "FreeRestartNPUCodes":[ "8C0E4E00","8C104E00","8C0C4E00","8C044E00","8C064E00","8C17A005","8C1DA005","8C19A005","80E58E03","80E58E02", diff --git a/build/faultCustomization.json b/build/faultCustomization.json index 945a7f6ea090fb15f02ac4ee538f0a009422a6ee..ac7c25a7f0abafd3be63587763b5f7b461713de8 100644 --- a/build/faultCustomization.json +++ b/build/faultCustomization.json @@ -30,6 +30,12 @@ "FaultTimeout": 30, "RecoverTimeout": 60, "FaultHandling": "PreSeparateNPU" + }, + { + "EventId": ["80E01801"], + "FaultTimeout": 30, + "RecoverTimeout": 0, + "FaultHandling": "RestartBusiness" } ] } \ No newline at end of file diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 6b605535da720c29772123dab93b04ebb031fe06..87186fe715927b475c5911641273e4d3b2c96128 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -161,8 +161,6 @@ const ( GeneralSubscribeTime = 3 // Hex hexadecimal Hex = 16 - // LinkupRecoverTime is the linkup duration for restoring NPU network health - LinkupRecoverTime = 60 // SecondMagnification is second-level unit magnification SecondMagnification = 1000 ) @@ -723,9 +721,23 @@ const ( ServerIdAbnormal = -2 ) +const ( + // TimeoutProcess represents fault timeout process + TimeoutProcess = "fault timeout" + // TimeoutRecoverProcess represents fault timeout recover process + TimeoutRecoverProcess = "fault timeout recover" +) + const ( // ChipFaultMode represents chip fault mode ChipFaultMode = "chip fault mode" // NetworkFaultMode represents network fault mode NetworkFaultMode = "network fault mode" ) + +const ( + // Polling represents subscribe mode invalid and polling is used scenario + Polling = "polling" + // Subscribe represents subscribe mode + Subscribe = "subscribe" +) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 9b2b4647317c54d523e079658d3c979b05b6dcac..f497d3b201172d51baaa189440a3eaee0e0afd8e 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -28,7 +28,6 @@ import ( "huawei.com/npu-exporter/v6/common-utils/utils" "huawei.com/npu-exporter/v6/devmanager/common" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) const ( @@ -58,8 +57,6 @@ const ( CardNetworkUnhealthy = "CardNetworkUnhealthy" // LinkDownFaultCode linkdown fault code LinkDownFaultCode = 0x81078603 - // LinkDownFaultCodeStr linkdown fault code string - LinkDownFaultCodeStr = "81078603" // ResetFinishFaultCode reset finish fault code ResetFinishFaultCode = 0x8C2FA009 // CardDropFaultCode card drop fault code @@ -88,12 +85,10 @@ var ( initLogicIDs []int32 // logicIDLock operate initLogicIDs lock logicIDLock sync.Mutex - // UseGetDeviceNetWorkHealthApi for indicating whether to use dcmi_get_device_network_health api - UseGetDeviceNetWorkHealthApi = true - // timeoutFaultInfoMap timeout event info cache - timeoutFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize) - // recoverFaultMap fault event info cache + // recoverFaultMap recover fault event info cache recoverFaultMap = make(map[int32][]int64, GeneralMapSize) + // recoverNetworkFaultMap network recover fault event info cache + recoverNetworkFaultMap = make(map[int32][]int64, GeneralMapSize) // recoverFaultFrequencyMap frequency fault info cache recoverFaultFrequencyMap = make(map[int32]string, GeneralMapSize) // devFaultInfoMap save the subscribe interface return fault @@ -128,10 +123,6 @@ var ( faultFrequencyMap = make(map[string]*FaultFrequencyCache, common.MaxErrorCodeCount) // faultFrequencyMapLock is the lock of faultFrequencyMap faultFrequencyMapLock sync.Mutex - // linkDownTimeoutCustomization is the customized timeout for link down event - linkDownTimeoutCustomization = ParamOption.LinkdownTimeout - // linkUpTimeoutCustomization is the customized timeout for link up event - linkUpTimeoutCustomization = int64(DefaultLinkUpTimeout) // faultDurationMap is the cache saving the occur duration of a fault, key is event id faultDurationMap = make(map[string]*FaultDurationCache, common.MaxErrorCodeCount) // faultDurationMapLock is the lock of faultDurationMap @@ -239,6 +230,14 @@ type FaultDuration struct { FaultHandling string } +type handleDurationInputPara struct { + logicID int32 + eventId string + index int + toStatus bool + duration int64 +} + // DevFaultInfoBasedTimeAscend sort fault queue based on alarmRaisedTime in ascending order type DevFaultInfoBasedTimeAscend []common.DevFaultInfo @@ -803,14 +802,6 @@ func GetAndCleanLogicID() []int32 { return oldInitLogicIDs } -// GetInitLogicIDs get init device's logicIDs -func GetInitLogicIDs() []int32 { - if len(initLogicIDs) == 0 { - return nil - } - return initLogicIDs -} - // setAlarmRaisedTime set `AlarmRaisedTime` by device fault code length func setAlarmRaisedTime(device *NpuDevice) { if len(device.FaultCodes) == 0 { @@ -838,7 +829,7 @@ func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevF // it must deal with two 'for', because the fault may recover one moment, in this case, // the recover message and occur message both in faultInfos, this fault cannot be reports outside. for _, faultInfo := range faultInfos { - if faultInfo.EventID == LinkDownFaultCode { + if NetworkFaultCodes.Has(faultInfo.EventID) { continue } if faultInfo.Assertion == common.FaultRecover { @@ -853,18 +844,67 @@ func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevF } } for _, faultInfo := range faultInfos { - if faultInfo.EventID == LinkDownFaultCode { + if NetworkFaultCodes.Has(faultInfo.EventID) { continue } if faultInfo.Assertion == common.FaultOccur || faultInfo.Assertion == common.FaultOnce { device.FaultCodes = append(device.FaultCodes, faultInfo.EventID) - insertFaultFrequency(device.LogicID, faultInfo.EventID) + eventIdStr := strings.ToLower(strconv.FormatInt(faultInfo.EventID, Hex)) + if _, ok := faultDurationMap[eventIdStr]; !ok { + insertFaultFrequency(device.LogicID, faultInfo.EventID) + } } } setAlarmRaisedTime(device) } -// DelOnceRecoverFault delete func 'cacheAfterDelFaultCode' record fault code in the end of cycle +// SetNetworkNewFaultAndCacheOnceRecoverFault set new network fault code and cache once recover network fault +func SetNetworkNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevFaultInfo, device *NpuDevice) { + if device == nil { + hwlog.RunLog.Error("param device is nil in SetNetworkNewFaultAndCacheOnceRecoverFault") + return + } + // it must deal with two 'for', because the fault may recover one moment, in this case, + // the recover message and occur message both in faultInfos, this fault cannot be reports outside. + networkFaultRecoverAndFaultOnceHandle(logicID, faultInfos, device) + networkFaultOccurAndFaultOnceHandle(faultInfos, device) + setNetworkAlarmRaisedTime(device) +} + +func networkFaultRecoverAndFaultOnceHandle(logicID int32, faultInfos []common.DevFaultInfo, device *NpuDevice) { + for _, faultInfo := range faultInfos { + if !NetworkFaultCodes.Has(faultInfo.EventID) { + continue + } + if faultInfo.Assertion == common.FaultRecover { + if Int64Tool.Index(device.NetworkFaultCodes, faultInfo.EventID) == -1 { + recoverNetworkFaultMap[logicID] = append(recoverNetworkFaultMap[logicID], faultInfo.EventID) + } else { + device.NetworkFaultCodes = Int64Tool.Remove(device.NetworkFaultCodes, faultInfo.EventID) + } + } + if faultInfo.Assertion == common.FaultOnce { + recoverNetworkFaultMap[logicID] = append(recoverNetworkFaultMap[logicID], faultInfo.EventID) + } + } +} + +func networkFaultOccurAndFaultOnceHandle(faultInfos []common.DevFaultInfo, device *NpuDevice) { + for _, faultInfo := range faultInfos { + if !NetworkFaultCodes.Has(faultInfo.EventID) { + continue + } + if faultInfo.Assertion == common.FaultOccur || faultInfo.Assertion == common.FaultOnce { + device.NetworkFaultCodes = append(device.NetworkFaultCodes, faultInfo.EventID) + eventIdStr := strings.ToLower(strconv.FormatInt(faultInfo.EventID, Hex)) + if _, ok := faultDurationMap[eventIdStr]; !ok { + insertFaultFrequency(device.LogicID, faultInfo.EventID) + } + } + } +} + +// DelOnceRecoverFault delete func 'cacheAfterDelFaultCode' record fault code and network fault code in the end of cycle func DelOnceRecoverFault(groupDevice map[string][]*NpuDevice) { for _, devices := range groupDevice { for _, device := range devices { @@ -873,9 +913,16 @@ func DelOnceRecoverFault(groupDevice map[string][]*NpuDevice) { device.FaultCodes = Int64Tool.Remove(device.FaultCodes, recoverFault) } setAlarmRaisedTime(device) + + recoverNetworkFaults := recoverNetworkFaultMap[device.LogicID] + for _, recoverNetworkFault := range recoverNetworkFaults { + device.NetworkFaultCodes = Int64Tool.Remove(device.NetworkFaultCodes, recoverNetworkFault) + } + setNetworkAlarmRaisedTime(device) } } recoverFaultMap = make(map[int32][]int64, GeneralMapSize) + recoverNetworkFaultMap = make(map[int32][]int64, GeneralMapSize) } // DelOnceFrequencyFault clear all the fault occurrence time in cache when frequency @@ -1021,233 +1068,264 @@ func DeleteManuallyFaultInfo(logicID int32) { } } -// GetLinkdownLinkupFaultEvents get linkdown/linkup events from event subscription interface -func GetLinkdownLinkupFaultEvents(logicID int32, faultInfos []common.DevFaultInfo) { - hwlog.RunLog.Debugf("logicId: %v, faultInfos: %+v", logicID, faultInfos) - if len(faultInfos) == 0 { - UseGetDeviceNetWorkHealthApi = true - return +// CountFaultDuration used to calculate each fault duration +func CountFaultDuration(device *NpuDevice, devFaultInfoMap map[int32][]common.DevFaultInfo) { + // Collect fault events from fault event queue cache to form the fault queue for duration statistics + collectEachFaultEvent(device.LogicID, devFaultInfoMap[device.LogicID]) + faultDurationMapLock.Lock() + defer faultDurationMapLock.Unlock() + + for eventId, _ := range faultDurationMap { + // Sort fault events in the fault queue in ascending order based on fault event AlarmRaisedTime + sortFaultEventsInAscendingOrder(device.LogicID, eventId) + + // Merge consecutive fault events by fault event assertion in the fault queue + // and clear first event according to the fault status of the current fault code + cleanFaultQueue(device.LogicID, eventId) + + // update the fault code timeout status, fault duration time, fault recover duration time + // and clear fault queue cache through timeout judgment and recovery judgment algorithm + handleFaultQueue(device.LogicID, eventId) } - isUseGetDeviceNetWorkHealthApi := true +} + +func collectEachFaultEvent(logicId int32, faultInfos []common.DevFaultInfo) { + faultDurationMapLock.Lock() + defer faultDurationMapLock.Unlock() + for _, faultInfo := range faultInfos { - if faultInfo.EventID == LinkDownFaultCode { - isUseGetDeviceNetWorkHealthApi = false - timeoutFaultInfoMap[logicID] = append(timeoutFaultInfoMap[logicID], faultInfo) + eventIdStr := strings.ToLower(strconv.FormatInt(faultInfo.EventID, Hex)) + if _, ok := faultDurationMap[eventIdStr]; !ok { + continue } + + if faultDurationMap[eventIdStr].Duration == nil { + faultDurationMap[eventIdStr].Duration = make(map[int32]FaultDurationData, 0) + } + + if _, ok := faultDurationMap[eventIdStr].Duration[logicId]; !ok { + faultDurationMap[eventIdStr].Duration[logicId] = FaultDurationData{ + FaultEventQueue: []common.DevFaultInfo{}, // Initializing the slice + } + } + faultDurationData := faultDurationMap[eventIdStr].Duration[logicId] + faultDurationData.FaultEventQueue = append(faultDurationData.FaultEventQueue, faultInfo) + faultDurationMap[eventIdStr].Duration[logicId] = faultDurationData } - if UseGetDeviceNetWorkHealthApi != isUseGetDeviceNetWorkHealthApi { - UseGetDeviceNetWorkHealthApi = isUseGetDeviceNetWorkHealthApi +} + +func sortFaultEventsInAscendingOrder(logicID int32, eventId string) { + if _, ok := faultDurationMap[eventId]; !ok { + return } - if !UseGetDeviceNetWorkHealthApi { - hwlog.RunLog.Info("linkdown event exists in event subscription interface, " + - "dcmi_get_device_network_health api will not be used") + if _, ok := faultDurationMap[eventId].Duration[logicID]; !ok { + return } + + faultQueue := faultDurationMap[eventId].Duration[logicID].FaultEventQueue + sort.Sort(DevFaultInfoBasedTimeAscend(faultQueue)) } -// GetCurrentDeviceNetWorkHealth Query the NPU network status at the current time -func GetCurrentDeviceNetWorkHealth(logicID int32, deviceNetWorkHealth string, getDeviceNetworkStateFail bool) { - // If the NPU network is healthy, the network status is regarded as linkup - // If the NPU network is unhealthy, the network status is regarded as linkdown - if getDeviceNetworkStateFail && len(timeoutFaultInfoMap[logicID]) > 0 { - hwlog.RunLog.Debugf("do not need to add device fault info, because of failing to getDeviceNetworkState") +func cleanFaultQueue(logicID int32, eventId string) { + if _, ok := faultDurationMap[eventId]; !ok { return } - var assertion int8 - if deviceNetWorkHealth == v1beta1.Unhealthy { - assertion = common.FaultOccur - } else { - assertion = common.FaultRecover + if _, ok := faultDurationMap[eventId].Duration[logicID]; !ok { + return } - devFaultInfo := common.DevFaultInfo{ - EventID: LinkDownFaultCode, - LogicID: logicID, - Assertion: assertion, - AlarmRaisedTime: time.Now().UnixMilli(), - } - timeoutFaultInfoMap[logicID] = append(timeoutFaultInfoMap[logicID], devFaultInfo) + faultDurationData := faultDurationMap[eventId].Duration[logicID] + mergeContinuousElementBasedAssertion(&faultDurationData.FaultEventQueue) + clearFirstEventBasedOnFaultStatus(&faultDurationData) + faultDurationMap[eventId].Duration[logicID] = faultDurationData + hwlog.RunLog.Debugf("NPU logic id: %d, %s fault timeout status: %v, fault queue after sort and merge: %v", + logicID, eventId, faultDurationMap[eventId].Duration[logicID].TimeoutStatus, + faultDurationMap[eventId].Duration[logicID].FaultEventQueue) } // mergeContinuousElementBasedAssertion merge continuous element based on assertion func mergeContinuousElementBasedAssertion(devFaultInfo *[]common.DevFaultInfo) { + if devFaultInfo == nil || len(*devFaultInfo) == 0 { + return + } + + previousEvent := (*devFaultInfo)[0] + newDevFaultInfo := []common.DevFaultInfo{previousEvent} for i := 1; i < len(*devFaultInfo); i++ { currentEvent := (*devFaultInfo)[i] - previousEvent := (*devFaultInfo)[i-1] - if currentEvent.Assertion == previousEvent.Assertion { - *devFaultInfo = append((*devFaultInfo)[:i], (*devFaultInfo)[i+1:]...) - i-- + continue } + previousEvent = currentEvent + newDevFaultInfo = append(newDevFaultInfo, currentEvent) } + *devFaultInfo = newDevFaultInfo } -// SortMergeFaultQueue sort fault queue based on alarmRaisedTime and merge continuous element based on assertion -func SortMergeFaultQueue(device *NpuDevice) { - if device == nil { - hwlog.RunLog.Error("param device is nil in SortMergeFaultQueue") - return +func clearFirstEventBasedOnFaultStatus(faultDurationData *FaultDurationData) { + // If the first fault event assertion is fault recover in fault queue when the fault status is healthy, + // clear the first fault event + if !faultDurationData.TimeoutStatus && len(faultDurationData.FaultEventQueue) > 0 && + faultDurationData.FaultEventQueue[0].Assertion == common.FaultRecover { + faultDurationData.FaultEventQueue = faultDurationData.FaultEventQueue[1:] } - faultInfos := timeoutFaultInfoMap[device.LogicID] - - sort.Sort(DevFaultInfoBasedTimeAscend(faultInfos)) - mergeContinuousElementBasedAssertion(&faultInfos) - timeoutFaultInfoMap[device.LogicID] = faultInfos - // If the first element is linkup in fault queue when the NPU network is healthy, clear the first element - if device.NetworkRealHealth == v1beta1.Healthy && len(timeoutFaultInfoMap[device.LogicID]) > 0 && - timeoutFaultInfoMap[device.LogicID][0].Assertion == common.FaultRecover { - timeoutFaultInfoMap[device.LogicID] = timeoutFaultInfoMap[device.LogicID][1:] + // If the first fault event assertion is fault occur in fault queue when the fault status is unhealthy, + // clear the first fault event + if faultDurationData.TimeoutStatus && len(faultDurationData.FaultEventQueue) > 0 && + faultDurationData.FaultEventQueue[0].Assertion == common.FaultOccur { + faultDurationData.FaultEventQueue = faultDurationData.FaultEventQueue[1:] } - - // If the first element is linkdown in fault queue when the NPU network is unhealthy, clear the first element - if device.NetworkRealHealth == v1beta1.Unhealthy && len(timeoutFaultInfoMap[device.LogicID]) > 0 && - timeoutFaultInfoMap[device.LogicID][0].Assertion == common.FaultOccur { - timeoutFaultInfoMap[device.LogicID] = timeoutFaultInfoMap[device.LogicID][1:] - } - - hwlog.RunLog.Debugf("NPU logic id: %v, network health status: %v, fault queue after sort and merge: %v", - device.LogicID, device.NetworkHealth, timeoutFaultInfoMap[device.LogicID]) } -func checkLinkdownTimeoutWhenNetworkHealth(device *NpuDevice) bool { - faultQueueLen := len(timeoutFaultInfoMap[device.LogicID]) - if faultQueueLen == 0 { - hwlog.RunLog.Debugf("NPU logic id: %v, fault queue is empty, "+ - "no need to check whether NPU linkdown timeout when NPU network is healthy", device.LogicID) - return true +func handleFaultQueue(logicID int32, eventId string) { + if _, ok := faultDurationMap[eventId]; !ok { + return } - var i int - for i = 0; i < faultQueueLen/halfDivisor; i++ { - if timeoutFaultInfoMap[device.LogicID][i*halfDivisor+1]. - AlarmRaisedTime-timeoutFaultInfoMap[device.LogicID][i*halfDivisor].AlarmRaisedTime <= - linkDownTimeoutCustomization*SecondMagnification { - continue - } - device.NetworkRealHealth = v1beta1.Unhealthy - hwlog.RunLog.Debugf("in linkdown timeout checking, %v(linkup) - %v(linkdown) > %v, "+ - "NPU %v network health set %v, fault queue: %v", timeoutFaultInfoMap[device.LogicID][i*halfDivisor+1], - timeoutFaultInfoMap[device.LogicID][i*halfDivisor], linkDownTimeoutCustomization*SecondMagnification, - device.LogicID, device.NetworkRealHealth, timeoutFaultInfoMap[device.LogicID]) - timeoutFaultInfoMap[device.LogicID] = timeoutFaultInfoMap[device.LogicID][halfDivisor*i+1:] - return false + if _, ok := faultDurationMap[eventId].Duration[logicID]; !ok { + return } - if i*halfDivisor+1 == faultQueueLen { - currentHostTime := time.Now().UnixMilli() - if currentHostTime-timeoutFaultInfoMap[device.LogicID][i*halfDivisor].AlarmRaisedTime <= - linkDownTimeoutCustomization*SecondMagnification { - hwlog.RunLog.Debugf("in linkdown timeout checking, %v(current host time) - "+ - "%v(linkdown) <= %v, NPU %v network health set %v, fault queue: %v", currentHostTime, - timeoutFaultInfoMap[device.LogicID][i*halfDivisor], linkDownTimeoutCustomization*SecondMagnification, - device.LogicID, device.NetworkRealHealth, timeoutFaultInfoMap[device.LogicID]) - timeoutFaultInfoMap[device.LogicID] = timeoutFaultInfoMap[device.LogicID][halfDivisor*i:] - } else { - device.NetworkRealHealth = v1beta1.Unhealthy - hwlog.RunLog.Debugf("in linkdown timeout checking, %v(current host time) - "+ - "%v(linkdown) > %v, NPU %v network health set %v, fault queue: %v", currentHostTime, - timeoutFaultInfoMap[device.LogicID][i*halfDivisor], linkDownTimeoutCustomization*SecondMagnification, - device.LogicID, device.NetworkRealHealth, timeoutFaultInfoMap[device.LogicID]) - timeoutFaultInfoMap[device.LogicID] = timeoutFaultInfoMap[device.LogicID][halfDivisor*i+1:] + faultDurationData := faultDurationMap[eventId].Duration[logicID] + if len(faultDurationData.FaultEventQueue) == 0 { + hwlog.RunLog.Debugf("NPU logic id: %v, %v fault queue is empty, no need to handle fault queue", + logicID, eventId) + return + } + + initTimeoutStatus := faultDurationData.TimeoutStatus + exitTag := false + for !exitTag { + faultDurationData = faultDurationMap[eventId].Duration[logicID] + exitTag = timeoutOrRecoveryAlgorithm(logicID, eventId, !faultDurationData.TimeoutStatus) + } + faultDurationData = faultDurationMap[eventId].Duration[logicID] + hwlog.RunLog.Debugf("NPU logic id: %v, after timeout or recovery algorithm handling, %v fault timeout "+ + "status is %v, fault duration time is %.2f seconds, fault recover duration time is %.2f seconds, "+ + "fault queue is %v", logicID, eventId, faultDurationData.TimeoutStatus, + float64(faultDurationData.FaultDurationTime)/1000.0, float64(faultDurationData.FaultRecoverDurationTime)/1000.0, + faultDurationData.FaultEventQueue) + + if initTimeoutStatus == false && faultDurationData.TimeoutStatus == true { + num, err := strconv.ParseInt(eventId, 16, 0) + if err != nil { + hwlog.RunLog.Errorf(parseHexFailedMsg, eventId) + return } + insertFaultFrequency(logicID, num) } - if halfDivisor*i == faultQueueLen { - hwlog.RunLog.Debugf("in linkdown timeout checking, %v(linkup) - %v(linkdown) <= %v, NPU %v "+ - "network health set %v, fault queue: %v", timeoutFaultInfoMap[device.LogicID][i*halfDivisor-1], - timeoutFaultInfoMap[device.LogicID][i*halfDivisor-halfDivisor], linkDownTimeoutCustomization* - SecondMagnification, device.LogicID, device.NetworkRealHealth, timeoutFaultInfoMap[device.LogicID]) - timeoutFaultInfoMap[device.LogicID] = timeoutFaultInfoMap[device.LogicID][halfDivisor*i:] + + var duration int64 + if faultDurationData.TimeoutStatus { + duration = faultDurationData.FaultDurationTime + } else { + duration = faultDurationData.FaultRecoverDurationTime + } + if initTimeoutStatus != faultDurationData.TimeoutStatus { + hwlog.RunLog.Infof("NPU logic id: %v, after timeout or recovery algorithm handling, %v fault timeout "+ + "status change, now fault timeout status set %v, duration time is %.2f seconds", + logicID, eventId, faultDurationData.TimeoutStatus, float64(duration)/1000.0) } - return true } -func checkLinkupRecoverWhenNetworkUnhealth(device *NpuDevice) bool { - faultQueueLen := len(timeoutFaultInfoMap[device.LogicID]) +func timeoutOrRecoveryAlgorithm(logicID int32, eventId string, toStatus bool) bool { + process := getProcessInFaultDuration(toStatus) + faultQueueLen := len(faultDurationMap[eventId].Duration[logicID].FaultEventQueue) if faultQueueLen == 0 { - hwlog.RunLog.Debugf("NPU logic id: %v, fault queue is empty, "+ - "no need to check whether NPU linkup recover when NPU network is unhealthy", device.LogicID) + hwlog.RunLog.Debugf("NPU logic id: %v, %v fault queue is empty, no need to do %v judgment", logicID, + eventId, process) return true } var i int + var duration int64 + timeoutThreshold := getTimeoutThreshold(eventId, toStatus) + faultTimeoutMsg := "NPU logic id: %v, in %v judgment, %v duration is %.2f seconds > %v seconds, %v fault " + + "timeout status set %v" + faultNotTimeoutMsg := "NPU logic id: %v, in %v judgment, %v duration is %.2f seconds <= %v seconds, %v " + + "fault timeout status %v doesn't need to change, continue to perform %v judgment" for i = 0; i < faultQueueLen/halfDivisor; i++ { - if timeoutFaultInfoMap[device.LogicID][i*halfDivisor+1].AlarmRaisedTime- - timeoutFaultInfoMap[device.LogicID][i*halfDivisor].AlarmRaisedTime <= - linkUpTimeoutCustomization*SecondMagnification { + faultDurationData := faultDurationMap[eventId].Duration[logicID] + duration = faultDurationData.FaultEventQueue[i*halfDivisor+1].AlarmRaisedTime - + faultDurationData.FaultEventQueue[i*halfDivisor].AlarmRaisedTime + if duration <= timeoutThreshold*SecondMagnification { continue } - device.NetworkRealHealth = v1beta1.Healthy - hwlog.RunLog.Debugf("in linkup recover checking, %v(linkdown) - %v(linkup) > %v,"+ - " NPU %v network health set %v, fault queue: %v", timeoutFaultInfoMap[device.LogicID][i*halfDivisor+1], - timeoutFaultInfoMap[device.LogicID][i*halfDivisor], linkUpTimeoutCustomization*SecondMagnification, - device.LogicID, device.NetworkRealHealth, timeoutFaultInfoMap[device.LogicID]) - timeoutFaultInfoMap[device.LogicID] = timeoutFaultInfoMap[device.LogicID][halfDivisor*i+1:] - return false + hwlog.RunLog.Debugf(faultTimeoutMsg, logicID, process, process, float64(duration)/1000.0, timeoutThreshold, + eventId, toStatus) + return handleTimeoutCondition(handleDurationInputPara{logicID, eventId, i, + toStatus, duration}) } if i*halfDivisor+1 == faultQueueLen { + faultDurationData := faultDurationMap[eventId].Duration[logicID] currentHostTime := time.Now().UnixMilli() - if currentHostTime-timeoutFaultInfoMap[device.LogicID][i*halfDivisor].AlarmRaisedTime <= - linkUpTimeoutCustomization*SecondMagnification { - hwlog.RunLog.Debugf("in linkup recover checking, %v(current host time) - %v(linkup) <= %v, NPU %v"+ - " network health set %v, fault queue: %v", currentHostTime, - timeoutFaultInfoMap[device.LogicID][i*halfDivisor], linkUpTimeoutCustomization*SecondMagnification, - device.LogicID, device.NetworkRealHealth, timeoutFaultInfoMap[device.LogicID]) - timeoutFaultInfoMap[device.LogicID] = timeoutFaultInfoMap[device.LogicID][halfDivisor*i:] - } else { - device.NetworkRealHealth = v1beta1.Healthy - hwlog.RunLog.Debugf("in linkup recover checking, %v(current host time) - %v(linkup) > %v, NPU %v "+ - "network health set %v, fault queue: %v", currentHostTime, - timeoutFaultInfoMap[device.LogicID][i*halfDivisor], linkUpTimeoutCustomization*SecondMagnification, - device.LogicID, device.NetworkRealHealth, timeoutFaultInfoMap[device.LogicID]) - timeoutFaultInfoMap[device.LogicID] = timeoutFaultInfoMap[device.LogicID][halfDivisor*i+1:] + duration = currentHostTime - faultDurationData.FaultEventQueue[i*halfDivisor].AlarmRaisedTime + if duration <= timeoutThreshold*SecondMagnification { + hwlog.RunLog.Debugf(faultNotTimeoutMsg, logicID, process, process, float64(duration)/1000.0, + timeoutThreshold, eventId, faultDurationData.TimeoutStatus, process) + return handleNotTimeoutCondition(handleDurationInputPara{logicID, eventId, i, + toStatus, duration}) } + hwlog.RunLog.Debugf(faultTimeoutMsg, logicID, process, process, float64(duration)/1000.0, timeoutThreshold, + eventId, toStatus) + return handleTimeoutCondition(handleDurationInputPara{logicID, eventId, i, + toStatus, duration}) } if halfDivisor*i == faultQueueLen { - hwlog.RunLog.Debugf("in linkup recover checking, %v(linkdown) - %v(linkup) <= %v, "+ - "NPU %v network health set %v, fault queue: %v", timeoutFaultInfoMap[device.LogicID][i*halfDivisor-1], - timeoutFaultInfoMap[device.LogicID][i*halfDivisor-halfDivisor], linkUpTimeoutCustomization* - SecondMagnification, device.LogicID, device.NetworkRealHealth, timeoutFaultInfoMap[device.LogicID]) - timeoutFaultInfoMap[device.LogicID] = timeoutFaultInfoMap[device.LogicID][halfDivisor*i:] + hwlog.RunLog.Debugf(faultNotTimeoutMsg, logicID, process, process, float64(duration)/1000.0, timeoutThreshold, + eventId, faultDurationMap[eventId].Duration[logicID].TimeoutStatus, process) + return handleNotTimeoutCondition(handleDurationInputPara{logicID, eventId, i, + toStatus, duration}) } return true } -// LinkDownTimeoutCheck check whether the NPU linkdown timeout happened and NPU network recovered -func LinkDownTimeoutCheck(device *NpuDevice) { - if device == nil { - hwlog.RunLog.Error("param device is nil in LinkDownTimeoutCheck") - return +func getProcessInFaultDuration(toStatus bool) string { + if toStatus { + return TimeoutProcess } - // check whether the NPU linkdown timeout happened based on the fault queue - // check whether the NPU network needs to be restored based on the fault queue - timeoutFaultInfoMapLen := len(timeoutFaultInfoMap[device.LogicID]) + return TimeoutRecoverProcess +} - if timeoutFaultInfoMapLen == 0 && device.NetworkHealth == device.NetworkRealHealth { - hwlog.RunLog.Debugf("NPU logic id: %v, fault queue is empty and NPU network health status not change, "+ - "no need to check whether NPU linkdown timeout, or whether need to recover NPU network health", device.LogicID) - return +func getTimeoutThreshold(eventId string, toStatus bool) int64 { + if _, ok := faultDurationMap[eventId]; !ok { + return MinFaultTimeout } - exitTag := false - - for !exitTag { - if device.NetworkRealHealth == v1beta1.Healthy { - exitTag = checkLinkdownTimeoutWhenNetworkHealth(device) - } else { - exitTag = checkLinkupRecoverWhenNetworkUnhealth(device) - } - } - if device.NetworkRealHealth == v1beta1.Unhealthy && device.NetworkHealth == v1beta1.Healthy { - hwlog.RunLog.Debugf("insert network fault into FaultFrequency, logic id: %d", device.LogicID) - insertFaultFrequency(device.LogicID, LinkDownFaultCode) + if toStatus { + return faultDurationMap[eventId].FaultDuration.FaultTimeout } + return faultDurationMap[eventId].FaultDuration.RecoverTimeout +} - hwlog.RunLog.Debugf("NPU logic id: %v, network health status: %v, fault queue after linkDown timeout "+ - "check and recover: %v", device.LogicID, device.NetworkHealth, timeoutFaultInfoMap[device.LogicID]) +func handleTimeoutCondition(inputPara handleDurationInputPara) bool { + faultDurationData := faultDurationMap[inputPara.eventId].Duration[inputPara.logicID] + faultDurationData.TimeoutStatus = inputPara.toStatus + faultQueueMsg := "NPU logic id: %v, %v fault queue: %v" + if inputPara.toStatus { + faultDurationData.FaultDurationTime = inputPara.duration + faultDurationMap[inputPara.eventId].Duration[inputPara.logicID] = faultDurationData + hwlog.RunLog.Debugf(faultQueueMsg, inputPara.logicID, inputPara.eventId, faultDurationData.FaultEventQueue) + return true + } + faultDurationData.FaultRecoverDurationTime = inputPara.duration + faultDurationData.FaultEventQueue = faultDurationData.FaultEventQueue[halfDivisor*inputPara.index+1:] + faultDurationMap[inputPara.eventId].Duration[inputPara.logicID] = faultDurationData + hwlog.RunLog.Debugf(faultQueueMsg, inputPara.logicID, inputPara.eventId, faultDurationData.FaultEventQueue) + return false +} - if device.NetworkHealth != device.NetworkRealHealth { - hwlog.RunLog.Infof("NPU logic id: %v, after handling, network health status change, now network health set %v", - device.LogicID, device.NetworkRealHealth) +func handleNotTimeoutCondition(inputPara handleDurationInputPara) bool { + faultDurationData := faultDurationMap[inputPara.eventId].Duration[inputPara.logicID] + if inputPara.toStatus { + faultDurationData.FaultDurationTime = inputPara.duration + } else { + faultDurationData.FaultRecoverDurationTime = inputPara.duration } - device.NetworkHealth = device.NetworkRealHealth + faultDurationData.FaultEventQueue = faultDurationData.FaultEventQueue[halfDivisor*inputPara.index:] + faultDurationMap[inputPara.eventId].Duration[inputPara.logicID] = faultDurationData + hwlog.RunLog.Debugf("NPU logic id: %v, %v fault queue: %v", inputPara.logicID, inputPara.eventId, + faultDurationData.FaultEventQueue) + return true } // GetFaultAssertionName get assertion name of fault code diff --git a/pkg/common/fault_code_test.go b/pkg/common/fault_code_test.go index 478df136385cfc27c86516722de38beafc59bf72..6fd7b4cac064e2941616ef143117f44f9168d2d2 100644 --- a/pkg/common/fault_code_test.go +++ b/pkg/common/fault_code_test.go @@ -28,7 +28,6 @@ import ( "huawei.com/npu-exporter/v6/common-utils/utils" "huawei.com/npu-exporter/v6/devmanager/common" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) // TestLoadFaultCodeFromFile for test LoadFaultCodeFromFile @@ -121,9 +120,13 @@ func TestSetNewFaultAndCacheOnceRecoverFault(t *testing.T) { {Assertion: common.FaultRecover}, {Assertion: common.FaultRecover, EventID: 1}, {Assertion: common.FaultOnce, EventID: 0}, + {Assertion: common.FaultOccur, EventID: LinkDownFaultCode}, + {Assertion: common.FaultRecover, EventID: LinkDownFaultCode}, } device := &NpuDevice{FaultCodes: []int64{1}} expectedFaultCodes, expectedFaultMapLen := []int64{0}, 2 + NetworkFaultCodes = sets.NewInt64() + NetworkFaultCodes.Insert(LinkDownFaultCode) SetNewFaultAndCacheOnceRecoverFault(logicID, faultInfos, device) convey.So(device.FaultCodes, convey.ShouldResemble, expectedFaultCodes) convey.So(len(recoverFaultMap[logicID]), convey.ShouldEqual, expectedFaultMapLen) @@ -131,21 +134,53 @@ func TestSetNewFaultAndCacheOnceRecoverFault(t *testing.T) { }) } +// TestSetNetworkNewFaultAndCacheOnceRecoverFault for test SetNetworkNewFaultAndCacheOnceRecoverFault +func TestSetNetworkNewFaultAndCacheOnceRecoverFault(t *testing.T) { + convey.Convey("test SetNetworkNewFaultAndCacheOnceRecoverFault", t, func() { + convey.Convey("SetNetworkNewFaultAndCacheOnceRecoverFault success", func() { + recoverNetworkFaultMap = make(map[int32][]int64, GeneralMapSize) + logicID := int32(0) + faultInfos := []common.DevFaultInfo{ + {Assertion: common.FaultRecover}, + {Assertion: common.FaultRecover, EventID: 1}, + {Assertion: common.FaultOnce, EventID: 0}, + {Assertion: common.FaultOccur, EventID: LinkDownFaultCode}, + {Assertion: common.FaultRecover, EventID: LinkDownFaultCode}, + {Assertion: common.FaultOnce, EventID: LinkDownFaultCode}, + } + device := &NpuDevice{NetworkFaultCodes: []int64{LinkDownFaultCode}} + expectedNetworkFaultCodes := []int64{LinkDownFaultCode, LinkDownFaultCode} + expectedRecoverNetworkFaultMapLen := 1 + NetworkFaultCodes = sets.NewInt64() + NetworkFaultCodes.Insert(LinkDownFaultCode) + SetNetworkNewFaultAndCacheOnceRecoverFault(logicID, faultInfos, device) + convey.So(device.NetworkFaultCodes, convey.ShouldResemble, expectedNetworkFaultCodes) + convey.So(len(recoverNetworkFaultMap[logicID]), convey.ShouldEqual, expectedRecoverNetworkFaultMapLen) + }) + }) +} + // TestDelOnceRecoverFault for test DelOnceRecoverFault func TestDelOnceRecoverFault(t *testing.T) { convey.Convey("test DelOnceRecoverFault", t, func() { convey.Convey("DelOnceRecoverFault success", func() { faultCodes := []int64{1} - device := &NpuDevice{FaultCodes: faultCodes} + networkFaultCodes := []int64{LinkDownFaultCode} + device := &NpuDevice{LogicID: 0, FaultCodes: faultCodes, NetworkFaultCodes: networkFaultCodes} recoverFaultMap = map[int32][]int64{ 0: faultCodes, } + recoverNetworkFaultMap = map[int32][]int64{ + 0: networkFaultCodes, + } groupDevice := map[string][]*NpuDevice{ "test": {device}, } DelOnceRecoverFault(groupDevice) convey.So(len(device.FaultCodes), convey.ShouldEqual, 0) convey.So(len(recoverFaultMap), convey.ShouldEqual, 0) + convey.So(len(device.NetworkFaultCodes), convey.ShouldEqual, 0) + convey.So(len(recoverNetworkFaultMap), convey.ShouldEqual, 0) }) }) } @@ -288,19 +323,6 @@ func TestQueryManuallyFaultInfoByLogicID(t *testing.T) { }) } -// TestGetLinkdownLinkupFaultEvents for test GetLinkdownLinkupFaultEvents -func TestGetLinkdownLinkupFaultEvents(t *testing.T) { - convey.Convey("test GetLinkdownLinkupFaultEvents success", t, func() { - timeoutFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize) - UseGetDeviceNetWorkHealthApi = true - logicID := int32(0) - faultInfos := []common.DevFaultInfo{{EventID: LinkDownFaultCode}} - GetLinkdownLinkupFaultEvents(logicID, faultInfos) - convey.So(len(timeoutFaultInfoMap), convey.ShouldEqual, len(faultInfos)) - convey.So(UseGetDeviceNetWorkHealthApi, convey.ShouldEqual, false) - }) -} - // TestSetManuallyFaultNPUHandled for test SetManuallyFaultNPUHandled func TestSetManuallyFaultNPUHandled(t *testing.T) { convey.Convey("test SetManuallyFaultNPUHandled success", t, func() { @@ -311,20 +333,76 @@ func TestSetManuallyFaultNPUHandled(t *testing.T) { }) } -// TestGetCurrentDeviceNetWorkHealth for test GetCurrentDeviceNetWorkHealth -func TestGetCurrentDeviceNetWorkHealth(t *testing.T) { - convey.Convey("test GetCurrentDeviceNetWorkHealth success", t, func() { +// TestCollectEachFaultEvent for test collectEachFaultEvent +func TestCollectEachFaultEvent(t *testing.T) { + convey.Convey("test collectEachFaultEvent success", t, func() { logicID := int32(0) - convey.Convey("test network status Unhealthy", func() { - timeoutFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize) - GetCurrentDeviceNetWorkHealth(logicID, v1beta1.Unhealthy, false) - convey.So(timeoutFaultInfoMap[logicID][0].Assertion, convey.ShouldEqual, common.FaultOccur) - }) - convey.Convey("test network status Healthy", func() { - timeoutFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize) - GetCurrentDeviceNetWorkHealth(logicID, v1beta1.Healthy, false) - convey.So(timeoutFaultInfoMap[logicID][0].Assertion, convey.ShouldEqual, common.FaultRecover) - }) + linkDownFaultCodeStr := strings.ToLower(strconv.FormatInt(LinkDownFaultCode, Hex)) + faultDurationMap = map[string]*FaultDurationCache{ + linkDownFaultCodeStr: { + FaultDuration: FaultDuration{ + FaultTimeout: 30, + RecoverTimeout: 60, + FaultHandling: PreSeparateNPU, + }, + }, + } + faultInfos := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur}, + {EventID: CardDropFaultCode, Assertion: common.FaultOccur}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover}, + } + collectEachFaultEvent(logicID, faultInfos) + convey.So(len(faultDurationMap), convey.ShouldEqual, 1) + convey.So(len(faultDurationMap[linkDownFaultCodeStr].Duration[logicID].FaultEventQueue), + convey.ShouldEqual, 2) + }) +} + +// TestSortFaultEventsInAscendingOrder for test sortFaultEventsInAscendingOrder +func TestSortFaultEventsInAscendingOrder(t *testing.T) { + convey.Convey("test sortFaultEventsInAscendingOrder success", t, func() { + logicID := int32(0) + linkDownFaultCodeStr := strings.ToLower(strconv.FormatInt(LinkDownFaultCode, Hex)) + CardDropFaultCodeStr := strings.ToLower(strconv.FormatInt(CardDropFaultCode, Hex)) + faultDurationMap = map[string]*FaultDurationCache{ + linkDownFaultCodeStr: { + FaultDuration: FaultDuration{ + FaultTimeout: 30, + RecoverTimeout: 60, + FaultHandling: PreSeparateNPU, + }, + }, + CardDropFaultCodeStr: { + FaultDuration: FaultDuration{ + FaultTimeout: 120, + RecoverTimeout: 0, + FaultHandling: SeparateNPU, + }, + }, + } + faultInfos := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 4}, + {EventID: CardDropFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 3}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 1}, + {EventID: CardDropFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 2}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 9}, + } + + linkDownFaultExpectVal := []common.DevFaultInfo{{EventID: LinkDownFaultCode, Assertion: common.FaultRecover, + AlarmRaisedTime: 1}, {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 4}, { + EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 9}} + + cardDropFaultExpectVal := []common.DevFaultInfo{{EventID: CardDropFaultCode, Assertion: common.FaultOccur, + AlarmRaisedTime: 2}, {EventID: CardDropFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 3}} + + collectEachFaultEvent(logicID, faultInfos) + sortFaultEventsInAscendingOrder(logicID, linkDownFaultCodeStr) + sortFaultEventsInAscendingOrder(logicID, CardDropFaultCodeStr) + convey.So(faultDurationMap[linkDownFaultCodeStr].Duration[logicID].FaultEventQueue, convey.ShouldResemble, + linkDownFaultExpectVal) + convey.So(faultDurationMap[CardDropFaultCodeStr].Duration[logicID].FaultEventQueue, convey.ShouldResemble, + cardDropFaultExpectVal) }) } @@ -347,255 +425,354 @@ func TestMergeContinuousElementBasedAssertion(t *testing.T) { convey.Convey("test merge mix fault continuous assertion success", t, func() { devFaultInfo := []common.DevFaultInfo{{Assertion: common.FaultRecover}, {Assertion: common.FaultRecover}, - {Assertion: common.FaultOccur}} + {Assertion: common.FaultOccur}, {Assertion: common.FaultOccur}} expectVal := []common.DevFaultInfo{{Assertion: common.FaultRecover}, {Assertion: common.FaultOccur}} mergeContinuousElementBasedAssertion(&devFaultInfo) convey.So(devFaultInfo, convey.ShouldResemble, expectVal) }) } -// TestSortMergeFaultQueueWhenNetWorkHealthy for test TestSortMergeFaultQueue -func TestSortMergeFaultQueueWhenNetWorkHealthy(t *testing.T) { - convey.Convey("test sort merge fault queue success when network healthy", t, func() { - device := NpuDevice{ - LogicID: 0, - NetworkRealHealth: v1beta1.Healthy, - } - timeoutFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize) - timeoutFaultInfoMap[device.LogicID] = []common.DevFaultInfo{{ +// TestClearFirstEventBasedOnFaultStatus for test clearFirstEventBasedOnFaultStatus +func TestClearFirstEventBasedOnFaultStatus(t *testing.T) { + convey.Convey("test clearFirstEventBasedOnFaultStatus timeout success", t, func() { + faultDurationData := FaultDurationData{ + TimeoutStatus: false, + FaultEventQueue: []common.DevFaultInfo{{ + EventID: LinkDownFaultCode, + Assertion: common.FaultRecover, + AlarmRaisedTime: 2}, { + EventID: LinkDownFaultCode, + Assertion: common.FaultOccur, + AlarmRaisedTime: 4, + }}} + expectVal := []common.DevFaultInfo{{ EventID: LinkDownFaultCode, - LogicID: 0, Assertion: common.FaultOccur, - AlarmRaisedTime: 165, - }, { + AlarmRaisedTime: 4}} + clearFirstEventBasedOnFaultStatus(&faultDurationData) + convey.So(faultDurationData.FaultEventQueue, convey.ShouldResemble, expectVal) + }) + + convey.Convey("test clearFirstEventBasedOnFaultStatus recover success", t, func() { + faultDurationData := FaultDurationData{ + TimeoutStatus: true, + FaultEventQueue: []common.DevFaultInfo{{ + EventID: LinkDownFaultCode, + Assertion: common.FaultOccur, + AlarmRaisedTime: 2}, { + EventID: LinkDownFaultCode, + Assertion: common.FaultRecover, + AlarmRaisedTime: 4, + }}} + expectVal := []common.DevFaultInfo{{ EventID: LinkDownFaultCode, - LogicID: 0, Assertion: common.FaultRecover, - AlarmRaisedTime: 100, - }, { - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultOccur, - AlarmRaisedTime: 150, - }} - expectVal := map[int32][]common.DevFaultInfo{0: {{ - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultOccur, - AlarmRaisedTime: 150, - }}} - SortMergeFaultQueue(&device) - convey.So(timeoutFaultInfoMap, convey.ShouldResemble, expectVal) + AlarmRaisedTime: 4}} + clearFirstEventBasedOnFaultStatus(&faultDurationData) + convey.So(faultDurationData.FaultEventQueue, convey.ShouldResemble, expectVal) }) } -// TestSortMergeFaultQueueWhenNetWorkUnhealthy for test TestSortMergeFaultQueue -func TestSortMergeFaultQueueWhenNetWorkUnhealthy(t *testing.T) { - convey.Convey("test sort merge fault queue success when network unhealthy", t, func() { - device := NpuDevice{ - LogicID: 0, - NetworkRealHealth: v1beta1.Unhealthy, +// TestCleanFaultQueue for test cleanFaultQueue when fault time status is false +func TestCleanFaultQueueWhenFaultTimeStatusFalse(t *testing.T) { + convey.Convey("test CleanFaultQueue when fault time status is false", t, func() { + logicID := int32(0) + linkDownFaultCodeStr := strings.ToLower(strconv.FormatInt(LinkDownFaultCode, Hex)) + CardDropFaultCodeStr := strings.ToLower(strconv.FormatInt(CardDropFaultCode, Hex)) + faultDurationMap = map[string]*FaultDurationCache{ + linkDownFaultCodeStr: { + Duration: map[int32]FaultDurationData{logicID: {TimeoutStatus: false}}, + FaultDuration: FaultDuration{ + FaultTimeout: 30, + RecoverTimeout: 60, + FaultHandling: PreSeparateNPU, + }, + }, + CardDropFaultCodeStr: { + FaultDuration: FaultDuration{ + FaultTimeout: 120, + RecoverTimeout: 0, + FaultHandling: SeparateNPU, + }, + }, + } + + faultInfos := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 165}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 100}, + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 150}, + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 150}, } - timeoutFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize) - timeoutFaultInfoMap[device.LogicID] = []common.DevFaultInfo{{ + expectVal := []common.DevFaultInfo{{ EventID: LinkDownFaultCode, - LogicID: 0, Assertion: common.FaultOccur, - AlarmRaisedTime: 165, - }, { + AlarmRaisedTime: 150}} + + collectEachFaultEvent(logicID, faultInfos) + sortFaultEventsInAscendingOrder(logicID, linkDownFaultCodeStr) + cleanFaultQueue(logicID, linkDownFaultCodeStr) + convey.So(faultDurationMap[linkDownFaultCodeStr].Duration[logicID].FaultEventQueue, + convey.ShouldResemble, expectVal) + }) +} + +// TestCleanFaultQueue for test cleanFaultQueue when fault time status is true +func TestCleanFaultQueueWhenFaultTimeStatusTrue(t *testing.T) { + convey.Convey("test CleanFaultQueue when fault time status is true", t, func() { + logicID := int32(0) + linkDownFaultCodeStr := strings.ToLower(strconv.FormatInt(LinkDownFaultCode, Hex)) + faultDurationMap = map[string]*FaultDurationCache{ + linkDownFaultCodeStr: { + Duration: map[int32]FaultDurationData{logicID: {TimeoutStatus: true, + FaultEventQueue: []common.DevFaultInfo{}}}, + FaultDuration: FaultDuration{ + FaultTimeout: 30, + RecoverTimeout: 60, + FaultHandling: PreSeparateNPU, + }, + }, + } + + faultInfos := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 165}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 100}, + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 150}, + } + expectVal := []common.DevFaultInfo{{ EventID: LinkDownFaultCode, - LogicID: 0, Assertion: common.FaultRecover, - AlarmRaisedTime: 100, - }, { + AlarmRaisedTime: 100}, { EventID: LinkDownFaultCode, - LogicID: 0, Assertion: common.FaultOccur, AlarmRaisedTime: 150, }} - expectVal := map[int32][]common.DevFaultInfo{0: {{ - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultRecover, - AlarmRaisedTime: 100, - }, { - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultOccur, - AlarmRaisedTime: 150, - }}} - SortMergeFaultQueue(&device) - convey.So(timeoutFaultInfoMap, convey.ShouldResemble, expectVal) + + collectEachFaultEvent(logicID, faultInfos) + sortFaultEventsInAscendingOrder(logicID, linkDownFaultCodeStr) + cleanFaultQueue(logicID, linkDownFaultCodeStr) + convey.So(faultDurationMap[linkDownFaultCodeStr].Duration[logicID].FaultEventQueue, + convey.ShouldResemble, expectVal) }) } -// TestLinkDownTimeoutCheckCase01 for test TestLinkDownTimeoutCheck -func TestLinkDownTimeoutCheckCase01(t *testing.T) { - convey.Convey("test link down timeout check case 01", t, func() { - device := NpuDevice{ - LogicID: 0, - NetworkRealHealth: v1beta1.Healthy, - NetworkHealth: v1beta1.Healthy, +// TestHandleFaultQueueCase01 for test handleFaultQueue case 01 +func TestHandleFaultQueueCase01(t *testing.T) { + convey.Convey("test handleFaultQueue case 01", t, func() { + logicID := int32(0) + linkDownFaultCodeStr := strings.ToLower(strconv.FormatInt(LinkDownFaultCode, Hex)) + faultDurationMap = map[string]*FaultDurationCache{ + linkDownFaultCodeStr: { + FaultDuration: FaultDuration{ + FaultTimeout: 30, + RecoverTimeout: 60, + FaultHandling: PreSeparateNPU, + }, + }, } - linkDownTimeoutCustomization = 30 - linkUpTimeoutCustomization = 60 - timeoutFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize) - timeoutFaultInfoMap[device.LogicID] = []common.DevFaultInfo{{ - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultOccur, - AlarmRaisedTime: 50, - }, { - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultRecover, - AlarmRaisedTime: 81, - }, { - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultOccur, - AlarmRaisedTime: 82, - }} - LinkDownTimeoutCheck(&device) - convey.So(device.NetworkRealHealth, convey.ShouldEqual, v1beta1.Unhealthy) - convey.So(device.NetworkHealth, convey.ShouldEqual, v1beta1.Unhealthy) - expectVal := map[int32][]common.DevFaultInfo{0: {}} - convey.So(timeoutFaultInfoMap, convey.ShouldResemble, expectVal) + + faultInfos := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 50 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 81 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 82 * SecondMagnification}, + } + expectVal := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 50 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 81 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 82 * SecondMagnification}, + } + + collectEachFaultEvent(logicID, faultInfos) + sortFaultEventsInAscendingOrder(logicID, linkDownFaultCodeStr) + cleanFaultQueue(logicID, linkDownFaultCodeStr) + handleFaultQueue(logicID, linkDownFaultCodeStr) + + faultDurationData := faultDurationMap[linkDownFaultCodeStr].Duration[logicID] + convey.So(faultDurationData.TimeoutStatus, convey.ShouldEqual, true) + convey.So(faultDurationData.FaultEventQueue, convey.ShouldResemble, expectVal) + convey.So(faultDurationData.FaultDurationTime, convey.ShouldEqual, 31*SecondMagnification) + convey.So(faultDurationData.FaultRecoverDurationTime, convey.ShouldEqual, 0) }) } -// TestLinkDownTimeoutCheckCase02 for test TestLinkDownTimeoutCheck -func TestLinkDownTimeoutCheckCase02(t *testing.T) { - convey.Convey("test link down timeout check case 02", t, func() { - device := NpuDevice{ - LogicID: 0, - NetworkRealHealth: v1beta1.Healthy, - NetworkHealth: v1beta1.Healthy, +// TestHandleFaultQueueCase02 for test handleFaultQueue case 02 +func TestHandleFaultQueueCase02(t *testing.T) { + convey.Convey("test handleFaultQueue case 02", t, func() { + logicID := int32(0) + linkDownFaultCodeStr := strings.ToLower(strconv.FormatInt(LinkDownFaultCode, Hex)) + faultDurationMap = map[string]*FaultDurationCache{ + linkDownFaultCodeStr: { + Duration: map[int32]FaultDurationData{logicID: {FaultEventQueue: []common.DevFaultInfo{}}}, + FaultDuration: FaultDuration{ + FaultTimeout: 30, + RecoverTimeout: 60, + FaultHandling: PreSeparateNPU, + }, + }, } - linkDownTimeoutCustomization = 30 - linkUpTimeoutCustomization = 60 - timeoutFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize) - timeoutFaultInfoMap[device.LogicID] = []common.DevFaultInfo{{ - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultOccur, - AlarmRaisedTime: 50, - }, { - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultRecover, - AlarmRaisedTime: 80, - }} - LinkDownTimeoutCheck(&device) - convey.So(device.NetworkRealHealth, convey.ShouldEqual, v1beta1.Healthy) - convey.So(device.NetworkHealth, convey.ShouldEqual, v1beta1.Healthy) - expectVal := map[int32][]common.DevFaultInfo{0: {}} - convey.So(timeoutFaultInfoMap, convey.ShouldResemble, expectVal) + + faultInfos := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 50 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 80 * SecondMagnification}, + } + expectVal := make([]common.DevFaultInfo, 0) + + collectEachFaultEvent(logicID, faultInfos) + sortFaultEventsInAscendingOrder(logicID, linkDownFaultCodeStr) + cleanFaultQueue(logicID, linkDownFaultCodeStr) + handleFaultQueue(logicID, linkDownFaultCodeStr) + + faultDurationData := faultDurationMap[linkDownFaultCodeStr].Duration[logicID] + convey.So(faultDurationData.TimeoutStatus, convey.ShouldEqual, false) + convey.So(faultDurationData.FaultEventQueue, convey.ShouldResemble, expectVal) + convey.So(faultDurationData.FaultDurationTime, convey.ShouldEqual, 30*SecondMagnification) + convey.So(faultDurationData.FaultRecoverDurationTime, convey.ShouldEqual, 0) }) } -// TestLinkDownTimeoutCheckCase03 for test TestLinkDownTimeoutCheck -func TestLinkDownTimeoutCheckCase03(t *testing.T) { - convey.Convey("test link down timeout check case 03", t, func() { - device := NpuDevice{ - LogicID: 0, - NetworkRealHealth: v1beta1.Healthy, - NetworkHealth: v1beta1.Healthy, +// TestHandleFaultQueueCase03 for test handleFaultQueue case 03 +func TestHandleFaultQueueCase03(t *testing.T) { + convey.Convey("test handleFaultQueue case 03", t, func() { + logicID := int32(0) + linkDownFaultCodeStr := strings.ToLower(strconv.FormatInt(LinkDownFaultCode, Hex)) + faultDurationMap = map[string]*FaultDurationCache{ + linkDownFaultCodeStr: { + FaultDuration: FaultDuration{ + FaultTimeout: 30, + RecoverTimeout: 60, + FaultHandling: PreSeparateNPU, + }, + }, } - linkDownTimeoutCustomization = 30 - linkUpTimeoutCustomization = 60 - timeoutFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize) - timeoutFaultInfoMap[device.LogicID] = []common.DevFaultInfo{{ - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultOccur, - AlarmRaisedTime: 50, - }, { - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultRecover, - AlarmRaisedTime: 80, - }, { - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultOccur, - AlarmRaisedTime: 82, - }, { - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultRecover, - AlarmRaisedTime: 112, - }} - LinkDownTimeoutCheck(&device) - convey.So(device.NetworkRealHealth, convey.ShouldEqual, v1beta1.Healthy) - convey.So(device.NetworkHealth, convey.ShouldEqual, v1beta1.Healthy) - expectVal := map[int32][]common.DevFaultInfo{0: {}} - convey.So(timeoutFaultInfoMap, convey.ShouldResemble, expectVal) + + faultInfos := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 50 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 80 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 82 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 112 * SecondMagnification}, + } + expectVal := make([]common.DevFaultInfo, 0) + + collectEachFaultEvent(logicID, faultInfos) + sortFaultEventsInAscendingOrder(logicID, linkDownFaultCodeStr) + cleanFaultQueue(logicID, linkDownFaultCodeStr) + handleFaultQueue(logicID, linkDownFaultCodeStr) + + faultDurationData := faultDurationMap[linkDownFaultCodeStr].Duration[logicID] + convey.So(faultDurationData.TimeoutStatus, convey.ShouldEqual, false) + convey.So(faultDurationData.FaultEventQueue, convey.ShouldResemble, expectVal) + convey.So(faultDurationData.FaultDurationTime, convey.ShouldEqual, 30*SecondMagnification) + convey.So(faultDurationData.FaultRecoverDurationTime, convey.ShouldEqual, 0) }) } -// TestLinkDownTimeoutCheckCase04 for test TestLinkDownTimeoutCheck -func TestLinkDownTimeoutCheckCase04(t *testing.T) { - convey.Convey("test link down timeout check case 04", t, func() { - device := NpuDevice{ - LogicID: 0, - NetworkRealHealth: v1beta1.Unhealthy, - NetworkHealth: v1beta1.Unhealthy, +// TestHandleFaultQueueCase04 for test handleFaultQueue case 04 +func TestHandleFaultQueueCase04(t *testing.T) { + convey.Convey("test handleFaultQueue case 04", t, func() { + logicID := int32(0) + linkDownFaultCodeStr := strings.ToLower(strconv.FormatInt(LinkDownFaultCode, Hex)) + faultDurationMap = map[string]*FaultDurationCache{ + linkDownFaultCodeStr: { + Duration: map[int32]FaultDurationData{logicID: {TimeoutStatus: true}}, + FaultDuration: FaultDuration{ + FaultTimeout: 30, + RecoverTimeout: 60, + FaultHandling: PreSeparateNPU, + }, + }, } - linkDownTimeoutCustomization = 30 - linkUpTimeoutCustomization = 60 - timeoutFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize) - timeoutFaultInfoMap[device.LogicID] = []common.DevFaultInfo{{ - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultRecover, - AlarmRaisedTime: 50, - }, { - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultOccur, - AlarmRaisedTime: 110, - }} - LinkDownTimeoutCheck(&device) - convey.So(device.NetworkRealHealth, convey.ShouldEqual, v1beta1.Unhealthy) - convey.So(device.NetworkHealth, convey.ShouldEqual, v1beta1.Unhealthy) - expectVal := map[int32][]common.DevFaultInfo{0: {}} - convey.So(timeoutFaultInfoMap, convey.ShouldResemble, expectVal) + + faultInfos := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 50 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 110 * SecondMagnification}, + } + expectVal := make([]common.DevFaultInfo, 0) + + collectEachFaultEvent(logicID, faultInfos) + sortFaultEventsInAscendingOrder(logicID, linkDownFaultCodeStr) + cleanFaultQueue(logicID, linkDownFaultCodeStr) + handleFaultQueue(logicID, linkDownFaultCodeStr) + + faultDurationData := faultDurationMap[linkDownFaultCodeStr].Duration[logicID] + convey.So(faultDurationData.TimeoutStatus, convey.ShouldEqual, true) + convey.So(faultDurationData.FaultEventQueue, convey.ShouldResemble, expectVal) + convey.So(faultDurationData.FaultDurationTime, convey.ShouldEqual, 0) + convey.So(faultDurationData.FaultRecoverDurationTime, convey.ShouldEqual, 60*SecondMagnification) }) } -// TestLinkDownTimeoutCheckCase05 for test TestLinkDownTimeoutCheck -func TestLinkDownTimeoutCheckCase05(t *testing.T) { - convey.Convey("test link down timeout check case 05", t, func() { - device := NpuDevice{ - LogicID: 0, - NetworkRealHealth: v1beta1.Unhealthy, - NetworkHealth: v1beta1.Unhealthy, +// TestHandleFaultQueueCase05 for test handleFaultQueue case 05 +func TestHandleFaultQueueCase05(t *testing.T) { + convey.Convey("test handleFaultQueue case 05", t, func() { + logicID := int32(0) + linkDownFaultCodeStr := strings.ToLower(strconv.FormatInt(LinkDownFaultCode, Hex)) + faultDurationMap = map[string]*FaultDurationCache{ + linkDownFaultCodeStr: { + Duration: map[int32]FaultDurationData{logicID: {TimeoutStatus: true}}, + FaultDuration: FaultDuration{ + FaultTimeout: 30, + RecoverTimeout: 60, + FaultHandling: PreSeparateNPU, + }, + }, } - linkDownTimeoutCustomization = 30 - linkUpTimeoutCustomization = 60 - timeoutFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize) - timeoutFaultInfoMap[device.LogicID] = []common.DevFaultInfo{{ - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultRecover, - AlarmRaisedTime: 50, - }, { - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultOccur, - AlarmRaisedTime: 111, - }, { - EventID: LinkDownFaultCode, - LogicID: 0, - Assertion: common.FaultRecover, - AlarmRaisedTime: 112, - }} - LinkDownTimeoutCheck(&device) - convey.So(device.NetworkRealHealth, convey.ShouldEqual, v1beta1.Healthy) - convey.So(device.NetworkHealth, convey.ShouldEqual, v1beta1.Healthy) - expectVal := map[int32][]common.DevFaultInfo{0: {}} - convey.So(timeoutFaultInfoMap, convey.ShouldResemble, expectVal) + + faultInfos := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 50 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 111 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 112 * SecondMagnification}, + } + expectVal := make([]common.DevFaultInfo, 0) + + collectEachFaultEvent(logicID, faultInfos) + sortFaultEventsInAscendingOrder(logicID, linkDownFaultCodeStr) + cleanFaultQueue(logicID, linkDownFaultCodeStr) + handleFaultQueue(logicID, linkDownFaultCodeStr) + + faultDurationData := faultDurationMap[linkDownFaultCodeStr].Duration[logicID] + convey.So(faultDurationData.TimeoutStatus, convey.ShouldEqual, false) + convey.So(faultDurationData.FaultEventQueue, convey.ShouldResemble, expectVal) + convey.So(faultDurationData.FaultDurationTime, convey.ShouldEqual, 1*SecondMagnification) + convey.So(faultDurationData.FaultRecoverDurationTime, convey.ShouldEqual, 61*SecondMagnification) + }) +} + +// TestHandleFaultQueueCase06 for test handleFaultQueue case 06 +func TestHandleFaultQueueCase06(t *testing.T) { + convey.Convey("test handleFaultQueue case 06", t, func() { + logicID := int32(0) + linkDownFaultCodeStr := strings.ToLower(strconv.FormatInt(LinkDownFaultCode, Hex)) + faultDurationMap = map[string]*FaultDurationCache{ + linkDownFaultCodeStr: { + Duration: map[int32]FaultDurationData{logicID: {TimeoutStatus: true, + FaultEventQueue: []common.DevFaultInfo{}}}, + FaultDuration: FaultDuration{ + FaultTimeout: 30, + RecoverTimeout: 60, + FaultHandling: PreSeparateNPU, + }, + }, + } + + faultInfos := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 50 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 111 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 142 * SecondMagnification}, + } + expectVal := []common.DevFaultInfo{ + {EventID: LinkDownFaultCode, Assertion: common.FaultOccur, AlarmRaisedTime: 111 * SecondMagnification}, + {EventID: LinkDownFaultCode, Assertion: common.FaultRecover, AlarmRaisedTime: 142 * SecondMagnification}, + } + + collectEachFaultEvent(logicID, faultInfos) + sortFaultEventsInAscendingOrder(logicID, linkDownFaultCodeStr) + cleanFaultQueue(logicID, linkDownFaultCodeStr) + handleFaultQueue(logicID, linkDownFaultCodeStr) + + faultDurationData := faultDurationMap[linkDownFaultCodeStr].Duration[logicID] + convey.So(faultDurationData.TimeoutStatus, convey.ShouldEqual, true) + convey.So(faultDurationData.FaultEventQueue, convey.ShouldResemble, expectVal) + convey.So(faultDurationData.FaultDurationTime, convey.ShouldEqual, 31*SecondMagnification) + convey.So(faultDurationData.FaultRecoverDurationTime, convey.ShouldEqual, 61*SecondMagnification) }) } @@ -1272,19 +1449,6 @@ func TestGetChangedDevFaultInfo(t *testing.T) { }) } -// TestGetInitLogicIDs for test GetInitLogicIDs -func TestGetInitLogicIDs(t *testing.T) { - convey.Convey("test GetInitLogicIDs success case1", t, func() { - initLogicIDs = []int32{} - convey.So(GetInitLogicIDs(), convey.ShouldEqual, nil) - }) - - convey.Convey("test GetInitLogicIDs success case2", t, func() { - initLogicIDs = []int32{0, 2} - convey.So(GetInitLogicIDs(), convey.ShouldResemble, initLogicIDs) - }) -} - // TestGetTimeoutFaultCodes for test GetTimeoutFaultCodes func TestGetTimeoutFaultCodes(t *testing.T) { convey.Convey("test GetTimeoutFaultCodes success", t, func() { diff --git a/pkg/common/proto.go b/pkg/common/proto.go index d8a061d0c0e9e13895e8481e3b6cf5d07bcebbb3..af85648725b6fe66a67cb53ccbd06c6a9aa403b4 100644 --- a/pkg/common/proto.go +++ b/pkg/common/proto.go @@ -66,7 +66,6 @@ type NpuDevice struct { DevType string DeviceName string Health string - NetworkRealHealth string NetworkHealth string CardDrop bool IP string diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 53d3c8e89a5ab2fdfbe1a8002c587e87011443fe..c283e50636628bbab46704cbfb6a40bcb4437771 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -53,8 +53,6 @@ const ( ipv6LinkTypePrefix = "fe80" checkNodeLabelPolling = 60 * 60 - polling = "polling" - subscribe = "subscribe" ) // AscendTools struct definition @@ -107,6 +105,10 @@ type DevManager interface { GetIfCardsInResetting(int32) bool GetResetFailedTimes(int32) int SetResetFailedTimes(int32, int) + HandleDropCardFaultEvents(*common.NpuDevice) + HandleLostChipFaultEvents(*common.NpuDevice, []int32) + HandleLostNetworkFaultEvents(*common.NpuDevice, []int32) + LogFaultModeChange(*common.NpuDevice, []int32, string) } // SetDmgr set devmanager @@ -626,9 +628,11 @@ func setAICoreHealthyIfVNpu(groupDevice map[string][]*common.NpuDevice, aiCoreDe } for _, device := range aiCoreDevs { device.Health = logicDeviceMap[device.LogicID].Health + device.NetworkHealth = logicDeviceMap[device.LogicID].NetworkHealth device.FaultCodes = logicDeviceMap[device.LogicID].FaultCodes device.AlarmRaisedTime = logicDeviceMap[device.LogicID].AlarmRaisedTime - device.NetworkHealth = logicDeviceMap[device.LogicID].NetworkHealth + device.NetworkFaultCodes = logicDeviceMap[device.LogicID].NetworkFaultCodes + device.NetworkAlarmRaisedTime = logicDeviceMap[device.LogicID].NetworkAlarmRaisedTime } } @@ -696,6 +700,15 @@ func (tool *AscendTools) isHealthy(device *common.NpuDevice) string { return v1beta1.Unhealthy } +func (tool *AscendTools) isNetworkHealthy(device *common.NpuDevice) string { + faultType := common.GetNetworkFaultType(device.NetworkFaultCodes, device.LogicID) + if faultType == common.NormalNPU || faultType == common.NotHandleFault { + return v1beta1.Healthy + } + + return v1beta1.Unhealthy +} + func (tool *AscendTools) npuIsUsedNow(deviceName string) bool { podList := tool.client.GetActivePodListCache() for _, pod := range podList { @@ -870,27 +883,27 @@ func (tool *AscendTools) getAiCoreCount(cgoVDevInfo npuCommon.VirtualDevInfo) (i // writeNewFaultCode writes fault code and health to device func (tool *AscendTools) writeNewFaultCode(deviceMap map[string][]*common.NpuDevice, runMode string) { - initLogicIDs := common.GetAndCleanLogicID() devFaultInfoMap := common.GetAndCleanFaultInfo() for _, devices := range deviceMap { for _, device := range devices { - tool.flushFaultCodesWithInit(device, initLogicIDs, devFaultInfoMap) + tool.flushFaultCodesWithInit(device, devFaultInfoMap) + common.CountFaultDuration(device, devFaultInfoMap) device.Health = tool.isHealthy(device) if runMode == common.Ascend910 && tool.deviceUsage == common.Train { - tool.handleDeviceNetworkFault(device, devFaultInfoMap) + device.NetworkHealth = tool.isNetworkHealthy(device) } } } isFirstFlushFault = false } -func (tool *AscendTools) flushFaultCodesWithInit(device *common.NpuDevice, initLogicIDs []int32, +func (tool *AscendTools) flushFaultCodesWithInit(device *common.NpuDevice, devFaultInfoMap map[int32][]npuCommon.DevFaultInfo) { if devFaultInfo, ok := devFaultInfoMap[device.LogicID]; ok { tool.writeFaultToEvent(devFaultInfo) } common.SetNewFaultAndCacheOnceRecoverFault(device.LogicID, devFaultInfoMap[device.LogicID], device) - logFaultModeChange(device, initLogicIDs, subscribe) + common.SetNetworkNewFaultAndCacheOnceRecoverFault(device.LogicID, devFaultInfoMap[device.LogicID], device) } func moreThanFiveMin(device *common.NpuDevice) bool { @@ -900,7 +913,14 @@ func moreThanFiveMin(device *common.NpuDevice) bool { return time.Now().UnixMilli()-device.AlarmRaisedTime > subscribeToPollingTime } -func logFaultModeChange(device *common.NpuDevice, initLogicIDs []int32, newMode string) { +func networkMoreThanFiveMin(device *common.NpuDevice) bool { + if device.NetworkAlarmRaisedTime == 0 { + return false + } + return time.Now().UnixMilli()-device.NetworkAlarmRaisedTime > subscribeToPollingTime +} + +func (tool *AscendTools) LogFaultModeChange(device *common.NpuDevice, initLogicIDs []int32, newMode string) { var oldMode string var ok bool if oldMode, ok = faultMode[device.LogicID]; !ok { @@ -911,10 +931,12 @@ func logFaultModeChange(device *common.NpuDevice, initLogicIDs []int32, newMode return } faultMode[device.LogicID] = newMode - if newMode == polling { + if newMode == common.Polling { var reason string if device.Health == v1beta1.Unhealthy && moreThanFiveMin(device) { reason = "fault raised more than five minutes" + } else if device.NetworkHealth == v1beta1.Unhealthy && networkMoreThanFiveMin(device) { + reason = "network fault raised more than five minutes" } else if common.Int32Tool.Contains(initLogicIDs, device.LogicID) { reason = "device reset" } else if common.SubscribeFailed { @@ -1032,29 +1054,6 @@ func (tool *AscendTools) SetResetFailedTimes(deviceLogicId int32, count int) { tool.resetFailedTimesMap[deviceLogicId] = count } -// handleDeviceNetworkFault handling network fault -func (tool *AscendTools) handleDeviceNetworkFault(device *common.NpuDevice, - devFaultInfoMap map[int32][]npuCommon.DevFaultInfo) { - if isFirstFlushFault { - device.NetworkHealth = v1beta1.Healthy - device.NetworkRealHealth = v1beta1.Healthy - } - - common.GetLinkdownLinkupFaultEvents(device.LogicID, devFaultInfoMap[device.LogicID]) - - if common.UseGetDeviceNetWorkHealthApi { - deviceNetworkHealth, err := tool.getDeviceNetworkState(device.LogicID, device.NetworkHealth) - if err != nil { - hwlog.RunLog.Errorf("failed to getDeviceNetworkState, err: %v", err) - } - common.GetCurrentDeviceNetWorkHealth(device.LogicID, deviceNetworkHealth, err != nil) - } - - common.SortMergeFaultQueue(device) - - common.LinkDownTimeoutCheck(device) -} - func (tool *AscendTools) writeFaultToEvent(devFaultInfo []npuCommon.DevFaultInfo) { for _, faultInfo := range devFaultInfo { if err := tool.doWriteFaultToEvent(faultInfo); err != nil { @@ -1082,13 +1081,16 @@ func (tool *AscendTools) doWriteFaultToEvent(faultInfo npuCommon.DevFaultInfo) e if assertionName == "" { return fmt.Errorf("failed to get name of assertion: %d", faultInfo.Assertion) } - faultLevelName := common.GetFaultTypeByCode([]int64{faultInfo.EventID}) + var faultLevelName string + if !common.NetworkFaultCodes.Has(faultInfo.EventID) { + faultLevelName = common.GetFaultTypeByCode([]int64{faultInfo.EventID}) + } else { + faultLevelName = common.GetNetworkFaultTypeByCode([]int64{faultInfo.EventID}) + } faultInfo.AlarmRaisedTime = time.Now().UnixMilli() - event := &v1.Event{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: common.DeviceInfoCMNameSpace, - Name: fmt.Sprintf("%s.%d%d", podName, faultInfo.AlarmRaisedTime, faultInfo.LogicID), + ObjectMeta: metav1.ObjectMeta{Namespace: common.DeviceInfoCMNameSpace, + Name: fmt.Sprintf("%s.%d%d", podName, faultInfo.AlarmRaisedTime, faultInfo.LogicID), }, Type: v1.EventTypeWarning, Message: fmt.Sprintf("device fault, nodeName:%s, assertion:%s, cardID:%d, deviceID:%d, "+ @@ -1096,16 +1098,12 @@ func (tool *AscendTools) doWriteFaultToEvent(faultInfo npuCommon.DevFaultInfo) e strings.ToUpper(strconv.FormatInt(faultInfo.EventID, common.Hex)), faultLevelName, time.UnixMilli(faultInfo.AlarmRaisedTime).Format(common.TimeFormat)), EventTime: metav1.MicroTime{Time: time.UnixMilli(faultInfo.AlarmRaisedTime)}, - Reason: assertionName, - Action: faultLevelName, - Source: v1.EventSource{Component: common.Component, Host: nodeName}, + Reason: assertionName, Action: faultLevelName, + Source: v1.EventSource{Component: common.Component, Host: nodeName}, InvolvedObject: v1.ObjectReference{ - Kind: common.ResourceKindPod, - Namespace: common.DeviceInfoCMNameSpace, - Name: podName, + Kind: common.ResourceKindPod, Namespace: common.DeviceInfoCMNameSpace, Name: podName, }, - ReportingController: common.Component, - ReportingInstance: podName, + ReportingController: common.Component, ReportingInstance: podName, } if faultInfo.Assertion != npuCommon.FaultOccur { event.Type = v1.EventTypeNormal @@ -1135,3 +1133,111 @@ func (tool *AscendTools) SetServerIndex(serverIndex int32) { func (tool *AscendTools) GetServerIndex() int32 { return tool.serverIndex } + +// HandleDropCardFaultEvents handle drop card fault events that may be lost by the fault subscription interface +func (tool *AscendTools) HandleDropCardFaultEvents(npuDevice *common.NpuDevice) { + if common.SubscribeFailed { + return + } + tool.generateCardDropFaultEvents(npuDevice) +} + +func (tool *AscendTools) generateCardDropFaultEvents(npuDevice *common.NpuDevice) { + if !npuDevice.CardDrop && tool.checkCardDropFault(npuDevice.LogicID) { + faultInfo := npuCommon.DevFaultInfo{ + EventID: common.CardDropFaultCode, + LogicID: npuDevice.LogicID, + Assertion: npuCommon.FaultOccur, + AlarmRaisedTime: time.Now().UnixMilli(), + } + npuDevice.CardDrop = true + hwlog.RunLog.Info("generate card drop occur fault event") + common.SaveDevFaultInfo(faultInfo) + } + + if npuDevice.CardDrop && !tool.checkCardDropFault(npuDevice.LogicID) { + faultInfo := npuCommon.DevFaultInfo{ + EventID: common.CardDropFaultCode, + LogicID: npuDevice.LogicID, + Assertion: npuCommon.FaultRecover, + AlarmRaisedTime: time.Now().UnixMilli(), + } + npuDevice.CardDrop = false + hwlog.RunLog.Info("generate card drop recover fault event") + common.SaveDevFaultInfo(faultInfo) + } +} + +func (tool *AscendTools) checkCardDropFault(logicID int32) bool { + _, err := tool.dmgr.GetDeviceHealth(logicID) + if common.CheckErrorMessage(err, npuCommon.DeviceNotReadyErrCodeStr) { + hwlog.RunLog.Errorf("logic id %d, error message contains %s, device does not ready, "+ + "the card may be dropped", logicID, npuCommon.DeviceNotReadyErrCodeStr) + return true + } + + return false +} + +// HandleLostChipFaultEvents handle chip fault events that may be lost by the fault subscription interface +func (tool *AscendTools) HandleLostChipFaultEvents(device *common.NpuDevice, initLogicIDs []int32) { + needHandleLostChipFaultCondition := isFirstFlushFault || (common.Int32Tool.Contains(initLogicIDs, + device.LogicID)) || common.SubscribeFailed || (device.Health == v1beta1.Unhealthy && moreThanFiveMin(device)) + if !needHandleLostChipFaultCondition { + return + } + tool.generateChipFaultEventsBasedOnFaultCacheChange(device) +} + +func (tool *AscendTools) generateChipFaultEventsBasedOnFaultCacheChange(device *common.NpuDevice) { + _, errCodes, err := tool.dmgr.GetDeviceAllErrorCode(device.LogicID) + if err != nil { + hwlog.RunLog.Errorf("get device fault failed logic: %d, err: %v", device.LogicID, err) + return + } + chipFaultCodes := make([]int64, 0, npuCommon.MaxErrorCodeCount) + for _, faultCode := range errCodes { + if common.NetworkFaultCodes.Has(faultCode) { + continue + } + chipFaultCodes = append(chipFaultCodes, faultCode) + } + + chipFaultEvents := common.GetChangedDevFaultInfo(device, device.FaultCodes, chipFaultCodes) + for _, chipFaultEvent := range chipFaultEvents { + hwlog.RunLog.Info("generate chip fault event based on chip fault cache change") + common.SaveDevFaultInfo(chipFaultEvent) + } +} + +// HandleLostNetworkFaultEvents handle network fault events that may be lost by the fault subscription interface +func (tool *AscendTools) HandleLostNetworkFaultEvents(device *common.NpuDevice, initLogicIDs []int32) { + needHandleLostNetworkFaultCondition := isFirstFlushFault || (common.Int32Tool.Contains(initLogicIDs, + device.LogicID)) || common.SubscribeFailed || (device.NetworkHealth == v1beta1.Unhealthy && + networkMoreThanFiveMin(device)) + if !needHandleLostNetworkFaultCondition { + return + } + tool.generateNetworkFaultEventsBasedOnFaultCacheChange(device) +} + +func (tool *AscendTools) generateNetworkFaultEventsBasedOnFaultCacheChange(device *common.NpuDevice) { + _, errCodes, err := tool.dmgr.GetDeviceAllErrorCode(device.LogicID) + if err != nil { + hwlog.RunLog.Errorf("get device fault failed logic: %d, err: %v", device.LogicID, err) + return + } + networkFaultCodes := make([]int64, 0, npuCommon.MaxErrorCodeCount) + for _, faultCode := range errCodes { + if !common.NetworkFaultCodes.Has(faultCode) { + continue + } + networkFaultCodes = append(networkFaultCodes, faultCode) + } + + networkFaultEvents := common.GetChangedDevFaultInfo(device, device.NetworkFaultCodes, networkFaultCodes) + for _, networkFaultEvent := range networkFaultEvents { + hwlog.RunLog.Info("generate network fault event based on network fault cache change") + common.SaveDevFaultInfo(networkFaultEvent) + } +} diff --git a/pkg/server/manager.go b/pkg/server/manager.go index ea3de8ff9fb3a9b1e8fdfae1dfb8cdbc00759552..061d6505dd5001b726afb290195f0e92cdd1b6a0 100644 --- a/pkg/server/manager.go +++ b/pkg/server/manager.go @@ -290,6 +290,8 @@ func (hdm *HwDevManager) updateDeviceHealth(curAllDevs []common.NpuDevice) { curAllDevs[i].NetworkHealth = hdm.allInfo.AllDevs[index].NetworkHealth curAllDevs[i].FaultCodes = hdm.allInfo.AllDevs[index].FaultCodes curAllDevs[i].AlarmRaisedTime = hdm.allInfo.AllDevs[index].AlarmRaisedTime + curAllDevs[i].NetworkFaultCodes = hdm.allInfo.AllDevs[index].NetworkFaultCodes + curAllDevs[i].NetworkAlarmRaisedTime = hdm.allInfo.AllDevs[index].NetworkAlarmRaisedTime } } } @@ -365,7 +367,7 @@ func (hdm *HwDevManager) ListenDevice(ctx context.Context) { continue } // complete the fault codes that cannot be reported by the event subscribe interface - hdm.mendSubscribeFaultEvent() + hdm.mendSubscribeFaultEvents() hdm.notifyToK8s(&initTime) hdm.useVolcanoNotify() @@ -383,16 +385,18 @@ func deepCopyGroupDevice(groupDevice map[string][]*common.NpuDevice) map[string] newNpuDevices := make([]*common.NpuDevice, 0, len(npuDevices)) for _, npuDevice := range npuDevices { newNpuDevice := &common.NpuDevice{ - FaultCodes: npuDevice.FaultCodes, - AlarmRaisedTime: npuDevice.AlarmRaisedTime, - DevType: npuDevice.DevType, - DeviceName: npuDevice.DeviceName, - Health: npuDevice.Health, - NetworkHealth: npuDevice.NetworkHealth, - IP: npuDevice.IP, - LogicID: npuDevice.LogicID, - PhyID: npuDevice.PhyID, - CardID: npuDevice.CardID, + FaultCodes: npuDevice.FaultCodes, + AlarmRaisedTime: npuDevice.AlarmRaisedTime, + NetworkFaultCodes: npuDevice.NetworkFaultCodes, + NetworkAlarmRaisedTime: npuDevice.NetworkAlarmRaisedTime, + DevType: npuDevice.DevType, + DeviceName: npuDevice.DeviceName, + Health: npuDevice.Health, + NetworkHealth: npuDevice.NetworkHealth, + IP: npuDevice.IP, + LogicID: npuDevice.LogicID, + PhyID: npuDevice.PhyID, + CardID: npuDevice.CardID, } newNpuDevices = append(newNpuDevices, newNpuDevice) } @@ -1083,51 +1087,19 @@ func getFaultCodeCMPollInterval(configMap *v1.ConfigMap) int { return interval } -func (hdm *HwDevManager) mendSubscribeFaultEvent() { - hdm.handleDropCardFault() -} - -func (hdm *HwDevManager) handleDropCardFault() { - if common.SubscribeFailed { - return - } - +func (hdm *HwDevManager) mendSubscribeFaultEvents() { + initLogicIDs := common.GetAndCleanLogicID() for _, npuDevices := range hdm.groupDevice { for _, npuDevice := range npuDevices { - hdm.generateCardDropFaultEvent(npuDevice) - } - } -} - -func (hdm *HwDevManager) generateCardDropFaultEvent(npuDevice *common.NpuDevice) { - if !npuDevice.CardDrop && hdm.checkCardDropFault(npuDevice.LogicID) { - faultInfo := npuCommon.DevFaultInfo{ - EventID: common.CardDropFaultCode, - LogicID: npuDevice.LogicID, - Assertion: npuCommon.FaultOccur, - } - npuDevice.CardDrop = true - common.SaveDevFaultInfo(faultInfo) - } + if common.SubscribeFailed { + hdm.manager.LogFaultModeChange(npuDevice, initLogicIDs, common.Polling) + } else { + hdm.manager.LogFaultModeChange(npuDevice, initLogicIDs, common.Subscribe) + } - if npuDevice.CardDrop && !hdm.checkCardDropFault(npuDevice.LogicID) { - faultInfo := npuCommon.DevFaultInfo{ - EventID: common.CardDropFaultCode, - LogicID: npuDevice.LogicID, - Assertion: npuCommon.FaultRecover, + hdm.manager.HandleDropCardFaultEvents(npuDevice) + hdm.manager.HandleLostChipFaultEvents(npuDevice, initLogicIDs) + hdm.manager.HandleLostNetworkFaultEvents(npuDevice, initLogicIDs) } - npuDevice.CardDrop = false - common.SaveDevFaultInfo(faultInfo) } } - -func (hdm *HwDevManager) checkCardDropFault(logicID int32) bool { - _, err := hdm.manager.GetDmgr().GetDeviceHealth(logicID) - if common.CheckErrorMessage(err, npuCommon.DeviceNotReadyErrCodeStr) { - hwlog.RunLog.Errorf("logic id %d, error message contains %s, device does not ready, "+ - "the card may be dropped", logicID, npuCommon.DeviceNotReadyErrCodeStr) - return true - } - - return false -}