diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 3e8e0570c578f3b1c7b1df9e4b7fe6df8daf3037..9891ff236b62a896089fa1533755771d4159e16b 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -770,3 +770,16 @@ const ( // SubHealthyAnnotationKey sub-healthy annotation key on node SubHealthyAnnotationKey = "subHealthy" ) + +const ( + // HbmDoubleBitFaultCode indicate 0x80E01801 + HbmDoubleBitFaultCode = 2162169857 + // AivBusFaultCode indicate 0x80CB8009 + AivBusFaultCode = 2160820233 + // AicBusFaultCode indicate 0x80C98009 + AicBusFaultCode = 2160689161 + // AssociatedFaultDiagnosisTime associated fault diagnosis + AssociatedFaultDiagnosisTime = 5 + // TimeMilliseconds indicate how many milliseconds are there in a second + TimeMilliseconds = 1000 +) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 84a1f7f51c975d38e4ded3d04050330078c22fdb..d0eee21fb5fc7fcba3090e68eafbd873541682e4 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -131,11 +131,11 @@ var ( WaitFaultSelfHealingTime time.Duration = DefaultWaitFaultSelfHealingTime // WaitDeviceResetTime is the time used in waiting device reset WaitDeviceResetTime time.Duration = DefaultWaitDeviceResetTime - // faultFrequencyMap is the cache saving the occur frequency of a fault, key is event id + // faultFrequencyMap is the cache saving to occur frequency of a fault, key is event id faultFrequencyMap = make(map[string]*FaultFrequencyCache, common.MaxErrorCodeCount) // faultFrequencyMapLock is the lock of faultFrequencyMap faultFrequencyMapLock sync.Mutex - // faultDurationMap is the cache saving the occur duration of a fault, key is event id + // faultDurationMap is the cache saving to occur duration of a fault, key is event id faultDurationMap = make(map[string]*FaultDurationCache, common.MaxErrorCodeCount) // faultDurationMapLock is the lock of faultDurationMap faultDurationMapLock sync.Mutex @@ -143,6 +143,7 @@ var ( parseHexFailedMsg = "parse hex int failed and skip it, string: %s" networkFaultConfigureFailedMsg = "%x is a network fault and cannot be configured to %s now, " + "fault handling policy is set to NotHandleFault" + hbmTool = NewHbmFaultManager() ) // ManuallyFaultInfo save the info of ManuallySeparateNPU @@ -285,6 +286,70 @@ func (devFault DevFaultInfoBasedTimeAscend) Less(i, j int) bool { return devFault[i].AlarmRaisedTime < devFault[j].AlarmRaisedTime } +// HbmFaultManager manage the accompanying faults of aic error and hbm error +type HbmFaultManager struct { + HbmOccurTimeCache map[int32]int64 + AicFaultEventQue map[int32][]common.DevFaultInfo +} + +// NewHbmFaultManager return a hbm fault manager +func NewHbmFaultManager() *HbmFaultManager { + return &HbmFaultManager{ + HbmOccurTimeCache: make(map[int32]int64, GeneralMapSize), + AicFaultEventQue: make(map[int32][]common.DevFaultInfo, GeneralMapSize), + } +} + +func (h *HbmFaultManager) updateHbmOccurTime(faultInfo common.DevFaultInfo) { + h.HbmOccurTimeCache[faultInfo.LogicID] = faultInfo.AlarmRaisedTime + hwlog.RunLog.Debugf("hbm fault occur, device %d update occur time: %d", + faultInfo.LogicID, h.HbmOccurTimeCache[faultInfo.LogicID]) +} + +func (h *HbmFaultManager) aicFaultEventInQue(faultInfo common.DevFaultInfo) { + _, ok := h.AicFaultEventQue[faultInfo.LogicID] + if !ok { + h.AicFaultEventQue[faultInfo.LogicID] = []common.DevFaultInfo{} + } + h.AicFaultEventQue[faultInfo.LogicID] = append(h.AicFaultEventQue[faultInfo.LogicID], faultInfo) + sort.Sort(DevFaultInfoBasedTimeAscend(h.AicFaultEventQue[faultInfo.LogicID])) + hwlog.RunLog.Debugf("aic/aiv fault event %d in que, device %d new event que:%#v", + faultInfo.EventID, faultInfo.LogicID, h.AicFaultEventQue[faultInfo.LogicID]) +} + +func (h *HbmFaultManager) aicFaultEventOutQue(logicId int32) []common.DevFaultInfo { + var faultInfoList []common.DevFaultInfo + faultEventQue, ok := h.AicFaultEventQue[logicId] + if !ok { + return faultInfoList + } + if _, ok := h.HbmOccurTimeCache[logicId]; !ok { + h.HbmOccurTimeCache[logicId] = 0 + } + var newFaultEventQue []common.DevFaultInfo + nowTime := time.Now().Unix() + for i := 0; i < len(faultEventQue); i++ { + // The fault aic error occurring ten seconds before and after the occurrence of hbm error should be deleted, + if Int64Tool.Abs(h.HbmOccurTimeCache[logicId], faultEventQue[i].AlarmRaisedTime) < + AssociatedFaultDiagnosisTime*TimeMilliseconds { + hwlog.RunLog.Infof("device %d delete event in fault event que, aic event time %d hbm event time %d", + logicId, faultEventQue[i].AlarmRaisedTime, h.HbmOccurTimeCache[logicId]) + continue + } + // aic error should report if hbm error does not occur within ten seconds, + // and the event in this outbound queue should also be deleted + if nowTime-faultEventQue[i].AlarmRaisedTime > AssociatedFaultDiagnosisTime*TimeMilliseconds { + hwlog.RunLog.Infof("device % delete event in fault event que, aic event time %d now time %d", + logicId, faultEventQue[i].AlarmRaisedTime, nowTime) + faultInfoList = append(faultInfoList, faultEventQue[i]) + continue + } + newFaultEventQue = append(newFaultEventQue, faultEventQue[i]) + } + h.AicFaultEventQue[logicId] = newFaultEventQue + return faultInfoList +} + // LoadFaultCodeFromFile load fault code and fault type from faultCode.json func LoadFaultCodeFromFile() error { faultCodeBytes, err := utils.LoadFile(faultCodeFilePath) @@ -904,6 +969,21 @@ func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevF hwlog.RunLog.Error("param device is nil in SetNewFaultAndCacheOnceRecoverFault") return } + + var newFaultInfos []common.DevFaultInfo + // dealing with Hbm and Aic/Aiv associated faults + for i := 0; i < len(faultInfos); i++ { + if faultInfos[i].EventID == HbmDoubleBitFaultCode && faultInfos[i].Assertion != common.FaultRecover { + hbmTool.updateHbmOccurTime(faultInfos[i]) + } + if faultInfos[i].EventID == AicBusFaultCode || faultInfos[i].EventID == AivBusFaultCode { + hbmTool.aicFaultEventInQue(faultInfos[i]) + continue + } + newFaultInfos = append(newFaultInfos, faultInfos[i]) + } + faultInfos = append(newFaultInfos, hbmTool.aicFaultEventOutQue(logicID)...) + // it must deal with two 'for', because the fault may recover one moment, in this case, // the recover message and occur message both in faultInfos, this fault cannot be reports outside. for _, faultInfo := range faultInfos { diff --git a/pkg/common/fault_code_test.go b/pkg/common/fault_code_test.go index 28e27b39471cd4441cb81c85b82d796208c4e065..28507d399aac42bbbda6e1146f6112bd27d5e831 100644 --- a/pkg/common/fault_code_test.go +++ b/pkg/common/fault_code_test.go @@ -1573,3 +1573,49 @@ func TestLoadSwitchFaultCode(t *testing.T) { convey.So(NotHandleFaultCodes[firstFaultIdx] == generalFaultCode, convey.ShouldBeTrue) }) } + +// TestHbmFaultManager Test HbmFaultManager +func TestHbmFaultManager(t *testing.T) { + convey.Convey("test new fault manager", t, func() { + hbmFaultManager := NewHbmFaultManager() + convey.So(hbmFaultManager, convey.ShouldNotBeNil) + }) + convey.Convey("test update hbm occur time ", t, func() { + hbmFaultManager := NewHbmFaultManager() + faultInfo := common.DevFaultInfo{ + LogicID: 1, + EventID: HbmDoubleBitFaultCode, + AlarmRaisedTime: 123456789, + } + hbmFaultManager.updateHbmOccurTime(faultInfo) + hbmOccurTime, ok := hbmFaultManager.HbmOccurTimeCache[1] + convey.So(ok, convey.ShouldBeTrue) + convey.So(hbmOccurTime, convey.ShouldEqual, 123456789) + }) + convey.Convey("test fault event in que ", t, func() { + hbmFaultManager := NewHbmFaultManager() + faultInfo := common.DevFaultInfo{ + LogicID: 1, + EventID: AivBusFaultCode, + AlarmRaisedTime: 123456789, + } + hbmFaultManager.aicFaultEventInQue(faultInfo) + faultInfoList, ok := hbmFaultManager.AicFaultEventQue[1] + convey.So(ok, convey.ShouldBeTrue) + convey.So(len(faultInfoList), convey.ShouldEqual, 1) + convey.So(faultInfoList[0].AlarmRaisedTime, convey.ShouldEqual, 123456789) + }) + convey.Convey("test fault event in que ", t, func() { + hbmFaultManager := NewHbmFaultManager() + faultInfo := common.DevFaultInfo{ + LogicID: 1, + EventID: AicBusFaultCode, + AlarmRaisedTime: 100000000, + } + hbmFaultManager.aicFaultEventInQue(faultInfo) + faultInfoList := hbmFaultManager.aicFaultEventOutQue(1) + convey.So(len(faultInfoList), convey.ShouldEqual, 1) + convey.So(faultInfoList[0].AlarmRaisedTime, convey.ShouldEqual, 100000000) + convey.So(faultInfoList[0].EventID, convey.ShouldEqual, AicBusFaultCode) + }) +} diff --git a/pkg/common/slice_common.go b/pkg/common/slice_common.go index 607b7a29edeef867c2a0934827e64bf5b9e63878..48bd55c05424da954cc3ea209de3195ed420e0d7 100644 --- a/pkg/common/slice_common.go +++ b/pkg/common/slice_common.go @@ -96,6 +96,14 @@ func (i int64Tool) ToHexString(sources []int64) string { return target } +// Abs return a absolute value between two int 64 value +func (i int64Tool) Abs(var1, var2 int64) int64 { + if var1 > var2 { + return var1 - var2 + } + return var2 - var1 +} + // Index slice for string search the index with target func (s stringTool) Index(sources []string, target string) int { for i, source := range sources { diff --git a/pkg/common/slice_common_test.go b/pkg/common/slice_common_test.go index b227c9e9171380c0e6767c2f222706db5ab81c0f..ad1c920331329bf2540db0a1f0ccf66b578e9e09 100644 --- a/pkg/common/slice_common_test.go +++ b/pkg/common/slice_common_test.go @@ -127,3 +127,17 @@ func TestInt64ToolIndex(t *testing.T) { convey.So(tool.Index(testVal, unExistVal), convey.ShouldEqual, -1) }) } + +// TestInt64ToolAbs for test int64Tool.Abs +func TestInt64ToolAbs(t *testing.T) { + convey.Convey("test int64Tool.Abs case positive number", t, func() { + tool := int64Tool{} + testVal1, testVal2 := int64(1), int64(2) + convey.So(tool.Abs(testVal1, testVal2), convey.ShouldEqual, 1) + }) + convey.Convey("test int64Tool.Abs case negative number", t, func() { + tool := int64Tool{} + testVal1, testVal2 := int64(1), int64(-1) + convey.So(tool.Abs(testVal1, testVal2), convey.ShouldEqual, 2) + }) +}