From 8ed1e15e3b182e13efcdad5a49eacae243f60aac Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Sun, 8 Sep 2024 15:03:08 +0800 Subject: [PATCH 1/9] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=20=E5=A2=9E=E5=8A=A0AIC?= =?UTF-8?q?=E5=92=8CHBM=E6=95=85=E9=9A=9C=E4=BC=B4=E7=94=9F=E4=B8=8A?= =?UTF-8?q?=E6=8A=A5=E9=80=BB=E8=BE=91=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/constants.go | 5 +++ pkg/common/fault_code.go | 70 ++++++++++++++++++++++++++++++++++++-- pkg/common/slice_common.go | 8 +++++ 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 3e8e0570..bf50b4e9 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -770,3 +770,8 @@ const ( // SubHealthyAnnotationKey sub-healthy annotation key on node SubHealthyAnnotationKey = "subHealthy" ) + +const ( + HbmDoubleBitFaultCode = 2146441217 + AicAiVFaultCode = 2146402313 +) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 84a1f7f5..641497e8 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -131,11 +131,11 @@ var ( WaitFaultSelfHealingTime time.Duration = DefaultWaitFaultSelfHealingTime // WaitDeviceResetTime is the time used in waiting device reset WaitDeviceResetTime time.Duration = DefaultWaitDeviceResetTime - // faultFrequencyMap is the cache saving the occur frequency of a fault, key is event id + // faultFrequencyMap is the cache saving to occur frequency of a fault, key is event id faultFrequencyMap = make(map[string]*FaultFrequencyCache, common.MaxErrorCodeCount) // faultFrequencyMapLock is the lock of faultFrequencyMap faultFrequencyMapLock sync.Mutex - // faultDurationMap is the cache saving the occur duration of a fault, key is event id + // faultDurationMap is the cache saving to occur duration of a fault, key is event id faultDurationMap = make(map[string]*FaultDurationCache, common.MaxErrorCodeCount) // faultDurationMapLock is the lock of faultDurationMap faultDurationMapLock sync.Mutex @@ -143,6 +143,7 @@ var ( parseHexFailedMsg = "parse hex int failed and skip it, string: %s" networkFaultConfigureFailedMsg = "%x is a network fault and cannot be configured to %s now, " + "fault handling policy is set to NotHandleFault" + hbmTool = NewHbmFaultManager() ) // ManuallyFaultInfo save the info of ManuallySeparateNPU @@ -285,6 +286,61 @@ func (devFault DevFaultInfoBasedTimeAscend) Less(i, j int) bool { return devFault[i].AlarmRaisedTime < devFault[j].AlarmRaisedTime } +// HbmFaultManager manage the accompanying faults of aic error and hbm error +type HbmFaultManager struct { + HbmOccurTimeCache map[int32]int64 + AicFaultEventQue map[int32][]common.DevFaultInfo +} + +func NewHbmFaultManager() *HbmFaultManager { + return &HbmFaultManager{ + HbmOccurTimeCache: make(map[int32]int64, GeneralMapSize), + AicFaultEventQue: make(map[int32][]common.DevFaultInfo, GeneralMapSize), + } +} + +func (h *HbmFaultManager) updateHbmOccurTime(faultInfo common.DevFaultInfo) { + if _, ok := h.HbmOccurTimeCache[faultInfo.LogicID]; !ok { + h.HbmOccurTimeCache[faultInfo.LogicID] = 0 + } + h.HbmOccurTimeCache[faultInfo.LogicID] = faultInfo.AlarmRaisedTime +} + +func (h *HbmFaultManager) aicFaultEventInQue(faultInfo common.DevFaultInfo) { + _, ok := h.AicFaultEventQue[faultInfo.LogicID] + if !ok { + h.AicFaultEventQue[faultInfo.LogicID] = []common.DevFaultInfo{} + } + h.AicFaultEventQue[faultInfo.LogicID] = append(h.AicFaultEventQue[faultInfo.LogicID], faultInfo) + sort.Sort(DevFaultInfoBasedTimeAscend(h.AicFaultEventQue[faultInfo.LogicID])) +} + +func (h *HbmFaultManager) aicFaultEventOutQue(logicId int32) []common.DevFaultInfo { + var faultInfoList []common.DevFaultInfo + faultEventQue, ok := h.AicFaultEventQue[logicId] + if !ok { + return faultInfoList + } + if _, ok := h.HbmOccurTimeCache[logicId]; !ok { + h.HbmOccurTimeCache[logicId] = 0 + } + nowTime := time.Now().Unix() + for i := 0; i < len(faultEventQue); i++ { + // aic error should report if hbm error does not occur within ten seconds + if nowTime-faultEventQue[i].AlarmRaisedTime > 10 { + faultInfoList = append(faultInfoList, faultEventQue[i]) + } + // The fault aic error occurring ten seconds before and after the occurrence of hbm error should be deleted, + // and the event in this outbound queue should also be deleted + if Int64Tool.Int64Abs(h.HbmOccurTimeCache[logicId], faultEventQue[i].AlarmRaisedTime) < 10 || + nowTime-faultEventQue[i].AlarmRaisedTime > 10 { + faultEventQue = append(faultEventQue[:i], faultEventQue[i+1:]...) + i-- + } + } + return faultInfoList +} + // LoadFaultCodeFromFile load fault code and fault type from faultCode.json func LoadFaultCodeFromFile() error { faultCodeBytes, err := utils.LoadFile(faultCodeFilePath) @@ -904,6 +960,16 @@ func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevF hwlog.RunLog.Error("param device is nil in SetNewFaultAndCacheOnceRecoverFault") return } + for i := 0; i < len(faultInfos); i++ { + if faultInfos[i].EventID == HbmDoubleBitFaultCode && faultInfos[i].Assertion != common.FaultRecover { + hbmTool.updateHbmOccurTime(faultInfos[i]) + } + if faultInfos[i].EventID == AicAiVFaultCode { + hbmTool.aicFaultEventInQue(faultInfos[i]) + faultInfos = append(faultInfos[:i], faultInfos[i+1:]...) + } + } + faultInfos = append(faultInfos, hbmTool.aicFaultEventOutQue(logicID)...) // it must deal with two 'for', because the fault may recover one moment, in this case, // the recover message and occur message both in faultInfos, this fault cannot be reports outside. for _, faultInfo := range faultInfos { diff --git a/pkg/common/slice_common.go b/pkg/common/slice_common.go index 607b7a29..aee75d80 100644 --- a/pkg/common/slice_common.go +++ b/pkg/common/slice_common.go @@ -96,6 +96,14 @@ func (i int64Tool) ToHexString(sources []int64) string { return target } +// Int64Abs return a absolute value between two int 64 value +func (i int64Tool) Int64Abs(var1, var2 int64) int64 { + if var1 > var2 { + return var1 - var2 + } + return var2 - var1 +} + // Index slice for string search the index with target func (s stringTool) Index(sources []string, target string) int { for i, source := range sources { -- Gitee From f855dd25f8663aac08dd464bf5618c724695372e Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Sun, 8 Sep 2024 15:13:20 +0800 Subject: [PATCH 2/9] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=20=E5=A2=9E=E5=8A=A0AIC?= =?UTF-8?q?=E5=92=8CHBM=E6=95=85=E9=9A=9C=E4=BC=B4=E7=94=9F=E4=B8=8A?= =?UTF-8?q?=E6=8A=A5=E9=80=BB=E8=BE=91=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/fault_code.go | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 641497e8..e834ed93 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -292,6 +292,7 @@ type HbmFaultManager struct { AicFaultEventQue map[int32][]common.DevFaultInfo } +// NewHbmFaultManager return a hbm fault manager func NewHbmFaultManager() *HbmFaultManager { return &HbmFaultManager{ HbmOccurTimeCache: make(map[int32]int64, GeneralMapSize), @@ -304,6 +305,8 @@ func (h *HbmFaultManager) updateHbmOccurTime(faultInfo common.DevFaultInfo) { h.HbmOccurTimeCache[faultInfo.LogicID] = 0 } h.HbmOccurTimeCache[faultInfo.LogicID] = faultInfo.AlarmRaisedTime + hwlog.RunLog.Debugf("hbm occur, device %d update occur time: %d", + faultInfo.LogicID, h.HbmOccurTimeCache[faultInfo.LogicID]) } func (h *HbmFaultManager) aicFaultEventInQue(faultInfo common.DevFaultInfo) { @@ -313,6 +316,8 @@ func (h *HbmFaultManager) aicFaultEventInQue(faultInfo common.DevFaultInfo) { } h.AicFaultEventQue[faultInfo.LogicID] = append(h.AicFaultEventQue[faultInfo.LogicID], faultInfo) sort.Sort(DevFaultInfoBasedTimeAscend(h.AicFaultEventQue[faultInfo.LogicID])) + hwlog.RunLog.Debugf("aic fault event in que, device %d new event que:%#v", + faultInfo.LogicID, h.AicFaultEventQue[faultInfo.LogicID]) } func (h *HbmFaultManager) aicFaultEventOutQue(logicId int32) []common.DevFaultInfo { @@ -326,14 +331,19 @@ func (h *HbmFaultManager) aicFaultEventOutQue(logicId int32) []common.DevFaultIn } nowTime := time.Now().Unix() for i := 0; i < len(faultEventQue); i++ { - // aic error should report if hbm error does not occur within ten seconds - if nowTime-faultEventQue[i].AlarmRaisedTime > 10 { - faultInfoList = append(faultInfoList, faultEventQue[i]) - } // The fault aic error occurring ten seconds before and after the occurrence of hbm error should be deleted, + if Int64Tool.Int64Abs(h.HbmOccurTimeCache[logicId], faultEventQue[i].AlarmRaisedTime) < 10 { + hwlog.RunLog.Infof("device %d delete event in fault event que, aic event time %d hbm event time %d", + logicId, faultEventQue[i].AlarmRaisedTime, h.HbmOccurTimeCache[logicId]) + faultEventQue = append(faultEventQue[:i], faultEventQue[i+1:]...) + i-- + } + // aic error should report if hbm error does not occur within ten seconds, // and the event in this outbound queue should also be deleted - if Int64Tool.Int64Abs(h.HbmOccurTimeCache[logicId], faultEventQue[i].AlarmRaisedTime) < 10 || - nowTime-faultEventQue[i].AlarmRaisedTime > 10 { + if nowTime-faultEventQue[i].AlarmRaisedTime > 10 { + hwlog.RunLog.Infof("device % delete event in fault event que, aic event time %d now time %d", + logicId, faultEventQue[i].AlarmRaisedTime, nowTime) + faultInfoList = append(faultInfoList, faultEventQue[i]) faultEventQue = append(faultEventQue[:i], faultEventQue[i+1:]...) i-- } -- Gitee From b21b77dc3086a4dfa5fdca44131ef2613562a174 Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Sun, 8 Sep 2024 18:13:24 +0800 Subject: [PATCH 3/9] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=20=E5=A2=9E=E5=8A=A0AIC?= =?UTF-8?q?=E5=92=8CHBM=E6=95=85=E9=9A=9C=E4=BC=B4=E7=94=9F=E4=B8=8A?= =?UTF-8?q?=E6=8A=A5=E9=80=BB=E8=BE=91=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/fault_code.go | 1 + pkg/common/fault_code_test.go | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index e834ed93..868c2e86 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -977,6 +977,7 @@ func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevF if faultInfos[i].EventID == AicAiVFaultCode { hbmTool.aicFaultEventInQue(faultInfos[i]) faultInfos = append(faultInfos[:i], faultInfos[i+1:]...) + i-- } } faultInfos = append(faultInfos, hbmTool.aicFaultEventOutQue(logicID)...) diff --git a/pkg/common/fault_code_test.go b/pkg/common/fault_code_test.go index 28e27b39..0a745e97 100644 --- a/pkg/common/fault_code_test.go +++ b/pkg/common/fault_code_test.go @@ -1573,3 +1573,36 @@ func TestLoadSwitchFaultCode(t *testing.T) { convey.So(NotHandleFaultCodes[firstFaultIdx] == generalFaultCode, convey.ShouldBeTrue) }) } + +// TestHbmFaultManager Test HbmFaultManager +func TestHbmFaultManager(t *testing.T) { + convey.Convey("test new fault manager", t, func() { + hbmFaultManager := NewHbmFaultManager() + convey.So(hbmFaultManager, convey.ShouldNotBeNil) + }) + convey.Convey("test update hbm occur time ", t, func() { + hbmFaultManager := NewHbmFaultManager() + faultInfo := common.DevFaultInfo{ + LogicID: 1, + EventID: HbmDoubleBitFaultCode, + AlarmRaisedTime: 123456789, + } + hbmFaultManager.updateHbmOccurTime(faultInfo) + hbmOccurTime, ok := hbmFaultManager.HbmOccurTimeCache[1] + convey.So(ok, convey.ShouldBeTrue) + convey.So(hbmOccurTime, convey.ShouldEqual, 123456789) + }) + convey.Convey("test fault event in que ", t, func() { + hbmFaultManager := NewHbmFaultManager() + faultInfo := common.DevFaultInfo{ + LogicID: 1, + EventID: AicAiVFaultCode, + AlarmRaisedTime: 123456789, + } + hbmFaultManager.updateHbmOccurTime(faultInfo) + faultInfoList, ok := hbmFaultManager.AicFaultEventQue[1] + convey.So(ok, convey.ShouldBeTrue) + convey.So(len(faultInfoList), convey.ShouldEqual, 1) + convey.So(faultInfoList[0].AlarmRaisedTime, convey.ShouldEqual, 123456789) + }) +} -- Gitee From cb68276868e42066f41757859f16a122f71d36f8 Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Sun, 8 Sep 2024 18:46:17 +0800 Subject: [PATCH 4/9] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=20=E5=A2=9E=E5=8A=A0AIC?= =?UTF-8?q?=E5=92=8CHBM=E6=95=85=E9=9A=9C=E4=BC=B4=E7=94=9F=E4=B8=8A?= =?UTF-8?q?=E6=8A=A5=E9=80=BB=E8=BE=91=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/fault_code.go | 2 +- pkg/common/fault_code_test.go | 15 ++++++++++++++- pkg/common/slice_common.go | 4 ++-- pkg/common/slice_common_test.go | 14 ++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 868c2e86..ec31617f 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -332,7 +332,7 @@ func (h *HbmFaultManager) aicFaultEventOutQue(logicId int32) []common.DevFaultIn nowTime := time.Now().Unix() for i := 0; i < len(faultEventQue); i++ { // The fault aic error occurring ten seconds before and after the occurrence of hbm error should be deleted, - if Int64Tool.Int64Abs(h.HbmOccurTimeCache[logicId], faultEventQue[i].AlarmRaisedTime) < 10 { + if Int64Tool.Abs(h.HbmOccurTimeCache[logicId], faultEventQue[i].AlarmRaisedTime) < 10 { hwlog.RunLog.Infof("device %d delete event in fault event que, aic event time %d hbm event time %d", logicId, faultEventQue[i].AlarmRaisedTime, h.HbmOccurTimeCache[logicId]) faultEventQue = append(faultEventQue[:i], faultEventQue[i+1:]...) diff --git a/pkg/common/fault_code_test.go b/pkg/common/fault_code_test.go index 0a745e97..343046ad 100644 --- a/pkg/common/fault_code_test.go +++ b/pkg/common/fault_code_test.go @@ -1599,10 +1599,23 @@ func TestHbmFaultManager(t *testing.T) { EventID: AicAiVFaultCode, AlarmRaisedTime: 123456789, } - hbmFaultManager.updateHbmOccurTime(faultInfo) + hbmFaultManager.aicFaultEventInQue(faultInfo) faultInfoList, ok := hbmFaultManager.AicFaultEventQue[1] convey.So(ok, convey.ShouldBeTrue) convey.So(len(faultInfoList), convey.ShouldEqual, 1) convey.So(faultInfoList[0].AlarmRaisedTime, convey.ShouldEqual, 123456789) }) + convey.Convey("test fault event in que ", t, func() { + hbmFaultManager := NewHbmFaultManager() + faultInfo := common.DevFaultInfo{ + LogicID: 1, + EventID: AicAiVFaultCode, + AlarmRaisedTime: 100000000, + } + hbmFaultManager.aicFaultEventInQue(faultInfo) + faultInfoList := hbmFaultManager.aicFaultEventOutQue(1) + convey.So(len(faultInfoList), convey.ShouldEqual, 1) + convey.So(faultInfoList[0].AlarmRaisedTime, convey.ShouldEqual, 100000000) + convey.So(faultInfoList[0].EventID, convey.ShouldEqual, AicAiVFaultCode) + }) } diff --git a/pkg/common/slice_common.go b/pkg/common/slice_common.go index aee75d80..48bd55c0 100644 --- a/pkg/common/slice_common.go +++ b/pkg/common/slice_common.go @@ -96,8 +96,8 @@ func (i int64Tool) ToHexString(sources []int64) string { return target } -// Int64Abs return a absolute value between two int 64 value -func (i int64Tool) Int64Abs(var1, var2 int64) int64 { +// Abs return a absolute value between two int 64 value +func (i int64Tool) Abs(var1, var2 int64) int64 { if var1 > var2 { return var1 - var2 } diff --git a/pkg/common/slice_common_test.go b/pkg/common/slice_common_test.go index b227c9e9..ad1c9203 100644 --- a/pkg/common/slice_common_test.go +++ b/pkg/common/slice_common_test.go @@ -127,3 +127,17 @@ func TestInt64ToolIndex(t *testing.T) { convey.So(tool.Index(testVal, unExistVal), convey.ShouldEqual, -1) }) } + +// TestInt64ToolAbs for test int64Tool.Abs +func TestInt64ToolAbs(t *testing.T) { + convey.Convey("test int64Tool.Abs case positive number", t, func() { + tool := int64Tool{} + testVal1, testVal2 := int64(1), int64(2) + convey.So(tool.Abs(testVal1, testVal2), convey.ShouldEqual, 1) + }) + convey.Convey("test int64Tool.Abs case negative number", t, func() { + tool := int64Tool{} + testVal1, testVal2 := int64(1), int64(-1) + convey.So(tool.Abs(testVal1, testVal2), convey.ShouldEqual, 2) + }) +} -- Gitee From 2e8a7eb6873f59807ca3bc0d17a23e6591280a37 Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Tue, 10 Sep 2024 09:07:56 +0800 Subject: [PATCH 5/9] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=20=E5=A2=9E=E5=8A=A0AIC?= =?UTF-8?q?=E5=92=8CHBM=E6=95=85=E9=9A=9C=E4=BC=B4=E7=94=9F=E4=B8=8A?= =?UTF-8?q?=E6=8A=A5=E9=80=BB=E8=BE=91=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/constants.go | 3 ++- pkg/common/fault_code.go | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index bf50b4e9..6799b098 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -773,5 +773,6 @@ const ( const ( HbmDoubleBitFaultCode = 2146441217 - AicAiVFaultCode = 2146402313 + AivBusFaultCode = 2146402313 + AicBusFaultCode = 2147057673 ) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index ec31617f..84bde5c9 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -316,7 +316,7 @@ func (h *HbmFaultManager) aicFaultEventInQue(faultInfo common.DevFaultInfo) { } h.AicFaultEventQue[faultInfo.LogicID] = append(h.AicFaultEventQue[faultInfo.LogicID], faultInfo) sort.Sort(DevFaultInfoBasedTimeAscend(h.AicFaultEventQue[faultInfo.LogicID])) - hwlog.RunLog.Debugf("aic fault event in que, device %d new event que:%#v", + hwlog.RunLog.Debugf("aic/aiv fault event in que, device %d new event que:%#v", faultInfo.LogicID, h.AicFaultEventQue[faultInfo.LogicID]) } @@ -974,7 +974,7 @@ func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevF if faultInfos[i].EventID == HbmDoubleBitFaultCode && faultInfos[i].Assertion != common.FaultRecover { hbmTool.updateHbmOccurTime(faultInfos[i]) } - if faultInfos[i].EventID == AicAiVFaultCode { + if faultInfos[i].EventID == AicBusFaultCode || faultInfos[i].EventID == AivBusFaultCode { hbmTool.aicFaultEventInQue(faultInfos[i]) faultInfos = append(faultInfos[:i], faultInfos[i+1:]...) i-- -- Gitee From f26aa94a3ae2f9b391326ed3877995aeb1351304 Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Tue, 10 Sep 2024 14:29:48 +0800 Subject: [PATCH 6/9] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=20=E5=A2=9E=E5=8A=A0AIC?= =?UTF-8?q?=E5=92=8CHBM=E6=95=85=E9=9A=9C=E4=BC=B4=E7=94=9F=E4=B8=8A?= =?UTF-8?q?=E6=8A=A5=E9=80=BB=E8=BE=91=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/constants.go | 9 +++++++-- pkg/common/fault_code.go | 13 ++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 6799b098..7c4f2fde 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -772,7 +772,12 @@ const ( ) const ( + // HbmDoubleBitFaultCode indicate 0x80E01801 HbmDoubleBitFaultCode = 2146441217 - AivBusFaultCode = 2146402313 - AicBusFaultCode = 2147057673 + // AivBusFaultCode indicate 0x80CB8009 + AivBusFaultCode = 2146402313 + // AicBusFaultCode indicate 0x80C98009 + AicBusFaultCode = 2147057673 + // AssociatedFaultDiagnosisTime associated fault diagnosis + AssociatedFaultDiagnosisTime = 5 ) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 84bde5c9..1a8155d2 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -305,7 +305,7 @@ func (h *HbmFaultManager) updateHbmOccurTime(faultInfo common.DevFaultInfo) { h.HbmOccurTimeCache[faultInfo.LogicID] = 0 } h.HbmOccurTimeCache[faultInfo.LogicID] = faultInfo.AlarmRaisedTime - hwlog.RunLog.Debugf("hbm occur, device %d update occur time: %d", + hwlog.RunLog.Debugf("hbm fault occur, device %d update occur time: %d", faultInfo.LogicID, h.HbmOccurTimeCache[faultInfo.LogicID]) } @@ -316,8 +316,8 @@ func (h *HbmFaultManager) aicFaultEventInQue(faultInfo common.DevFaultInfo) { } h.AicFaultEventQue[faultInfo.LogicID] = append(h.AicFaultEventQue[faultInfo.LogicID], faultInfo) sort.Sort(DevFaultInfoBasedTimeAscend(h.AicFaultEventQue[faultInfo.LogicID])) - hwlog.RunLog.Debugf("aic/aiv fault event in que, device %d new event que:%#v", - faultInfo.LogicID, h.AicFaultEventQue[faultInfo.LogicID]) + hwlog.RunLog.Debugf("aic/aiv fault event %d in que, device %d new event que:%#v", + faultInfo.EventID, faultInfo.LogicID, h.AicFaultEventQue[faultInfo.LogicID]) } func (h *HbmFaultManager) aicFaultEventOutQue(logicId int32) []common.DevFaultInfo { @@ -332,7 +332,7 @@ func (h *HbmFaultManager) aicFaultEventOutQue(logicId int32) []common.DevFaultIn nowTime := time.Now().Unix() for i := 0; i < len(faultEventQue); i++ { // The fault aic error occurring ten seconds before and after the occurrence of hbm error should be deleted, - if Int64Tool.Abs(h.HbmOccurTimeCache[logicId], faultEventQue[i].AlarmRaisedTime) < 10 { + if Int64Tool.Abs(h.HbmOccurTimeCache[logicId], faultEventQue[i].AlarmRaisedTime) < AssociatedFaultDiagnosisTime { hwlog.RunLog.Infof("device %d delete event in fault event que, aic event time %d hbm event time %d", logicId, faultEventQue[i].AlarmRaisedTime, h.HbmOccurTimeCache[logicId]) faultEventQue = append(faultEventQue[:i], faultEventQue[i+1:]...) @@ -340,7 +340,7 @@ func (h *HbmFaultManager) aicFaultEventOutQue(logicId int32) []common.DevFaultIn } // aic error should report if hbm error does not occur within ten seconds, // and the event in this outbound queue should also be deleted - if nowTime-faultEventQue[i].AlarmRaisedTime > 10 { + if nowTime-faultEventQue[i].AlarmRaisedTime > AssociatedFaultDiagnosisTime { hwlog.RunLog.Infof("device % delete event in fault event que, aic event time %d now time %d", logicId, faultEventQue[i].AlarmRaisedTime, nowTime) faultInfoList = append(faultInfoList, faultEventQue[i]) @@ -970,6 +970,8 @@ func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevF hwlog.RunLog.Error("param device is nil in SetNewFaultAndCacheOnceRecoverFault") return } + + // dealing with HBM and AIC associated faults for i := 0; i < len(faultInfos); i++ { if faultInfos[i].EventID == HbmDoubleBitFaultCode && faultInfos[i].Assertion != common.FaultRecover { hbmTool.updateHbmOccurTime(faultInfos[i]) @@ -981,6 +983,7 @@ func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevF } } faultInfos = append(faultInfos, hbmTool.aicFaultEventOutQue(logicID)...) + // it must deal with two 'for', because the fault may recover one moment, in this case, // the recover message and occur message both in faultInfos, this fault cannot be reports outside. for _, faultInfo := range faultInfos { -- Gitee From 5bd3fd3f537186a73662b9b72abd1779ab8ec7c6 Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Tue, 10 Sep 2024 14:37:31 +0800 Subject: [PATCH 7/9] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=20=E5=A2=9E=E5=8A=A0AIC?= =?UTF-8?q?=E5=92=8CHBM=E6=95=85=E9=9A=9C=E4=BC=B4=E7=94=9F=E4=B8=8A?= =?UTF-8?q?=E6=8A=A5=E9=80=BB=E8=BE=91=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/fault_code.go | 2 +- pkg/common/fault_code_test.go | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 1a8155d2..2043814b 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -971,7 +971,7 @@ func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevF return } - // dealing with HBM and AIC associated faults + // dealing with Hbm and Aic/Aiv associated faults for i := 0; i < len(faultInfos); i++ { if faultInfos[i].EventID == HbmDoubleBitFaultCode && faultInfos[i].Assertion != common.FaultRecover { hbmTool.updateHbmOccurTime(faultInfos[i]) diff --git a/pkg/common/fault_code_test.go b/pkg/common/fault_code_test.go index 343046ad..28507d39 100644 --- a/pkg/common/fault_code_test.go +++ b/pkg/common/fault_code_test.go @@ -1596,7 +1596,7 @@ func TestHbmFaultManager(t *testing.T) { hbmFaultManager := NewHbmFaultManager() faultInfo := common.DevFaultInfo{ LogicID: 1, - EventID: AicAiVFaultCode, + EventID: AivBusFaultCode, AlarmRaisedTime: 123456789, } hbmFaultManager.aicFaultEventInQue(faultInfo) @@ -1609,13 +1609,13 @@ func TestHbmFaultManager(t *testing.T) { hbmFaultManager := NewHbmFaultManager() faultInfo := common.DevFaultInfo{ LogicID: 1, - EventID: AicAiVFaultCode, + EventID: AicBusFaultCode, AlarmRaisedTime: 100000000, } hbmFaultManager.aicFaultEventInQue(faultInfo) faultInfoList := hbmFaultManager.aicFaultEventOutQue(1) convey.So(len(faultInfoList), convey.ShouldEqual, 1) convey.So(faultInfoList[0].AlarmRaisedTime, convey.ShouldEqual, 100000000) - convey.So(faultInfoList[0].EventID, convey.ShouldEqual, AicAiVFaultCode) + convey.So(faultInfoList[0].EventID, convey.ShouldEqual, AicBusFaultCode) }) } -- Gitee From 3d7a8d91ca42e8db0691e77560a6e8075cf71f0c Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Tue, 10 Sep 2024 20:40:36 +0800 Subject: [PATCH 8/9] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=20=E5=A2=9E=E5=8A=A0AIC?= =?UTF-8?q?=E5=92=8CHBM=E6=95=85=E9=9A=9C=E4=BC=B4=E7=94=9F=E4=B8=8A?= =?UTF-8?q?=E6=8A=A5=E9=80=BB=E8=BE=91=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/constants.go | 8 +++++--- pkg/common/fault_code.go | 21 ++++++++++++--------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 7c4f2fde..9891ff23 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -773,11 +773,13 @@ const ( const ( // HbmDoubleBitFaultCode indicate 0x80E01801 - HbmDoubleBitFaultCode = 2146441217 + HbmDoubleBitFaultCode = 2162169857 // AivBusFaultCode indicate 0x80CB8009 - AivBusFaultCode = 2146402313 + AivBusFaultCode = 2160820233 // AicBusFaultCode indicate 0x80C98009 - AicBusFaultCode = 2147057673 + AicBusFaultCode = 2160689161 // AssociatedFaultDiagnosisTime associated fault diagnosis AssociatedFaultDiagnosisTime = 5 + // TimeMilliseconds indicate how many milliseconds are there in a second + TimeMilliseconds = 1000 ) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 2043814b..473213d7 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -329,25 +329,27 @@ func (h *HbmFaultManager) aicFaultEventOutQue(logicId int32) []common.DevFaultIn if _, ok := h.HbmOccurTimeCache[logicId]; !ok { h.HbmOccurTimeCache[logicId] = 0 } + var newFaultEventQue []common.DevFaultInfo nowTime := time.Now().Unix() for i := 0; i < len(faultEventQue); i++ { // The fault aic error occurring ten seconds before and after the occurrence of hbm error should be deleted, - if Int64Tool.Abs(h.HbmOccurTimeCache[logicId], faultEventQue[i].AlarmRaisedTime) < AssociatedFaultDiagnosisTime { + if Int64Tool.Abs(h.HbmOccurTimeCache[logicId], faultEventQue[i].AlarmRaisedTime) < + AssociatedFaultDiagnosisTime*TimeMilliseconds { hwlog.RunLog.Infof("device %d delete event in fault event que, aic event time %d hbm event time %d", logicId, faultEventQue[i].AlarmRaisedTime, h.HbmOccurTimeCache[logicId]) - faultEventQue = append(faultEventQue[:i], faultEventQue[i+1:]...) - i-- + continue } // aic error should report if hbm error does not occur within ten seconds, // and the event in this outbound queue should also be deleted - if nowTime-faultEventQue[i].AlarmRaisedTime > AssociatedFaultDiagnosisTime { + if nowTime-faultEventQue[i].AlarmRaisedTime > AssociatedFaultDiagnosisTime*TimeMilliseconds { hwlog.RunLog.Infof("device % delete event in fault event que, aic event time %d now time %d", logicId, faultEventQue[i].AlarmRaisedTime, nowTime) faultInfoList = append(faultInfoList, faultEventQue[i]) - faultEventQue = append(faultEventQue[:i], faultEventQue[i+1:]...) - i-- + continue } + newFaultEventQue = append(newFaultEventQue, faultEventQue[i]) } + h.AicFaultEventQue[logicId] = newFaultEventQue return faultInfoList } @@ -971,6 +973,7 @@ func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevF return } + var newFaultInfos []common.DevFaultInfo // dealing with Hbm and Aic/Aiv associated faults for i := 0; i < len(faultInfos); i++ { if faultInfos[i].EventID == HbmDoubleBitFaultCode && faultInfos[i].Assertion != common.FaultRecover { @@ -978,11 +981,11 @@ func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevF } if faultInfos[i].EventID == AicBusFaultCode || faultInfos[i].EventID == AivBusFaultCode { hbmTool.aicFaultEventInQue(faultInfos[i]) - faultInfos = append(faultInfos[:i], faultInfos[i+1:]...) - i-- + continue } + newFaultInfos = append(newFaultInfos, faultInfos[i]) } - faultInfos = append(faultInfos, hbmTool.aicFaultEventOutQue(logicID)...) + faultInfos = append(newFaultInfos, hbmTool.aicFaultEventOutQue(logicID)...) // it must deal with two 'for', because the fault may recover one moment, in this case, // the recover message and occur message both in faultInfos, this fault cannot be reports outside. -- Gitee From c7ff9f40f689bbca19d226c9f3a6c01092cfa305 Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Tue, 10 Sep 2024 20:48:30 +0800 Subject: [PATCH 9/9] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=20=E5=A2=9E=E5=8A=A0AIC?= =?UTF-8?q?=E5=92=8CHBM=E6=95=85=E9=9A=9C=E4=BC=B4=E7=94=9F=E4=B8=8A?= =?UTF-8?q?=E6=8A=A5=E9=80=BB=E8=BE=91=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/fault_code.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 473213d7..d0eee21f 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -301,9 +301,6 @@ func NewHbmFaultManager() *HbmFaultManager { } func (h *HbmFaultManager) updateHbmOccurTime(faultInfo common.DevFaultInfo) { - if _, ok := h.HbmOccurTimeCache[faultInfo.LogicID]; !ok { - h.HbmOccurTimeCache[faultInfo.LogicID] = 0 - } h.HbmOccurTimeCache[faultInfo.LogicID] = faultInfo.AlarmRaisedTime hwlog.RunLog.Debugf("hbm fault occur, device %d update occur time: %d", faultInfo.LogicID, h.HbmOccurTimeCache[faultInfo.LogicID]) -- Gitee