From 416de2b171fd4543d3c20fa3187fa6619b97c176 Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Tue, 3 Sep 2024 20:39:15 +0800 Subject: [PATCH 1/6] fix --- pkg/common/constants.go | 7 +++++++ pkg/device/ascendcommon.go | 24 ++++++++++++++++++++++-- pkg/kubeclient/kubeclient.go | 26 ++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 42802cd9..d1b6532a 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -755,3 +755,10 @@ const ( // Subscribe represents subscribe mode Subscribe = "subscribe" ) + +const ( + // UpdateAnnotationRetryTimes update annotation retry times + UpdateAnnotationRetryTimes = 3 + // SubHealthyAnnotationKey sub-healthy annotation key on node + SubHealthyAnnotationKey = "subHealthy" +) diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 452ef350..d47aa717 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -46,6 +46,8 @@ var ( faultMode = make(map[int32]string, common.GeneralMapSize) lastCheckNodeLabel int64 useIpv4 = true + preSubHealthy = false + firstUpdate = true ) const ( @@ -229,6 +231,24 @@ func (tool *AscendTools) UpdateNodeDeviceInfo(devStatusSet common.DevStatusSet, } tool.delVirDevInfo(newDeviceList) + curSubHealthyStatus := false + for _, df := range devStatusSet.DeviceFault { + if df.FaultLevel == common.PreSeparateNPU { + curSubHealthyStatus = true + break + } + } + if firstUpdate || preSubHealthy != curSubHealthyStatus { + preSubHealthy = curSubHealthyStatus + err := tool.client.UpdateNodeAnnotation(common.SubHealthyAnnotationKey, + strconv.FormatBool(curSubHealthyStatus), common.UpdateAnnotationRetryTimes) + if err == nil { + firstUpdate = true + } else { + hwlog.RunLog.Warnf("update node annotation failed, err: %v", err) + } + } + manuallySeparateNPU := tool.handleManuallySeparateNPUFaultInfo() // if subscribe failed, will use get interface if common.SwitchSubscribeFailed && common.ParamOption.EnableSwitchFault { @@ -715,8 +735,8 @@ func (tool *AscendTools) isHealthy(device *common.NpuDevice) string { tool.npuIsUsedNow(device.DeviceName) && common.ParamOption.GraceToleranceOn == true) { return v1beta1.Healthy } - if faultType == common.PreSeparateNPU && tool.npuIsUsedNow(device.DeviceName) { - hwlog.RunLog.Infof("detect %s but device is in use, device name: %s", faultType, device.DeviceName) + if faultType == common.PreSeparateNPU { + hwlog.RunLog.Infof("detect %s, device name: %s", faultType, device.DeviceName) return v1beta1.Healthy } return v1beta1.Unhealthy diff --git a/pkg/kubeclient/kubeclient.go b/pkg/kubeclient/kubeclient.go index 53ee9d64..d0eb7591 100644 --- a/pkg/kubeclient/kubeclient.go +++ b/pkg/kubeclient/kubeclient.go @@ -21,6 +21,7 @@ import ( "os" "reflect" "strings" + "time" "huawei.com/npu-exporter/v6/common-utils/hwlog" "k8s.io/api/core/v1" @@ -335,3 +336,28 @@ func (ki *ClientK8s) ResourceEventHandler(res ResourceType, filter func(obj inte func (ki *ClientK8s) FlushPodCacheNextQuerying() { ki.IsApiErr = true } + +func (ki *ClientK8s) UpdateNodeAnnotation(key, value string, retryTimes int) error { + node, err := ki.GetNode() + retry := 0 + for err != nil && retry < retryTimes { + node, err = ki.GetNode() + retry++ + time.Sleep(time.Duration(retry) * time.Second) + } + if err != nil { + return err + } + if node.Annotations == nil { + node.Annotations = make(map[string]string) + } + node.Labels[key] = value + _, err = ki.Clientset.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{}) + retry = 0 + for err != nil && retry < retryTimes { + _, err = ki.Clientset.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{}) + retry++ + time.Sleep(time.Duration(retry) * time.Second) + } + return err +} -- Gitee From 3711941be9d5bfd8425f71d0fee5882614a203ab Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Wed, 4 Sep 2024 15:40:36 +0800 Subject: [PATCH 2/6] fix --- pkg/common/constants.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 33ad56e3..3e8e0570 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -763,10 +763,10 @@ const ( ) const ( + NPUNormalStatus = "normal" + NPUResettingStatus = "resetting" // UpdateAnnotationRetryTimes update annotation retry times UpdateAnnotationRetryTimes = 3 // SubHealthyAnnotationKey sub-healthy annotation key on node SubHealthyAnnotationKey = "subHealthy" - NPUNormalStatus = "normal" - NPUResettingStatus = "resetting" ) -- Gitee From cd7bdee608a2dd04047969346831da42670569ae Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Wed, 4 Sep 2024 18:11:44 +0800 Subject: [PATCH 3/6] fix --- pkg/device/ascendcommon.go | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 76223bfc..2a6f641a 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -231,23 +231,7 @@ func (tool *AscendTools) UpdateNodeDeviceInfo(devStatusSet common.DevStatusSet, } tool.delVirDevInfo(newDeviceList) - curSubHealthyStatus := false - for _, df := range devStatusSet.DeviceFault { - if df.FaultLevel == common.PreSeparateNPU { - curSubHealthyStatus = true - break - } - } - if firstUpdate || preSubHealthy != curSubHealthyStatus { - preSubHealthy = curSubHealthyStatus - err := tool.client.UpdateNodeAnnotation(common.SubHealthyAnnotationKey, - strconv.FormatBool(curSubHealthyStatus), common.UpdateAnnotationRetryTimes) - if err == nil { - firstUpdate = true - } else { - hwlog.RunLog.Warnf("update node annotation failed, err: %v", err) - } - } + tool.annotateWithSubHealthy(devStatusSet) manuallySeparateNPU := tool.handleManuallySeparateNPUFaultInfo() // if subscribe failed, will use get interface @@ -272,6 +256,26 @@ func (tool *AscendTools) UpdateNodeDeviceInfo(devStatusSet common.DevStatusSet, return waitErr } +func (tool *AscendTools) annotateWithSubHealthy(devStatusSet common.DevStatusSet) { + curSubHealthyStatus := false + for _, df := range devStatusSet.DeviceFault { + if df.FaultLevel == common.PreSeparateNPU { + curSubHealthyStatus = true + break + } + } + if firstUpdate || preSubHealthy != curSubHealthyStatus { + preSubHealthy = curSubHealthyStatus + err := tool.client.UpdateNodeAnnotation(common.SubHealthyAnnotationKey, + strconv.FormatBool(curSubHealthyStatus), common.UpdateAnnotationRetryTimes) + if err == nil { + firstUpdate = true + } else { + hwlog.RunLog.Warnf("update node annotation failed, err: %v", err) + } + } +} + func (tool *AscendTools) delVirDevInfo(newDeviceList map[string]string) { for annotationTag := range common.GetAllDeviceInfoTypeList() { if _, ok := newDeviceList[annotationTag]; !ok { -- Gitee From afb3c77e577dc4cb0b9c430a3154caacd7a7cb76 Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Wed, 4 Sep 2024 18:23:55 +0800 Subject: [PATCH 4/6] fix --- pkg/kubeclient/kubeclient.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/kubeclient/kubeclient.go b/pkg/kubeclient/kubeclient.go index d0eb7591..e1bd8ba1 100644 --- a/pkg/kubeclient/kubeclient.go +++ b/pkg/kubeclient/kubeclient.go @@ -17,6 +17,7 @@ package kubeclient import ( "context" + "errors" "fmt" "os" "reflect" @@ -338,6 +339,9 @@ func (ki *ClientK8s) FlushPodCacheNextQuerying() { } func (ki *ClientK8s) UpdateNodeAnnotation(key, value string, retryTimes int) error { + if ki == nil { + return errors.New("ClientK8s is nil") + } node, err := ki.GetNode() retry := 0 for err != nil && retry < retryTimes { -- Gitee From afea204f36928d4b55eb42489fe48d8ca13842c8 Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Wed, 4 Sep 2024 18:46:38 +0800 Subject: [PATCH 5/6] fix dt --- pkg/device/ascend310_test.go | 6 +++++- pkg/device/ascend310p_test.go | 6 +++++- pkg/device/ascend910_test.go | 6 +++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pkg/device/ascend310_test.go b/pkg/device/ascend310_test.go index cf372604..8eaeb2e7 100644 --- a/pkg/device/ascend310_test.go +++ b/pkg/device/ascend310_test.go @@ -79,7 +79,10 @@ func TestDoWithVolcanoListAndWatch310(t *testing.T) { allInfo, err := manager.GetNPUs() convey.So(err, convey.ShouldBeNil) groupDevice := ClassifyDevices(allInfo.AllDevs, allInfo.AllDevTypes) - + mockAnnotation := gomonkey.ApplyPrivateMethod(reflect.TypeOf(new(AscendTools)), + "annotateWithSubHealthy", func(_ common.DevStatusSet) { + return + }) mockGetPodsUsedNpu := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), "GetPodsUsedNpu", func(_ *kubeclient.ClientK8s) sets.String { return nil @@ -106,6 +109,7 @@ func TestDoWithVolcanoListAndWatch310(t *testing.T) { mockGetPodsUsedNpu.Reset() mockGetConfigMap.Reset() mockCreateConfigMap.Reset() + mockAnnotation.Reset() }() manager.client.SetNodeDeviceInfoCache(createFakeDeviceInfo()) manager.DoWithVolcanoListAndWatch(groupDevice) diff --git a/pkg/device/ascend310p_test.go b/pkg/device/ascend310p_test.go index bab53b2e..683658db 100644 --- a/pkg/device/ascend310p_test.go +++ b/pkg/device/ascend310p_test.go @@ -55,7 +55,10 @@ func TestDoWithVolcanoListAndWatch310p(t *testing.T) { allInfo, err := manager.GetNPUs() convey.So(err, convey.ShouldBeNil) groupDevice := ClassifyDevices(allInfo.AllDevs, allInfo.AllDevTypes) - + mockAnnotation := gomonkey.ApplyPrivateMethod(reflect.TypeOf(new(AscendTools)), + "annotateWithSubHealthy", func(_ common.DevStatusSet) { + return + }) mockGetPodsUsedNpu := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), "GetPodsUsedNpu", func(_ *kubeclient.ClientK8s) sets.String { return nil @@ -81,6 +84,7 @@ func TestDoWithVolcanoListAndWatch310p(t *testing.T) { mockGetPodsUsedNpu.Reset() mockGetConfigMap.Reset() mockCreateConfigMap.Reset() + mockAnnotation.Reset() }() manager.client.SetNodeDeviceInfoCache(createFakeDeviceInfo()) manager.DoWithVolcanoListAndWatch(groupDevice) diff --git a/pkg/device/ascend910_test.go b/pkg/device/ascend910_test.go index 9241d89a..7799a6f6 100644 --- a/pkg/device/ascend910_test.go +++ b/pkg/device/ascend910_test.go @@ -89,7 +89,10 @@ func TestDoWithVolcanoListAndWatch910(t *testing.T) { allInfo, err := manager.GetNPUs() convey.So(err, convey.ShouldBeNil) groupDevice := ClassifyDevices(allInfo.AllDevs, allInfo.AllDevTypes) - + mockAnnotation := gomonkey.ApplyPrivateMethod(reflect.TypeOf(new(AscendTools)), + "annotateWithSubHealthy", func(_ common.DevStatusSet) { + return + }) mockGetPodsUsedNpu := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), "GetPodsUsedNpu", func(_ *kubeclient.ClientK8s) sets.String { return nil @@ -125,6 +128,7 @@ func TestDoWithVolcanoListAndWatch910(t *testing.T) { mockPatchNodeState.Reset() mockCreateConfigMap.Reset() mockNodeBack.Reset() + mockAnnotation.Reset() }() manager.client.SetNodeDeviceInfoCache(createFakeDeviceInfo()) manager.DoWithVolcanoListAndWatch(groupDevice) -- Gitee From a16bc1a9c82da81245bb29e303d706ab557e9978 Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Wed, 4 Sep 2024 21:07:20 +0800 Subject: [PATCH 6/6] fix --- pkg/device/ascendcommon.go | 2 +- pkg/kubeclient/kubeclient.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 2a6f641a..86d4eb14 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -269,7 +269,7 @@ func (tool *AscendTools) annotateWithSubHealthy(devStatusSet common.DevStatusSet err := tool.client.UpdateNodeAnnotation(common.SubHealthyAnnotationKey, strconv.FormatBool(curSubHealthyStatus), common.UpdateAnnotationRetryTimes) if err == nil { - firstUpdate = true + firstUpdate = false } else { hwlog.RunLog.Warnf("update node annotation failed, err: %v", err) } diff --git a/pkg/kubeclient/kubeclient.go b/pkg/kubeclient/kubeclient.go index e1bd8ba1..e367d77b 100644 --- a/pkg/kubeclient/kubeclient.go +++ b/pkg/kubeclient/kubeclient.go @@ -355,7 +355,7 @@ func (ki *ClientK8s) UpdateNodeAnnotation(key, value string, retryTimes int) err if node.Annotations == nil { node.Annotations = make(map[string]string) } - node.Labels[key] = value + node.Annotations[key] = value _, err = ki.Clientset.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{}) retry = 0 for err != nil && retry < retryTimes { -- Gitee