From 97a49f53857fbc9b13bae3b9375d80e061e25b89 Mon Sep 17 00:00:00 2001 From: dongpeng Date: Fri, 6 Sep 2024 16:59:54 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91cleanCode=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/constants.go | 4 +- pkg/device/ascend910_test.go | 96 +++++++++++++++++++++------------ pkg/device/ascendcommon.go | 3 +- pkg/device/ascendcommon_test.go | 42 ++++++++++----- pkg/kubeclient/kubeclient.go | 3 +- pkg/server/manager.go | 19 +++---- pkg/server/plugin.go | 3 +- 7 files changed, 112 insertions(+), 58 deletions(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 3e8e0570..b58782d2 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -763,7 +763,9 @@ const ( ) const ( - NPUNormalStatus = "normal" + // NPUNormalStatus represents normal status + NPUNormalStatus = "normal" + // NPUResettingStatus represents resetting status NPUResettingStatus = "resetting" // UpdateAnnotationRetryTimes update annotation retry times UpdateAnnotationRetryTimes = 3 diff --git a/pkg/device/ascend910_test.go b/pkg/device/ascend910_test.go index 7799a6f6..4a23ed85 100644 --- a/pkg/device/ascend910_test.go +++ b/pkg/device/ascend910_test.go @@ -89,39 +89,12 @@ func TestDoWithVolcanoListAndWatch910(t *testing.T) { allInfo, err := manager.GetNPUs() convey.So(err, convey.ShouldBeNil) groupDevice := ClassifyDevices(allInfo.AllDevs, allInfo.AllDevTypes) - mockAnnotation := gomonkey.ApplyPrivateMethod(reflect.TypeOf(new(AscendTools)), - "annotateWithSubHealthy", func(_ common.DevStatusSet) { - return - }) - mockGetPodsUsedNpu := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), - "GetPodsUsedNpu", func(_ *kubeclient.ClientK8s) sets.String { - return nil - }) - mockGetConfigMap := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), - "GetDeviceInfoCMCache", func(_ *kubeclient.ClientK8s) *common.NodeDeviceInfoCache { - nodeDeviceData := common.NodeDeviceInfoCache{DeviceInfo: common.NodeDeviceInfo{ - DeviceList: map[string]string{common.Ascend910: ascend910LogicID1}, - UpdateTime: time.Now().Unix()}} - nodeDeviceData.CheckCode = common.MakeDataHash(nodeDeviceData.DeviceInfo) - return &nodeDeviceData - }) - mockPatchNodeState := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), - "PatchNodeState", func(_ *kubeclient.ClientK8s, curNode, - newNode *v1.Node) (*v1.Node, []byte, error) { - return &v1.Node{}, nil, nil - }) - mockCreateConfigMap := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), - "WriteDeviceInfoDataIntoCM", func(_ *kubeclient.ClientK8s, - deviceInfo map[string]string, manuallySeparateNPU string, _ common.SwitchFaultInfo, superPodID, - serverIndex int32) (*common.NodeDeviceInfoCache, error) { - return &common.NodeDeviceInfoCache{}, nil - }) - mockNodeBack := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), "GetNode", - func(_ *kubeclient.ClientK8s) (*v1.Node, error) { - curNode := &v1.Node{} - curNode.Labels = make(map[string]string, 1) - return curNode, nil - }) + mockAnnotation := mockAnnotateWithSubHealthy() + mockGetPodsUsedNpu := mockGetPodsUsedNpu() + mockGetConfigMap := mockGetDeviceInfoCMCache() + mockPatchNodeState := mockPatchNodeState() + mockCreateConfigMap := mockWriteDeviceInfoDataIntoCM() + mockNodeBack := mockGetNode() defer func() { mockGetPodsUsedNpu.Reset() mockGetConfigMap.Reset() @@ -135,6 +108,63 @@ func TestDoWithVolcanoListAndWatch910(t *testing.T) { }) } +func mockGetNode() *gomonkey.Patches { + mockNodeBack := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), "GetNode", + func(_ *kubeclient.ClientK8s) (*v1.Node, error) { + curNode := &v1.Node{} + curNode.Labels = make(map[string]string, 1) + return curNode, nil + }) + return mockNodeBack +} + +func mockWriteDeviceInfoDataIntoCM() *gomonkey.Patches { + mockCreateConfigMap := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), + "WriteDeviceInfoDataIntoCM", func(_ *kubeclient.ClientK8s, + deviceInfo map[string]string, manuallySeparateNPU string, _ common.SwitchFaultInfo, superPodID, + serverIndex int32) (*common.NodeDeviceInfoCache, error) { + return &common.NodeDeviceInfoCache{}, nil + }) + return mockCreateConfigMap +} + +func mockPatchNodeState() *gomonkey.Patches { + mockPatchNodeState := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), + "PatchNodeState", func(_ *kubeclient.ClientK8s, curNode, + newNode *v1.Node) (*v1.Node, []byte, error) { + return &v1.Node{}, nil, nil + }) + return mockPatchNodeState +} + +func mockGetDeviceInfoCMCache() *gomonkey.Patches { + mockGetConfigMap := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), + "GetDeviceInfoCMCache", func(_ *kubeclient.ClientK8s) *common.NodeDeviceInfoCache { + nodeDeviceData := common.NodeDeviceInfoCache{DeviceInfo: common.NodeDeviceInfo{ + DeviceList: map[string]string{common.Ascend910: ascend910LogicID1}, + UpdateTime: time.Now().Unix()}} + nodeDeviceData.CheckCode = common.MakeDataHash(nodeDeviceData.DeviceInfo) + return &nodeDeviceData + }) + return mockGetConfigMap +} + +func mockGetPodsUsedNpu() *gomonkey.Patches { + mockGetPodsUsedNpu := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), + "GetPodsUsedNpu", func(_ *kubeclient.ClientK8s) sets.String { + return nil + }) + return mockGetPodsUsedNpu +} + +func mockAnnotateWithSubHealthy() *gomonkey.Patches { + mockAnnotation := gomonkey.ApplyPrivateMethod(reflect.TypeOf(new(AscendTools)), + "annotateWithSubHealthy", func(_ common.DevStatusSet) { + return + }) + return mockAnnotation +} + func TestToStandardDeviceFmt(t *testing.T) { convey.Convey("910 test toStandardDeviceFmt", t, func() { hnm := NewHwAscend910Manager() diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 86d4eb14..dd6d8a49 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -633,7 +633,8 @@ func (tool *AscendTools) AddPodAnnotation(podDev *common.PodDeviceInfo, deviceTy DeviceType: deviceType, SuperPodID: tool.GetSuperPodID(), } - configuration := common.GetPodConfiguration(phyDevMapVirtualDev, ascendVisibleDevices, podDev.Pod.Name, info, allDevices) + configuration := common.GetPodConfiguration(phyDevMapVirtualDev, ascendVisibleDevices, + podDev.Pod.Name, info, allDevices) if !common.ParamOption.PresetVDevice { tool.AppendVGroupInfo(podDev.RealDevice) } diff --git a/pkg/device/ascendcommon_test.go b/pkg/device/ascendcommon_test.go index 241e21c6..19caef24 100644 --- a/pkg/device/ascendcommon_test.go +++ b/pkg/device/ascendcommon_test.go @@ -135,8 +135,10 @@ func TestAddPodAnnotation1(t *testing.T) { convey.Convey("test AddPodAnnotation 1", t, func() { convey.Convey("GetDeviceListID failed", func() { err := tool.AddPodAnnotation(&common.PodDeviceInfo{ - v1.Pod{}, nil, []string{common.Ascend910}}, common.Ascend910c2, - "", nil) + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend910}, + }, common.Ascend910c2, "", nil) convey.So(err, convey.ShouldNotBeNil) }) mockTryUpdatePodAnnotation := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), @@ -148,20 +150,27 @@ func TestAddPodAnnotation1(t *testing.T) { convey.Convey("physical device 310P", func() { tool.name = common.Ascend310P err := tool.AddPodAnnotation(&common.PodDeviceInfo{ - v1.Pod{}, nil, []string{common.Ascend310P + "-0"}, + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend310P + "-0"}, }, common.Ascend310P, "", nil) convey.So(err, convey.ShouldBeNil) }) convey.Convey("virtual device", func() { err := tool.AddPodAnnotation(&common.PodDeviceInfo{ - v1.Pod{}, nil, []string{common.Ascend310Pc2 + "-100-0"}, + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend310Pc2 + "-100-0"}, }, common.Ascend310Pc2, "", nil) convey.So(err, convey.ShouldBeNil) }) convey.Convey("ParseInt failed", func() { tool.name = common.Ascend910 - err := tool.AddPodAnnotation(&common.PodDeviceInfo{v1.Pod{}, nil, []string{common.Ascend910 + "-a"}}, - common.Ascend910, "", nil) + err := tool.AddPodAnnotation(&common.PodDeviceInfo{ + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend910 + "-a"}, + }, common.Ascend910, "", nil) convey.So(err, convey.ShouldNotBeNil) }) convey.Convey("GetLogicIDFromPhysicID failed", func() { @@ -170,8 +179,11 @@ func TestAddPodAnnotation1(t *testing.T) { return 0, fmt.Errorf("error") }) defer mockGetLogicIDFromPhysicID.Reset() - err := tool.AddPodAnnotation(&common.PodDeviceInfo{v1.Pod{}, nil, []string{common.Ascend910 + "-0"}}, - common.Ascend910, "", nil) + err := tool.AddPodAnnotation(&common.PodDeviceInfo{ + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend910 + "-0"}, + }, common.Ascend910, "", nil) convey.So(err, convey.ShouldNotBeNil) }) }) @@ -200,8 +212,11 @@ func TestAddPodAnnotation2(t *testing.T) { return "", fmt.Errorf("error") }) defer mockGetDeviceIPAddress.Reset() - err := tool.AddPodAnnotation(&common.PodDeviceInfo{v1.Pod{}, nil, []string{common.Ascend910 + "-0"}}, - common.Ascend910, "", nil) + err := tool.AddPodAnnotation(&common.PodDeviceInfo{ + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend910 + "-0"}, + }, common.Ascend910, "", nil) convey.So(err, convey.ShouldNotBeNil) }) convey.Convey("GetDeviceIPAddress ok", func() { @@ -211,8 +226,11 @@ func TestAddPodAnnotation2(t *testing.T) { return "", nil }) defer mockGetDeviceIPAddress.Reset() - err := tool.AddPodAnnotation(&common.PodDeviceInfo{v1.Pod{}, nil, []string{common.Ascend910 + "-0"}}, - common.Ascend910, "", nil) + err := tool.AddPodAnnotation(&common.PodDeviceInfo{ + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend910 + "-0"}, + }, common.Ascend910, "", nil) convey.So(err, convey.ShouldBeNil) }) }) diff --git a/pkg/kubeclient/kubeclient.go b/pkg/kubeclient/kubeclient.go index e367d77b..53e0773b 100644 --- a/pkg/kubeclient/kubeclient.go +++ b/pkg/kubeclient/kubeclient.go @@ -338,9 +338,10 @@ func (ki *ClientK8s) FlushPodCacheNextQuerying() { ki.IsApiErr = true } +// UpdateNodeAnnotation update node annotation func (ki *ClientK8s) UpdateNodeAnnotation(key, value string, retryTimes int) error { if ki == nil { - return errors.New("ClientK8s is nil") + return errors.New("clientK8s is nil") } node, err := ki.GetNode() retry := 0 diff --git a/pkg/server/manager.go b/pkg/server/manager.go index 34f6a4aa..c5f32ad3 100644 --- a/pkg/server/manager.go +++ b/pkg/server/manager.go @@ -252,15 +252,16 @@ func (hdm *HwDevManager) getSuperPodInfo() common.SuperPodInfo { } hwlog.RunLog.Infof("get super pod info: %v", superPodInfo) npuDevice.SuperDeviceID = superPodInfo.SdId - if result.ScaleType == common.ScaleTypeAbnormal { - result = common.SuperPodInfo{ - ScaleType: int32(superPodInfo.ScaleType), - SuperPodId: int32(superPodInfo.SuperPodId), - ServerId: int32(superPodInfo.ServerId), - } - for i := 0; i < len(superPodInfo.Reserve); i++ { - result.Reserve = append(result.Reserve, int32(superPodInfo.Reserve[i])) - } + if result.ScaleType != common.ScaleTypeAbnormal { + continue + } + result = common.SuperPodInfo{ + ScaleType: int32(superPodInfo.ScaleType), + SuperPodId: int32(superPodInfo.SuperPodId), + ServerId: int32(superPodInfo.ServerId), + } + for i := 0; i < len(superPodInfo.Reserve); i++ { + result.Reserve = append(result.Reserve, int32(superPodInfo.Reserve[i])) } } } diff --git a/pkg/server/plugin.go b/pkg/server/plugin.go index b8eccd05..1ce21d64 100644 --- a/pkg/server/plugin.go +++ b/pkg/server/plugin.go @@ -485,7 +485,7 @@ func (ps *PluginServer) GetKltAndRealAllocateDev(podList []v1.Pod) ([]*common.Po if err != nil { return nil, fmt.Errorf("get pod resource failed, %v", err) } - var podDeviceInfo []*common.PodDeviceInfo + var podDeviceInfo = make([]*common.PodDeviceInfo, 0) for _, pod := range podList { podKey := pod.Namespace + common.UnderLine + pod.Name podResource, exist := podDevice[podKey] @@ -523,6 +523,7 @@ func (ps *PluginServer) GetKltAndRealAllocateDev(podList []v1.Pod) ([]*common.Po return podDeviceInfo, nil } +// GetRealAllocateDevices get real allocate devices from klt allocate devices func (ps *PluginServer) GetRealAllocateDevices(pod v1.Pod, kltAllocate []string) ([]string, error) { realDeviceList, err := ps.GetRealAllocateDevicesFromMap(kltAllocate) if err == nil { -- Gitee From 5989fda537cfa37fba41b96ef285e1551182cde5 Mon Sep 17 00:00:00 2001 From: dongpeng Date: Fri, 6 Sep 2024 17:48:42 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91cleanCode=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/kubeclient/kubeclient.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/kubeclient/kubeclient.go b/pkg/kubeclient/kubeclient.go index 53e0773b..44a75870 100644 --- a/pkg/kubeclient/kubeclient.go +++ b/pkg/kubeclient/kubeclient.go @@ -347,6 +347,9 @@ func (ki *ClientK8s) UpdateNodeAnnotation(key, value string, retryTimes int) err retry := 0 for err != nil && retry < retryTimes { node, err = ki.GetNode() + if err == nil { + break + } retry++ time.Sleep(time.Duration(retry) * time.Second) } -- Gitee