diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 3e8e0570c578f3b1c7b1df9e4b7fe6df8daf3037..b58782d2e787f2bc2bf7ec97a9f95f2de753dee5 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -763,7 +763,9 @@ const ( ) const ( - NPUNormalStatus = "normal" + // NPUNormalStatus represents normal status + NPUNormalStatus = "normal" + // NPUResettingStatus represents resetting status NPUResettingStatus = "resetting" // UpdateAnnotationRetryTimes update annotation retry times UpdateAnnotationRetryTimes = 3 diff --git a/pkg/device/ascend910_test.go b/pkg/device/ascend910_test.go index 7799a6f6fc707677654aeaf11fe1ea961dec148a..4a23ed858213a0082630bfd95709e1db782302a6 100644 --- a/pkg/device/ascend910_test.go +++ b/pkg/device/ascend910_test.go @@ -89,39 +89,12 @@ func TestDoWithVolcanoListAndWatch910(t *testing.T) { allInfo, err := manager.GetNPUs() convey.So(err, convey.ShouldBeNil) groupDevice := ClassifyDevices(allInfo.AllDevs, allInfo.AllDevTypes) - mockAnnotation := gomonkey.ApplyPrivateMethod(reflect.TypeOf(new(AscendTools)), - "annotateWithSubHealthy", func(_ common.DevStatusSet) { - return - }) - mockGetPodsUsedNpu := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), - "GetPodsUsedNpu", func(_ *kubeclient.ClientK8s) sets.String { - return nil - }) - mockGetConfigMap := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), - "GetDeviceInfoCMCache", func(_ *kubeclient.ClientK8s) *common.NodeDeviceInfoCache { - nodeDeviceData := common.NodeDeviceInfoCache{DeviceInfo: common.NodeDeviceInfo{ - DeviceList: map[string]string{common.Ascend910: ascend910LogicID1}, - UpdateTime: time.Now().Unix()}} - nodeDeviceData.CheckCode = common.MakeDataHash(nodeDeviceData.DeviceInfo) - return &nodeDeviceData - }) - mockPatchNodeState := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), - "PatchNodeState", func(_ *kubeclient.ClientK8s, curNode, - newNode *v1.Node) (*v1.Node, []byte, error) { - return &v1.Node{}, nil, nil - }) - mockCreateConfigMap := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), - "WriteDeviceInfoDataIntoCM", func(_ *kubeclient.ClientK8s, - deviceInfo map[string]string, manuallySeparateNPU string, _ common.SwitchFaultInfo, superPodID, - serverIndex int32) (*common.NodeDeviceInfoCache, error) { - return &common.NodeDeviceInfoCache{}, nil - }) - mockNodeBack := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), "GetNode", - func(_ *kubeclient.ClientK8s) (*v1.Node, error) { - curNode := &v1.Node{} - curNode.Labels = make(map[string]string, 1) - return curNode, nil - }) + mockAnnotation := mockAnnotateWithSubHealthy() + mockGetPodsUsedNpu := mockGetPodsUsedNpu() + mockGetConfigMap := mockGetDeviceInfoCMCache() + mockPatchNodeState := mockPatchNodeState() + mockCreateConfigMap := mockWriteDeviceInfoDataIntoCM() + mockNodeBack := mockGetNode() defer func() { mockGetPodsUsedNpu.Reset() mockGetConfigMap.Reset() @@ -135,6 +108,63 @@ func TestDoWithVolcanoListAndWatch910(t *testing.T) { }) } +func mockGetNode() *gomonkey.Patches { + mockNodeBack := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), "GetNode", + func(_ *kubeclient.ClientK8s) (*v1.Node, error) { + curNode := &v1.Node{} + curNode.Labels = make(map[string]string, 1) + return curNode, nil + }) + return mockNodeBack +} + +func mockWriteDeviceInfoDataIntoCM() *gomonkey.Patches { + mockCreateConfigMap := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), + "WriteDeviceInfoDataIntoCM", func(_ *kubeclient.ClientK8s, + deviceInfo map[string]string, manuallySeparateNPU string, _ common.SwitchFaultInfo, superPodID, + serverIndex int32) (*common.NodeDeviceInfoCache, error) { + return &common.NodeDeviceInfoCache{}, nil + }) + return mockCreateConfigMap +} + +func mockPatchNodeState() *gomonkey.Patches { + mockPatchNodeState := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), + "PatchNodeState", func(_ *kubeclient.ClientK8s, curNode, + newNode *v1.Node) (*v1.Node, []byte, error) { + return &v1.Node{}, nil, nil + }) + return mockPatchNodeState +} + +func mockGetDeviceInfoCMCache() *gomonkey.Patches { + mockGetConfigMap := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), + "GetDeviceInfoCMCache", func(_ *kubeclient.ClientK8s) *common.NodeDeviceInfoCache { + nodeDeviceData := common.NodeDeviceInfoCache{DeviceInfo: common.NodeDeviceInfo{ + DeviceList: map[string]string{common.Ascend910: ascend910LogicID1}, + UpdateTime: time.Now().Unix()}} + nodeDeviceData.CheckCode = common.MakeDataHash(nodeDeviceData.DeviceInfo) + return &nodeDeviceData + }) + return mockGetConfigMap +} + +func mockGetPodsUsedNpu() *gomonkey.Patches { + mockGetPodsUsedNpu := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), + "GetPodsUsedNpu", func(_ *kubeclient.ClientK8s) sets.String { + return nil + }) + return mockGetPodsUsedNpu +} + +func mockAnnotateWithSubHealthy() *gomonkey.Patches { + mockAnnotation := gomonkey.ApplyPrivateMethod(reflect.TypeOf(new(AscendTools)), + "annotateWithSubHealthy", func(_ common.DevStatusSet) { + return + }) + return mockAnnotation +} + func TestToStandardDeviceFmt(t *testing.T) { convey.Convey("910 test toStandardDeviceFmt", t, func() { hnm := NewHwAscend910Manager() diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 86d4eb14d426e52a297aaf0fd637934c61a47c12..dd6d8a49c9afd1b0cd6012ff77a910c48c0d2677 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -633,7 +633,8 @@ func (tool *AscendTools) AddPodAnnotation(podDev *common.PodDeviceInfo, deviceTy DeviceType: deviceType, SuperPodID: tool.GetSuperPodID(), } - configuration := common.GetPodConfiguration(phyDevMapVirtualDev, ascendVisibleDevices, podDev.Pod.Name, info, allDevices) + configuration := common.GetPodConfiguration(phyDevMapVirtualDev, ascendVisibleDevices, + podDev.Pod.Name, info, allDevices) if !common.ParamOption.PresetVDevice { tool.AppendVGroupInfo(podDev.RealDevice) } diff --git a/pkg/device/ascendcommon_test.go b/pkg/device/ascendcommon_test.go index 241e21c6e97c7dac5222ffade2e3b2d442e0df90..19caef241a91f76ec806935047e70a70add1bb46 100644 --- a/pkg/device/ascendcommon_test.go +++ b/pkg/device/ascendcommon_test.go @@ -135,8 +135,10 @@ func TestAddPodAnnotation1(t *testing.T) { convey.Convey("test AddPodAnnotation 1", t, func() { convey.Convey("GetDeviceListID failed", func() { err := tool.AddPodAnnotation(&common.PodDeviceInfo{ - v1.Pod{}, nil, []string{common.Ascend910}}, common.Ascend910c2, - "", nil) + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend910}, + }, common.Ascend910c2, "", nil) convey.So(err, convey.ShouldNotBeNil) }) mockTryUpdatePodAnnotation := gomonkey.ApplyMethod(reflect.TypeOf(new(kubeclient.ClientK8s)), @@ -148,20 +150,27 @@ func TestAddPodAnnotation1(t *testing.T) { convey.Convey("physical device 310P", func() { tool.name = common.Ascend310P err := tool.AddPodAnnotation(&common.PodDeviceInfo{ - v1.Pod{}, nil, []string{common.Ascend310P + "-0"}, + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend310P + "-0"}, }, common.Ascend310P, "", nil) convey.So(err, convey.ShouldBeNil) }) convey.Convey("virtual device", func() { err := tool.AddPodAnnotation(&common.PodDeviceInfo{ - v1.Pod{}, nil, []string{common.Ascend310Pc2 + "-100-0"}, + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend310Pc2 + "-100-0"}, }, common.Ascend310Pc2, "", nil) convey.So(err, convey.ShouldBeNil) }) convey.Convey("ParseInt failed", func() { tool.name = common.Ascend910 - err := tool.AddPodAnnotation(&common.PodDeviceInfo{v1.Pod{}, nil, []string{common.Ascend910 + "-a"}}, - common.Ascend910, "", nil) + err := tool.AddPodAnnotation(&common.PodDeviceInfo{ + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend910 + "-a"}, + }, common.Ascend910, "", nil) convey.So(err, convey.ShouldNotBeNil) }) convey.Convey("GetLogicIDFromPhysicID failed", func() { @@ -170,8 +179,11 @@ func TestAddPodAnnotation1(t *testing.T) { return 0, fmt.Errorf("error") }) defer mockGetLogicIDFromPhysicID.Reset() - err := tool.AddPodAnnotation(&common.PodDeviceInfo{v1.Pod{}, nil, []string{common.Ascend910 + "-0"}}, - common.Ascend910, "", nil) + err := tool.AddPodAnnotation(&common.PodDeviceInfo{ + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend910 + "-0"}, + }, common.Ascend910, "", nil) convey.So(err, convey.ShouldNotBeNil) }) }) @@ -200,8 +212,11 @@ func TestAddPodAnnotation2(t *testing.T) { return "", fmt.Errorf("error") }) defer mockGetDeviceIPAddress.Reset() - err := tool.AddPodAnnotation(&common.PodDeviceInfo{v1.Pod{}, nil, []string{common.Ascend910 + "-0"}}, - common.Ascend910, "", nil) + err := tool.AddPodAnnotation(&common.PodDeviceInfo{ + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend910 + "-0"}, + }, common.Ascend910, "", nil) convey.So(err, convey.ShouldNotBeNil) }) convey.Convey("GetDeviceIPAddress ok", func() { @@ -211,8 +226,11 @@ func TestAddPodAnnotation2(t *testing.T) { return "", nil }) defer mockGetDeviceIPAddress.Reset() - err := tool.AddPodAnnotation(&common.PodDeviceInfo{v1.Pod{}, nil, []string{common.Ascend910 + "-0"}}, - common.Ascend910, "", nil) + err := tool.AddPodAnnotation(&common.PodDeviceInfo{ + Pod: v1.Pod{}, + KltDevice: nil, + RealDevice: []string{common.Ascend910 + "-0"}, + }, common.Ascend910, "", nil) convey.So(err, convey.ShouldBeNil) }) }) diff --git a/pkg/kubeclient/kubeclient.go b/pkg/kubeclient/kubeclient.go index e367d77b6bb4efe5211b5ba44c5fa559736515d5..44a75870e9ae8e92ed32e6363267024cf1b7092a 100644 --- a/pkg/kubeclient/kubeclient.go +++ b/pkg/kubeclient/kubeclient.go @@ -338,14 +338,18 @@ func (ki *ClientK8s) FlushPodCacheNextQuerying() { ki.IsApiErr = true } +// UpdateNodeAnnotation update node annotation func (ki *ClientK8s) UpdateNodeAnnotation(key, value string, retryTimes int) error { if ki == nil { - return errors.New("ClientK8s is nil") + return errors.New("clientK8s is nil") } node, err := ki.GetNode() retry := 0 for err != nil && retry < retryTimes { node, err = ki.GetNode() + if err == nil { + break + } retry++ time.Sleep(time.Duration(retry) * time.Second) } diff --git a/pkg/server/manager.go b/pkg/server/manager.go index 34f6a4aa1bc6e3b4eaeacdd0e191802b94ed843a..c5f32ad3b18c9228ff00ea4b554aeaaad5386c77 100644 --- a/pkg/server/manager.go +++ b/pkg/server/manager.go @@ -252,15 +252,16 @@ func (hdm *HwDevManager) getSuperPodInfo() common.SuperPodInfo { } hwlog.RunLog.Infof("get super pod info: %v", superPodInfo) npuDevice.SuperDeviceID = superPodInfo.SdId - if result.ScaleType == common.ScaleTypeAbnormal { - result = common.SuperPodInfo{ - ScaleType: int32(superPodInfo.ScaleType), - SuperPodId: int32(superPodInfo.SuperPodId), - ServerId: int32(superPodInfo.ServerId), - } - for i := 0; i < len(superPodInfo.Reserve); i++ { - result.Reserve = append(result.Reserve, int32(superPodInfo.Reserve[i])) - } + if result.ScaleType != common.ScaleTypeAbnormal { + continue + } + result = common.SuperPodInfo{ + ScaleType: int32(superPodInfo.ScaleType), + SuperPodId: int32(superPodInfo.SuperPodId), + ServerId: int32(superPodInfo.ServerId), + } + for i := 0; i < len(superPodInfo.Reserve); i++ { + result.Reserve = append(result.Reserve, int32(superPodInfo.Reserve[i])) } } } diff --git a/pkg/server/plugin.go b/pkg/server/plugin.go index b8eccd054b8bd75c0615065cafe4b57b81c57684..1ce21d64300e18e7fbc65732962a2a4f383d0e90 100644 --- a/pkg/server/plugin.go +++ b/pkg/server/plugin.go @@ -485,7 +485,7 @@ func (ps *PluginServer) GetKltAndRealAllocateDev(podList []v1.Pod) ([]*common.Po if err != nil { return nil, fmt.Errorf("get pod resource failed, %v", err) } - var podDeviceInfo []*common.PodDeviceInfo + var podDeviceInfo = make([]*common.PodDeviceInfo, 0) for _, pod := range podList { podKey := pod.Namespace + common.UnderLine + pod.Name podResource, exist := podDevice[podKey] @@ -523,6 +523,7 @@ func (ps *PluginServer) GetKltAndRealAllocateDev(podList []v1.Pod) ([]*common.Po return podDeviceInfo, nil } +// GetRealAllocateDevices get real allocate devices from klt allocate devices func (ps *PluginServer) GetRealAllocateDevices(pod v1.Pod, kltAllocate []string) ([]string, error) { realDeviceList, err := ps.GetRealAllocateDevicesFromMap(kltAllocate) if err == nil {