From 1dc3007989da3dc2793daa05f76807ea2cdb9f5d Mon Sep 17 00:00:00 2001 From: w00631655 Date: Wed, 6 Sep 2023 16:54:24 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=95=85=E9=9A=9C=E7=A0=81=E5=AF=B9?= =?UTF-8?q?=E5=A4=96=E5=91=88=E7=8E=B0=E4=BD=BF=E7=94=A8=E5=A4=A7=E5=86=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/proto.go | 13 +++++++------ pkg/device/ascendcommon.go | 2 +- pkg/kubeclient/client_server.go | 23 ++++++++++++++++------- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/pkg/common/proto.go b/pkg/common/proto.go index 3b21b49a..c8a4ab88 100644 --- a/pkg/common/proto.go +++ b/pkg/common/proto.go @@ -113,12 +113,12 @@ func GetAllDeviceInfoTypeList() map[string]struct{} { return map[string]struct{}{HuaweiUnHealthAscend910: {}, HuaweiNetworkUnHealthAscend910: {}, ResourceNamePrefix + Ascend910: {}, ResourceNamePrefix + Ascend910c2: {}, ResourceNamePrefix + Ascend910c4: {}, ResourceNamePrefix + Ascend910c8: {}, - ResourceNamePrefix + Ascend910c16: {}, ResourceNamePrefix + Ascend910c5Cpu1Gb8: {}, + ResourceNamePrefix + Ascend910c16: {}, ResourceNamePrefix + Ascend910c5Cpu1Gb8: {}, ResourceNamePrefix + Ascend910c5Cpu1Gb16: {}, ResourceNamePrefix + Ascend910c6Cpu1Gb16: {}, ResourceNamePrefix + Ascend910c10Cpu3Gb16: {}, ResourceNamePrefix + Ascend910c3Cpu1Gb8: {}, ResourceNamePrefix + Ascend910c10Cpu3Gb16Ndvpp: {}, ResourceNamePrefix + Ascend910c10Cpu3Gb32: {}, ResourceNamePrefix + Ascend910c10Cpu4Gb16Dvpp: {}, - ResourceNamePrefix + Ascend910c12Cpu3Gb32: {}, ResourceNamePrefix + Ascend310: {}, + ResourceNamePrefix + Ascend910c12Cpu3Gb32: {}, ResourceNamePrefix + Ascend310: {}, ResourceNamePrefix + Ascend310P: {}, ResourceNamePrefix + Ascend310Pc1: {}, ResourceNamePrefix + Ascend310Pc2: {}, ResourceNamePrefix + Ascend310Pc4: {}, ResourceNamePrefix + Ascend310Pc2Cpu1: {}, ResourceNamePrefix + Ascend310Pc4Cpu3: {}, @@ -169,10 +169,11 @@ type TaskDevInfo struct { // DevFaultInfo is the fault info of device type DevFaultInfo struct { - LogicId int32 - Status string - Policy string - ErrorCode []int64 + LogicId int32 + Status string + Policy string + ErrorCode []int64 + ErrorCodeHex string } // TaskFaultInfoCache record task fault rank information cache diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 3b183edd..d82313ae 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -316,7 +316,7 @@ func (tool *AscendTools) groupDevsByStatus(subClassDevices []*common.NpuDevice, NPUName: device.DeviceName, LargeModelFaultLevel: common.GetLargeModelFaultTypeByCode(device.FaultCodes), FaultLevel: common.GetFaultTypeByCode(device.FaultCodes), - FaultCode: common.Int64Tool.ToHexString(device.FaultCodes), + FaultCode: strings.ToUpper(common.Int64Tool.ToHexString(device.FaultCodes)), }) } if device.Health == v1beta1.Healthy { diff --git a/pkg/kubeclient/client_server.go b/pkg/kubeclient/client_server.go index 0a05ceda..e75c85e4 100644 --- a/pkg/kubeclient/client_server.go +++ b/pkg/kubeclient/client_server.go @@ -118,22 +118,20 @@ func (ki *ClientK8s) WriteResetInfoDataIntoCM(taskName string, namespace string, hwlog.RunLog.Errorf("failed to get reset cm of task %s, err: %#v", taskName, err) return nil, err } - taskResetInfo := &common.TaskResetInfoCache{ - ResetInfo: taskInfo, - } oldResetInfoData, ok := oldCM.Data[common.ResetInfoCMDataKey] if !ok { return nil, fmt.Errorf("invalid reset info data") } - if strings.Contains(oldResetInfoData, common.IsolateError) && len(taskResetInfo.ResetInfo.RankList) != 0 { + if strings.Contains(oldResetInfoData, common.IsolateError) && len(taskInfo.RankList) != 0 { return nil, fmt.Errorf("task should be rescheduled") } - taskResetInfo.ResetInfo.UpdateTime = time.Now().Unix() - checkCode := common.MakeDataHash(taskResetInfo.ResetInfo) + newTaskInfo := setNewTaskInfoWithHexString(taskInfo) + newTaskInfo.UpdateTime = time.Now().Unix() + checkCode := common.MakeDataHash(newTaskInfo) var data []byte - if data = common.MarshalData(taskResetInfo.ResetInfo); len(data) == 0 { + if data = common.MarshalData(newTaskInfo); len(data) == 0 { return nil, fmt.Errorf("marshal task reset data failed") } resetInfoCM := &v1.ConfigMap{ @@ -149,6 +147,17 @@ func (ki *ClientK8s) WriteResetInfoDataIntoCM(taskName string, namespace string, return ki.UpdateConfigMap(resetInfoCM) } +func setNewTaskInfoWithHexString(taskInfo *common.TaskResetInfo) *common.TaskResetInfo { + var newTaskInfo common.TaskResetInfo + for _, deviceInfo := range taskInfo.RankList { + newDeviceInfo := *deviceInfo + newDeviceInfo.ErrorCodeHex = strings.ToUpper(common.Int64Tool.ToHexString(newDeviceInfo.ErrorCode)) + newDeviceInfo.ErrorCode = []int64{} + newTaskInfo.RankList = append(newTaskInfo.RankList, &newDeviceInfo) + } + return &newTaskInfo +} + // WriteFaultInfoDataIntoCM write fault info into config map func (ki *ClientK8s) WriteFaultInfoDataIntoCM(taskName string, namespace string, faultInfo *common.TaskFaultInfo) (*v1.ConfigMap, error) { -- Gitee From b3ce0833e72d112e0c15d04e83de4fdac79a9c66 Mon Sep 17 00:00:00 2001 From: w00631655 Date: Tue, 12 Sep 2023 10:26:03 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=8E=BB=E9=99=A4k8s=E8=B5=84=E6=BA=90?= =?UTF-8?q?=E8=BF=87=E6=9C=9F=E6=97=B6=E9=97=B4=E9=99=90=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.go | 14 -------------- pkg/common/proto.go | 1 - 2 files changed, 15 deletions(-) diff --git a/main.go b/main.go index 5177018a..e55f05bf 100644 --- a/main.go +++ b/main.go @@ -40,13 +40,6 @@ const ( // minListWatchPeriod is the min listening device state's period minListWatchPeriod = 3 maxLogLineLength = 1024 - - // defaultCacheExpirePeriod is the default k8s cache expire period - defaultCacheExpirePeriod = 30 - // maxCacheExpirePeriod is the max k8s cache expire period - maxCacheExpirePeriod = 60 - // minCacheExpirePeriod is the min k8s cache expire period - minCacheExpirePeriod = 0 ) var ( @@ -77,8 +70,6 @@ var ( useLargeModel = flag.Bool("useLargeModel", false, "Whether to use large model") shareDevCount = flag.Uint("shareDevCount", 1, "share device function, enable the func by setting "+ "a value greater than 1, range is [1, 100], only support 310B") - cacheExpirePeriod = flag.Int64("cacheExpirePeriod", defaultCacheExpirePeriod, "k8s resource cache expire period, "+ - "second unit, 0 means not to use cache, range [0, 60]") ) var ( @@ -152,10 +143,6 @@ func checkParam() bool { hwlog.RunLog.Error("unSupport build scene, only support edge and center") return false } - if (*cacheExpirePeriod) < minCacheExpirePeriod || (*cacheExpirePeriod) > maxCacheExpirePeriod { - hwlog.RunLog.Warn("cacheExpirePeriod period out of range") - return false - } return checkShareDevCount() } @@ -218,7 +205,6 @@ func setParameters() { PresetVDevice: *presetVirtualDevice, Use310PMixedInsert: *use310PMixedInsert, HotReset: *hotReset, - CacheExpirePeriod: *cacheExpirePeriod, UseLargeModel: *useLargeModel, BuildScene: BuildScene, ShareCount: *shareDevCount, diff --git a/pkg/common/proto.go b/pkg/common/proto.go index c8a4ab88..d83494c2 100644 --- a/pkg/common/proto.go +++ b/pkg/common/proto.go @@ -105,7 +105,6 @@ type Option struct { BuildScene string // build scene judge device-plugin start scene ProductTypes []string // all product types RealCardType string // real card type - CacheExpirePeriod int64 // k8s resource expire period } // GetAllDeviceInfoTypeList Get All Device Info Type List -- Gitee