diff --git a/main.go b/main.go index 5177018a89e8e0b8a1992ec0510e1eab34e0f336..e55f05bff59c32fb1348110678bcc1f179102873 100644 --- a/main.go +++ b/main.go @@ -40,13 +40,6 @@ const ( // minListWatchPeriod is the min listening device state's period minListWatchPeriod = 3 maxLogLineLength = 1024 - - // defaultCacheExpirePeriod is the default k8s cache expire period - defaultCacheExpirePeriod = 30 - // maxCacheExpirePeriod is the max k8s cache expire period - maxCacheExpirePeriod = 60 - // minCacheExpirePeriod is the min k8s cache expire period - minCacheExpirePeriod = 0 ) var ( @@ -77,8 +70,6 @@ var ( useLargeModel = flag.Bool("useLargeModel", false, "Whether to use large model") shareDevCount = flag.Uint("shareDevCount", 1, "share device function, enable the func by setting "+ "a value greater than 1, range is [1, 100], only support 310B") - cacheExpirePeriod = flag.Int64("cacheExpirePeriod", defaultCacheExpirePeriod, "k8s resource cache expire period, "+ - "second unit, 0 means not to use cache, range [0, 60]") ) var ( @@ -152,10 +143,6 @@ func checkParam() bool { hwlog.RunLog.Error("unSupport build scene, only support edge and center") return false } - if (*cacheExpirePeriod) < minCacheExpirePeriod || (*cacheExpirePeriod) > maxCacheExpirePeriod { - hwlog.RunLog.Warn("cacheExpirePeriod period out of range") - return false - } return checkShareDevCount() } @@ -218,7 +205,6 @@ func setParameters() { PresetVDevice: *presetVirtualDevice, Use310PMixedInsert: *use310PMixedInsert, HotReset: *hotReset, - CacheExpirePeriod: *cacheExpirePeriod, UseLargeModel: *useLargeModel, BuildScene: BuildScene, ShareCount: *shareDevCount, diff --git a/pkg/common/proto.go b/pkg/common/proto.go index 3b21b49abd3eba80952bebf2a372af4bc7c8ba2c..d83494c2bb5ac47060c6070a0d61a7d340d7c2a6 100644 --- a/pkg/common/proto.go +++ b/pkg/common/proto.go @@ -105,7 +105,6 @@ type Option struct { BuildScene string // build scene judge device-plugin start scene ProductTypes []string // all product types RealCardType string // real card type - CacheExpirePeriod int64 // k8s resource expire period } // GetAllDeviceInfoTypeList Get All Device Info Type List @@ -113,12 +112,12 @@ func GetAllDeviceInfoTypeList() map[string]struct{} { return map[string]struct{}{HuaweiUnHealthAscend910: {}, HuaweiNetworkUnHealthAscend910: {}, ResourceNamePrefix + Ascend910: {}, ResourceNamePrefix + Ascend910c2: {}, ResourceNamePrefix + Ascend910c4: {}, ResourceNamePrefix + Ascend910c8: {}, - ResourceNamePrefix + Ascend910c16: {}, ResourceNamePrefix + Ascend910c5Cpu1Gb8: {}, + ResourceNamePrefix + Ascend910c16: {}, ResourceNamePrefix + Ascend910c5Cpu1Gb8: {}, ResourceNamePrefix + Ascend910c5Cpu1Gb16: {}, ResourceNamePrefix + Ascend910c6Cpu1Gb16: {}, ResourceNamePrefix + Ascend910c10Cpu3Gb16: {}, ResourceNamePrefix + Ascend910c3Cpu1Gb8: {}, ResourceNamePrefix + Ascend910c10Cpu3Gb16Ndvpp: {}, ResourceNamePrefix + Ascend910c10Cpu3Gb32: {}, ResourceNamePrefix + Ascend910c10Cpu4Gb16Dvpp: {}, - ResourceNamePrefix + Ascend910c12Cpu3Gb32: {}, ResourceNamePrefix + Ascend310: {}, + ResourceNamePrefix + Ascend910c12Cpu3Gb32: {}, ResourceNamePrefix + Ascend310: {}, ResourceNamePrefix + Ascend310P: {}, ResourceNamePrefix + Ascend310Pc1: {}, ResourceNamePrefix + Ascend310Pc2: {}, ResourceNamePrefix + Ascend310Pc4: {}, ResourceNamePrefix + Ascend310Pc2Cpu1: {}, ResourceNamePrefix + Ascend310Pc4Cpu3: {}, @@ -169,10 +168,11 @@ type TaskDevInfo struct { // DevFaultInfo is the fault info of device type DevFaultInfo struct { - LogicId int32 - Status string - Policy string - ErrorCode []int64 + LogicId int32 + Status string + Policy string + ErrorCode []int64 + ErrorCodeHex string } // TaskFaultInfoCache record task fault rank information cache diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 3b183eddbff8f11e9ff2aef7a8536a25c3e49ece..d82313aef9a9e4502c90811da38ca2831ca34e34 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -316,7 +316,7 @@ func (tool *AscendTools) groupDevsByStatus(subClassDevices []*common.NpuDevice, NPUName: device.DeviceName, LargeModelFaultLevel: common.GetLargeModelFaultTypeByCode(device.FaultCodes), FaultLevel: common.GetFaultTypeByCode(device.FaultCodes), - FaultCode: common.Int64Tool.ToHexString(device.FaultCodes), + FaultCode: strings.ToUpper(common.Int64Tool.ToHexString(device.FaultCodes)), }) } if device.Health == v1beta1.Healthy { diff --git a/pkg/kubeclient/client_server.go b/pkg/kubeclient/client_server.go index 0a05ceda8467e8fa3c34e5b969320530e831571a..e75c85e4889e42de8675b1c295f7090a694a25d4 100644 --- a/pkg/kubeclient/client_server.go +++ b/pkg/kubeclient/client_server.go @@ -118,22 +118,20 @@ func (ki *ClientK8s) WriteResetInfoDataIntoCM(taskName string, namespace string, hwlog.RunLog.Errorf("failed to get reset cm of task %s, err: %#v", taskName, err) return nil, err } - taskResetInfo := &common.TaskResetInfoCache{ - ResetInfo: taskInfo, - } oldResetInfoData, ok := oldCM.Data[common.ResetInfoCMDataKey] if !ok { return nil, fmt.Errorf("invalid reset info data") } - if strings.Contains(oldResetInfoData, common.IsolateError) && len(taskResetInfo.ResetInfo.RankList) != 0 { + if strings.Contains(oldResetInfoData, common.IsolateError) && len(taskInfo.RankList) != 0 { return nil, fmt.Errorf("task should be rescheduled") } - taskResetInfo.ResetInfo.UpdateTime = time.Now().Unix() - checkCode := common.MakeDataHash(taskResetInfo.ResetInfo) + newTaskInfo := setNewTaskInfoWithHexString(taskInfo) + newTaskInfo.UpdateTime = time.Now().Unix() + checkCode := common.MakeDataHash(newTaskInfo) var data []byte - if data = common.MarshalData(taskResetInfo.ResetInfo); len(data) == 0 { + if data = common.MarshalData(newTaskInfo); len(data) == 0 { return nil, fmt.Errorf("marshal task reset data failed") } resetInfoCM := &v1.ConfigMap{ @@ -149,6 +147,17 @@ func (ki *ClientK8s) WriteResetInfoDataIntoCM(taskName string, namespace string, return ki.UpdateConfigMap(resetInfoCM) } +func setNewTaskInfoWithHexString(taskInfo *common.TaskResetInfo) *common.TaskResetInfo { + var newTaskInfo common.TaskResetInfo + for _, deviceInfo := range taskInfo.RankList { + newDeviceInfo := *deviceInfo + newDeviceInfo.ErrorCodeHex = strings.ToUpper(common.Int64Tool.ToHexString(newDeviceInfo.ErrorCode)) + newDeviceInfo.ErrorCode = []int64{} + newTaskInfo.RankList = append(newTaskInfo.RankList, &newDeviceInfo) + } + return &newTaskInfo +} + // WriteFaultInfoDataIntoCM write fault info into config map func (ki *ClientK8s) WriteFaultInfoDataIntoCM(taskName string, namespace string, faultInfo *common.TaskFaultInfo) (*v1.ConfigMap, error) {