diff --git a/main.go b/main.go index b0cbc2c8b11c48cc1c485b5d934231c9b3d112ef..6d3dd55c68087da16eaf35bc48c61dffcf759e89 100644 --- a/main.go +++ b/main.go @@ -25,7 +25,7 @@ import ( "huawei.com/npu-exporter/v6/devmanager" "Ascend-device-plugin/pkg/common" - "Ascend-device-plugin/pkg/device/deviceSwitch" + "Ascend-device-plugin/pkg/device/deviceswitch" "Ascend-device-plugin/pkg/server" ) @@ -207,7 +207,7 @@ func InitFunction() (*server.HwDevManager, error) { hwlog.RunLog.Info("init device manager success") common.ParamOption.EnableSwitchFault = true if common.ParamOption.RealCardType == common.Ascend910A3 && common.ParamOption.EnableSwitchFault { - switchDevMgr := deviceSwitch.NewSwitchDevManager() + switchDevMgr := deviceswitch.NewSwitchDevManager() if err := switchDevMgr.InitSwitchDev(); err != nil { hwlog.RunLog.Warnf("failed to init switch switch device manager, will not deal with switch fault, "+ "err: %s", err.Error()) diff --git a/pkg/common/device.go b/pkg/common/device.go index a80175372d0d5410beadb9a961ea9814e7dd79d8..a01c4a1e752e27331ab9ecdd8adadca31e547679 100644 --- a/pkg/common/device.go +++ b/pkg/common/device.go @@ -112,6 +112,8 @@ func GetSwitchFaultInfo() SwitchFaultInfo { faultLevel, NodeStatus = PreSeparateFaultLevelStr, "SubHealthy" case SeparateFaultLevel: faultLevel, NodeStatus = SeparateFaultLevelStr, "UnHealthy" + default: + faultLevel, NodeStatus = NotHandleFaultLevelStr, "Healthy" } // keep those none zero codes reportFaultCodes := make([]string, 0) diff --git a/pkg/device/ascend910.go b/pkg/device/ascend910.go index 534168bbe2f5f7e0527a18451bdef96ec0fafe94..0f87f8a0634a06b02e2ff01129b7c5f2e51d01f4 100644 --- a/pkg/device/ascend910.go +++ b/pkg/device/ascend910.go @@ -1424,6 +1424,9 @@ func (hnm *HwAscend910Manager) refreshDevFaultInfo(devFaultInfo []*common.TaskDe return false, nil } for _, npuDevice := range devStatusList { + if int(npuDevice.LogicID) >= len(devFaultInfo) { + continue + } devFaultInfo[npuDevice.LogicID].ErrorCode = npuDevice.FaultCodes devFaultInfo[npuDevice.LogicID].Policy = hnm.hotResetManager. GetDevProcessPolicy(common.GetFaultType(npuDevice.FaultCodes, npuDevice.LogicID)) diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 08c7848f903acc31b6a1b0aa0f41f3ec5ae12eac..2b1698e9354e79f39413442a9b67052697d33c4a 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -35,7 +35,7 @@ import ( "k8s.io/utils/strings/slices" "Ascend-device-plugin/pkg/common" - "Ascend-device-plugin/pkg/device/deviceSwitch" + "Ascend-device-plugin/pkg/device/deviceswitch" "Ascend-device-plugin/pkg/kubeclient" ) @@ -233,7 +233,7 @@ func (tool *AscendTools) UpdateNodeDeviceInfo(devStatusSet common.DevStatusSet, // if subscribe failed, will use get interface if common.SwitchSubscribeFailed && common.ParamOption.EnableSwitchFault { var err error - newFaults, err := deviceSwitch.GetSwitchFaults() + newFaults, err := deviceswitch.GetSwitchFaults() common.SetSwitchFaultCode(newFaults) if err != nil { hwlog.RunLog.Error("failed to query all fault codes of switch") diff --git a/pkg/device/deviceSwitch/ascendSwitch.go b/pkg/device/deviceswitch/ascend_switch.go similarity index 98% rename from pkg/device/deviceSwitch/ascendSwitch.go rename to pkg/device/deviceswitch/ascend_switch.go index 4c93f061832beb48f0247535a0ad471226d0610a..d106d0961c5cb65dce43b3294e30979e2f29dc20 100644 --- a/pkg/device/deviceSwitch/ascendSwitch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -12,8 +12,8 @@ limitations under the License. */ -// Package deviceSwitch functions of getting switch faults code -package deviceSwitch +// Package deviceswitch functions of getting switch faults code +package deviceswitch import ( "context" @@ -245,7 +245,8 @@ func (sdm *SwitchDevManager) SubscribeSwitchFaults() error { func GetSwitchFaults() ([]int64, error) { var errCount C.uint var errInfoArray [maxFaultNum]C.struct_LqDcmiEvent - if retCode := C.lq_dcmi_get_fault_info(C.uint(maxFaultNum), &errCount, &errInfoArray[0]); int32(retCode) != devmanagercommon.Success { + if retCode := C.lq_dcmi_get_fault_info(C.uint(maxFaultNum), &errCount, + &errInfoArray[0]); int32(retCode) != devmanagercommon.Success { return []int64{}, fmt.Errorf("failed to get switch device errorcodes, errCode:%v", retCode) } if int32(errCount) < 0 || int32(errCount) > maxFaultNum { diff --git a/pkg/device/deviceSwitch/library.h b/pkg/device/deviceswitch/library.h similarity index 100% rename from pkg/device/deviceSwitch/library.h rename to pkg/device/deviceswitch/library.h diff --git a/pkg/kubeclient/client_server.go b/pkg/kubeclient/client_server.go index 85da291d20e3e2a58c24707e950cad7ab0309bb7..ef7682934f51ee1c0318a2c5da8bf512d78be97e 100644 --- a/pkg/kubeclient/client_server.go +++ b/pkg/kubeclient/client_server.go @@ -169,7 +169,8 @@ func (ki *ClientK8s) GetManuallySeparateNPUIDFromDeviceInfo(deviceInfoCMName, de // WriteDeviceInfoDataIntoCM write deviceinfo into config map func (ki *ClientK8s) WriteDeviceInfoDataIntoCM(deviceInfo map[string]string, - manuallySeparateNPU string, switchInfo common.SwitchFaultInfo, superPodID, serverIndex int32) (*common.NodeDeviceInfoCache, error) { + manuallySeparateNPU string, switchInfo common.SwitchFaultInfo, superPodID, + serverIndex int32) (*common.NodeDeviceInfoCache, error) { var nodeDeviceData = common.NodeDeviceInfoCache{ DeviceInfo: common.NodeDeviceInfo{ diff --git a/pkg/kubeclient/client_server_test.go b/pkg/kubeclient/client_server_test.go index 8e2dc861d6a6278ce502dace9f432d6f23b5f6a6..da751a3c208aabfefc52d5aa870ff0acbaaffe8d 100644 --- a/pkg/kubeclient/client_server_test.go +++ b/pkg/kubeclient/client_server_test.go @@ -374,7 +374,8 @@ func resetMock(resetMockList ...*gomonkey.Patches) { func annotationResetMock(devErr, stateErr, nodeErr error) (*gomonkey.Patches, *gomonkey.Patches, *gomonkey.Patches) { node := getMockNode(common.HuaweiAscend910, npuChip910PhyID0) mockWrite := gomonkey.ApplyMethod(reflect.TypeOf(new(ClientK8s)), "WriteDeviceInfoDataIntoCM", - func(_ *ClientK8s, _ map[string]string, _ string, _ common.SwitchFaultInfo, _, _ int32) (*common.NodeDeviceInfoCache, error) { + func(_ *ClientK8s, _ map[string]string, _ string, _ common.SwitchFaultInfo, _, _ int32) ( + *common.NodeDeviceInfoCache, error) { return nil, devErr }) mockPatchNode := gomonkey.ApplyMethod(reflect.TypeOf(new(ClientK8s)), "PatchNodeState", diff --git a/pkg/kubeclient/kube_cache.go b/pkg/kubeclient/kube_cache.go index 09b80e3ab5edb36c731682da3fd39edfb9e5fc8f..f542f059f9025b1ee7279f466d51a8baa9e08ee6 100644 --- a/pkg/kubeclient/kube_cache.go +++ b/pkg/kubeclient/kube_cache.go @@ -190,7 +190,8 @@ func (ki *ClientK8s) GetDeviceInfoCMCache() *common.NodeDeviceInfoCache { // WriteDeviceInfoDataIntoCMCache write deviceinfo into config map with cache func (ki *ClientK8s) WriteDeviceInfoDataIntoCMCache(deviceInfo map[string]string, manuallySeparateNPU string, switchInfo common.SwitchFaultInfo, superPodID, serverIndex int32) error { - newNodeDeviceInfoCache, err := ki.WriteDeviceInfoDataIntoCM(deviceInfo, manuallySeparateNPU, switchInfo, superPodID, serverIndex) + newNodeDeviceInfoCache, err := ki.WriteDeviceInfoDataIntoCM(deviceInfo, manuallySeparateNPU, switchInfo, + superPodID, serverIndex) if err != nil { return err } diff --git a/pkg/server/manager.go b/pkg/server/manager.go index 5647471db33e76b663dba86bdc0cc7641a1a8e5f..8f5ba3102c0292c8d011f3425471f8345abe069c 100644 --- a/pkg/server/manager.go +++ b/pkg/server/manager.go @@ -36,7 +36,7 @@ import ( "Ascend-device-plugin/pkg/common" "Ascend-device-plugin/pkg/device" - "Ascend-device-plugin/pkg/device/deviceSwitch" + "Ascend-device-plugin/pkg/device/deviceswitch" "Ascend-device-plugin/pkg/kubeclient" ) @@ -44,7 +44,7 @@ var lastStatus = common.NewAtomicBool(false) // HwDevManager manages huawei device devices. type HwDevManager struct { - SwitchDevManager *deviceSwitch.SwitchDevManager + SwitchDevManager *deviceswitch.SwitchDevManager groupDevice map[string][]*common.NpuDevice ServerMap map[string]InterfaceServer allInfo common.NpuAllInfo @@ -1024,23 +1024,28 @@ func (hdm *HwDevManager) pollFaultCodeCM(ctx context.Context) { time.Sleep(time.Duration(common.PollFaultCodeCMInterval) * time.Second) continue } - if resourceVersion != configMap.ResourceVersion { - hwlog.RunLog.Infof("detect '%s' configmap changed", common.FaultCodeCMName) - interval = getFaultCodeCMPollInterval(configMap) - resourceVersion = configMap.ResourceVersion - loadFaultCode(configMap) - if common.ParamOption.RealCardType == common.Ascend910A3 && common.ParamOption.EnableSwitchFault { - loadSwitchFaultCode(configMap) - deviceSwitch.UpdateSwitchFaultLevel() - } - loadFaultCustomization(configMap) - hwlog.RunLog.Infof("handling '%s' configmap change complete", common.FaultCodeCMName) - } + interval = getFaultCodeCMPollInterval(configMap) + updateFaultConfigFromCm(resourceVersion, configMap) time.Sleep(time.Duration(interval) * time.Second) } } } +func updateFaultConfigFromCm(resourceVersion string, configMap *v1.ConfigMap) { + if resourceVersion == configMap.ResourceVersion { + return + } + hwlog.RunLog.Infof("detect '%s' configmap changed", common.FaultCodeCMName) + resourceVersion = configMap.ResourceVersion + loadFaultCode(configMap) + if common.ParamOption.RealCardType == common.Ascend910A3 && common.ParamOption.EnableSwitchFault { + loadSwitchFaultCode(configMap) + deviceswitch.UpdateSwitchFaultLevel() + } + loadFaultCustomization(configMap) + hwlog.RunLog.Infof("handling '%s' configmap change complete", common.FaultCodeCMName) +} + func initFaultInfoFromFile() { if err := common.LoadFaultCodeFromFile(); err != nil { hwlog.RunLog.Errorf("load fault code from file failed, err: %v", err) @@ -1053,7 +1058,7 @@ func initFaultInfoFromFile() { hwlog.RunLog.Errorf("load switch fault code from file failed, err: %v", err) return } - deviceSwitch.UpdateSwitchFaultLevel() + deviceswitch.UpdateSwitchFaultLevel() } }