diff --git a/pkg/common/common.go b/pkg/common/common.go index 629e806f0b5002f4c6887581d4d7608557292bc0..4b9589c8a56e7004950226df160bc3829ce69e58 100644 --- a/pkg/common/common.go +++ b/pkg/common/common.go @@ -16,10 +16,12 @@ package common import ( + "crypto/rand" "crypto/sha256" "encoding/hex" "encoding/json" "fmt" + "math/big" "os" "os/signal" "path/filepath" @@ -647,3 +649,22 @@ func GetJobNameOfPod(pod *v1.Pod) string { } return taskName } + +// RandomInt64 return a random int64 number +func RandomInt64(min, max int64) int64 { + randomBytes := make([]byte, 16) + _, err := rand.Read(randomBytes) + if err != nil { + return 0 + } + randomNum := new(big.Int).SetBytes(randomBytes) + + bigMax := big.NewInt(max) + bigMin := big.NewInt(min) + diff := new(big.Int).Sub(bigMax, bigMin) + + randomNum.Mod(randomNum, diff) + randomNum.Add(randomNum, bigMin) + + return randomNum.Int64() +} diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 3e8e0570c578f3b1c7b1df9e4b7fe6df8daf3037..5b46efa7277c6f16cd0b4429c0967fd0f62e7ccd 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -769,4 +769,6 @@ const ( UpdateAnnotationRetryTimes = 3 // SubHealthyAnnotationKey sub-healthy annotation key on node SubHealthyAnnotationKey = "subHealthy" + // FirstUpdateMaxSleepMilliSecond max sleep time before first update node annotation + FirstUpdateMaxSleepMilliSecond = 3000 ) diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 86d4eb14d426e52a297aaf0fd637934c61a47c12..ab0b0f0191250a482303e3e06a488ef85242bee0 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -230,9 +230,9 @@ func (tool *AscendTools) UpdateNodeDeviceInfo(devStatusSet common.DevStatusSet, return false, nil } tool.delVirDevInfo(newDeviceList) - - tool.annotateWithSubHealthy(devStatusSet) - + if common.ParamOption.UseVolcanoType { + tool.annotateWithSubHealthy(devStatusSet) + } manuallySeparateNPU := tool.handleManuallySeparateNPUFaultInfo() // if subscribe failed, will use get interface if common.SwitchSubscribeFailed && common.ParamOption.EnableSwitchFault { @@ -264,14 +264,18 @@ func (tool *AscendTools) annotateWithSubHealthy(devStatusSet common.DevStatusSet break } } - if firstUpdate || preSubHealthy != curSubHealthyStatus { - preSubHealthy = curSubHealthyStatus + if firstUpdate || (preSubHealthy != curSubHealthyStatus) { + if firstUpdate { + sleepTime := common.RandomInt64(0, common.FirstUpdateMaxSleepMilliSecond) + time.Sleep(time.Duration(sleepTime) * time.Millisecond) + } err := tool.client.UpdateNodeAnnotation(common.SubHealthyAnnotationKey, strconv.FormatBool(curSubHealthyStatus), common.UpdateAnnotationRetryTimes) if err == nil { + preSubHealthy = curSubHealthyStatus firstUpdate = false } else { - hwlog.RunLog.Warnf("update node annotation failed, err: %v", err) + hwlog.RunLog.Errorf("update node annotation failed, err: %v", err) } } }