From 410c76449a70b04690c5d2dc9d3b54e68f10bef8 Mon Sep 17 00:00:00 2001 From: fengjianqing <1416100064@qq.com> Date: Tue, 30 Jul 2024 20:13:55 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=8A=A8=E6=80=81=E7=AE=97?= =?UTF-8?q?=E5=8A=9B=E5=88=87=E5=88=86=E5=9C=BA=E6=99=AF=E3=80=82local-wor?= =?UTF-8?q?ld-size=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=E7=94=B1dp=E6=9D=A5?= =?UTF-8?q?=E5=86=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/common.go | 8 +++++++- pkg/common/constants.go | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pkg/common/common.go b/pkg/common/common.go index 33969a9d..f0c2563b 100644 --- a/pkg/common/common.go +++ b/pkg/common/common.go @@ -95,7 +95,13 @@ func SetAscendRuntimeEnv(devices []int, ascendRuntimeOptions string, resp *v1bet if ParamOption.RealCardType == Ascend310B { (*resp).Envs[ascendAllowLinkEnv] = "True" } - + // dynamic cut, dp write the env which job use npu num to container + if !ParamOption.PresetVDevice { + (*resp).Envs[msWorkerNumEnv] = strconv.Itoa(len(deviceStr)) + (*resp).Envs[ptWorldSizeEnv] = strconv.Itoa(len(deviceStr)) + (*resp).Envs[ptLocalWorldSizeEnv] = strconv.Itoa(len(deviceStr)) + (*resp).Envs[tfWorkerSizeEnv] = strconv.Itoa(len(deviceStr)) + } hwlog.RunLog.Infof("allocate resp env: %s; %s", (*resp).Envs[AscendVisibleDevicesEnv], ascendRuntimeOptions) } diff --git a/pkg/common/constants.go b/pkg/common/constants.go index d580943b..89ebef62 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -165,6 +165,10 @@ const ( SecondMagnification = 1000 // SecondMagnificationFloat is second-level unit magnification float SecondMagnificationFloat = 1000.0 + ptWorldSizeEnv = "WORLD_SIZE" + ptLocalWorldSizeEnv = "Local_WORLD_SIZE" + tfWorkerSizeEnv = "CM_WORKER_SIZE" + msWorkerNumEnv = "MS_WORKER_NUM" ) const ( -- Gitee From 73dcd53a1cbadadf76de8cfaf2a52d0d272ec97a Mon Sep 17 00:00:00 2001 From: fengjianqing <1416100064@qq.com> Date: Wed, 31 Jul 2024 11:55:36 +0800 Subject: [PATCH 2/5] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=8A=A8=E6=80=81=E7=AE=97?= =?UTF-8?q?=E5=8A=9B=E5=88=87=E5=88=86=E5=9C=BA=E6=99=AF=E3=80=82local-wor?= =?UTF-8?q?ld-size=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=E7=94=B1dp=E6=9D=A5?= =?UTF-8?q?=E5=86=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/constants.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 89ebef62..15b1c3bc 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -166,7 +166,7 @@ const ( // SecondMagnificationFloat is second-level unit magnification float SecondMagnificationFloat = 1000.0 ptWorldSizeEnv = "WORLD_SIZE" - ptLocalWorldSizeEnv = "Local_WORLD_SIZE" + ptLocalWorldSizeEnv = "LOCAL_WORLD_SIZE" tfWorkerSizeEnv = "CM_WORKER_SIZE" msWorkerNumEnv = "MS_WORKER_NUM" ) -- Gitee From bf83db7b834fb26c6c37d6b1d777e9a3f2945de8 Mon Sep 17 00:00:00 2001 From: fengjianqing <1416100064@qq.com> Date: Wed, 31 Jul 2024 14:38:18 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=8A=A8=E6=80=81=E7=AE=97?= =?UTF-8?q?=E5=8A=9B=E5=88=87=E5=88=86=E5=9C=BA=E6=99=AF=E3=80=82local-wor?= =?UTF-8?q?ld-size=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=E7=94=B1dp=E6=9D=A5?= =?UTF-8?q?=E5=86=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/common.go | 2 ++ pkg/common/constants.go | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pkg/common/common.go b/pkg/common/common.go index f0c2563b..d7c018dc 100644 --- a/pkg/common/common.go +++ b/pkg/common/common.go @@ -97,10 +97,12 @@ func SetAscendRuntimeEnv(devices []int, ascendRuntimeOptions string, resp *v1bet } // dynamic cut, dp write the env which job use npu num to container if !ParamOption.PresetVDevice { + (*resp).Envs[msLocalWorker] = strconv.Itoa(len(deviceStr)) (*resp).Envs[msWorkerNumEnv] = strconv.Itoa(len(deviceStr)) (*resp).Envs[ptWorldSizeEnv] = strconv.Itoa(len(deviceStr)) (*resp).Envs[ptLocalWorldSizeEnv] = strconv.Itoa(len(deviceStr)) (*resp).Envs[tfWorkerSizeEnv] = strconv.Itoa(len(deviceStr)) + (*resp).Envs[tfLocalWorker] = strconv.Itoa(len(deviceStr)) } hwlog.RunLog.Infof("allocate resp env: %s; %s", (*resp).Envs[AscendVisibleDevicesEnv], ascendRuntimeOptions) } diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 15b1c3bc..22b8b76c 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -168,7 +168,9 @@ const ( ptWorldSizeEnv = "WORLD_SIZE" ptLocalWorldSizeEnv = "LOCAL_WORLD_SIZE" tfWorkerSizeEnv = "CM_WORKER_SIZE" + tfLocalWorker = "CM_LOCAL_WORKER" msWorkerNumEnv = "MS_WORKER_NUM" + msLocalWorker = "MS_LOCAL_WORKER" ) const ( -- Gitee From 107fd325457d7226eaa9dfedfad1370476cb3aff Mon Sep 17 00:00:00 2001 From: fengjianqing <1416100064@qq.com> Date: Wed, 31 Jul 2024 14:58:18 +0800 Subject: [PATCH 4/5] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=8A=A8=E6=80=81=E7=AE=97?= =?UTF-8?q?=E5=8A=9B=E5=88=87=E5=88=86=E5=9C=BA=E6=99=AF=E3=80=82local-wor?= =?UTF-8?q?ld-size=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=E7=94=B1dp=E6=9D=A5?= =?UTF-8?q?=E5=86=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/common.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/common/common.go b/pkg/common/common.go index d7c018dc..7e4828d4 100644 --- a/pkg/common/common.go +++ b/pkg/common/common.go @@ -95,7 +95,7 @@ func SetAscendRuntimeEnv(devices []int, ascendRuntimeOptions string, resp *v1bet if ParamOption.RealCardType == Ascend310B { (*resp).Envs[ascendAllowLinkEnv] = "True" } - // dynamic cut, dp write the env which job use npu num to container + // npu dynamic cut, dp write the env which job use npu num to container instead of ascend-operator if !ParamOption.PresetVDevice { (*resp).Envs[msLocalWorker] = strconv.Itoa(len(deviceStr)) (*resp).Envs[msWorkerNumEnv] = strconv.Itoa(len(deviceStr)) -- Gitee From 262f0f3f67994f97bb474791d58103b5d07461a6 Mon Sep 17 00:00:00 2001 From: fengjianqing <1416100064@qq.com> Date: Wed, 31 Jul 2024 18:47:29 +0800 Subject: [PATCH 5/5] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=8A=A8=E6=80=81=E7=AE=97?= =?UTF-8?q?=E5=8A=9B=E5=88=87=E5=88=86=E5=9C=BA=E6=99=AF=E3=80=82local-wor?= =?UTF-8?q?ld-size=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=E7=94=B1dp=E6=9D=A5?= =?UTF-8?q?=E5=86=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/common.go | 21 +++++++++++++++++++++ pkg/common/constants.go | 6 ------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pkg/common/common.go b/pkg/common/common.go index 7e4828d4..e28a1182 100644 --- a/pkg/common/common.go +++ b/pkg/common/common.go @@ -51,6 +51,17 @@ var ( } ) +// the env key of pt ms tf framework while pod is dynamic cut job +const ( + ptWorldSizeEnv = "WORLD_SIZE" + ptLocalWorldSizeEnv = "LOCAL_WORLD_SIZE" + tfWorkerSizeEnv = "CM_WORKER_SIZE" + tfLocalWorker = "CM_LOCAL_WORKER" + msWorkerNumEnv = "MS_WORKER_NUM" + msLocalWorker = "MS_LOCAL_WORKER" + ptLocalRank = "LOCAL_RANK" +) + // ServerInfo used for pass parameters type ServerInfo struct { ServerID string @@ -101,12 +112,22 @@ func SetAscendRuntimeEnv(devices []int, ascendRuntimeOptions string, resp *v1bet (*resp).Envs[msWorkerNumEnv] = strconv.Itoa(len(deviceStr)) (*resp).Envs[ptWorldSizeEnv] = strconv.Itoa(len(deviceStr)) (*resp).Envs[ptLocalWorldSizeEnv] = strconv.Itoa(len(deviceStr)) + (*resp).Envs[ptLocalRank] = localRankStr(len(deviceStr)) (*resp).Envs[tfWorkerSizeEnv] = strconv.Itoa(len(deviceStr)) (*resp).Envs[tfLocalWorker] = strconv.Itoa(len(deviceStr)) } hwlog.RunLog.Infof("allocate resp env: %s; %s", (*resp).Envs[AscendVisibleDevicesEnv], ascendRuntimeOptions) } +func localRankStr(req int) string { + rankStr := "" + for i := 0; i < req-1; i++ { + rankStr += strconv.Itoa(i) + "," + } + rankStr += strconv.Itoa(req - 1) + return rankStr +} + // MakeDataHash Make Data Hash func MakeDataHash(data interface{}) string { var dataBuffer []byte diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 22b8b76c..d580943b 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -165,12 +165,6 @@ const ( SecondMagnification = 1000 // SecondMagnificationFloat is second-level unit magnification float SecondMagnificationFloat = 1000.0 - ptWorldSizeEnv = "WORLD_SIZE" - ptLocalWorldSizeEnv = "LOCAL_WORLD_SIZE" - tfWorkerSizeEnv = "CM_WORKER_SIZE" - tfLocalWorker = "CM_LOCAL_WORKER" - msWorkerNumEnv = "MS_WORKER_NUM" - msLocalWorker = "MS_LOCAL_WORKER" ) const ( -- Gitee