From ae27971669a3d49be844b1fa58dac6b2b4d1be70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AF=9B=E6=B5=B7=E5=B9=B3?= Date: Tue, 28 Oct 2025 20:05:07 +0800 Subject: [PATCH 1/7] open source rectification --- .../kernel_impl/faster_gelu_custom.h | 10 +++---- .../normalize/kernel_impl/normalize_custom.h | 20 +++++++------- .../welford_finalize_custom_tiling.h | 2 +- .../kernel_impl/welford_finalize_custom.h | 14 +++++----- .../welford_update_custom_tiling.h | 4 +-- .../kernel_impl/welford_update_custom.h | 14 +++++----- .../op_host/welford_update_custom_tiling.h | 4 +-- examples/reduce/sum/main.cpp | 2 +- .../reduce/sum/op_host/sum_custom_tiling.cpp | 16 ++++++------ .../reduce/sum/op_host/sum_custom_tiling.h | 2 +- .../reduce/sum/op_kernel/sum_custom_impl.h | 10 +++---- examples/sort/topk/kernel_impl/topk_custom.h | 16 ++++++------ .../topk_custom_tiling.cpp | 2 +- .../kernel_impl/init_global_memory_custom.h | 2 +- lib/quantization/ascend_antiquant.h | 24 ++++++++--------- .../groupnorm/test_operator_groupnorm.cpp | 9 ++++--- .../test_operator_welfordfinalize.cpp | 6 ++--- .../reduce_all/test_operator_reduce_all.cpp | 2 +- .../reduce_any/test_operator_reduce_any.cpp | 2 +- .../reduce_max/test_operator_reduce_max.cpp | 2 +- .../reduce_mean/test_operator_reduce_mean.cpp | 2 +- .../reduce_min/test_operator_reduce_min.cpp | 2 +- .../reduce_prod/test_operator_reduce_prod.cpp | 2 +- .../reduce_sum/test_operator_reduce_sum.cpp | 2 +- tests/reduce/sum/test_operator_sum.cpp | 16 +++++++----- .../test_operator_confusion_transpose.cpp | 26 +++++++++---------- 26 files changed, 109 insertions(+), 104 deletions(-) diff --git a/examples/activation/fastergelu/kernel_impl/faster_gelu_custom.h b/examples/activation/fastergelu/kernel_impl/faster_gelu_custom.h index ab84dacd..32602ded 100644 --- a/examples/activation/fastergelu/kernel_impl/faster_gelu_custom.h +++ b/examples/activation/fastergelu/kernel_impl/faster_gelu_custom.h @@ -21,12 +21,12 @@ struct VecTiling { template class KernelFasterGelu { public: __aicore__ inline KernelFasterGelu() {} - __aicore__ inline void Init(GM_ADDR src_gm, GM_ADDR dst_gm, uint32_t inputSize) + __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t inputSize) { dataSize = inputSize; - srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(src_gm), dataSize); - dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(dst_gm), dataSize); + srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(srcGm), dataSize); + dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(dstGm), dataSize); pipe.InitBuffer(inQueueX, 1, dataSize * sizeof(srcType)); pipe.InitBuffer(outQueue, 1, dataSize * sizeof(srcType)); @@ -50,8 +50,8 @@ private: AscendC::LocalTensor dstLocal = outQueue.AllocTensor(); AscendC::LocalTensor srcLocal = inQueueX.DeQue(); AscendC::FasterGelu(dstLocal, srcLocal, dataSize); - // AscendC::FasterGelu(dstLocal, srcLocal, dataSize); //开启高精度模式 - // AscendC::FasterGelu(dstLocal, srcLocal, dataSize); //开启高性能模式 + // AscendC::FasterGelu(dstLocal, srcLocal, dataSize); //Enable high precision mode + // AscendC::FasterGelu(dstLocal, srcLocal, dataSize); //Enable high performance mode outQueue.EnQue(dstLocal); inQueueX.FreeTensor(srcLocal); } diff --git a/examples/normalization/normalize/kernel_impl/normalize_custom.h b/examples/normalization/normalize/kernel_impl/normalize_custom.h index 89b1b960..5de70bb8 100644 --- a/examples/normalization/normalize/kernel_impl/normalize_custom.h +++ b/examples/normalization/normalize/kernel_impl/normalize_custom.h @@ -28,21 +28,21 @@ template class KernelNormalize { public: __aicore__ inline KernelNormalize() {} - __aicore__ inline void Init(GM_ADDR inputX_gm, GM_ADDR inputMean_gm, GM_ADDR inputVar_gm, GM_ADDR gamma_gm, - GM_ADDR beta_gm, GM_ADDR output_gm, GM_ADDR outputRstd_gm, NormalizeTiling tilingData) { + __aicore__ inline void Init(GM_ADDR inputXGm, GM_ADDR inputMeanGm, GM_ADDR inputVarGm, GM_ADDR gammaGm, + GM_ADDR betaGm, GM_ADDR outputGm, GM_ADDR outputRstdGm, NormalizeTiling tilingData) { aLength = tilingData.aLength; rLength = tilingData.rLength; rLengthWithPadding = tilingData.rLengthWithPadding; tmpLocalBytes = tilingData.tmpLocalSize; uint32_t totalLength = aLength * rLengthWithPadding; - inputX_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(inputX_gm), totalLength); // [A, R] - inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(inputMean_gm), aLength); // [A] - inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(inputVar_gm), aLength); // [A] - inputGamma_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gamma_gm), rLengthWithPadding); // [R] - inputBeta_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(beta_gm), rLengthWithPadding); // [R] - - output_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(output_gm), totalLength); - outputRstd_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(outputRstd_gm), aLength); + inputX_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(inputXGm), totalLength); // [A, R] + inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(inputMeanGm), aLength); // [A] + inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(inputVarGm), aLength); // [A] + inputGamma_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gammaGm), rLengthWithPadding); // [R] + inputBeta_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(betaGm), rLengthWithPadding); // [R] + + output_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(outputGm), totalLength); + outputRstd_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(outputRstdGm), aLength); pipe.InitBuffer(inQueueX, 1, sizeof(T) * totalLength); pipe.InitBuffer(inQueueMean, 1, sizeof(float) * aLength); diff --git a/examples/normalization/welford_finalize/host_tiling/welford_finalize_custom_tiling.h b/examples/normalization/welford_finalize/host_tiling/welford_finalize_custom_tiling.h index 13ac35cf..6a91d8ca 100644 --- a/examples/normalization/welford_finalize/host_tiling/welford_finalize_custom_tiling.h +++ b/examples/normalization/welford_finalize/host_tiling/welford_finalize_custom_tiling.h @@ -38,7 +38,7 @@ void ComputeTiling(uint32_t rnLength, uint32_t abLength, uint32_t head, uint32_t ge::Shape srcShape(shapeVec); uint32_t maxsize = 0; uint32_t minsize = 0; - uint32_t dtypesize = 4; // float类型 + uint32_t dtypesize = 4; // float type tiling.set_rnLength(rnLength); tiling.set_abLength(abLength); diff --git a/examples/normalization/welford_finalize/kernel_impl/welford_finalize_custom.h b/examples/normalization/welford_finalize/kernel_impl/welford_finalize_custom.h index f0b48af6..889905ea 100644 --- a/examples/normalization/welford_finalize/kernel_impl/welford_finalize_custom.h +++ b/examples/normalization/welford_finalize/kernel_impl/welford_finalize_custom.h @@ -32,8 +32,8 @@ class KernelWelfordFinalize { public: __aicore__ inline KernelWelfordFinalize() {} - __aicore__ inline void Init(GM_ADDR inputMean_gm, GM_ADDR inputVariance_gm, GM_ADDR counts_gm, GM_ADDR outputMean_gm, - GM_ADDR outputVariance_gm, VecTiling tilingData) + __aicore__ inline void Init(GM_ADDR inputMeanGm, GM_ADDR inputVarianceGm, GM_ADDR countsGm, GM_ADDR outputMeanGm, + GM_ADDR outputVarianceGm, VecTiling tilingData) { this->rnLength = tilingData.rnLength; this->abLength = tilingData.abLength; @@ -51,11 +51,11 @@ public: this->rRec = 1.0f / rLength; this->outLength = OUT_SIZE; - inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(inputMean_gm), abLength); - inputVariance_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(inputVariance_gm), abLength); - inputcounts_global.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(counts_gm), abLength); - outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(outputMean_gm), outLength); - outputVariance_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(outputVariance_gm), outLength); + inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(inputMeanGm), abLength); + inputVariance_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(inputVarianceGm), abLength); + inputcounts_global.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(countsGm), abLength); + outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(outputMeanGm), outLength); + outputVariance_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(outputVarianceGm), outLength); pipe.InitBuffer(inQueueMean, 1, abLength * sizeof(dataType)); pipe.InitBuffer(inQueueVariance, 1, abLength * sizeof(dataType)); diff --git a/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h b/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h index ef4335a3..b23571d0 100644 --- a/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h +++ b/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h @@ -36,8 +36,8 @@ void ComputeTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t ab ge::Shape srcShape(shapeVec); uint32_t maxsize = 0; uint32_t minsize = 0; - uint32_t dtypesizeT = 2; // half类型 - uint32_t dtypesizeU = 4; // float类型 + uint32_t dtypesizeT = 2; // half type + uint32_t dtypesizeU = 4; // float type tiling.set_inplace(inplace); tiling.set_nLength(nLength); diff --git a/examples/normalization/welford_update/kernel_impl/welford_update_custom.h b/examples/normalization/welford_update/kernel_impl/welford_update_custom.h index e91dee4e..154a9b14 100644 --- a/examples/normalization/welford_update/kernel_impl/welford_update_custom.h +++ b/examples/normalization/welford_update/kernel_impl/welford_update_custom.h @@ -30,8 +30,8 @@ template (inputX_gm), bshLength); - inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputMean_gm), bshLength); - inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputVar_gm), bshLength); + inputX_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(inputXGm), bshLength); + inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputMeanGm), bshLength); + inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputVarGm), bshLength); - outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputMean_gm), bshLength); - outputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputVar_gm), bshLength); + outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputMeanGm), bshLength); + outputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputVarGm), bshLength); pipe.InitBuffer(inQueueX, 1, sizeof(T) * bshLength); pipe.InitBuffer(inQueueMean, 1, sizeof(U) * bshLength); diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h index ef4335a3..b23571d0 100644 --- a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h @@ -36,8 +36,8 @@ void ComputeTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t ab ge::Shape srcShape(shapeVec); uint32_t maxsize = 0; uint32_t minsize = 0; - uint32_t dtypesizeT = 2; // half类型 - uint32_t dtypesizeU = 4; // float类型 + uint32_t dtypesizeT = 2; // half type + uint32_t dtypesizeU = 4; // float type tiling.set_inplace(inplace); tiling.set_nLength(nLength); diff --git a/examples/reduce/sum/main.cpp b/examples/reduce/sum/main.cpp index c3ec5710..4d7da545 100644 --- a/examples/reduce/sum/main.cpp +++ b/examples/reduce/sum/main.cpp @@ -26,7 +26,7 @@ constexpr uint32_t M = 7; // outter constexpr uint32_t N = 2023; // inner_actual } -extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N); +extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t m, const uint32_t n); static bool CompareResult(const void *outputData, uint32_t outSize) { void *goldenData; diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.cpp b/examples/reduce/sum/op_host/sum_custom_tiling.cpp index ebfc06e6..5e3cbb4c 100644 --- a/examples/reduce/sum/op_host/sum_custom_tiling.cpp +++ b/examples/reduce/sum/op_host/sum_custom_tiling.cpp @@ -16,25 +16,25 @@ namespace { constexpr uint32_t PADDING_BYTE = 32U; } -void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N) { +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t m, const uint32_t n) { uint32_t minValue = 0; uint32_t maxValue = 0; - AscendC::GetSumMaxMinTmpSize(N, sizeof(uint32_t), false, maxValue, minValue); + AscendC::GetSumMaxMinTmpSize(n, sizeof(uint32_t), false, maxValue, minValue); SumCustomTilingData *tiling = reinterpret_cast(tilingBuf); - auto paddingFunc = [](const uint32_t n, const uint32_t typeSize) -> uint32_t { + auto paddingFunc = [](const uint32_t n1, const uint32_t typeSize) -> uint32_t { if (typeSize == 0) { return 0; } - return (n * typeSize + PADDING_BYTE - 1U) / PADDING_BYTE * PADDING_BYTE / typeSize; + return (n1 * typeSize + PADDING_BYTE - 1U) / PADDING_BYTE * PADDING_BYTE / typeSize; }; - tiling->outter = M; - tiling->inner = paddingFunc(N, sizeof(uint32_t)); - tiling->n = N; + tiling->outter = m; + tiling->inner = paddingFunc(n, sizeof(uint32_t)); + tiling->n = n; tiling->tmpBufSize = minValue; - tiling->out_inner = paddingFunc(M, sizeof(uint32_t)); + tiling->out_inner = paddingFunc(m, sizeof(uint32_t)); } \ No newline at end of file diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.h b/examples/reduce/sum/op_host/sum_custom_tiling.h index e3947535..2df3f1ed 100644 --- a/examples/reduce/sum/op_host/sum_custom_tiling.h +++ b/examples/reduce/sum/op_host/sum_custom_tiling.h @@ -18,7 +18,7 @@ struct SumCustomTilingData { uint32_t outter; uint32_t n; uint32_t tmpBufSize; - uint32_t out_inner; + uint32_t outInner; }; #endif // EXAMPLES_REDUCE_SUM_COSTOM_TILING_H \ No newline at end of file diff --git a/examples/reduce/sum/op_kernel/sum_custom_impl.h b/examples/reduce/sum/op_kernel/sum_custom_impl.h index 71f8bef1..0f3a97c5 100644 --- a/examples/reduce/sum/op_kernel/sum_custom_impl.h +++ b/examples/reduce/sum/op_kernel/sum_custom_impl.h @@ -25,7 +25,7 @@ public: outter = tilingData.outter; n = tilingData.n; tmpBufSize = tilingData.tmpBufSize; - out_inner = tilingData.out_inner; + outInner = tilingData.outInner; params.inner = inner; params.outter = outter; @@ -36,7 +36,7 @@ public: pipe = pipeIn; pipe->InitBuffer(inQueue, 1, inner * outter * sizeof(T)); - pipe->InitBuffer(outQueue, 1, out_inner * sizeof(T)); + pipe->InitBuffer(outQueue, 1, outInner * sizeof(T)); pipe->InitBuffer(tmpBuf, tmpBufSize * sizeof(uint8_t)); } __aicore__ inline void Process() { @@ -57,7 +57,7 @@ private: AscendC::LocalTensor sharedTmpBuffer = tmpBuf.AllocTensor(); T scalar(0); - AscendC::Duplicate(yLocal, scalar, out_inner); + AscendC::Duplicate(yLocal, scalar, outInner); AscendC::Sum(yLocal, xLocal, sharedTmpBuffer, params); outQueue.EnQue(yLocal); @@ -66,7 +66,7 @@ private: } __aicore__ inline void CopyOut() { AscendC::LocalTensor yLocal = outQueue.DeQue(); - AscendC::DataCopy(yGm, yLocal, out_inner); + AscendC::DataCopy(yGm, yLocal, outInner); outQueue.FreeTensor(yLocal); } @@ -82,7 +82,7 @@ private: uint32_t outter = 0; uint32_t n = 0; uint32_t tmpBufSize = 0; - uint32_t out_inner = 0; + uint32_t outInner = 0; AscendC::SumParams params; }; } diff --git a/examples/sort/topk/kernel_impl/topk_custom.h b/examples/sort/topk/kernel_impl/topk_custom.h index ad24c567..7b0a1883 100644 --- a/examples/sort/topk/kernel_impl/topk_custom.h +++ b/examples/sort/topk/kernel_impl/topk_custom.h @@ -47,17 +47,17 @@ public: tmplocalBytes = tilingData.minsize; topKTilingData = tilingData.topKTilingData; k = tilingData.k; - // 计算k_pad + // calculation kPad if (sizeof(T) == sizeof(float)) { - k_pad = (k + K_FLOAT - 1) / K_FLOAT * K_FLOAT; + kPad = (k + K_FLOAT - 1) / K_FLOAT * K_FLOAT; } else { - k_pad = (k + K_HALF - 1) / K_HALF * K_HALF; + kPad = (k + K_HALF - 1) / K_HALF * K_HALF; } - kpad_index = (k + K_FLOAT) / K_FLOAT * K_FLOAT; + kPadIndex = (k + K_FLOAT) / K_FLOAT * K_FLOAT; isLargest = tilingData.isLargest; inDataSize = inner * outter; - outValueDataSize = k_pad * outter; - outIndexDataSize = kpad_index * outter; + outValueDataSize = kPad * outter; + outIndexDataSize = kPadIndex * outter; inputdexDataSize = inner; if (topkMode == true) { @@ -189,8 +189,8 @@ private: uint32_t outValueDataSize = 0; uint32_t outIndexDataSize = 0; uint32_t k; - uint32_t k_pad; - uint32_t kpad_index; + uint32_t kPad; + uint32_t kPadIndex; bool isLargest = true; TopkTiling topKTilingData; uint32_t outter; diff --git a/examples/sort/topk/kernel_launch_method_by_direct/topk_custom_tiling.cpp b/examples/sort/topk/kernel_launch_method_by_direct/topk_custom_tiling.cpp index cd5af26c..c2239d97 100644 --- a/examples/sort/topk/kernel_launch_method_by_direct/topk_custom_tiling.cpp +++ b/examples/sort/topk/kernel_launch_method_by_direct/topk_custom_tiling.cpp @@ -29,7 +29,7 @@ uint8_t* GenerateTiling(uint32_t k, uint32_t outter, uint32_t inner, uint32_t n, uint32_t maxsize = 0; uint32_t minsize = 0; - uint32_t dtypesize = 4; // float类型 + uint32_t dtypesize = 4; // float type platform_ascendc::PlatformAscendC* ascendcPlatform; if (socVersion != nullptr) { diff --git a/examples/utils/init_global_memory/kernel_impl/init_global_memory_custom.h b/examples/utils/init_global_memory/kernel_impl/init_global_memory_custom.h index 21ca183c..bb57df5f 100644 --- a/examples/utils/init_global_memory/kernel_impl/init_global_memory_custom.h +++ b/examples/utils/init_global_memory/kernel_impl/init_global_memory_custom.h @@ -30,7 +30,7 @@ public: // init zGm value AscendC::InitGlobalMemory(zGm, INIT_SIZE, (float)(AscendC::GetBlockIdx())); - //需要插MTE2等MTE3的同步 + // sync of MTE2 and MTE3 is requied AscendC::TEventID eventIdMTE3ToMTE2 = GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE3_MTE2); AscendC::SetFlag(eventIdMTE3ToMTE2); AscendC::WaitFlag(eventIdMTE3ToMTE2); diff --git a/lib/quantization/ascend_antiquant.h b/lib/quantization/ascend_antiquant.h index 0275ae1e..c1c338f9 100644 --- a/lib/quantization/ascend_antiquant.h +++ b/lib/quantization/ascend_antiquant.h @@ -33,12 +33,12 @@ namespace AscendC { template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, const LocalTensor &offset, const LocalTensor &scale, - const LocalTensor &sharedTmpBuffer, const uint32_t K, const AntiQuantShapeInfo& shapeInfo = {}) + const LocalTensor &sharedTmpBuffer, const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, offset, scale, sharedTmpBuffer, K, + AscendAntiQuantImpl(dst, src, offset, scale, sharedTmpBuffer, k, shapeInfo); } @@ -54,13 +54,13 @@ __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, c */ template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, - const LocalTensor &scale, const LocalTensor &sharedTmpBuffer, const uint32_t K, + const LocalTensor &scale, const LocalTensor &sharedTmpBuffer, const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, scale, sharedTmpBuffer, K, shapeInfo); + AscendAntiQuantImpl(dst, src, scale, sharedTmpBuffer, k, shapeInfo); } /* ! @@ -75,13 +75,13 @@ __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, c */ template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, - const LocalTensor &offset, const LocalTensor &scale, const uint32_t K, + const LocalTensor &offset, const LocalTensor &scale, const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, offset, scale, K, shapeInfo); + AscendAntiQuantImpl(dst, src, offset, scale, k, shapeInfo); } /* ! @@ -98,12 +98,12 @@ __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, c template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, const OutputDataType offset, const OutputDataType scale, const LocalTensor &sharedTmpBuffer, - const uint32_t K, const AntiQuantShapeInfo& shapeInfo = {}) + const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, offset, scale, sharedTmpBuffer, K, + AscendAntiQuantImpl(dst, src, offset, scale, sharedTmpBuffer, k, shapeInfo); } @@ -119,13 +119,13 @@ __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, c */ template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, - const OutputDataType scale, const LocalTensor &sharedTmpBuffer, const uint32_t K, + const OutputDataType scale, const LocalTensor &sharedTmpBuffer, const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, scale, sharedTmpBuffer, K, shapeInfo); + AscendAntiQuantImpl(dst, src, scale, sharedTmpBuffer, k, shapeInfo); } /* ! @@ -140,12 +140,12 @@ __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, c */ template __aicore__ inline void AscendAntiQuant(const LocalTensor &dst, const LocalTensor &src, - const OutputDataType offset, const OutputDataType scale, const uint32_t K, const AntiQuantShapeInfo& shapeInfo = {}) + const OutputDataType offset, const OutputDataType scale, const uint32_t k, const AntiQuantShapeInfo& shapeInfo = {}) { if ASCEND_IS_AIC { return; } - AscendAntiQuantImpl(dst, src, offset, scale, K, shapeInfo); + AscendAntiQuantImpl(dst, src, offset, scale, k, shapeInfo); } #pragma end_pipe } // namespace AscendC diff --git a/tests/normalization/groupnorm/test_operator_groupnorm.cpp b/tests/normalization/groupnorm/test_operator_groupnorm.cpp index 7dd522b9..4fa37322 100644 --- a/tests/normalization/groupnorm/test_operator_groupnorm.cpp +++ b/tests/normalization/groupnorm/test_operator_groupnorm.cpp @@ -70,11 +70,12 @@ __aicore__ inline void GetGroupNormNDTillingInfo(const ShapeInfo& inputShapeInfo tiling.tmpBufSize = stackBufferSize / ONE_BLK_SIZE * ONE_BLK_SIZE / B32_BYTE_SIZE; tiling.oneTmpSize = (tiling.tmpBufSize - meanVarTotalSize) / tiling.numberOfTmpBuf; - // 为了使 MeanVarTensor 可以直接使用 Add 而不需使用 GetValue, 需保证每个迭代至少有8的整数倍组 group + // to enable MeanVarTensor to directly use Add without need to use GetValue + // it is necessary to ensure that each iteration has at least 8 integer multiples of groups tiling.bsCurLength = tiling.oneTmpSize / (GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION * tiling.d * tiling.hwAlignSize) * GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION; - // 判断是否满足 smallShape 计算 + // determine whether the condition for smallShape is met uint32_t k = GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE; while ((tiling.dhwAlignSize / (ONE_BLK_SIZE / B32_BYTE_SIZE)) % k != 0) { k--; @@ -82,7 +83,9 @@ __aicore__ inline void GetGroupNormNDTillingInfo(const ShapeInfo& inputShapeInfo tiling.smallShape = (tiling.hwAlignSize <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM) && (tiling.hwAlignSize * tiling.d <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM * k); - // ReduceSum0级接口带来的约束, 根据DHW计算2次 ReduceSum 的 mask/repeat, 以及 DHW/bsCurLength 取值范围 + // the constraints instroduced by the ReduceSum0 interface + // base one the DHW calculation of the mask/repeat for 2 ReduceSum operations, + // as well as the value range of DHW/bsCurLength if (tiling.smallShape) { uint32_t mask1{GROUPNORM_MAX_MASK_VAL}; if (tiling.dhwAlignSize > GROUPNORM_MAX_MASK_VAL) { diff --git a/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp b/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp index c5b08ec1..c937fbfa 100644 --- a/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp +++ b/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp @@ -210,9 +210,9 @@ protected: {} }; -// 1、有尾块; -// 2、有counts; -// 3、 有buffer约束; +// 1. with tail block; +// 2. there is counts; +// 3. there is buffer constraint INSTANTIATE_TEST_CASE_P(TEST_PACKAGE_WelfordFinalize, WelfordFinalizeTestSuite, ::testing::Values( WelfordFinalizeTestParams { 4, 32, 4, 32, 4, 0, kernel_WelfordFinalize_test }, // !1 + !2 + !3 diff --git a/tests/reduce/reduce_all/test_operator_reduce_all.cpp b/tests/reduce/reduce_all/test_operator_reduce_all.cpp index 4cf97671..bd570304 100644 --- a/tests/reduce/reduce_all/test_operator_reduce_all.cpp +++ b/tests/reduce/reduce_all/test_operator_reduce_all.cpp @@ -180,7 +180,7 @@ TEST_P(ReduceAllTestsuite, ReduceAllOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_any/test_operator_reduce_any.cpp b/tests/reduce/reduce_any/test_operator_reduce_any.cpp index bf4335fc..9a09028e 100644 --- a/tests/reduce/reduce_any/test_operator_reduce_any.cpp +++ b/tests/reduce/reduce_any/test_operator_reduce_any.cpp @@ -179,7 +179,7 @@ TEST_P(ReduceAnyTestsuite, ReduceAnyOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_max/test_operator_reduce_max.cpp b/tests/reduce/reduce_max/test_operator_reduce_max.cpp index 25891741..8d8ae90c 100644 --- a/tests/reduce/reduce_max/test_operator_reduce_max.cpp +++ b/tests/reduce/reduce_max/test_operator_reduce_max.cpp @@ -172,7 +172,7 @@ TEST_P(MaxTestsuite, MaxOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_mean/test_operator_reduce_mean.cpp b/tests/reduce/reduce_mean/test_operator_reduce_mean.cpp index 5d7ad967..b0dca21c 100644 --- a/tests/reduce/reduce_mean/test_operator_reduce_mean.cpp +++ b/tests/reduce/reduce_mean/test_operator_reduce_mean.cpp @@ -159,7 +159,7 @@ TEST_P(ReduceMeanTestsuite, ReduceMeanOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_min/test_operator_reduce_min.cpp b/tests/reduce/reduce_min/test_operator_reduce_min.cpp index 6d0787df..7137cd17 100644 --- a/tests/reduce/reduce_min/test_operator_reduce_min.cpp +++ b/tests/reduce/reduce_min/test_operator_reduce_min.cpp @@ -172,7 +172,7 @@ TEST_P(MinTestsuite, MinOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_prod/test_operator_reduce_prod.cpp b/tests/reduce/reduce_prod/test_operator_reduce_prod.cpp index d27778c9..80079e0a 100644 --- a/tests/reduce/reduce_prod/test_operator_reduce_prod.cpp +++ b/tests/reduce/reduce_prod/test_operator_reduce_prod.cpp @@ -136,7 +136,7 @@ TEST_P(ProdTestsuite, ProdOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/reduce_sum/test_operator_reduce_sum.cpp b/tests/reduce/reduce_sum/test_operator_reduce_sum.cpp index 2b0927cd..e0e418ee 100644 --- a/tests/reduce/reduce_sum/test_operator_reduce_sum.cpp +++ b/tests/reduce/reduce_sum/test_operator_reduce_sum.cpp @@ -163,7 +163,7 @@ TEST_P(ReduceSumTestsuite, ReduceSumOpTestCase) auto last = param.last; constexpr uint32_t BLK_SIZE = 32; auto padLast = (last * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; - uint8_t srcGm[first * padLast] = {0}; // 外部保证inner是32B对齐 + uint8_t srcGm[first * padLast] = {0}; // external guarantee inner is 32B aligned uint32_t dstLen = param.isAr ? first : last; auto padDst = (dstLen * param.typeSize + BLK_SIZE - 1) / BLK_SIZE * BLK_SIZE; uint8_t dstGm[padDst] = {0}; diff --git a/tests/reduce/sum/test_operator_sum.cpp b/tests/reduce/sum/test_operator_sum.cpp index 015bcf70..5a9dd78c 100644 --- a/tests/reduce/sum/test_operator_sum.cpp +++ b/tests/reduce/sum/test_operator_sum.cpp @@ -31,10 +31,10 @@ public: src1Global.SetGlobalBuffer((__gm__ T*)src0Gm); dstGlobal.SetGlobalBuffer((__gm__ T*)dstGm); pipe.InitBuffer(inQueueSrc1, 1, 8 * 160 * sizeof(T)); - pipe.InitBuffer(outQueueDst, 1, ONE_BLK_SIZE); // 8个数整体对齐 - int32_t repeatTimes = (160 + elementNumPerRep - 1) / elementNumPerRep; // workSize = repeatTimes向上取整 + pipe.InitBuffer(outQueueDst, 1, ONE_BLK_SIZE); // align the 8 number as a whole + int32_t repeatTimes = (160 + elementNumPerRep - 1) / elementNumPerRep; // workSize = ceil(repeatTimes) int32_t finalWorkSize = (repeatTimes + elementNumPerBlk - 1) / elementNumPerBlk * elementNumPerBlk * sizeof(T); - pipe.InitBuffer(workQueue, 1, finalWorkSize); // 向上取整 + pipe.InitBuffer(workQueue, 1, finalWorkSize); // round up } __aicore__ inline void Process() @@ -59,7 +59,7 @@ private: LocalTensor workLocal = workQueue.AllocTensor(); LocalTensor dstLocal = outQueueDst.AllocTensor(); - SumParams params {8, 160, 152}; // n是自己填的 + SumParams params {8, 160, 152}; Sum(dstLocal, srcLocal1, workLocal, params); outQueueDst.EnQue(dstLocal); @@ -75,11 +75,13 @@ private: private: TPipe pipe; - TQue inQueueSrc1; // 用于申请临时tensor + // used for applying a temporary tensor + TQue inQueueSrc1; TQue workQueue; TQue outQueueDst; - GlobalTensor src1Global, dstGlobal; // 用于关联Gm + // used for associating GM + GlobalTensor src1Global, dstGlobal; }; } // namespace AscendC @@ -117,7 +119,7 @@ INSTANTIATE_TEST_CASE_P(TEST_OPEARATION_SUM, SumTestsuite, TEST_P(SumTestsuite, SumOpTestCase) { auto param = GetParam(); - uint8_t src0Gm[8 * 160 * param.typeSize]; // 外部保证inner是32B对齐 + uint8_t src0Gm[8 * 160 * param.typeSize]; // external guarantee inner is 32B aligned uint32_t dstLen = (8 * param.typeSize + ONE_BLK_SIZE - 1) / ONE_BLK_SIZE * ONE_BLK_SIZE; uint8_t dstGm[dstLen]; param.cal_func(dstGm, src0Gm); diff --git a/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp b/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp index c0624cc1..fe6b6b58 100644 --- a/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp +++ b/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp @@ -19,9 +19,9 @@ using namespace std; using namespace AscendC; -// 场景1 +// scene 1 namespace AscendC { -// 场景1、2: srcShape[B, A1, A2, A3] +// scene 1, 2: srcShape[B, A1, A2, A3] __aicore__ inline void GetConfusionTranspose0213TilingInfo(const ShapeInfo srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, ConfusionTransposeTiling& tiling) { @@ -32,7 +32,7 @@ __aicore__ inline void GetConfusionTranspose0213TilingInfo(const ShapeInfo srcSh uint32_t widthTiling = (srcShape.originalShape[3] + BLOCK_CUBE - 1) / BLOCK_CUBE; uint32_t alignA3 = widthTiling * BLOCK_CUBE; - // stackBuffer向 [16,16]对齐 + // stackBuffer is aligned to [16,16] uint32_t newPopSize = (stackBufferSize / CUBE_MAX_SIZE) * CUBE_MAX_SIZE; // element uint32_t newPopH = newPopSize / BLOCK_CUBE; uint32_t needSize = alignA2 * BLOCK_CUBE; @@ -62,7 +62,7 @@ __aicore__ inline void GetConfusionTranspose0213TilingInfo(const ShapeInfo srcSh tiling.param15 = mainOffset; } -// 场景3:srcShape[B, N, S, H/N] +// scene 3:srcShape[B, N, S, H/N] __aicore__ inline void GetConfusionTranspose2NZ012NTilingInfo(const ShapeInfo srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, ConfusionTransposeTiling& tiling) { @@ -103,7 +103,7 @@ __aicore__ inline void GetConfusionTranspose2NZ012NTilingInfo(const ShapeInfo sr tiling.param16 = srcBatchOffset; } -// 场景4:srcShape[B, N, S, H/N] +// scene 4:srcShape[B, N, S, H/N] __aicore__ inline void GetConfusionTranspose2ND012NTilingInfo(const ShapeInfo srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, ConfusionTransposeTiling& tiling) { @@ -146,7 +146,7 @@ __aicore__ inline void GetConfusionTranspose2ND012NTilingInfo(const ShapeInfo sr tiling.param17 = blockNum; } -// 场景5、6:srcShape[B, N, S, H/N] +// scene 5, 6:srcShape[B, N, S, H/N] __aicore__ inline void GetConfusionTranspose012TilingInfo(const ShapeInfo srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, ConfusionTransposeTiling& tiling) { @@ -183,7 +183,7 @@ __aicore__ inline void GetConfusionTranspose012TilingInfo(const ShapeInfo srcSha tiling.param14 = blockNum; } -// 场景7:srcShape[height, width] +// scene 7:srcShape[height, width] __aicore__ inline void GetConfusionTransposeOnlyTilingInfo(const ShapeInfo srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, ConfusionTransposeTiling& tiling) { @@ -335,7 +335,7 @@ TEST_P(ConfusionTransposeFirstTestsuite, ConfusionTransposeFirstTestCase) } } -// 场景2 +// scene 2 namespace AscendC { template class KernelConfusionTransposeSecond { @@ -458,7 +458,7 @@ TEST_P(ConfusionTransposeSecondTestsuite, ConfusionTransposeSecondTestCase) } } -// 场景3 +// scene 3 namespace AscendC { template class KernelConfusionTransposeThird { @@ -604,7 +604,7 @@ TEST_P(ConfusionTransposeThirdTestsuite, ConfusionTransposeThirdTestCase) } } -// 场景4 +// scene 4 namespace AscendC { template class KernelConfusionTransposeFourth { @@ -751,7 +751,7 @@ TEST_P(ConfusionTransposeFourthTestsuite, ConfusionTransposeFourthTestCase) } -// 场景5 +// scene 5 namespace AscendC { template class KernelConfusionTransposeFifth { @@ -897,7 +897,7 @@ TEST_P(ConfusionTransposeFifthTestsuite, ConfusionTransposeFifthTestCase) } -// 场景6 +// scene 6 namespace AscendC { template class KernelConfusionTransposeSixth { @@ -1042,7 +1042,7 @@ TEST_P(ConfusionTransposeSixthTestsuite, ConfusionTransposeSixthTestCase) } } -// 场景7 +// scene 7 namespace AscendC { template class KernelConfusionTransposeSeventh { -- Gitee From edab8174dcb94d963c203391093e5810e9d06ad4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AF=9B=E6=B5=B7=E5=B9=B3?= Date: Wed, 29 Oct 2025 11:05:24 +0800 Subject: [PATCH 2/7] update var naming --- .../normalize/kernel_impl/normalize_custom.h | 16 ++++++++-------- .../kernel_impl/welford_finalize_custom.h | 14 +++++++------- .../kernel_impl/welford_update_custom.h | 14 +++++++------- examples/reduce/sum/op_host/sum_custom_tiling.h | 2 +- examples/reduce/sum/op_kernel/sum_custom_impl.h | 10 +++++----- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/examples/normalization/normalize/kernel_impl/normalize_custom.h b/examples/normalization/normalize/kernel_impl/normalize_custom.h index 5de70bb8..60ab5f52 100644 --- a/examples/normalization/normalize/kernel_impl/normalize_custom.h +++ b/examples/normalization/normalize/kernel_impl/normalize_custom.h @@ -28,21 +28,21 @@ template class KernelNormalize { public: __aicore__ inline KernelNormalize() {} - __aicore__ inline void Init(GM_ADDR inputXGm, GM_ADDR inputMeanGm, GM_ADDR inputVarGm, GM_ADDR gammaGm, - GM_ADDR betaGm, GM_ADDR outputGm, GM_ADDR outputRstdGm, NormalizeTiling tilingData) { + __aicore__ inline void Init(GM_ADDR inputXGm, GM_ADDR inputMean_gm, GM_ADDR inputVar_gm, GM_ADDR gammaGm, + GM_ADDR betaGm, GM_ADDR output_gm, GM_ADDR outputRstd_gm, NormalizeTiling tilingData) { aLength = tilingData.aLength; rLength = tilingData.rLength; rLengthWithPadding = tilingData.rLengthWithPadding; tmpLocalBytes = tilingData.tmpLocalSize; uint32_t totalLength = aLength * rLengthWithPadding; inputX_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(inputXGm), totalLength); // [A, R] - inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(inputMeanGm), aLength); // [A] - inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(inputVarGm), aLength); // [A] - inputGamma_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gammaGm), rLengthWithPadding); // [R] - inputBeta_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(betaGm), rLengthWithPadding); // [R] + inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(inputMean_gm), aLength); // [A] + inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(inputVar_gm), aLength); // [A] + inputGamma_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gamma_gm), rLengthWithPadding); // [R] + inputBeta_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(beta_gm), rLengthWithPadding); // [R] - output_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(outputGm), totalLength); - outputRstd_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(outputRstdGm), aLength); + output_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(output_gm), totalLength); + outputRstd_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(outputRstd_gm), aLength); pipe.InitBuffer(inQueueX, 1, sizeof(T) * totalLength); pipe.InitBuffer(inQueueMean, 1, sizeof(float) * aLength); diff --git a/examples/normalization/welford_finalize/kernel_impl/welford_finalize_custom.h b/examples/normalization/welford_finalize/kernel_impl/welford_finalize_custom.h index 889905ea..f0b48af6 100644 --- a/examples/normalization/welford_finalize/kernel_impl/welford_finalize_custom.h +++ b/examples/normalization/welford_finalize/kernel_impl/welford_finalize_custom.h @@ -32,8 +32,8 @@ class KernelWelfordFinalize { public: __aicore__ inline KernelWelfordFinalize() {} - __aicore__ inline void Init(GM_ADDR inputMeanGm, GM_ADDR inputVarianceGm, GM_ADDR countsGm, GM_ADDR outputMeanGm, - GM_ADDR outputVarianceGm, VecTiling tilingData) + __aicore__ inline void Init(GM_ADDR inputMean_gm, GM_ADDR inputVariance_gm, GM_ADDR counts_gm, GM_ADDR outputMean_gm, + GM_ADDR outputVariance_gm, VecTiling tilingData) { this->rnLength = tilingData.rnLength; this->abLength = tilingData.abLength; @@ -51,11 +51,11 @@ public: this->rRec = 1.0f / rLength; this->outLength = OUT_SIZE; - inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(inputMeanGm), abLength); - inputVariance_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(inputVarianceGm), abLength); - inputcounts_global.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(countsGm), abLength); - outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(outputMeanGm), outLength); - outputVariance_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(outputVarianceGm), outLength); + inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(inputMean_gm), abLength); + inputVariance_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(inputVariance_gm), abLength); + inputcounts_global.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(counts_gm), abLength); + outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(outputMean_gm), outLength); + outputVariance_global.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(outputVariance_gm), outLength); pipe.InitBuffer(inQueueMean, 1, abLength * sizeof(dataType)); pipe.InitBuffer(inQueueVariance, 1, abLength * sizeof(dataType)); diff --git a/examples/normalization/welford_update/kernel_impl/welford_update_custom.h b/examples/normalization/welford_update/kernel_impl/welford_update_custom.h index 154a9b14..e91dee4e 100644 --- a/examples/normalization/welford_update/kernel_impl/welford_update_custom.h +++ b/examples/normalization/welford_update/kernel_impl/welford_update_custom.h @@ -30,8 +30,8 @@ template (inputXGm), bshLength); - inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputMeanGm), bshLength); - inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputVarGm), bshLength); + inputX_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(inputX_gm), bshLength); + inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputMean_gm), bshLength); + inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputVar_gm), bshLength); - outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputMeanGm), bshLength); - outputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputVarGm), bshLength); + outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputMean_gm), bshLength); + outputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputVar_gm), bshLength); pipe.InitBuffer(inQueueX, 1, sizeof(T) * bshLength); pipe.InitBuffer(inQueueMean, 1, sizeof(U) * bshLength); diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.h b/examples/reduce/sum/op_host/sum_custom_tiling.h index 2df3f1ed..e3947535 100644 --- a/examples/reduce/sum/op_host/sum_custom_tiling.h +++ b/examples/reduce/sum/op_host/sum_custom_tiling.h @@ -18,7 +18,7 @@ struct SumCustomTilingData { uint32_t outter; uint32_t n; uint32_t tmpBufSize; - uint32_t outInner; + uint32_t out_inner; }; #endif // EXAMPLES_REDUCE_SUM_COSTOM_TILING_H \ No newline at end of file diff --git a/examples/reduce/sum/op_kernel/sum_custom_impl.h b/examples/reduce/sum/op_kernel/sum_custom_impl.h index 0f3a97c5..71f8bef1 100644 --- a/examples/reduce/sum/op_kernel/sum_custom_impl.h +++ b/examples/reduce/sum/op_kernel/sum_custom_impl.h @@ -25,7 +25,7 @@ public: outter = tilingData.outter; n = tilingData.n; tmpBufSize = tilingData.tmpBufSize; - outInner = tilingData.outInner; + out_inner = tilingData.out_inner; params.inner = inner; params.outter = outter; @@ -36,7 +36,7 @@ public: pipe = pipeIn; pipe->InitBuffer(inQueue, 1, inner * outter * sizeof(T)); - pipe->InitBuffer(outQueue, 1, outInner * sizeof(T)); + pipe->InitBuffer(outQueue, 1, out_inner * sizeof(T)); pipe->InitBuffer(tmpBuf, tmpBufSize * sizeof(uint8_t)); } __aicore__ inline void Process() { @@ -57,7 +57,7 @@ private: AscendC::LocalTensor sharedTmpBuffer = tmpBuf.AllocTensor(); T scalar(0); - AscendC::Duplicate(yLocal, scalar, outInner); + AscendC::Duplicate(yLocal, scalar, out_inner); AscendC::Sum(yLocal, xLocal, sharedTmpBuffer, params); outQueue.EnQue(yLocal); @@ -66,7 +66,7 @@ private: } __aicore__ inline void CopyOut() { AscendC::LocalTensor yLocal = outQueue.DeQue(); - AscendC::DataCopy(yGm, yLocal, outInner); + AscendC::DataCopy(yGm, yLocal, out_inner); outQueue.FreeTensor(yLocal); } @@ -82,7 +82,7 @@ private: uint32_t outter = 0; uint32_t n = 0; uint32_t tmpBufSize = 0; - uint32_t outInner = 0; + uint32_t out_inner = 0; AscendC::SumParams params; }; } -- Gitee From d5e0ed123721223223048cf63e078d663e41bf58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AF=9B=E6=B5=B7=E5=B9=B3?= Date: Wed, 29 Oct 2025 11:08:17 +0800 Subject: [PATCH 3/7] fix: compile error --- .../normalization/normalize/kernel_impl/normalize_custom.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/normalization/normalize/kernel_impl/normalize_custom.h b/examples/normalization/normalize/kernel_impl/normalize_custom.h index 60ab5f52..6fdc4974 100644 --- a/examples/normalization/normalize/kernel_impl/normalize_custom.h +++ b/examples/normalization/normalize/kernel_impl/normalize_custom.h @@ -28,8 +28,8 @@ template class KernelNormalize { public: __aicore__ inline KernelNormalize() {} - __aicore__ inline void Init(GM_ADDR inputXGm, GM_ADDR inputMean_gm, GM_ADDR inputVar_gm, GM_ADDR gammaGm, - GM_ADDR betaGm, GM_ADDR output_gm, GM_ADDR outputRstd_gm, NormalizeTiling tilingData) { + __aicore__ inline void Init(GM_ADDR inputXGm, GM_ADDR inputMean_gm, GM_ADDR inputVar_gm, GM_ADDR gamma_gm, + GM_ADDR beta_gm, GM_ADDR output_gm, GM_ADDR outputRstd_gm, NormalizeTiling tilingData) { aLength = tilingData.aLength; rLength = tilingData.rLength; rLengthWithPadding = tilingData.rLengthWithPadding; -- Gitee From 8daf6f0a20c8fef719ca4a72743bc2871d1c622a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AF=9B=E6=B5=B7=E5=B9=B3?= Date: Wed, 29 Oct 2025 11:10:00 +0800 Subject: [PATCH 4/7] update var naming --- .../normalization/normalize/kernel_impl/normalize_custom.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/normalization/normalize/kernel_impl/normalize_custom.h b/examples/normalization/normalize/kernel_impl/normalize_custom.h index 6fdc4974..89b1b960 100644 --- a/examples/normalization/normalize/kernel_impl/normalize_custom.h +++ b/examples/normalization/normalize/kernel_impl/normalize_custom.h @@ -28,14 +28,14 @@ template class KernelNormalize { public: __aicore__ inline KernelNormalize() {} - __aicore__ inline void Init(GM_ADDR inputXGm, GM_ADDR inputMean_gm, GM_ADDR inputVar_gm, GM_ADDR gamma_gm, + __aicore__ inline void Init(GM_ADDR inputX_gm, GM_ADDR inputMean_gm, GM_ADDR inputVar_gm, GM_ADDR gamma_gm, GM_ADDR beta_gm, GM_ADDR output_gm, GM_ADDR outputRstd_gm, NormalizeTiling tilingData) { aLength = tilingData.aLength; rLength = tilingData.rLength; rLengthWithPadding = tilingData.rLengthWithPadding; tmpLocalBytes = tilingData.tmpLocalSize; uint32_t totalLength = aLength * rLengthWithPadding; - inputX_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(inputXGm), totalLength); // [A, R] + inputX_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(inputX_gm), totalLength); // [A, R] inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(inputMean_gm), aLength); // [A] inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(inputVar_gm), aLength); // [A] inputGamma_global.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gamma_gm), rLengthWithPadding); // [R] -- Gitee From c2640022843ca9e43ea550d760749ac0a857efa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AF=9B=E6=B5=B7=E5=B9=B3?= Date: Wed, 29 Oct 2025 11:23:54 +0800 Subject: [PATCH 5/7] fix: compile error --- examples/reduce/sum/op_host/sum_custom_tiling.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.cpp b/examples/reduce/sum/op_host/sum_custom_tiling.cpp index 5e3cbb4c..549592ef 100644 --- a/examples/reduce/sum/op_host/sum_custom_tiling.cpp +++ b/examples/reduce/sum/op_host/sum_custom_tiling.cpp @@ -16,7 +16,7 @@ namespace { constexpr uint32_t PADDING_BYTE = 32U; } -void GenerateTilingData(uint8_t *tilingBuf, const uint32_t m, const uint32_t n) { +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t m, const uint32_t N) { uint32_t minValue = 0; uint32_t maxValue = 0; @@ -24,16 +24,16 @@ void GenerateTilingData(uint8_t *tilingBuf, const uint32_t m, const uint32_t n) SumCustomTilingData *tiling = reinterpret_cast(tilingBuf); - auto paddingFunc = [](const uint32_t n1, const uint32_t typeSize) -> uint32_t { + auto paddingFunc = [](const uint32_t N, const uint32_t typeSize) -> uint32_t { if (typeSize == 0) { return 0; } - return (n1 * typeSize + PADDING_BYTE - 1U) / PADDING_BYTE * PADDING_BYTE / typeSize; + return (n * typeSize + PADDING_BYTE - 1U) / PADDING_BYTE * PADDING_BYTE / typeSize; }; tiling->outter = m; - tiling->inner = paddingFunc(n, sizeof(uint32_t)); - tiling->n = n; + tiling->inner = paddingFunc(N, sizeof(uint32_t)); + tiling->n = N; tiling->tmpBufSize = minValue; tiling->out_inner = paddingFunc(m, sizeof(uint32_t)); -- Gitee From 1d67e00c4beecdeb6cb3ffadf78cc7781695e404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AF=9B=E6=B5=B7=E5=B9=B3?= Date: Wed, 29 Oct 2025 11:28:47 +0800 Subject: [PATCH 6/7] update var naming --- examples/reduce/sum/op_host/sum_custom_tiling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.cpp b/examples/reduce/sum/op_host/sum_custom_tiling.cpp index 549592ef..12c654aa 100644 --- a/examples/reduce/sum/op_host/sum_custom_tiling.cpp +++ b/examples/reduce/sum/op_host/sum_custom_tiling.cpp @@ -20,11 +20,11 @@ void GenerateTilingData(uint8_t *tilingBuf, const uint32_t m, const uint32_t N) uint32_t minValue = 0; uint32_t maxValue = 0; - AscendC::GetSumMaxMinTmpSize(n, sizeof(uint32_t), false, maxValue, minValue); + AscendC::GetSumMaxMinTmpSize(N, sizeof(uint32_t), false, maxValue, minValue); SumCustomTilingData *tiling = reinterpret_cast(tilingBuf); - auto paddingFunc = [](const uint32_t N, const uint32_t typeSize) -> uint32_t { + auto paddingFunc = [](const uint32_t n, const uint32_t typeSize) -> uint32_t { if (typeSize == 0) { return 0; } -- Gitee From 15c27ec41fa47c3a37ff35bf216ba94d2cee95b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AF=9B=E6=B5=B7=E5=B9=B3?= Date: Wed, 29 Oct 2025 11:37:17 +0800 Subject: [PATCH 7/7] fix: compile error --- examples/reduce/sum/op_host/sum_custom_tiling.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.cpp b/examples/reduce/sum/op_host/sum_custom_tiling.cpp index 12c654aa..5e3cbb4c 100644 --- a/examples/reduce/sum/op_host/sum_custom_tiling.cpp +++ b/examples/reduce/sum/op_host/sum_custom_tiling.cpp @@ -16,24 +16,24 @@ namespace { constexpr uint32_t PADDING_BYTE = 32U; } -void GenerateTilingData(uint8_t *tilingBuf, const uint32_t m, const uint32_t N) { +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t m, const uint32_t n) { uint32_t minValue = 0; uint32_t maxValue = 0; - AscendC::GetSumMaxMinTmpSize(N, sizeof(uint32_t), false, maxValue, minValue); + AscendC::GetSumMaxMinTmpSize(n, sizeof(uint32_t), false, maxValue, minValue); SumCustomTilingData *tiling = reinterpret_cast(tilingBuf); - auto paddingFunc = [](const uint32_t n, const uint32_t typeSize) -> uint32_t { + auto paddingFunc = [](const uint32_t n1, const uint32_t typeSize) -> uint32_t { if (typeSize == 0) { return 0; } - return (n * typeSize + PADDING_BYTE - 1U) / PADDING_BYTE * PADDING_BYTE / typeSize; + return (n1 * typeSize + PADDING_BYTE - 1U) / PADDING_BYTE * PADDING_BYTE / typeSize; }; tiling->outter = m; - tiling->inner = paddingFunc(N, sizeof(uint32_t)); - tiling->n = N; + tiling->inner = paddingFunc(n, sizeof(uint32_t)); + tiling->n = n; tiling->tmpBufSize = minValue; tiling->out_inner = paddingFunc(m, sizeof(uint32_t)); -- Gitee