[ 33%] Building CCE object cmake/npu/CMakeFiles/reduce_sum_custom_npu.dir///reduce_sum_custom.cpp.o error: stack frame size (16024) exceeds limit (16000) in function '_ZN7AscendC9ReduceSumIDhEEvRKNS_11LocalTensorIT_EES5_S5_i' error: stack frame size (16024) exceeds limit (16000) in function '_ZN7AscendC9ReduceSumIDhEEvRKNS_11LocalTensorIT_EES5_S5_i' 2 errors generated.
# 文件位置:cmake/Modules/CMakeDetermineCCECompiler.cmake set(_CMAKE_CCE_COMPILE_OPTIONS "-mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-record-overflow=false \ -mllvm -cce-aicore-addr-transform -mllvm -cce-aicore-jump-expand=true -mllvm -cce-aicore-stack-size=0x8000" # 此处完成修改 )
// 文件位置:${ASCEND_TOOLKIT_HOME}/compiler/tikcpp/tikcfw/impl/kernel_macros.h #ifndef inline //#define inline __inline__ __attribute__((always_inline)) #endif
// 文件位置:${ASCEND_TOOLKIT_HOME}/compiler/tikcpp/tikcfw/interface/kernel_tensor.h // aicore inline const gm T* GetPhyAddr() const // 注释该函数 // { // return address_; // } aicore inline uint64_t GetPhyAddr() const // 添加该函数 { return reinterpret_cast<uint64_t>(address_); } // aicore inline gm T* GetPhyAddr(const uint64_t offset) const // 注释该函数 // { // return address_ + offset; // } aicore inline uint64_t GetPhyAddr(const uint64_t offset) const // 添加该函数 { return reinterpret_cast<uint64_t>(address_ + offset); }
// reduce_sum_custom.cpp __global__ __aicore__ void reduce_sum_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workSpace, GM_ADDR sync) { set_ctrl(sbitset1(sbitset1(sbitset1(sbitset1(get_ctrl(), 2), 3), 4),5)); // 在此插入 KernelReduceSum op; op.Init(x, y, workSpace, sync); op.Process(); }