stack frame size未超过32768

报错如下：

[ 33%] Building CCE object cmake/npu/CMakeFiles/reduce_sum_custom_npu.dir///reduce_sum_custom.cpp.o  
error: stack frame size (16024) exceeds limit (16000) in function '_ZN7AscendC9ReduceSumIDhEEvRKNS_11LocalTensorIT_EES5_S5_i'
error: stack frame size (16024) exceeds limit (16000) in function '_ZN7AscendC9ReduceSumIDhEEvRKNS_11LocalTensorIT_EES5_S5_i'
2 errors generated.

可通过修改编译选项进行规避。打开cmake/Modules/CMakeDetermineCCECompiler.cmake文件，找到变量_CMAKE_CCE_COMPILE_OPTIONS的赋值语句，修改或新增如下三个字段，设置stack size到最大值0x8000：

-mllvm -cce-aicore-function-stack-size=0x8000
-mllvm -cce-aicore-stack-size=0x8000
-mllvm -cce-aicore-jump-expand=true

# 文件位置：cmake/Modules/CMakeDetermineCCECompiler.cmake
    set(_CMAKE_CCE_COMPILE_OPTIONS
	"-mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-record-overflow=false \
    -mllvm -cce-aicore-addr-transform -mllvm -cce-aicore-jump-expand=true -mllvm -cce-aicore-stack-size=0x8000" # 此处完成修改
    )

stack frame size超过32768

首先可尝试注释算子运行不会跑到的冗余代码，如果非mix融合算子，还可通过减少内联函数的使用来进一步减小栈帧大小，方法如下：

修改CANN包中的头文件kernel_macros.h ，找到#define inline __inline__ __attribute__((always_inline))一行并注释掉。

// 文件位置：${ASCEND_TOOLKIT_HOME}/compiler/tikcpp/tikcfw/impl/kernel_macros.h

#ifndef inline
//#define inline __inline__ __attribute__((always_inline))
#endif

修改CANN软件包中的头文件 kernel_tensor.h，将GlobalTensor::GetPhyAddr相关方法的返回类型强转为uint64_t类型，使编译器支持非inline函数调用，如下所示。

// 文件位置：${ASCEND_TOOLKIT_HOME}/compiler/tikcpp/tikcfw/interface/kernel_tensor.h

// aicore inline const gm T* GetPhyAddr() const // 注释该函数
// {
//     return address_;
// }
aicore inline uint64_t GetPhyAddr() const // 添加该函数
{
    return reinterpret_cast<uint64_t>(address_);
}

// aicore inline gm T* GetPhyAddr(const uint64_t offset) const // 注释该函数
// {
//     return address_ + offset;
// }
aicore inline uint64_t GetPhyAddr(const uint64_t offset) const // 添加该函数
{
    return reinterpret_cast<uint64_t>(address_ + offset);
}

在算子核函数代码首行插入代码 set_ctrl(sbitset1(sbitset1(sbitset1(sbitset1(get_ctrl(), 2), 3), 4),5));，防止函数嵌套层数溢出，如下所示。

// reduce_sum_custom.cpp
__global__ __aicore__ void reduce_sum_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workSpace, GM_ADDR sync)
{
    set_ctrl(sbitset1(sbitset1(sbitset1(sbitset1(get_ctrl(), 2), 3), 4),5)); // 在此插入
    KernelReduceSum op;
    op.Init(x, y, workSpace, sync);
    op.Process();
}

使用"-O0 -g"编译算子时，stack frame size超出限制