融合范围标定示例

通过调用aclskScopeBegin和aclskScopeEnd接口，用户若传入有效scopeName可以框定算子融合进SuperKernel，若传入nullptr可以框定算子不融合。该功能只在使能SuperKernel优化时有实际效果。如下代码演示了如何在NPU上利用SuperKernel技术实现一个简单的加法算子。

标定算子融合到SuperKernel的关键步骤示例

代码仅展示核心逻辑，不可直接编译运行，仅供参考：

        
         
           
           
             #include "acl/acl.h"
#include "super_kernel/super_kernel.h"
...
// 原始kernel函数（add_custom）
__global__ __vector__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
{
    KernelAdd op;
    op.Init(x, y, z, tiling.totalLength, tiling.tileNum);
    op.Process();
}
// 定义参数结构体，用于封装传递给SuperKernel的参数
struct ArgsStruct {
    GM_ADDR x;                     
    GM_ADDR y;                     
    GM_ADDR z;  
    AddCustomTilingData tiling;
};
// 定义一个带模板参数的SuperKernel子函数（add_custom_sk）
template<uint32_t splitNum>
__sk__ __vector__ void add_custom_sk(const ArgsStruct *args, sk::SkSystemArgs *sysArgs)
{
    // 从结构体获取参数
    GM_ADDR x = args->x;
    GM_ADDR y = args->y;
    GM_ADDR z = args->z;
    AddCustomTilingData tiling = args->tiling;
    uint16_t blockNum = sysArgs->skNumBlocks;
    uint16_t blockIdx = sysArgs->skBlockIdx;
    // 逻辑与原kernel的global函数一致
    KernelAdd op;
    op.Init(x, y, z, tiling.totalLength, tiling.tileNum);
    op.Process();
}

// 使用SK_BIND绑定，将add_custom kernel与多个SK子函数绑定，通过指定模板参数实例化出4个不同的符号
SK_BIND(add_custom, 4, add_custom_sk<0>, add_custom_sk<1>, add_custom_sk<2>, add_custom_sk<3>);

// 主函数
int main()
{
    // 算子数据准备
    constexpr uint32_t totalLength = 8 * 2048;
    constexpr float valueX = 1.2f;
    constexpr float valueY = 2.3f;
    std::vector<float> x(totalLength, valueX);
    std::vector<float> y(totalLength, valueY);
    constexpr uint32_t numBlocks = 8;
    uint32_t totalLength = x.size();
    size_t totalByteSize = totalLength * sizeof(float);
    int32_t deviceId = 0;
    aclrtStream stream = nullptr;
    AddCustomTilingData tiling = {totalLength, 8};
    uint8_t *xHost = reinterpret_cast<uint8_t *>(x.data());
    // printf("0-host: x0:%d \n", xHost[0]);
    uint8_t *yHost = reinterpret_cast<uint8_t *>(y.data());
    uint8_t *zHost = nullptr;
    uint8_t *xDevice = nullptr;
    uint8_t *yDevice = nullptr;
    uint8_t *zDevice = nullptr;
    // 内存分配
    aclrtMallocHost((void **)(&zHost), totalByteSize);
    aclrtMalloc((void **)&xDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
    aclrtMalloc((void **)&yDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
    aclrtMalloc((void **)&zDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
    // 数据传输
    aclrtMemcpy(xDevice, totalByteSize, xHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
    aclrtMemcpy(yDevice, totalByteSize, yHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
    // 初始化
    aclInit(nullptr);
    aclrtSetDevice(deviceId);
    aclrtCreateStream(&stream);
    // 构建图
    aclmdlRICaptureBegin(stream, ACL_MODEL_RI_CAPTURE_MODE_GLOBAL);
    aclmdlRI modelRI;
    // 标定SuperKernel融合起始位置
    aclskScopeBegin("sk1", stream);
    add_custom<<<numBlocks, nullptr, stream>>>(xDevice, yDevice, zDevice, tiling);
    // 标定SuperKernel融合结束位置
    aclskScopeEnd("sk1", stream);
    aclmdlRICaptureEnd(stream, &modelRI);
    // 开启SuperKernel优化
    aclskOptimize(modelRI, nullptr);
    // 执行图
    aclmdlRIExecuteAsync(modelRI, stream);
    // 获取图执行结果
    aclrtSynchronizeStream(stream);
    aclrtMemcpy(zHost, totalByteSize, zDevice, totalByteSize, ACL_MEMCPY_DEVICE_TO_HOST);
    std::vector<float> z((float *)zHost, (float *)(zHost + totalLength));
    // 资源释放
    aclrtFree(xDevice);
    aclrtFree(yDevice);
    aclrtFree(zDevice);
    aclrtFreeHost(zHost);
    aclrtDestroyStream(stream);
    aclrtResetDevice(deviceId);
    aclFinalize();
}

            

          

        
       

上述acl接口详细说明请参见《Runtime运行时 API》。

标定算子不融合到SuperKernel的关键步骤示例

以下代码仅展示核心逻辑，不可直接编译运行，仅供参考：

        
         
           
           
             #include "acl/acl.h"
#include "super_kernel/super_kernel.h"
...
int main()
{
    // 算子数据准备
    ...
    // 初始化
    aclInit(nullptr);
    aclrtSetDevice(deviceId);
    aclrtCreateStream(&stream);
    // 构建图
    aclmdlRICaptureBegin(stream, ACL_MODEL_RI_CAPTURE_MODE_GLOBAL);
    aclmdlRI modelRI;
    // 标定SuperKernel融合起始位置，传入nullptr表示不融合
    aclskScopeBegin(nullptr, stream);
    add_custom<<<numBlocks, nullptr, stream>>>(xDevice, yDevice, zDevice, tiling);
    // 标定SuperKernel融合结束位置，传入nullptr表示不融合
    aclskScopeEnd(nullptr, stream);
    aclmdlRICaptureEnd(stream, &modelRI);
    // 开启SuperKernel优化
    aclskOptimize(modelRI, nullptr);
    // 执行图
    aclmdlRIExecuteAsync(modelRI, stream);
    // 获取图执行结果
    ...
    // 资源释放
    aclrtFree(xDevice);
    aclrtFree(yDevice);
    aclrtFree(zDevice);
    aclrtFreeHost(zHost);
    aclrtDestroyStream(stream);
    aclrtResetDevice(deviceId);
    aclFinalize();
}

            

          

        
       

上述acl接口详细说明请参见《Runtime运行时 API》。

父主题： SuperKernel融合范围标定