开发者
资源

融合范围标定示例

通过调用aclskScopeBeginaclskScopeEnd接口,用户若传入有效scopeName可以框定算子融合进SuperKernel,若传入nullptr可以框定算子不融合。该功能只在使能SuperKernel优化时有实际效果。如下代码演示了如何在NPU上利用SuperKernel技术实现一个简单的加法算子。

  • 标定算子融合到SuperKernel的关键步骤示例
    代码仅展示核心逻辑,不可直接编译运行,仅供参考:
     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    #include "acl/acl.h"
    #include "super_kernel/super_kernel.h"
    ...
    // 原始kernel函数(add_custom)
    __global__ __vector__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
    {
        KernelAdd op;
        op.Init(x, y, z, tiling.totalLength, tiling.tileNum);
        op.Process();
    }
    // 定义参数结构体,用于封装传递给SuperKernel的参数
    struct ArgsStruct {
        GM_ADDR x;                     
        GM_ADDR y;                     
        GM_ADDR z;  
        AddCustomTilingData tiling;
    };
    // 定义一个带模板参数的SuperKernel子函数(add_custom_sk)
    template<uint32_t splitNum>
    __sk__ __vector__ void add_custom_sk(const ArgsStruct *args, sk::SkSystemArgs *sysArgs)
    {
        // 从结构体获取参数
        GM_ADDR x = args->x;
        GM_ADDR y = args->y;
        GM_ADDR z = args->z;
        AddCustomTilingData tiling = args->tiling;
        uint16_t blockNum = sysArgs->skNumBlocks;
        uint16_t blockIdx = sysArgs->skBlockIdx;
        // 逻辑与原kernel的global函数一致
        KernelAdd op;
        op.Init(x, y, z, tiling.totalLength, tiling.tileNum);
        op.Process();
    }
    
    // 使用SK_BIND绑定,将add_custom kernel与多个SK子函数绑定,通过指定模板参数实例化出4个不同的符号
    SK_BIND(add_custom, 4, add_custom_sk<0>, add_custom_sk<1>, add_custom_sk<2>, add_custom_sk<3>);
    
    // 主函数
    int main()
    {
        // 算子数据准备
        constexpr uint32_t totalLength = 8 * 2048;
        constexpr float valueX = 1.2f;
        constexpr float valueY = 2.3f;
        std::vector<float> x(totalLength, valueX);
        std::vector<float> y(totalLength, valueY);
        constexpr uint32_t numBlocks = 8;
        uint32_t totalLength = x.size();
        size_t totalByteSize = totalLength * sizeof(float);
        int32_t deviceId = 0;
        aclrtStream stream = nullptr;
        AddCustomTilingData tiling = {totalLength, 8};
        uint8_t *xHost = reinterpret_cast<uint8_t *>(x.data());
        // printf("0-host: x0:%d \n", xHost[0]);
        uint8_t *yHost = reinterpret_cast<uint8_t *>(y.data());
        uint8_t *zHost = nullptr;
        uint8_t *xDevice = nullptr;
        uint8_t *yDevice = nullptr;
        uint8_t *zDevice = nullptr;
        // 内存分配
        aclrtMallocHost((void **)(&zHost), totalByteSize);
        aclrtMalloc((void **)&xDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
        aclrtMalloc((void **)&yDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
        aclrtMalloc((void **)&zDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
        // 数据传输
        aclrtMemcpy(xDevice, totalByteSize, xHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
        aclrtMemcpy(yDevice, totalByteSize, yHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
        // 初始化
        aclInit(nullptr);
        aclrtSetDevice(deviceId);
        aclrtCreateStream(&stream);
        // 构建图
        aclmdlRICaptureBegin(stream, ACL_MODEL_RI_CAPTURE_MODE_GLOBAL);
        aclmdlRI modelRI;
        // 标定SuperKernel融合起始位置
        aclskScopeBegin("sk1", stream);
        add_custom<<<numBlocks, nullptr, stream>>>(xDevice, yDevice, zDevice, tiling);
        // 标定SuperKernel融合结束位置
        aclskScopeEnd("sk1", stream);
        aclmdlRICaptureEnd(stream, &modelRI);
        // 开启SuperKernel优化
        aclskOptimize(modelRI, nullptr);
        // 执行图
        aclmdlRIExecuteAsync(modelRI, stream);
        // 获取图执行结果
        aclrtSynchronizeStream(stream);
        aclrtMemcpy(zHost, totalByteSize, zDevice, totalByteSize, ACL_MEMCPY_DEVICE_TO_HOST);
        std::vector<float> z((float *)zHost, (float *)(zHost + totalLength));
        // 资源释放
        aclrtFree(xDevice);
        aclrtFree(yDevice);
        aclrtFree(zDevice);
        aclrtFreeHost(zHost);
        aclrtDestroyStream(stream);
        aclrtResetDevice(deviceId);
        aclFinalize();
    }
    

    上述acl接口详细说明请参见Runtime运行时 API

  • 标定算子不融合到SuperKernel的关键步骤示例
    以下代码仅展示核心逻辑,不可直接编译运行,仅供参考:
     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    #include "acl/acl.h"
    #include "super_kernel/super_kernel.h"
    ...
    int main()
    {
        // 算子数据准备
        ...
        // 初始化
        aclInit(nullptr);
        aclrtSetDevice(deviceId);
        aclrtCreateStream(&stream);
        // 构建图
        aclmdlRICaptureBegin(stream, ACL_MODEL_RI_CAPTURE_MODE_GLOBAL);
        aclmdlRI modelRI;
        // 标定SuperKernel融合起始位置,传入nullptr表示不融合
        aclskScopeBegin(nullptr, stream);
        add_custom<<<numBlocks, nullptr, stream>>>(xDevice, yDevice, zDevice, tiling);
        // 标定SuperKernel融合结束位置,传入nullptr表示不融合
        aclskScopeEnd(nullptr, stream);
        aclmdlRICaptureEnd(stream, &modelRI);
        // 开启SuperKernel优化
        aclskOptimize(modelRI, nullptr);
        // 执行图
        aclmdlRIExecuteAsync(modelRI, stream);
        // 获取图执行结果
        ...
        // 资源释放
        aclrtFree(xDevice);
        aclrtFree(yDevice);
        aclrtFree(zDevice);
        aclrtFreeHost(zHost);
        aclrtDestroyStream(stream);
        aclrtResetDevice(deviceId);
        aclFinalize();
    }
    

    上述acl接口详细说明请参见Runtime运行时 API