融合范围标定示例
通过调用aclskScopeBegin和aclskScopeEnd接口,用户若传入有效scopeName可以框定算子融合进SuperKernel,若传入nullptr可以框定算子不融合。该功能只在使能SuperKernel优化时有实际效果。如下代码演示了如何在NPU上利用SuperKernel技术实现一个简单的加法算子。
- 标定算子融合到SuperKernel的关键步骤示例
代码仅展示核心逻辑,不可直接编译运行,仅供参考:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
#include "acl/acl.h" #include "super_kernel/super_kernel.h" ... // 原始kernel函数(add_custom) __global__ __vector__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling) { KernelAdd op; op.Init(x, y, z, tiling.totalLength, tiling.tileNum); op.Process(); } // 定义参数结构体,用于封装传递给SuperKernel的参数 struct ArgsStruct { GM_ADDR x; GM_ADDR y; GM_ADDR z; AddCustomTilingData tiling; }; // 定义一个带模板参数的SuperKernel子函数(add_custom_sk) template<uint32_t splitNum> __sk__ __vector__ void add_custom_sk(const ArgsStruct *args, sk::SkSystemArgs *sysArgs) { // 从结构体获取参数 GM_ADDR x = args->x; GM_ADDR y = args->y; GM_ADDR z = args->z; AddCustomTilingData tiling = args->tiling; uint16_t blockNum = sysArgs->skNumBlocks; uint16_t blockIdx = sysArgs->skBlockIdx; // 逻辑与原kernel的global函数一致 KernelAdd op; op.Init(x, y, z, tiling.totalLength, tiling.tileNum); op.Process(); } // 使用SK_BIND绑定,将add_custom kernel与多个SK子函数绑定,通过指定模板参数实例化出4个不同的符号 SK_BIND(add_custom, 4, add_custom_sk<0>, add_custom_sk<1>, add_custom_sk<2>, add_custom_sk<3>); // 主函数 int main() { // 算子数据准备 constexpr uint32_t totalLength = 8 * 2048; constexpr float valueX = 1.2f; constexpr float valueY = 2.3f; std::vector<float> x(totalLength, valueX); std::vector<float> y(totalLength, valueY); constexpr uint32_t numBlocks = 8; uint32_t totalLength = x.size(); size_t totalByteSize = totalLength * sizeof(float); int32_t deviceId = 0; aclrtStream stream = nullptr; AddCustomTilingData tiling = {totalLength, 8}; uint8_t *xHost = reinterpret_cast<uint8_t *>(x.data()); // printf("0-host: x0:%d \n", xHost[0]); uint8_t *yHost = reinterpret_cast<uint8_t *>(y.data()); uint8_t *zHost = nullptr; uint8_t *xDevice = nullptr; uint8_t *yDevice = nullptr; uint8_t *zDevice = nullptr; // 内存分配 aclrtMallocHost((void **)(&zHost), totalByteSize); aclrtMalloc((void **)&xDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST); aclrtMalloc((void **)&yDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST); aclrtMalloc((void **)&zDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST); // 数据传输 aclrtMemcpy(xDevice, totalByteSize, xHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE); aclrtMemcpy(yDevice, totalByteSize, yHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE); // 初始化 aclInit(nullptr); aclrtSetDevice(deviceId); aclrtCreateStream(&stream); // 构建图 aclmdlRICaptureBegin(stream, ACL_MODEL_RI_CAPTURE_MODE_GLOBAL); aclmdlRI modelRI; // 标定SuperKernel融合起始位置 aclskScopeBegin("sk1", stream); add_custom<<<numBlocks, nullptr, stream>>>(xDevice, yDevice, zDevice, tiling); // 标定SuperKernel融合结束位置 aclskScopeEnd("sk1", stream); aclmdlRICaptureEnd(stream, &modelRI); // 开启SuperKernel优化 aclskOptimize(modelRI, nullptr); // 执行图 aclmdlRIExecuteAsync(modelRI, stream); // 获取图执行结果 aclrtSynchronizeStream(stream); aclrtMemcpy(zHost, totalByteSize, zDevice, totalByteSize, ACL_MEMCPY_DEVICE_TO_HOST); std::vector<float> z((float *)zHost, (float *)(zHost + totalLength)); // 资源释放 aclrtFree(xDevice); aclrtFree(yDevice); aclrtFree(zDevice); aclrtFreeHost(zHost); aclrtDestroyStream(stream); aclrtResetDevice(deviceId); aclFinalize(); }
- 标定算子不融合到SuperKernel的关键步骤示例
以下代码仅展示核心逻辑,不可直接编译运行,仅供参考:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
#include "acl/acl.h" #include "super_kernel/super_kernel.h" ... int main() { // 算子数据准备 ... // 初始化 aclInit(nullptr); aclrtSetDevice(deviceId); aclrtCreateStream(&stream); // 构建图 aclmdlRICaptureBegin(stream, ACL_MODEL_RI_CAPTURE_MODE_GLOBAL); aclmdlRI modelRI; // 标定SuperKernel融合起始位置,传入nullptr表示不融合 aclskScopeBegin(nullptr, stream); add_custom<<<numBlocks, nullptr, stream>>>(xDevice, yDevice, zDevice, tiling); // 标定SuperKernel融合结束位置,传入nullptr表示不融合 aclskScopeEnd(nullptr, stream); aclmdlRICaptureEnd(stream, &modelRI); // 开启SuperKernel优化 aclskOptimize(modelRI, nullptr); // 执行图 aclmdlRIExecuteAsync(modelRI, stream); // 获取图执行结果 ... // 资源释放 aclrtFree(xDevice); aclrtFree(yDevice); aclrtFree(zDevice); aclrtFreeHost(zHost); aclrtDestroyStream(stream); aclrtResetDevice(deviceId); aclFinalize(); }
父主题: SuperKernel融合范围标定