AI Core自定义算子加载
Kernel函数可以采用<<<>>>方式进行任务下发,具有代码简洁,可读性好的优点。
以下是关键步骤的代码示例,不可以直接拷贝编译运行,仅供参考。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | // Device code Template<> extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) { KernelAdd op; op.Init(x, y, z); op.Process(); } int main() { int N = ...; size_t size = N * sizeof(uint64); // Initialize aclrtSetDevice(0); // Create stream aclrtStream stream; aclrtCreateStream(&stream); // Allocate vectors in host memory void *h_x, *h_y, *h_z; aclrtMallocHost(&h_x, size); aclrtMallocHost(&h_y, size); aclrtMallocHost(&h_z, size); // Initialize input vectors ... // Allocate vectors in device memory void *d_x, *d_y, *d_z; aclrtMalloc(&d_x, size, ACL_MEM_MALLOC_HUGE_FIRST); aclrtMalloc(&d_y, size, ACL_MEM_MALLOC_HUGE_FIRST); aclrtMalloc(&d_z, size, ACL_MEM_MALLOC_HUGE_FIRST); // Copy vectors from host memory to device memory aclrtMemcpy(d_x, size, h_x, size, ACL_MEMCPY_HOST_TO_DEVICE); aclrtMemcpy(d_y, size, h_y, size, ACL_MEMCPY_HOST_TO_DEVICE); // Invoke kernel uint32_t numBlocks = 48; add_custom<<<numBlocks, nullptr, stream>>>(d_x, d_y, d_z); ... } |
用户也可以使用Runtime提供的LaunchKernel接口(例如aclrtLaunchKernelWithHostArgs接口)进行kernel函数的下发。使用这种方式需先了解Binary和Function的概念:
- Binary:是一个动态加载的代码容器单元,里面包含编译后的kernel代码、全局变量等。用户可以通过aclrtBinaryLoadFromFile或aclrtBinaryLoadFromData将编译好的算子二进制加载到NPU上,并获得对应的Binary句柄。
- Function:是一个具体可执行的kernel函数,它定义在Binary内部,是主机代码可以调用并在NPU上执行的入口点。用户可以通过aclrtBinaryGetFunction获取kernel函数对应的Function句柄。
以下是使用LaunchKernel接口的关键代码示例,不可以直接拷贝编译运行,仅供参考。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | // Device code extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) { KernelAdd op; op.Init(x, y, z); op.Process(); } int main() { int N = ...; size_t size = N * sizeof(uint64); // Initialize aclrtSetDevice(0); // Create stream aclrtStream stream; aclrtCreateStream(&stream); // Create binary from binary file aclrtBinHandle bin; aclrtBinaryLoadFromFile("add_custom.o", nullptr, &bin); // Get function handle from binary aclrtFuncHandle add_custom; aclrtBinaryGetFunction(bin, "add_custom", &add_custom); // Allocate vectors in host memory void *h_x, *h_y, *h_z; aclrtMallocHost(&h_x, size); aclrtMallocHost(&h_y, size); aclrtMallocHost(&h_z, size); // Initialize input vectors ... // Allocate vectors in device memory void *d_x, *d_y, *d_z; aclrtMalloc(&d_x, size, ACL_MEM_MALLOC_HUGE_FIRST); aclrtMalloc(&d_y, size, ACL_MEM_MALLOC_HUGE_FIRST); aclrtMalloc(&d_z, size, ACL_MEM_MALLOC_HUGE_FIRST); // Copy vectors from host memory to device memory aclrtMemcpy(d_x, size, h_x, size, ACL_MEMCPY_HOST_TO_DEVICE); aclrtMemcpy(d_y, size, h_y, size, ACL_MEMCPY_HOST_TO_DEVICE); // Invoke kernel uint32_t numBlocks = 48; void* args[] = {d_x, d_y, d_z}; size_t argsSize = 3 * sizeof(void*); aclrtLaunchKernelWithHostArgs(add_custom, numBlocks, stream, nullptr, args, argsSize, nullptr, 0); ... } |
父主题: CANN自定义算子调用