RTC
RTC is a runtime compilation library of Ascend C. It dynamically compiles intermediate code into target machine code during program running through the aclrtc API, improving program running performance.
The runtime compilation library provides the following core APIs:
- aclrtcCreateProg: creates an aclrtcProg program instance based on the input parameters (such as the Ascend C source code in string format).
- aclrtcCompileProg: compiles a given program. Compilation options can be customized. For example, you can specify the NPU architecture version number as follows: --npu-arch=dav-2201. For details about the supported compilation options, see Compilation Options.
- aclrtcGetBinDataSize: obtains the size of the binary data on the device after compilation.
- aclrtcGetBinData: obtains the binary data on the device after compilation.
- aclrtcDestroyProg: destroys a given program after the compilation and execution are complete.
After the compilation is complete, call the following APIs to load and execute the kernel (only core APIs are listed). For details about the complete process and API description, see "Kernel Loading and Execution".
- Use the aclrtBinaryLoadFromData API to parse the operator binary data obtained by the aclrtcGetBinData API.
- Obtain the kernel function handle and operate the parameter list based on the kernel function handle. Related APIs include aclrtBinaryGetFunction (for obtaining the kernel function handle), aclrtKernelArgsInit (for initializing the parameter list), and aclrtKernelArgsAppend (for appending the copied user-defined parameter values, such as xDevice, yDevice, and zDevice).
- Call the aclrtLaunchKernelWithConfig API to start the compute task of the corresponding operator.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
#include <iostream> #include <fstream> #include <vector> #include "acl/acl.h" // Header file required for using aclrtc APIs #include "acl/acl_rt_compile.h" #define CHECK_ACL(x) \ do { \ aclError __ret = x; \ if (__ret != ACL_ERROR_NONE) { \ std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ } \ } while (0); int main(int argc, char *argv[]) { // ----------------------------------------------------- aclrtc part ----------------------------------------------------- const char *src = R""""( #include "kernel_operator.h" constexpr int32_t TOTAL_LENGTH = 8 * 1024; // total length of data constexpr int32_t USE_CORE_NUM = 8; // num of core used constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // separate to 2 parts, due to double buffer class KernelAdd { public: __aicore__ inline KernelAdd() {} __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) { xGm.SetGlobalBuffer((__gm__ float *)x + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH); yGm.SetGlobalBuffer((__gm__ float *)y + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH); zGm.SetGlobalBuffer((__gm__ float *)z + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH); pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(float)); pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(float)); pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(float)); } __aicore__ inline void Process() { int32_t loopCount = TILE_NUM * BUFFER_NUM; for (int32_t i = 0; i < loopCount; i++) { CopyIn(i); Compute(i); CopyOut(i); } } private: __aicore__ inline void CopyIn(int32_t progress) { AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>(); AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>(); AscendC::DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); AscendC::DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } __aicore__ inline void Compute(int32_t progress) { AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>(); AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>(); AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>(); AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH); outQueueZ.EnQue<float>(zLocal); inQueueX.FreeTensor(xLocal); inQueueY.FreeTensor(yLocal); } __aicore__ inline void CopyOut(int32_t progress) { AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>(); AscendC::DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); outQueueZ.FreeTensor(zLocal); } private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX, inQueueY; AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ; AscendC::GlobalTensor<float> xGm; AscendC::GlobalTensor<float> yGm; AscendC::GlobalTensor<float> zGm; }; extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) { KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); KernelAdd op; op.Init(x, y, z); op.Process(); } )""""; // aclrtc process. src is the source code on the device. aclrtcCreateProg is used to create a compilation program. aclrtcProg prog; CHECK_ACL(aclrtcCreateProg(&prog, src, "add_custom", 0, nullptr, nullptr)); // aclrtc process. Pass the compilation options of the BiSheng Compiler and call aclrtcCompileProg for compilation. const char *options[] = { "--npu-arch=dav-2201", }; int numOptions = sizeof(options) / sizeof(options[0]); CHECK_ACL(aclrtcCompileProg(prog, numOptions, options)); // aclrtc process. Obtain the binary content and its size on the device. size_t binDataSizeRet; CHECK_ACL(aclrtcGetBinDataSize(prog, &binDataSizeRet)); std::vector<char> deviceELF(binDataSizeRet); CHECK_ACL(aclrtcGetBinData(prog, deviceELF.data())); const char *funcName = "add_custom"; // ----------------------------------------------------- aclrt part ----------------------------------------------------- uint32_t blockDim = 8; size_t inputByteSize = 8 * 1024 * sizeof(uint32_t); size_t outputByteSize = 8 * 1024 * sizeof(uint32_t); CHECK_ACL(aclInit(nullptr)); int32_t deviceId = 0; CHECK_ACL(aclrtSetDevice(deviceId)); aclrtStream stream = nullptr; CHECK_ACL(aclrtCreateStream(&stream)); uint8_t *xHost, *yHost, *zHost; uint8_t *xDevice, *yDevice, *zDevice; CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputByteSize)); CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputByteSize)); CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize)); CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); aclrtBinHandle binHandle = nullptr; aclrtBinaryLoadOptions loadOption; loadOption.numOpt = 1; aclrtBinaryLoadOption option; option.type = ACL_RT_BINARY_LOAD_OPT_LAZY_MAGIC; option.value.magic = ACL_RT_BINARY_MAGIC_ELF_VECTOR_CORE; // Set the magic value to indicate that the operator is executed on the Vector Core. loadOption.options = &option; CHECK_ACL(aclrtBinaryLoadFromData(deviceELF.data(), binDataSizeRet, &loadOption, &binHandle)); aclrtFuncHandle funcHandle = nullptr; CHECK_ACL(aclrtBinaryGetFunction(binHandle, funcName, &funcHandle)); aclrtArgsHandle argsHandle = nullptr; aclrtParamHandle paramHandle = nullptr; CHECK_ACL(aclrtKernelArgsInit(funcHandle, &argsHandle)); CHECK_ACL(aclrtKernelArgsAppend(argsHandle, (void **)&xDevice, sizeof(uintptr_t), ¶mHandle)); CHECK_ACL(aclrtKernelArgsAppend(argsHandle, (void **)&yDevice, sizeof(uintptr_t), ¶mHandle)); CHECK_ACL(aclrtKernelArgsAppend(argsHandle, (void **)&zDevice, sizeof(uintptr_t), ¶mHandle)); CHECK_ACL(aclrtKernelArgsFinalize(argsHandle)); // Kernel function entry CHECK_ACL(aclrtLaunchKernelWithConfig(funcHandle, blockDim, stream, nullptr, argsHandle, nullptr)); CHECK_ACL(aclrtSynchronizeStream(stream)); CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); // Obtain the log size and log string. size_t logSize; CHECK_ACL(aclrtcGetCompileLogSize(prog, &logSize)); char* log = (char*)malloc(logSize); CHECK_ACL(aclrtcGetCompileLog(prog, log)); // Save the log string to a file. /* std::ofstream logFile("compile.log"); if (logFile.is_open()) { logFile << log << std::endl; logFile.close(); std::cout << "already write to compile.log!" << std::endl; } */ CHECK_ACL(aclrtBinaryUnLoad(binHandle)); CHECK_ACL(aclrtFree(xDevice)); CHECK_ACL(aclrtFree(yDevice)); CHECK_ACL(aclrtFree(zDevice)); CHECK_ACL(aclrtFreeHost(xHost)); CHECK_ACL(aclrtFreeHost(yHost)); CHECK_ACL(aclrtFreeHost(zHost)); CHECK_ACL(aclrtDestroyStream(stream)); CHECK_ACL(aclrtResetDevice(deviceId)); CHECK_ACL(aclFinalize()); // The compilation and running are complete. Destroy the program. CHECK_ACL(aclrtcDestroyProg(&prog)); return 0; } |
The compilation command is as follows. During compilation, set -I to ${INSTALL_DIR}/include to find the aclrtc header files, and link to the alc_rtc dynamic library.
g++ add_custom.cpp -I${INSTALL_DIR}/include -L${INSTALL_DIR}/lib64 -lascendcl -lacl_rtc -o main
Replace ${INSTALL_DIR} with the CANN component directory. For example, if the installation is performed by the root user, the default file storage path is /usr/local/Ascend/cann.