快速上手
本节介绍异构编程环境配置与编译器使用的简单用例,方便用户快速验证环境信息,熟悉毕昇编译器的使用。
毕昇编译器安装和环境配置
毕昇编译器跟随CANN软件包一起发布。安装完CANN包后,毕昇编译器所在目录为:${INSTALL_DIR}/compiler/ccec_compiler。
${INSTALL_DIR}请替换为CANN软件安装后文件存储路径。例如,若安装的Ascend-cann-toolkit软件包,则安装后文件存储路径为:$HOME/Ascend/ascend-toolkit/latest。
编程开始前,需要配置毕昇编译器二进制程序相关环境变量,有设置PATH环境变量和配置CANN环境变量两种方式:
- 方式一:设置PATH环境变量。
# 获取CANN包中的毕昇编译器安装目录,举例如下: $ export PATH=${INSTALL_DIR}/compiler/ccec_compiler/bin/:$PATH
- 方式二:配置CANN环境变量。
示例一:使用AscendCL Runtime接口
本示例简单演示一个异构程序,启动4个block(核)的kernel函数,每个block写一份自己的数据,host侧使用ACL Runtime接口进行运行时管理。
// 文件名QuickStartDemo.cce #include "acl/acl.h" #include <stdio.h> #include <stdlib.h> #ifdef ASCENDC_CPU_DEBUG #define __aicore__ #else #define __aicore__ [aicore] #endif #define BLOCKS 4 #define CACHELINE_SZ 64 // Define a kernel __global__ __aicore__ void foo(__gm__ uint8_t *Out, int Stride) { Out[block_idx * Stride] = block_idx; } int main(int argc, char *argv[]) { aclInit(nullptr); aclrtSetDevice(0); aclrtStream stream; aclrtCreateStream(&stream); uint8_t ExpectedValue[] = {0, 1, 2, 3}; uint8_t *OutputValue = nullptr; aclrtMalloc((void **)&OutputValue, BLOCKS, ACL_MEM_MALLOC_HUGE_FIRST); uint8_t InitValue[BLOCKS] = {0}; aclrtMemcpyAsync((void *)OutputValue, sizeof(InitValue), InitValue, sizeof(InitValue), ACL_MEMCPY_HOST_TO_DEVICE, stream); aclrtSynchronizeStream(stream); // Invoke a kernel foo<<<BLOCKS, nullptr, stream>>>(OutputValue, CACHELINE_SZ); uint8_t *OutHost = nullptr; aclrtMallocHost((void **)&OutHost, BLOCKS * CACHELINE_SZ); aclrtMemcpyAsync(OutHost, BLOCKS * CACHELINE_SZ, OutputValue, BLOCKS * CACHELINE_SZ, ACL_MEMCPY_DEVICE_TO_HOST, stream); aclrtSynchronizeStream(stream); for (int I = 0; I < sizeof(ExpectedValue) / sizeof(uint8_t); I++) { printf("i%d\t Expect: 0x%04x\t\t\t\tResult: 0x%04x\n", I, ExpectedValue[I], OutHost[I * CACHELINE_SZ]); } aclrtFreeHost(OutHost); aclrtFree(OutputValue); aclrtDestroyStream(stream); aclrtResetDevice(0); aclFinalize(); return 0; }
编译命令如下,编译选项的具体介绍请参考异构编译。
# CANN软件包中的runtime路径 export RT_INC=${INSTALL_DIR}/runtime/include export RT_LIB=${INSTALL_DIR}/runtime/lib64 # 功能:Host & Device代码混合编译,生成可执行文件,需链接libascendcl.so 和 libruntime.so # 编译选项--cce-soc-version和--cce-soc-core-type指的是编译AscendXXXYY上的Vector核程序 $bisheng -O2 --cce-soc-version=AscendXXXYY --cce-soc-core-type=VecCore -I$RT_INC -L$RT_LIB -lascendcl -lruntime QuickStartDemo.cce -o QuickStartDemo
运行结果如下:
$ ./QuickStartDemo i0 Expect: 0x0000 Result: 0x0000 i1 Expect: 0x0001 Result: 0x0001 i2 Expect: 0x0002 Result: 0x0002 i3 Expect: 0x0003 Result: 0x0003
示例二:使用Ascend C开发算子
本示例代码基于Ascend C实现了一个Add矢量算子。
// 文件名为QuickStartDemoVecAdd.cce #include "acl/acl.h" #include <stdio.h> #include <stdlib.h> #ifdef ASCENDC_CPU_DEBUG #define __aicore__ #else #define __aicore__ [aicore] #endif constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data constexpr int32_t USE_CORE_NUM = 8; // num of core used constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer // ---------- Device side code ------------------------------ #include "kernel_operator.h" __global__ __aicore__ void VecAdd(__gm__ float *x, __gm__ float *y, __gm__ float *z) { using namespace AscendC; TPipe pipe; TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY; TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ; GlobalTensor<float> xGm; GlobalTensor<float> yGm; GlobalTensor<float> zGm; xGm.SetGlobalBuffer((__gm__ float*)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); yGm.SetGlobalBuffer((__gm__ float*)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); zGm.SetGlobalBuffer((__gm__ float*)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(float)); pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(float)); pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(float)); LocalTensor<float> xLocal = inQueueX.AllocTensor<float>(); LocalTensor<float> yLocal = inQueueY.AllocTensor<float>(); LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>(); uint32_t loopCount = TILE_NUM * BUFFER_NUM; for (uint32_t i = 0; i < loopCount; i++) { DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH); DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH); inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); xLocal = inQueueX.DeQue<float>(); yLocal = inQueueY.DeQue<float>(); Add(zLocal, xLocal, yLocal, TILE_LENGTH); outQueueZ.EnQue<float>(zLocal); zLocal = outQueueZ.DeQue<float>(); DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH); } inQueueX.FreeTensor(xLocal); inQueueY.FreeTensor(yLocal); outQueueZ.FreeTensor(zLocal); } int main(int argc, char *argv[]) { size_t inputByteSize = TOTAL_LENGTH * sizeof(float); size_t outputByteSize = TOTAL_LENGTH * sizeof(float); uint32_t blockDim = 8; // AscendCL初始化 aclInit(nullptr); // 运行管理资源申请 aclrtContext context; int32_t deviceId = 0; aclrtSetDevice(deviceId); aclrtCreateContext(&context, deviceId); aclrtStream stream = nullptr; aclrtCreateStream(&stream); // 分配Host内存 float *xHost, *yHost, *zHost; float *xDevice, *yDevice, *zDevice; aclrtMallocHost((void**)(&xHost), inputByteSize); aclrtMallocHost((void**)(&yHost), inputByteSize); aclrtMallocHost((void**)(&zHost), outputByteSize); // 分配Device内存 aclrtMalloc((void**)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST); aclrtMalloc((void**)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST); aclrtMalloc((void**)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST); // Host内存初始化 for (int i = 0; i < TOTAL_LENGTH; ++i) { xHost[i] = 1.0f; yHost[i] = 2.0f; } aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE); aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE); // 用内核调用符<<<>>>调用核函数完成指定的运算 VecAdd<<<USE_CORE_NUM, nullptr, stream>>>(xDevice, yDevice, zDevice); aclrtSynchronizeStream(stream); // 将Device上的运算结果拷贝回Host aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST); #undef printf for (int i = 0; i < TOTAL_LENGTH; i++) { printf("i%d\t Expect: %f\t\t\t\tResult: %f\n", i, 3.0f, zHost[i]); } // 释放申请的资源 aclrtFree(xDevice); aclrtFree(yDevice); aclrtFree(zDevice); aclrtFreeHost(xHost); aclrtFreeHost(yHost); aclrtFreeHost(zHost); // AscendCL去初始化 aclrtDestroyStream(stream); aclrtDestroyContext(context); aclrtResetDevice(deviceId); aclFinalize(); return 0; }
编译命令如下,编译选项的具体介绍请参考异构编译。
export RT_INC=${INSTALL_DIR}/runtime/include export RT_LIB=${INSTALL_DIR}/runtime/lib64 # 功能:Host & Device代码混合编译,生成可执行文件,仅需链接libruntime.so # 编译选项--cce-soc-version和--cce-soc-core-type指的是编译AscendXXXYY上的Vector核程序 $bisheng -O2 --cce-soc-version=AscendXXXYY --cce-soc-core-type=VecCore -I$RT_INC -L$RT_LIB -lascendcl -lruntime QuickStartDemoVecAdd.cce -o QuickStartDemoVecAdd -I${INSTALL_DIR}/compiler/tikcpp/tikcfw/ -I${INSTALL_DIR}/compiler/tikcpp/tikcfw/impl -I${INSTALL_DIR}/compiler/tikcpp/tikcfw/interface --std=c++17
运行结果如下:
$export LD_LIBRARY_PATH=$RT_LIB:$LD_LIBRARY_PATH $ ./QuickStartDemoVecAdd i0 Expect: 3.000000 Result: 3.000000 i1 Expect: 3.000000 Result: 3.000000 i2 Expect: 3.000000 Result: 3.000000 i3 Expect: 3.000000 Result: 3.000000 i4 Expect: 3.000000 Result: 3.000000 i5 Expect: 3.000000 Result: 3.000000 i6 Expect: 3.000000 Result: 3.000000 i7 Expect: 3.000000 Result: 3.000000 i8 Expect: 3.000000 Result: 3.000000 i9 Expect: 3.000000 Result: 3.000000 i10 Expect: 3.000000 Result: 3.000000 ... i16383 Expect: 3.000000 Result: 3.000000