快速上手

本节介绍异构编程环境配置与编译器使用的简单用例，方便用户快速验证环境信息，熟悉毕昇编译器的使用。

毕昇编译器安装和环境配置

毕昇编译器跟随CANN软件包一起发布。安装完CANN包后，毕昇编译器所在目录为：${INSTALL_DIR}/compiler/ccec_compiler。

${INSTALL_DIR}请替换为CANN软件安装后文件存储路径。例如，若安装的Ascend-cann-toolkit软件包，则安装后文件存储路径为：$HOME/Ascend/ascend-toolkit/latest。

编程开始前，需要配置毕昇编译器二进制程序相关环境变量，有设置PATH环境变量和配置CANN环境变量两种方式：

方式一：设置PATH环境变量。

# 获取CANN包中的毕昇编译器安装目录,举例如下：
$ export PATH=${INSTALL_DIR}/compiler/ccec_compiler/bin/:$PATH

方式二：配置CANN环境变量。
CANN软件包提供进程级环境变量设置脚本，供用户在进程中引用，以自动完成环境变量设置。执行命令参考如下，以下示例为root或非root用户默认安装路径，请以实际安装路径为准。
```
# 以root用户安装toolkit包
. /usr/local/Ascend/ascend-toolkit/set_env.sh 
# 以非root用户安装toolkit包
. ${HOME}/Ascend/ascend-toolkit/set_env.sh 
```

示例一：使用AscendCL Runtime接口

本示例简单演示一个异构程序，启动4个block（核）的kernel函数，每个block写一份自己的数据，host侧使用ACL Runtime接口进行运行时管理。

// 文件名QuickStartDemo.cce
#include "acl/acl.h"
#include <stdio.h>
#include <stdlib.h>

#ifdef ASCENDC_CPU_DEBUG
#define __aicore__
#else
#define __aicore__ [aicore]
#endif

#define BLOCKS 4
#define CACHELINE_SZ 64

// Define a kernel
__global__ __aicore__ void foo(__gm__ uint8_t *Out, int Stride) {
    Out[block_idx * Stride] = block_idx;
}

int main(int argc, char *argv[]) {
    aclInit(nullptr);
    aclrtSetDevice(0);
    aclrtStream stream;
    aclrtCreateStream(&stream);

    uint8_t ExpectedValue[] = {0, 1, 2, 3};
    uint8_t *OutputValue = nullptr;
    aclrtMalloc((void **)&OutputValue, BLOCKS, ACL_MEM_MALLOC_HUGE_FIRST);

    uint8_t InitValue[BLOCKS] = {0};
    aclrtMemcpyAsync((void *)OutputValue, sizeof(InitValue), InitValue,
                     sizeof(InitValue), ACL_MEMCPY_HOST_TO_DEVICE, stream);
    aclrtSynchronizeStream(stream);

    // Invoke a kernel
    foo<<<BLOCKS, nullptr, stream>>>(OutputValue, CACHELINE_SZ);

    uint8_t *OutHost = nullptr;
    aclrtMallocHost((void **)&OutHost, BLOCKS * CACHELINE_SZ);
    aclrtMemcpyAsync(OutHost, BLOCKS * CACHELINE_SZ, OutputValue,
                     BLOCKS * CACHELINE_SZ, ACL_MEMCPY_DEVICE_TO_HOST, stream);
    aclrtSynchronizeStream(stream);

    for (int I = 0; I < sizeof(ExpectedValue) / sizeof(uint8_t); I++) {
        printf("i%d\t Expect: 0x%04x\t\t\t\tResult: 0x%04x\n", I, ExpectedValue[I],
                OutHost[I * CACHELINE_SZ]);
    }

    aclrtFreeHost(OutHost);
    aclrtFree(OutputValue);

    aclrtDestroyStream(stream);
    aclrtResetDevice(0);
    aclFinalize();
    return 0;
}

编译命令如下，编译选项的具体介绍请参考异构编译。

# CANN软件包中的runtime路径
export RT_INC=${INSTALL_DIR}/runtime/include
export RT_LIB=${INSTALL_DIR}/runtime/lib64

# 功能：Host & Device代码混合编译，生成可执行文件,需链接libascendcl.so 和 libruntime.so
# 编译选项--cce-soc-version和--cce-soc-core-type指的是编译AscendXXXYY上的Vector核程序
$bisheng -O2 --cce-soc-version=AscendXXXYY --cce-soc-core-type=VecCore  -I$RT_INC -L$RT_LIB -lascendcl -lruntime QuickStartDemo.cce  -o QuickStartDemo

运行结果如下：

$ ./QuickStartDemo
i0       Expect: 0x0000                         Result: 0x0000
i1       Expect: 0x0001                         Result: 0x0001
i2       Expect: 0x0002                         Result: 0x0002
i3       Expect: 0x0003                         Result: 0x0003

示例二：使用Ascend C开发算子

本示例代码基于Ascend C实现了一个Add矢量算子。

// 文件名为QuickStartDemoVecAdd.cce
#include "acl/acl.h"
#include <stdio.h>
#include <stdlib.h>

#ifdef ASCENDC_CPU_DEBUG
#define __aicore__
#else
#define __aicore__ [aicore]
#endif

constexpr int32_t TOTAL_LENGTH = 8 * 2048;                            // total length of data
constexpr int32_t USE_CORE_NUM = 8;                                   // num of core used
constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;         // length computed of each core
constexpr int32_t TILE_NUM = 8;                                       // split data into 8 tiles for each core
constexpr int32_t BUFFER_NUM = 2;                                     // tensor num for each queue
constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer

// ---------- Device side code ------------------------------
#include "kernel_operator.h"
__global__ __aicore__ void VecAdd(__gm__ float *x, __gm__ float *y, __gm__ float *z) {
    using namespace AscendC;

    TPipe pipe;
    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
    GlobalTensor<float> xGm;
    GlobalTensor<float> yGm;
    GlobalTensor<float> zGm;

    xGm.SetGlobalBuffer((__gm__ float*)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
    yGm.SetGlobalBuffer((__gm__ float*)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
    zGm.SetGlobalBuffer((__gm__ float*)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
    pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(float));
    pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(float));
    pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(float));

    LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
    LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
    LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
    uint32_t loopCount = TILE_NUM * BUFFER_NUM;
    for (uint32_t i = 0; i < loopCount; i++) {
      DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH);
      DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH);
      inQueueX.EnQue(xLocal);
      inQueueY.EnQue(yLocal);

      xLocal = inQueueX.DeQue<float>();
      yLocal = inQueueY.DeQue<float>();
      Add(zLocal, xLocal, yLocal, TILE_LENGTH);
      outQueueZ.EnQue<float>(zLocal);

      zLocal = outQueueZ.DeQue<float>();
      DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH);
    }
    inQueueX.FreeTensor(xLocal);
    inQueueY.FreeTensor(yLocal);
    outQueueZ.FreeTensor(zLocal);
}

int main(int argc, char *argv[]) {
  size_t inputByteSize = TOTAL_LENGTH * sizeof(float);
  size_t outputByteSize = TOTAL_LENGTH * sizeof(float);
  uint32_t blockDim = 8;

  // AscendCL初始化
  aclInit(nullptr);
  // 运行管理资源申请
  aclrtContext context;
  int32_t deviceId = 0;
  aclrtSetDevice(deviceId);
  aclrtCreateContext(&context, deviceId);
  aclrtStream stream = nullptr;
  aclrtCreateStream(&stream);

  // 分配Host内存
  float *xHost, *yHost, *zHost;
  float *xDevice, *yDevice, *zDevice;
  aclrtMallocHost((void**)(&xHost), inputByteSize);
  aclrtMallocHost((void**)(&yHost), inputByteSize);
  aclrtMallocHost((void**)(&zHost), outputByteSize);
  // 分配Device内存
  aclrtMalloc((void**)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
  aclrtMalloc((void**)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
  aclrtMalloc((void**)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
  // Host内存初始化
  for (int i = 0; i < TOTAL_LENGTH; ++i) {
    xHost[i] = 1.0f;
    yHost[i] = 2.0f;
  }
  aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
  aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE);

  // 用内核调用符<<<>>>调用核函数完成指定的运算
  VecAdd<<<USE_CORE_NUM, nullptr, stream>>>(xDevice, yDevice, zDevice);

  aclrtSynchronizeStream(stream);
  // 将Device上的运算结果拷贝回Host
  aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST);
#undef printf
  for (int i = 0; i < TOTAL_LENGTH; i++) {
    printf("i%d\t Expect: %f\t\t\t\tResult: %f\n", i, 3.0f,
           zHost[i]);
  }
  // 释放申请的资源
  aclrtFree(xDevice);
  aclrtFree(yDevice);
  aclrtFree(zDevice);
  aclrtFreeHost(xHost);
  aclrtFreeHost(yHost);
  aclrtFreeHost(zHost);
  // AscendCL去初始化
  aclrtDestroyStream(stream);
  aclrtDestroyContext(context);
  aclrtResetDevice(deviceId);
  aclFinalize();

  return 0;
}

编译命令如下，编译选项的具体介绍请参考异构编译。

export RT_INC=${INSTALL_DIR}/runtime/include
export RT_LIB=${INSTALL_DIR}/runtime/lib64

# 功能：Host & Device代码混合编译，生成可执行文件,仅需链接libruntime.so
# 编译选项--cce-soc-version和--cce-soc-core-type指的是编译AscendXXXYY上的Vector核程序
$bisheng -O2 --cce-soc-version=AscendXXXYY --cce-soc-core-type=VecCore  -I$RT_INC -L$RT_LIB -lascendcl -lruntime QuickStartDemoVecAdd.cce  -o QuickStartDemoVecAdd -I${INSTALL_DIR}/compiler/tikcpp/tikcfw/ -I${INSTALL_DIR}/compiler/tikcpp/tikcfw/impl -I${INSTALL_DIR}/compiler/tikcpp/tikcfw/interface --std=c++17

运行结果如下：

$export LD_LIBRARY_PATH=$RT_LIB:$LD_LIBRARY_PATH
$ ./QuickStartDemoVecAdd
i0       Expect: 3.000000                               Result: 3.000000
i1       Expect: 3.000000                               Result: 3.000000
i2       Expect: 3.000000                               Result: 3.000000
i3       Expect: 3.000000                               Result: 3.000000
i4       Expect: 3.000000                               Result: 3.000000
i5       Expect: 3.000000                               Result: 3.000000
i6       Expect: 3.000000                               Result: 3.000000
i7       Expect: 3.000000                               Result: 3.000000
i8       Expect: 3.000000                               Result: 3.000000
i9       Expect: 3.000000                               Result: 3.000000
i10      Expect: 3.000000                               Result: 3.000000
...
i16383      Expect: 3.000000                               Result: 3.000000