核函数运行验证

核函数开发完成后,即可编写host侧的核函数调用程序,实现从host侧的APP程序调用算子,进行运行验证。本节将会介绍两种运行验证方法:

在进行算子调用前,请确保已经完成了Ascend C算子实现文件KERNEL_NAME.cpp的编写。除此之外,还需要准备以下文件:

您可以单击LINK,获取核函数开发和运行验证的完整样例。

  1. 输入数据和真值数据生成脚本文件:KERNEL_NAME.py。

    根据算子的输入输出编写生成输入数据和真值数据的脚本。

    此处以固定shape的add_custom算子为例:

    #!/usr/bin/python3
    # -*- coding:utf-8 -*-
    # Copyright 2022-2023 Huawei Technologies Co., Ltd
    import numpy as np
    
    
    def gen_golden_data_simple():
        input_x = np.random.uniform(-100, 100, [8, 2048]).astype(np.float16)
        input_y = np.random.uniform(-100, 100, [8, 2048]).astype(np.float16)
        golden = (input_x + input_y).astype(np.float16)
        input_x.tofile("./input/input_x.bin")
        input_y.tofile("./input/input_y.bin")
        golden.tofile("./output/golden.bin")
    
    
    if __name__ == "__main__":
        gen_golden_data_simple()
  2. 编译工程文件:CMakeLists.txt。

    用于编译cpu侧或npu侧运行的Ascend C算子。主要关注CMakeLists.txt中源文件是否全部列全。

  3. 调用算子的应用程序:main.cpp。
    下面的代码以固定shape的add_custom算子为例。您在实现自己的应用程序时,需要关注由于算子核函数不同带来的修改,包括算子核函数名,入参出参的不同等,合理安排相应的内存分配、内存拷贝和文件读写等,相关API的调用方式直接复用即可。
    /*
     * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
     * This file constains code of cpu debug and npu code.We read data from bin file
     * and write result to file.
     */
    #include "data_utils.h"
    #ifndef __CCE_KT_TEST__
    #include "acl/acl.h"
    extern void add_custom_do(uint32_t coreDim, void* l2ctrl, void* stream, uint8_t* x, uint8_t* y, uint8_t* z);
    #else
    #include "tikicpulib.h"
    extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z);
    #endif
    
    int32_t main(int32_t argc, char* argv[])
    {
        size_t inputByteSize = 8 * 2048 * sizeof(uint16_t);  // uint16_t represent half
        size_t outputByteSize = 8 * 2048 * sizeof(uint16_t);  // uint16_t represent half
        uint32_t blockDim = 8;
    
    #ifdef __CCE_KT_TEST__
        uint8_t* x = (uint8_t*)AscendC::GmAlloc(inputByteSize);
        uint8_t* y = (uint8_t*)AscendC::GmAlloc(inputByteSize);
        uint8_t* z = (uint8_t*)AscendC::GmAlloc(outputByteSize);
    
        ReadFile("./input/input_x.bin", inputByteSize, x, inputByteSize);
        ReadFile("./input/input_y.bin", inputByteSize, y, inputByteSize);
    
        AscendC::SetKernelMode(KernelMode::AIV_MODE);
        ICPU_RUN_KF(add_custom, blockDim, x, y, z); // use this macro for cpu debug
    
        WriteFile("./output/output_z.bin", z, outputByteSize);
    
        AscendC::GmFree((void *)x);
        AscendC::GmFree((void *)y);
        AscendC::GmFree((void *)z);
    #else
        CHECK_ACL(aclInit(nullptr));
        aclrtContext context;
        int32_t deviceId = 0;
        CHECK_ACL(aclrtSetDevice(deviceId));
        CHECK_ACL(aclrtCreateContext(&context, deviceId));
        aclrtStream stream = nullptr;
        CHECK_ACL(aclrtCreateStream(&stream));
    
        uint8_t *xHost, *yHost, *zHost;
        uint8_t *xDevice, *yDevice, *zDevice;
        CHECK_ACL(aclrtMallocHost((void**)(&xHost), inputByteSize));
        CHECK_ACL(aclrtMallocHost((void**)(&yHost), inputByteSize));
        CHECK_ACL(aclrtMallocHost((void**)(&zHost), outputByteSize));
        CHECK_ACL(aclrtMalloc((void**)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
        CHECK_ACL(aclrtMalloc((void**)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
        CHECK_ACL(aclrtMalloc((void**)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
    
        ReadFile("./input/input_x.bin", inputByteSize, xHost, inputByteSize);
        ReadFile("./input/input_y.bin", inputByteSize, yHost, inputByteSize);
        CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
        CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
    
        add_custom_do(blockDim, nullptr, stream, xDevice, yDevice, zDevice);
        CHECK_ACL(aclrtSynchronizeStream(stream));
    
        CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
        WriteFile("./output/output_z.bin", zHost, outputByteSize);
    
        CHECK_ACL(aclrtFree(xDevice));
        CHECK_ACL(aclrtFree(yDevice));
        CHECK_ACL(aclrtFree(zDevice));
        CHECK_ACL(aclrtFreeHost(xHost));
        CHECK_ACL(aclrtFreeHost(yHost));
        CHECK_ACL(aclrtFreeHost(zHost));
    
        CHECK_ACL(aclrtDestroyStream(stream));
        CHECK_ACL(aclrtDestroyContext(context));
        CHECK_ACL(aclrtResetDevice(deviceId));
        CHECK_ACL(aclFinalize());
    #endif
        return 0;
    }
  4. 一键式编译运行脚本run.sh,编译和运行应用程序。脚本执行方式如下:
    bash run.sh <kernel_name> <soc_version> <core_type> <run_mode>
    • <kernel_name>表示需要运行的算子。
    • <soc_version>表示算子运行的AI处理器型号。
    • <core_type>表示在AiCore上或者VectorCore上运行,参数取值为AiCore/VectorCore。
    • <run_mode>表示算子以cpu模式或npu模式运行,参数取值为cpu/npu。

用户可以基于此章节提供的文件进行快速编译,并在cpu侧和npu侧执行Ascend C算子。使用时请根据实际情况进行修改。

注意点如下: