核函数开发完成后,即可编写host侧的核函数调用程序,实现从host侧的APP程序调用算子,进行运行验证。本节将会介绍两种运行验证方法:
在进行算子调用前,请确保已经完成了Ascend C算子实现文件KERNEL_NAME.cpp的编写。除此之外,还需要准备以下文件:
您可以单击LINK,获取核函数开发和运行验证的完整样例。
根据算子的输入输出编写生成输入数据和真值数据的脚本。
此处以固定shape的add_custom算子为例:
#!/usr/bin/python3 # -*- coding:utf-8 -*- # Copyright 2022-2023 Huawei Technologies Co., Ltd import numpy as np def gen_golden_data_simple(): input_x = np.random.uniform(-100, 100, [8, 2048]).astype(np.float16) input_y = np.random.uniform(-100, 100, [8, 2048]).astype(np.float16) golden = (input_x + input_y).astype(np.float16) input_x.tofile("./input/input_x.bin") input_y.tofile("./input/input_y.bin") golden.tofile("./output/golden.bin") if __name__ == "__main__": gen_golden_data_simple()
/* * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. * This file constains code of cpu debug and npu code.We read data from bin file * and write result to file. */ #include "data_utils.h" #ifndef __CCE_KT_TEST__ #include "acl/acl.h" extern void add_custom_do(uint32_t coreDim, void* l2ctrl, void* stream, uint8_t* x, uint8_t* y, uint8_t* z); #else #include "tikicpulib.h" extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z); #endif int32_t main(int32_t argc, char* argv[]) { size_t inputByteSize = 8 * 2048 * sizeof(uint16_t); // uint16_t represent half size_t outputByteSize = 8 * 2048 * sizeof(uint16_t); // uint16_t represent half uint32_t blockDim = 8; #ifdef __CCE_KT_TEST__ uint8_t* x = (uint8_t*)AscendC::GmAlloc(inputByteSize); uint8_t* y = (uint8_t*)AscendC::GmAlloc(inputByteSize); uint8_t* z = (uint8_t*)AscendC::GmAlloc(outputByteSize); ReadFile("./input/input_x.bin", inputByteSize, x, inputByteSize); ReadFile("./input/input_y.bin", inputByteSize, y, inputByteSize); AscendC::SetKernelMode(KernelMode::AIV_MODE); ICPU_RUN_KF(add_custom, blockDim, x, y, z); // use this macro for cpu debug WriteFile("./output/output_z.bin", z, outputByteSize); AscendC::GmFree((void *)x); AscendC::GmFree((void *)y); AscendC::GmFree((void *)z); #else CHECK_ACL(aclInit(nullptr)); aclrtContext context; int32_t deviceId = 0; CHECK_ACL(aclrtSetDevice(deviceId)); CHECK_ACL(aclrtCreateContext(&context, deviceId)); aclrtStream stream = nullptr; CHECK_ACL(aclrtCreateStream(&stream)); uint8_t *xHost, *yHost, *zHost; uint8_t *xDevice, *yDevice, *zDevice; CHECK_ACL(aclrtMallocHost((void**)(&xHost), inputByteSize)); CHECK_ACL(aclrtMallocHost((void**)(&yHost), inputByteSize)); CHECK_ACL(aclrtMallocHost((void**)(&zHost), outputByteSize)); CHECK_ACL(aclrtMalloc((void**)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void**)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void**)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); ReadFile("./input/input_x.bin", inputByteSize, xHost, inputByteSize); ReadFile("./input/input_y.bin", inputByteSize, yHost, inputByteSize); CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); add_custom_do(blockDim, nullptr, stream, xDevice, yDevice, zDevice); CHECK_ACL(aclrtSynchronizeStream(stream)); CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); WriteFile("./output/output_z.bin", zHost, outputByteSize); CHECK_ACL(aclrtFree(xDevice)); CHECK_ACL(aclrtFree(yDevice)); CHECK_ACL(aclrtFree(zDevice)); CHECK_ACL(aclrtFreeHost(xHost)); CHECK_ACL(aclrtFreeHost(yHost)); CHECK_ACL(aclrtFreeHost(zHost)); CHECK_ACL(aclrtDestroyStream(stream)); CHECK_ACL(aclrtDestroyContext(context)); CHECK_ACL(aclrtResetDevice(deviceId)); CHECK_ACL(aclFinalize()); #endif return 0; }
bash run.sh <kernel_name> <soc_version> <core_type> <run_mode>
用户可以基于此章节提供的文件进行快速编译,并在cpu侧和npu侧执行Ascend C算子。使用时请根据实际情况进行修改。
注意点如下: