核函数开发完成后，即可编写host侧的核函数调用程序，实现从host侧的APP程序调用算子，进行运行验证。本节将会介绍两种运行验证方法：

使用ICPU_RUN_KF CPU调测宏完成算子核函数CPU侧运行验证；
使用<<<>>>内核调用符完成算子核函数NPU侧运行验证。

在进行算子调用前，请确保已经完成了Ascend C算子实现文件KERNEL_NAME.cpp的编写。除此之外，还需要准备以下文件：

调用算子的应用程序：main.cpp。
输入数据和真值数据生成脚本文件：add_custom.py。
编译cpu侧或npu侧运行的算子的编译工程文件：CMakeLists.txt。
编译运行算子的脚本：run.sh。

您可以单击LINK，获取核函数开发和运行验证的完整样例。

输入数据和真值数据生成脚本文件：KERNEL_NAME.py。

根据算子的输入输出编写生成输入数据和真值数据的脚本。

此处以固定shape的add_custom算子为例：

#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Copyright 2022-2023 Huawei Technologies Co., Ltd
import numpy as np


def gen_golden_data_simple():
    input_x = np.random.uniform(-100, 100, [8, 2048]).astype(np.float16)
    input_y = np.random.uniform(-100, 100, [8, 2048]).astype(np.float16)
    golden = (input_x + input_y).astype(np.float16)
    input_x.tofile("./input/input_x.bin")
    input_y.tofile("./input/input_y.bin")
    golden.tofile("./output/golden.bin")


if __name__ == "__main__":
    gen_golden_data_simple()

编译工程文件：CMakeLists.txt。
用于编译cpu侧或npu侧运行的Ascend C算子。主要关注CMakeLists.txt中源文件是否全部列全。

调用算子的应用程序：main.cpp。

下面的代码以固定shape的add_custom算子为例。您在实现自己的应用程序时，需要关注由于算子核函数不同带来的修改，包括算子核函数名，入参出参的不同等，合理安排相应的内存分配、内存拷贝和文件读写等，相关API的调用方式直接复用即可。

/*
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
 * This file constains code of cpu debug and npu code.We read data from bin file
 * and write result to file.
 */
#include "data_utils.h"
#ifndef __CCE_KT_TEST__
#include "acl/acl.h"
extern void add_custom_do(uint32_t coreDim, void* l2ctrl, void* stream, uint8_t* x, uint8_t* y, uint8_t* z);
#else
#include "tikicpulib.h"
extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z);
#endif

int32_t main(int32_t argc, char* argv[])
{
    size_t inputByteSize = 8 * 2048 * sizeof(uint16_t);  // uint16_t represent half
    size_t outputByteSize = 8 * 2048 * sizeof(uint16_t);  // uint16_t represent half
    uint32_t blockDim = 8;

#ifdef __CCE_KT_TEST__
    uint8_t* x = (uint8_t*)AscendC::GmAlloc(inputByteSize);
    uint8_t* y = (uint8_t*)AscendC::GmAlloc(inputByteSize);
    uint8_t* z = (uint8_t*)AscendC::GmAlloc(outputByteSize);

    ReadFile("./input/input_x.bin", inputByteSize, x, inputByteSize);
    ReadFile("./input/input_y.bin", inputByteSize, y, inputByteSize);

    AscendC::SetKernelMode(KernelMode::AIV_MODE);
    ICPU_RUN_KF(add_custom, blockDim, x, y, z); // use this macro for cpu debug

    WriteFile("./output/output_z.bin", z, outputByteSize);

    AscendC::GmFree((void *)x);
    AscendC::GmFree((void *)y);
    AscendC::GmFree((void *)z);
#else
    CHECK_ACL(aclInit(nullptr));
    aclrtContext context;
    int32_t deviceId = 0;
    CHECK_ACL(aclrtSetDevice(deviceId));
    CHECK_ACL(aclrtCreateContext(&context, deviceId));
    aclrtStream stream = nullptr;
    CHECK_ACL(aclrtCreateStream(&stream));

    uint8_t *xHost, *yHost, *zHost;
    uint8_t *xDevice, *yDevice, *zDevice;
    CHECK_ACL(aclrtMallocHost((void**)(&xHost), inputByteSize));
    CHECK_ACL(aclrtMallocHost((void**)(&yHost), inputByteSize));
    CHECK_ACL(aclrtMallocHost((void**)(&zHost), outputByteSize));
    CHECK_ACL(aclrtMalloc((void**)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
    CHECK_ACL(aclrtMalloc((void**)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
    CHECK_ACL(aclrtMalloc((void**)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));

    ReadFile("./input/input_x.bin", inputByteSize, xHost, inputByteSize);
    ReadFile("./input/input_y.bin", inputByteSize, yHost, inputByteSize);
    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
    CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));

    add_custom_do(blockDim, nullptr, stream, xDevice, yDevice, zDevice);
    CHECK_ACL(aclrtSynchronizeStream(stream));

    CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
    WriteFile("./output/output_z.bin", zHost, outputByteSize);

    CHECK_ACL(aclrtFree(xDevice));
    CHECK_ACL(aclrtFree(yDevice));
    CHECK_ACL(aclrtFree(zDevice));
    CHECK_ACL(aclrtFreeHost(xHost));
    CHECK_ACL(aclrtFreeHost(yHost));
    CHECK_ACL(aclrtFreeHost(zHost));

    CHECK_ACL(aclrtDestroyStream(stream));
    CHECK_ACL(aclrtDestroyContext(context));
    CHECK_ACL(aclrtResetDevice(deviceId));
    CHECK_ACL(aclFinalize());
#endif
    return 0;
}

一键式编译运行脚本run.sh，编译和运行应用程序。脚本执行方式如下：
```
bash run.sh <kernel_name> <soc_version> <core_type> <run_mode>
```
- <kernel_name>表示需要运行的算子。
- <soc_version>表示算子运行的AI处理器型号。
- <core_type>表示在AiCore上或者VectorCore上运行，参数取值为AiCore/VectorCore。
- <run_mode>表示算子以cpu模式或npu模式运行，参数取值为cpu/npu。

用户可以基于此章节提供的文件进行快速编译，并在cpu侧和npu侧执行Ascend C算子。使用时请根据实际情况进行修改。

注意点如下：

为方便run.sh调用，将Ascend C算子实现文件和数据生成脚本文件都命名为FILE_NAME，将可执行入口文件统一命名为main.cpp，用户需自行查看并确保CMakeLists.txt和run.sh中涉及到的文件名正确。

根据Ascend C算子的算法原理的不同，自行实现输入和真值数据的生成脚本；由于每个算子的输入输出个数各不相同，需修改run.sh中数据对比部分，当前使用md5sum对比了所有输出bin文件。此处使用md5sum进行结果比对的方式仅作为示例，用户可自己针对使用场景选择合适的精度比对方式，比如使用numpy中的接口自行实现。
run.sh中CANN软件包的安装位置ASCEND_HOME_DIR，请替换为实际安装路径。

核函数运行验证