昇腾社区首页
中文
注册

Host侧代码与说明

以Add算子为例,计算时,需要将两个类型为float的tensor逐元素相加,每个tensor包含64个元素。

首先在Host侧给两个入参以及结果创建空间并赋初始值,其次在Device侧创建参数指针并分配空间,再将初始值拷贝到Device中就可以拉起Device侧的核函数,Device侧代码与说明包含了函数的具体实现。

Device执行结束后可以将结果从Device侧拷贝到Host侧并与CPU执行结果相对比,最后释放空间与数据流即可。
// host侧文件:main.cce
// 直接通过<<<>>>异构调用语法调用device侧kernel
#include "acl/acl.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define NUM 64
extern "C" __global__ [aicore] void add_kernel(__gm__ float* __restrict__ input_1, __gm__ float* __restrict__ input_2, __gm__ float* __restrict__ output);
int main()
{
    aclrtStream stream;
    uint64_t i = 0;
    void * input_1 = NULL;
    void * input_2 = NULL;
    void * output  = NULL;
    aclrtSetDevice(0);
    // 创建数据流
    aclrtCreateStream(&stream);
    // 参数初始化
    float data_1[NUM] = {0};
    float data_2[NUM] = {0};
    
    for (i = 0; i < NUM; i++) {
        data_1[i] = 1;
        data_2[i] = 2;
    }
    float data_res[NUM] = {0};
    // 在Device上分配参数空间
    aclrtMalloc((void **)&input_1, NUM*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY);
    aclrtMalloc((void **)&input_2, NUM*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY);
    aclrtMalloc((void **)&output, NUM*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY);
    // 将Host侧数据拷贝到Device侧
    aclrtMemcpyAsync((void *)input_1, sizeof(data_1), data_1, sizeof(data_1), ACL_MEMCPY_HOST_TO_DEVICE, stream);
    aclrtMemcpyAsync((void *)input_2, sizeof(data_2), data_2, sizeof(data_2), ACL_MEMCPY_HOST_TO_DEVICE, stream);
    aclrtMemcpyAsync((void *)output, sizeof(data_res), data_res, sizeof(data_res), ACL_MEMCPY_HOST_TO_DEVICE, stream);
    // 启动Device侧核函数
    add_kernel<<<1, nullptr, stream>>>((float*)input_1, (float*)input_2, (float*)output);
    // 获取Device执行结果,并拷贝到Host
    float *hostMemOut;
    aclrtMallocHost((void**)&hostMemOut, NUM);
    aclrtMemcpyAsync(hostMemOut, NUM*sizeof(float), output, NUM*sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST, stream);
    aclrtSynchronizeStream(stream);
    //计算golden输出
    float golden[NUM] = {0};
    for (i = 0; i < NUM; i++) {
        golden[i] += data_1[i] + data_2[i];
    }
    // 对比结果
    for (i = 0; i < NUM; i++) {
        printf("i%ld\t Expect: %f\t\t\t\tResult: %f\n", 
            i,  golden[i], *((float *)hostMemOut + i));
    }
    // 释放数据空间以及数据流
    aclrtFreeHost(hostMemOut);
    aclrtDestroyStream(stream);
    aclrtResetDevice(0);
}