Host侧代码与说明
以Add算子为例,计算时,需要将两个类型为float的tensor逐元素相加,每个tensor包含64个元素。
首先在Host侧给两个入参以及结果创建空间并赋初始值,其次在Device侧创建参数指针并分配空间,再将初始值拷贝到Device中就可以拉起Device侧的核函数,Device侧代码与说明包含了函数的具体实现。
Device执行结束后可以将结果从Device侧拷贝到Host侧并与CPU执行结果相对比,最后释放空间与数据流即可。
// host侧文件:main.cce
// 直接通过<<<>>>异构调用语法调用device侧kernel
#include "acl/acl.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define NUM 64
extern "C" __global__ [aicore] void add_kernel(__gm__ float* __restrict__ input_1, __gm__ float* __restrict__ input_2, __gm__ float* __restrict__ output);
int main()
{
aclrtStream stream;
uint64_t i = 0;
void * input_1 = NULL;
void * input_2 = NULL;
void * output = NULL;
aclrtSetDevice(0);
// 创建数据流
aclrtCreateStream(&stream);
// 参数初始化
float data_1[NUM] = {0};
float data_2[NUM] = {0};
for (i = 0; i < NUM; i++) {
data_1[i] = 1;
data_2[i] = 2;
}
float data_res[NUM] = {0};
// 在Device上分配参数空间
aclrtMalloc((void **)&input_1, NUM*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY);
aclrtMalloc((void **)&input_2, NUM*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY);
aclrtMalloc((void **)&output, NUM*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY);
// 将Host侧数据拷贝到Device侧
aclrtMemcpyAsync((void *)input_1, sizeof(data_1), data_1, sizeof(data_1), ACL_MEMCPY_HOST_TO_DEVICE, stream);
aclrtMemcpyAsync((void *)input_2, sizeof(data_2), data_2, sizeof(data_2), ACL_MEMCPY_HOST_TO_DEVICE, stream);
aclrtMemcpyAsync((void *)output, sizeof(data_res), data_res, sizeof(data_res), ACL_MEMCPY_HOST_TO_DEVICE, stream);
// 启动Device侧核函数
add_kernel<<<1, nullptr, stream>>>((float*)input_1, (float*)input_2, (float*)output);
// 获取Device执行结果,并拷贝到Host
float *hostMemOut;
aclrtMallocHost((void**)&hostMemOut, NUM);
aclrtMemcpyAsync(hostMemOut, NUM*sizeof(float), output, NUM*sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST, stream);
aclrtSynchronizeStream(stream);
//计算golden输出
float golden[NUM] = {0};
for (i = 0; i < NUM; i++) {
golden[i] += data_1[i] + data_2[i];
}
// 对比结果
for (i = 0; i < NUM; i++) {
printf("i%ld\t Expect: %f\t\t\t\tResult: %f\n",
i, golden[i], *((float *)hostMemOut + i));
}
// 释放数据空间以及数据流
aclrtFreeHost(hostMemOut);
aclrtDestroyStream(stream);
aclrtResetDevice(0);
}
父主题: Vector算子示例