Host侧代码与说明
以Add算子为例,计算时,需要将两个类型为float的tensor逐元素相加,每个tensor包含64个元素。
首先在Host侧给两个入参以及结果创建空间并赋初始值,其次在Device侧创建参数指针并分配空间,再将初始值拷贝到Device中就可以拉起Device侧的核函数,Device侧代码与说明包含了函数的具体实现。
Device执行结束后可以将结果从Device侧拷贝到Host侧并与CPU执行结果相对比,最后释放空间与数据流即可。
// host侧文件:main.cce // 直接通过<<<>>>异构调用语法调用device侧kernel #include "acl/acl.h" #include <stdio.h> #include <string.h> #include <stdlib.h> #define NUM 64 extern "C" __global__ [aicore] void add_kernel(__gm__ float* __restrict__ input_1, __gm__ float* __restrict__ input_2, __gm__ float* __restrict__ output); int main() { aclrtStream stream; uint64_t i = 0; void * input_1 = NULL; void * input_2 = NULL; void * output = NULL; aclrtSetDevice(0); // 创建数据流 aclrtCreateStream(&stream); // 参数初始化 float data_1[NUM] = {0}; float data_2[NUM] = {0}; for (i = 0; i < NUM; i++) { data_1[i] = 1; data_2[i] = 2; } float data_res[NUM] = {0}; // 在Device上分配参数空间 aclrtMalloc((void **)&input_1, NUM*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY); aclrtMalloc((void **)&input_2, NUM*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY); aclrtMalloc((void **)&output, NUM*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY); // 将Host侧数据拷贝到Device侧 aclrtMemcpyAsync((void *)input_1, sizeof(data_1), data_1, sizeof(data_1), ACL_MEMCPY_HOST_TO_DEVICE, stream); aclrtMemcpyAsync((void *)input_2, sizeof(data_2), data_2, sizeof(data_2), ACL_MEMCPY_HOST_TO_DEVICE, stream); aclrtMemcpyAsync((void *)output, sizeof(data_res), data_res, sizeof(data_res), ACL_MEMCPY_HOST_TO_DEVICE, stream); // 启动Device侧核函数 add_kernel<<<1, nullptr, stream>>>((float*)input_1, (float*)input_2, (float*)output); // 获取Device执行结果,并拷贝到Host float *hostMemOut; aclrtMallocHost((void**)&hostMemOut, NUM); aclrtMemcpyAsync(hostMemOut, NUM*sizeof(float), output, NUM*sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST, stream); aclrtSynchronizeStream(stream); //计算golden输出 float golden[NUM] = {0}; for (i = 0; i < NUM; i++) { golden[i] += data_1[i] + data_2[i]; } // 对比结果 for (i = 0; i < NUM; i++) { printf("i%ld\t Expect: %f\t\t\t\tResult: %f\n", i, golden[i], *((float *)hostMemOut + i)); } // 释放数据空间以及数据流 aclrtFreeHost(hostMemOut); aclrtDestroyStream(stream); aclrtResetDevice(0); }
父主题: Vector算子示例