Host侧代码与说明
以vadd算子为例,计算时,可将两个类型为float的tensor逐元素相加,每个tensor包含64个元素,数据类型为float。
首先在Host侧给两个入参以及结果创建空间并赋初始值,其次在Device侧创建参数指针并分配空间,再将初始值拷贝到Device中就可以拉起Device侧的核函数,Device侧代码与说明包含了函数的具体实现。
Device执行结束后可以将结果从Device侧拷贝到Host侧并与CPU执行结果相对比,最后释放空间与数据流即可。
// Host侧文件:main.cce
// 直接通过<<<>>>异构调用语法调用Device侧kernel
#include "acl/acl.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define NUM 64
// 向量初始化函数
void InitVector(float *vector, uint32_t len, float min_value, float max_value) {
float range = max_value - min_value;
for (uint32_t i = 0; i < len; i++) {
float random_value = (float)rand() / (float)RAND_MAX;
vector[i] = min_value + random_value * range;
}
}
// 向量相加函数,用于在Host侧计算结果,后续和Device侧对比
void vector_add(float *a, float *b, float *result, uint32_t len) {
for (uint32_t i = 0; i < len; i++) {
result[i] = a[i] + b[i];
}
}
// 判断两个向量是否相等函数,用于验证Device侧结果的正确性
bool are_vectors_equal(float *vec1, float *vec2, uint32_t len) {
for (uint32_t i = 0; i < len; i++) {
if (fabs(vec1[i] - vec2[i]) > 1) {
return false;
}
}
return true;
}
extern "C" __global__ [aicore] void add_kernel(__gm__ uint8_t * __restrict tensor_a, __gm__ uint8_t * __restrict tensor_b, __gm__ uint8_t * __restrict tensor_c);
int main() {
aclrtSetDevice(0);
aclrtStream stream;
aclrtCreateStream(&stream);
// Host侧数据处理
float tensor_a[NUM] = {1.0};
float tensor_b[NUM] = {1.0};
float tensor_c[NUM] = {0.0};
InitVector((float *)tensor_a, NUM, -5, 5);
InitVector((float *)tensor_b, NUM, -5, 5);
InitVector((float *)tensor_c, NUM, -5, 5);
// 计算Host侧的golden输出
float golden[NUM] = {0.0};
vector_add(tensor_a, tensor_b, golden, NUM);
// 在Device上分配内存,aclrtMalloc分配的是原始字节内存,不包含任何类型信息,在Device侧核函数中需要以正确类型解释这些内存区域
__gm__ uint8_t *tensor_aGm = nullptr;
__gm__ uint8_t *tensor_bGm = nullptr;
__gm__ void *tensor_aGm_void;
aclrtMalloc(&tensor_aGm_void, sizeof(tensor_a), ACL_MEM_MALLOC_NORMAL_ONLY);
tensor_aGm = reinterpret_cast<__gm__ unsigned char *>(tensor_aGm_void);
__gm__ void *tensor_bGm_void;
aclrtMalloc(&tensor_bGm_void, sizeof(tensor_b), ACL_MEM_MALLOC_NORMAL_ONLY);
tensor_bGm = reinterpret_cast<__gm__ unsigned char *>(tensor_bGm_void);
__gm__ uint8_t *tensor_cGm = nullptr;
__gm__ void *tensor_cGm_void;
aclrtMalloc(&tensor_cGm_void, sizeof(tensor_c), ACL_MEM_MALLOC_NORMAL_ONLY);
tensor_cGm = reinterpret_cast<__gm__ unsigned char *>(tensor_cGm_void);
// 将Host侧数据拷贝到Device侧
aclrtMemcpyAsync((void *)tensor_aGm, sizeof(tensor_a), (float *)tensor_a, sizeof(tensor_a), ACL_MEMCPY_HOST_TO_DEVICE, stream);
aclrtMemcpyAsync((void *)tensor_bGm, sizeof(tensor_b), (float *)tensor_b, sizeof(tensor_b), ACL_MEMCPY_HOST_TO_DEVICE, stream);
aclrtMemcpyAsync((void *)tensor_cGm, sizeof(tensor_c), (void *)tensor_c, sizeof(tensor_c), ACL_MEMCPY_HOST_TO_DEVICE, stream);
// 启动Device侧核函数
add_kernel<<<1, nullptr, stream>>>(tensor_aGm, tensor_bGm, tensor_cGm);
// 获取Device执行结果,并拷贝到Host
aclrtMemcpyAsync((void *)tensor_c, sizeof(tensor_c), tensor_cGm,
sizeof(tensor_c), ACL_MEMCPY_DEVICE_TO_HOST, stream);
aclrtSynchronizeStream(stream);
// 打印Host侧计算的golden输出和Device侧计算的输出,将float转换为int类型便于观察
for (int i = 0; i < NUM; i++) {
printf("%4d", (int)golden[i]);
}
printf("\n");
for (int i = 0; i < NUM; i++) {
printf("%4d", (int)tensor_c[i]);
}
printf("\n");
// 判断Device侧的计算结果是否正确
bool equal1 = are_vectors_equal(golden, tensor_c, NUM);
printf("Matrix golden and Matrix tensor_c are %s\n", equal1 ? "equal" : "not equal");
// 释放数据空间以及数据流
aclrtFree(tensor_aGm);
aclrtFree(tensor_bGm);
aclrtFree(tensor_cGm);
aclrtDestroyStream(stream);
aclrtResetDevice(0);
}
父主题: Vector算子示例