Host侧代码与说明
考虑以下算子计算场景:A矩阵大小为32(M) * 16(K), B矩阵大小为16(K) * 32(N),将A矩阵分在两个核上做矩阵乘运算,每个核上的A的切分大小为16 * 16,用同样的B矩阵运算。
整体流程与CUDA类似,首先在Host侧给参数创建空间并赋初始值,其次在Device侧创建参数指针并分配空间,再将初始值拷贝到Device中就可以拉起Device侧的核函数,Device侧代码与说明包含了函数的具体实现。
Device执行结束后可以将结果从Device侧拷贝到Host侧并与CPU执行结果相对比,最后释放空间与数据流即可。
// host侧文件:main.cce
// 直接通过<<<>>>异构调用语法调用device侧kernel
#include "acl/acl.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define M 32
#define K 16
#define N 32
#define BLOCKDIM 2
extern "C" __global__ [aicore] void mat_mul_kernel(__gm__ float* __restrict__ tensor_a, __gm__ float* __restrict__ tensor_b, __gm__ float* __restrict__ tensor_c_gm);
int main()
{
aclrtStream stream;
uint64_t i, j, k;
void * input_a = NULL;
void * input_b = NULL;
void * output_c = NULL;
aclrtSetDevice(0);
// 创建数据流
aclrtCreateStream(&stream);
// 参数初始化
float a_data[M][K] = {0};
float b_data[K][N] = {0};
for (i = 0; i < M; i++) {
for (k = 0; k < K; k++) {
a_data[i][k] = 1;
}
}
for (j = 0; j < N; j++) {
for (k = 0; k < K; k++) {
b_data[k][j] = 2;
}
}
float c_data[M][N] = {0};
// 在Device上分配参数空间
aclrtMalloc((void **)&input_a , M*K*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY);
aclrtMalloc((void **)&input_b , K*N*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY);
aclrtMalloc((void **)&output_c , M*N*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY);
// 将Host侧数据拷贝到Device侧
aclrtMemcpyAsync((void *)input_a, sizeof(a_data), a_data, sizeof(a_data), ACL_MEMCPY_HOST_TO_DEVICE, stream);
aclrtMemcpyAsync((void *)input_b, sizeof(b_data), b_data, sizeof(b_data), ACL_MEMCPY_HOST_TO_DEVICE, stream);
aclrtMemcpyAsync((void *)output_c, sizeof(c_data), c_data, sizeof(c_data), ACL_MEMCPY_HOST_TO_DEVICE, stream);
// 启动Device侧核函数
mat_mul_kernel<<<BLOCKDIM, nullptr, stream>>>((float*)input_a, (float*)input_b, (float*)output_c);
// 获取Device执行结果,并拷贝到Host
float *hostMemOut;
aclrtMallocHost((void**)&hostMemOut, M*N);
aclrtMemcpyAsync(hostMemOut, M*N*sizeof(float), output_c, M*N*sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST, stream);
aclrtSynchronizeStream(stream);
//计算golden输出
float golden[M][N] = {0};
for (i = 0; i < M; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < K; k++) {
golden[i][j] += a_data[i][k] * b_data[k][j];
}
}
}
// 对比结果
for ( i = 0; i < M; i++) {
for (j = 0; j < N; j++) {
printf("i%ld\t Expect: %f\t\t\t\tResult: %f\n",
i*N + j, golden[i][j], *((float *)hostMemOut + i*N + j));
}
}
// 释放数据空间以及数据流
aclrtFreeHost(hostMemOut);
aclrtDestroyStream(stream);
aclrtResetDevice(0);
}
父主题: CUBE算子示例