Host侧代码与说明
本章使用的MIX算子,在Cube侧与CUBE算子示例中的例子完全相同,在Vector会对矩阵乘的结果进行ReLU操作。
// host侧文件:main.cce // 直接通过<<<>>>异构调用语法调用device侧kernel #include <stdio.h> #include <random> #include "acl/acl.h" #include "runtime/rt.h" #define M 32 #define K 16 #define N 32 #define BLOCKDIM 2 extern "C" __global__ [aicore] void mat_mul_relu_mix_kernel(__gm__ uint16_t* __restrict__ ffts_addr, __gm__ float* __restrict__ tensor_a, __gm__ float* __restrict__ tensor_b, __gm__ float* __restrict__ tensor_c_gm); int main() { aclrtStream stream; uint64_t i, j, k; void * input_a = NULL; void * input_b = NULL; void * output_c = NULL; aclrtSetDevice(0); // 创建数据流 aclrtCreateStream(&stream); // 参数初始化 float a_data[M][K] = {0}; float b_data[K][N] = {0}; float rangeMin = -50.0f; // 范围最小值 float rangeMax = 50.0f; // 范围最大值 for (i = 0; i < M; i++) { for (k = 0; k < K; k++) { a_data[i][k] = rangeMin + static_cast<float>(std::rand()) / RAND_MAX * (rangeMax - rangeMin); } } for (j = 0; j < N; j++) { for (k = 0; k < K; k++) { b_data[k][j] = 2; } } float c_data[M][N] = {0}; // 在Device上分配参数空间 aclrtMalloc((void **)&input_a , M*K*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY); aclrtMalloc((void **)&input_b , K*N*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY); aclrtMalloc((void **)&output_c , M*N*sizeof(float), ACL_MEM_MALLOC_NORMAL_ONLY); uint64_t ffts{0}; uint32_t fftsLen{0}; rtGetC2cCtrlAddr(&ffts, &fftsLen); // 将Host侧数据拷贝到Device侧 aclrtMemcpyAsync((void *)input_a, sizeof(a_data), a_data, sizeof(a_data), ACL_MEMCPY_HOST_TO_DEVICE, stream); aclrtMemcpyAsync((void *)input_b, sizeof(b_data), b_data, sizeof(b_data), ACL_MEMCPY_HOST_TO_DEVICE, stream); aclrtMemcpyAsync((void *)output_c, sizeof(c_data), c_data, sizeof(c_data), ACL_MEMCPY_HOST_TO_DEVICE, stream); // 启动Device侧核函数 mat_mul_relu_mix_kernel<<<BLOCKDIM, nullptr, stream>>>((uint16_t*)ffts, (float*)input_a, (float*)input_b, (float*)output_c); // 获取Device执行结果,并拷贝到Host float *hostMemOut; aclrtMallocHost((void**)&hostMemOut, M*N); aclrtMemcpyAsync(hostMemOut, M*N*sizeof(float), output_c, M*N*sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST, stream); aclrtSynchronizeStream(stream); // 计算golden输出 float golden[M][N] = {0}; for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { for (k = 0; k < K; k++) { golden[i][j] += a_data[i][k] * b_data[k][j]; } } } for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { golden[i][j] = golden[i][j] < 0 ? 0 : golden[i][j]; } } // 对比结果 for ( i = 0; i < M; i++) { for (j = 0; j < N; j++) { printf("i%ld\t Expect: %f\t\t\t\tResult: %f\n", i*N + j, golden[i][j], *((float *)hostMemOut + i*N + j)); } } // 释放数据空间以及数据流 aclrtFreeHost(hostMemOut); aclrtDestroyStream(stream); aclrtResetDevice(0); }
父主题: MIX算子示例