Host侧代码与说明

考虑以下算子计算场景：A矩阵大小为32*32， B矩阵大小为32*32，数据类型为float。
整体流程如下：首先在Host侧给参数创建空间并赋初始值，其次在Device侧创建参数指针并分配空间，再将初始值拷贝到Device中就可以调用Device侧的核函数，Device侧代码与说明包含了函数的具体实现。
Device执行结束后可以将结果从Device侧拷贝到Host侧并与CPU执行结果相对比，最后释放空间与数据流即可。
// host侧文件：main.cce
// 直接通过<<<>>>异构调用语法调用device侧kernel
#include "acl/acl.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define M 32
#define K 32
#define N 32 

// 矩阵初始化函数
void InitMatrix(float *matrix, uint32_t m, uint32_t n, float min_value, float max_value) {
    float range = max_value - min_value;
    
    for (uint32_t i = 0; i < m; i++) {
        for (uint32_t j = 0; j < n; j++) {
            float random_value = (float)rand() / (float)RAND_MAX;
            matrix[i * n + j] = min_value + random_value * range;
        }
    }
}

// 矩阵相乘函数，用于在Host侧计算结果，后续和Device侧对比
void matrix_multiply(float a[M][K], float b[K][N], float result[M][N]) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            result[i][j] = 0;
            for (int k = 0; k < K; k++) {
                result[i][j] += a[i][k] * b[k][j];
            }
        }
    }
}

// 判断两个矩阵是否相等函数，用于验证Device侧结果的正确性
bool are_matrices_equal(float mat1[M][N], float mat2[M][N]) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            if (fabs(mat1[i][j] - mat2[i][j]) > 1.0f) {
                return false; 
            }
        }
    }
    return true; 
}

// 分形转换函数，用于将ND格式矩阵转为ZZ格式的矩阵
void ConvertRowMajorToZZ(float *input, float *output, int m, int n) {
    const int BLOCK_HEIGHT = 16;
    const int BLOCK_WIDTH = 8;
    
    int block_rows = (m + BLOCK_HEIGHT - 1) / BLOCK_HEIGHT;
    int block_cols = (n + BLOCK_WIDTH - 1) / BLOCK_WIDTH;
    int out_idx = 0;
    for (int block_i = 0; block_i < block_rows; block_i++) {
        for (int block_j = 0; block_j < block_cols; block_j++) {
            int start_row = block_i * BLOCK_HEIGHT;
            int start_col = block_j * BLOCK_WIDTH;
            
            for (int r = 0; r < BLOCK_HEIGHT; r++) {
                int actual_row = start_row + r;
                if (actual_row >= m) {
                    continue;
                }
                
                for (int c = 0; c < BLOCK_WIDTH; c++) {
                    int actual_col = start_col + c;
                    if (actual_col >= n) {
                        continue;
                    }
                    
                    int in_idx = actual_row * n + actual_col;
                    output[out_idx++] = input[in_idx];
                }
            }
        }
    }
}

// 分形转换函数，用于将ND格式矩阵转为ZN格式的矩阵
void ConvertRowMajorToZN(float *input, float *output, int m, int n) {
    const int BLOCK_HEIGHT = 8;
    const int BLOCK_WIDTH = 16;
                                                                      
    int block_rows = (m + BLOCK_HEIGHT - 1) / BLOCK_HEIGHT;
    int block_cols = (n + BLOCK_WIDTH - 1) / BLOCK_WIDTH;
    int out_idx = 0;
    for (int block_i = 0; block_i < block_rows; block_i++) {
        for (int block_j = 0; block_j < block_cols; block_j++) {
            int start_row = block_i * BLOCK_HEIGHT;
            int start_col = block_j * BLOCK_WIDTH;
            
            for (int c = 0; c < BLOCK_WIDTH; c++) {
                int actual_col = start_col + c;
                if (actual_col >= n) break;
                
                for (int r = 0; r < BLOCK_HEIGHT; r++) {
                    int actual_row = start_row + r;
                    if (actual_row >= m) break;
                    
                    int in_idx = actual_row * n + actual_col;
                    output[out_idx++] = input[in_idx];
                }
            }
        }
    }
}

extern "C" __global__ [aicore] void mat_mul_kernel(__gm__ uint8_t * __restrict tensor_a, __gm__ uint8_t * __restrict tensor_b, __gm__ uint8_t * __restrict tensor_c, int m, int k, int n);

int main() {
    aclrtSetDevice(0);
    aclrtStream stream;
    aclrtCreateStream(&stream);

    // Host侧数据处理
    float tensor_a[M][K] = {1.0};
    float tensor_a_convert[M][K] = {1.0};
    float tensor_b[K][N] = {1.0};
    float tensor_b_convert[K][N] = {1.0};
    float tensor_c[M][N] = {0.0};
    InitMatrix((float *)tensor_a, M, K, -5, 5);
    InitMatrix((float *)tensor_b, M, K, -5, 5);
    InitMatrix((float *)tensor_c, M, N, -5, 5);

    // 计算Host侧的golden输出
    float golden[M][N] = {0.0};
    matrix_multiply(tensor_a, tensor_b, golden);

    // 分形转换，将ND数据分别转换为ZZ和ZN格式，以分别适配L0A和L0B的要求
    ConvertRowMajorToZZ((float *)tensor_a, (float *)tensor_a_convert, M, K);
    ConvertRowMajorToZN((float *)tensor_b, (float *)tensor_b_convert, K, N);

    // 在Device上分配内存，aclrtMalloc分配的是原始字节内存，不包含任何类型信息，在Device侧核函数中需要以正确类型解释这些内存区域
    __gm__ uint8_t *tensor_aGm = nullptr;
    __gm__ uint8_t *tensor_bGm = nullptr;
    __gm__ void *tensor_aGm_void;
    aclrtMalloc(&tensor_aGm_void, sizeof(tensor_a), ACL_MEM_MALLOC_NORMAL_ONLY);
    tensor_aGm = reinterpret_cast<__gm__ unsigned char *>(tensor_aGm_void);
    __gm__ void *tensor_bGm_void;
    aclrtMalloc(&tensor_bGm_void, sizeof(tensor_b), ACL_MEM_MALLOC_NORMAL_ONLY);
    tensor_bGm = reinterpret_cast<__gm__ unsigned char *>(tensor_bGm_void);
    __gm__ uint8_t *tensor_cGm = nullptr;
    __gm__ void *tensor_cGm_void;
    aclrtMalloc(&tensor_cGm_void, sizeof(tensor_c), ACL_MEM_MALLOC_NORMAL_ONLY);
    tensor_cGm = reinterpret_cast<__gm__ unsigned char *>(tensor_cGm_void);

    // 将Host侧数据拷贝到Device侧
    aclrtMemcpyAsync((void *)tensor_aGm, sizeof(tensor_a_convert), (float *)tensor_a_convert, sizeof(tensor_a_convert), ACL_MEMCPY_HOST_TO_DEVICE, stream);
    aclrtMemcpyAsync((void *)tensor_bGm, sizeof(tensor_b_convert), (float *)tensor_b_convert, sizeof(tensor_b_convert), ACL_MEMCPY_HOST_TO_DEVICE, stream);
    aclrtMemcpyAsync((void *)tensor_cGm, sizeof(tensor_c), (void *)tensor_c, sizeof(tensor_c), ACL_MEMCPY_HOST_TO_DEVICE, stream);

    // 启动Device侧核函数
    mat_mul_kernel<<<1, nullptr, stream>>>(tensor_aGm, tensor_bGm, tensor_cGm, M, K, N);

    // 获取Device执行结果，并拷贝到Host
    aclrtMemcpyAsync((void *)tensor_c, sizeof(tensor_c), tensor_cGm,
                     sizeof(tensor_c), ACL_MEMCPY_DEVICE_TO_HOST, stream);
    aclrtSynchronizeStream(stream);

    // 打印Host侧计算的golden输出和Device侧计算的输出，将float转换为int类型便于观察
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            printf("%6d", (int)golden[i][j]); 
            if (j % N == N-1) {
                printf("\n");
            }
        }
    }
    printf("\n");
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            printf("%6d", (int)tensor_c[i][j]); 
            if (j % N == N-1) {
                printf("\n");
            }
        }
    }
    printf("\n");

    // 判断Device侧的计算结果是否正确
    bool equal1 = are_matrices_equal(golden, tensor_c);
    printf("Matrix golden and Matrix tensor_c are %s\n", equal1 ? "equal" : "not equal");

    // 释放数据空间以及数据流
    aclrtFree(tensor_aGm);
    aclrtFree(tensor_bGm);
    aclrtFree(tensor_cGm);
    aclrtDestroyStream(stream);
    aclrtResetDevice(0);
}
父主题： 示例1