Load2D

Product Support

Product	Supported
Atlas A3 training products/Atlas A3 inference products	√
Atlas A2 training products/Atlas A2 inference products	√
Atlas 200I/500 A2 inference products	√
Atlas inference product's AI Core	√
Atlas inference product's Vector Core	x
Atlas training products	√

Function

Supports data transfer over the following paths:

GM->A1; GM->B1; GM->A2; GM->B2;

A1 -> A2; B1 -> B2.

Prototype

Load2D API

template <typename T>
__aicore__ inline void LoadData(const LocalTensor<T>& dst, const LocalTensor<T>& src, const LoadData2DParams& loadDataParams)
template <typename T> 
__aicore__ inline void LoadData(const LocalTensor<T>& dst, const GlobalTensor<T>& src, const LoadData2DParams& loadDataParams)

Parameters

**Table 1** Template parameters
Parameter	Description
T	Data types of the source and destination operands. Load2D API: For Atlas training products, the supported data types are uint8_t, int8_t, uint16_t, int16_t, and half. For the Atlas inference product's AI Core, the supported data types are uint8_t, int8_t, uint16_t, int16_t, and half. For Atlas A2 training products/Atlas A2 inference products, the supported data types are uint8_t, int8_t, uint16_t, int16_t, half, bfloat16_t, uint32_t, int32_t, and float. For Atlas A3 training products/Atlas A3 inference products, the supported data types are uint8_t, int8_t, uint16_t, int16_t, half, bfloat16_t, uint32_t, int32_t, and float. For the Atlas 200I/500 A2 inference products, the supported data types are uint8_t, int8_t, uint16_t, int16_t, half, bfloat16_t, uint32_t, int32_t, and float.

**Table 2** Common parameters
Parameter	Input/Output	Description
dst	Output	Destination operand, which is of the LocalTensor type. The sequential arrangement of data is determined by TPosition of the destination operand. The constraints are as follows: A2: ZZ format, with a tile size of 16 × (32B/sizeof(T)). B2: ZN format, with a tile size of (32B/sizeof(T)) × 16. A1/B1: No format restriction. Generally, the format is NZ with a tile size of 16 × (32B/sizeof(T)).
src	Input	Source operand, which is of the LocalTensor or GlobalTensor type. Its data type must be the same as that of dst.
loadDataParams	Input	LoadData parameter structure. Supported types are as follows: LoadData2DParams. For details, see Table 3. For details about the definition of the preceding structure parameters, see ${INSTALL_DIR}/include/ascendc/basic_api/interface/kernel_struct_mm.h. Replace *${INSTALL_DIR}* with the CANN installation path.

**Table 3** Parameters in the LoadData2DParams structure
Parameter	Description
startIndex	Tile matrix ID. It indicates the starting tile of the source operand for data transfer, where 0 represents the first tile matrix. Value range: startIndex ∈ [0, 65535]. Unit: 512 bytes. Default value: 0.
repeatTimes	Number of iterations. 512-byte data can be processed in each iteration. Value range: repeatTimes ∈ [1, 255].
srcStride	Interval between the start addresses of consecutive tiles of the source operand across adjacent iterations (unit: 512 bytes). Value range: srcStride ∈ [0, 65535]. Default value: 0.
sid	Reserved parameter. Set it to 0.
dstGap	Interval between the end address of the previous tile and the start address of the next tile of the destination operand across adjacent iterations (unit: 512 bytes). Value range: dstGap ∈ [0, 65535]. Default value: 0. Note: This parameter is disabled for Atlas training products.
ifTranspose	Whether to enable the transpose function for each tile matrix. The default value is false. true: enabled false: disabled Note: The transpose function can be enabled only for the A1->A2 and B1->B2 data paths. When the transpose function is enabled, the source and destination operands support only the uint16_t, int16_t, and half types.
addrMode	Reserved parameter. Set it to 0.

Restrictions

For details about the operand address alignment requirements, see General Address Alignment Restrictions.

Returns

None

Example

The example supports the Atlas inference product's AI Core platform.

#include "kernel_operator.h"

class KernelLoadData {
public:
    __aicore__ inline KernelLoadData()
    {
        coutBlocks = (Cout + 16 - 1) / 16;
        ho = (H + padTop + padBottom - dilationH * (Kh - 1) - 1) / strideH + 1;
        wo = (W + padLeft + padRight - dilationW * (Kw - 1) - 1) / strideW + 1;
        howo = ho * wo;
        howoRound = ((howo + 16 - 1) / 16) * 16;
        featureMapA1Size = C1 * H * W * C0;      // shape: [C1, H, W, C0]
        weightA1Size = C1 * Kh * Kw * Cout * C0; // shape: [C1, Kh, Kw, Cout, C0]
        featureMapA2Size = howoRound * (C1 * Kh * Kw * C0);
        weightB2Size = (C1 * Kh * Kw * C0) * coutBlocks * 16;
        m = howo;
        k = C1 * Kh * Kw * C0;
        n = Cout;
        dstSize = coutBlocks * howo * 16; // shape: [coutBlocks, howo, 16]
        dstCO1Size = coutBlocks * howoRound * 16;
        fmRepeat = featureMapA2Size / (16 * C0);
        weRepeat = weightB2Size / (16 * C0);
    }
    __aicore__ inline void Init(__gm__ uint8_t* fmGm, __gm__ uint8_t* weGm, __gm__ uint8_t* dstGm)
    {
        fmGlobal.SetGlobalBuffer((__gm__ half*)fmGm);
        weGlobal.SetGlobalBuffer((__gm__ half*)weGm);
        dstGlobal.SetGlobalBuffer((__gm__ half*)dstGm);
        pipe.InitBuffer(inQueueFmA1, 1, featureMapA1Size * sizeof(half));
        pipe.InitBuffer(inQueueFmA2, 1, featureMapA2Size * sizeof(half));
        pipe.InitBuffer(inQueueWeB1, 1, weightA1Size * sizeof(half));
        pipe.InitBuffer(inQueueWeB2, 1, weightB2Size * sizeof(half));
        pipe.InitBuffer(outQueue
, 1, dstCO1Size * sizeof(float));
        pipe.InitBuffer(outQueueUB, 1, dstSize * sizeof(half));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Split();
        Compute();
        CopyUB();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<half> featureMapA1 = inQueueFmA1.AllocTensor<half>();
        AscendC::LocalTensor<half> weightB1 = inQueueWeB1.AllocTensor<half>();
        AscendC::DataCopy(featureMapA1, fmGlobal, { 1, static_cast<uint16_t>(featureMapA1Size * sizeof(half) / 32), 0, 0 });
        AscendC::DataCopy(weightB1, weGlobal, { 1, static_cast<uint16_t>(weightA1Size * sizeof(half) / 32), 0, 0 });
        inQueueFmA1.EnQue(featureMapA1);
        inQueueWeB1.EnQue(weightB1);
    }
    __aicore__ inline void Split()
    {
        AscendC::LocalTensor<half> featureMapA1 = inQueueFmA1.DeQue<half>();
        AscendC::LocalTensor<half> weightB1 = inQueueWeB1.DeQue<half>();
        AscendC::LocalTensor<half> featureMapA2 = inQueueFmA2.AllocTensor<half>();
        AscendC::LocalTensor<half> weightB2 = inQueueWeB2.AllocTensor<half>();
        uint8_t padList[4] = {padLeft, padRight, padTop, padBottom};
        AscendC::LoadData(featureMapA2, featureMapA1,
            { padList, H, W, 0, 0, 0, -1, -1, strideW, strideH, Kw, Kh, dilationW, dilationH, 1, 0, fmRepeat, 0, (half)(0)});
        AscendC::LoadData(weightB2, weightB1, { 0, weRepeat, 1, 0, 0, false, 0 });
        inQueueFmA2.EnQue<half>(featureMapA2);
        inQueueWeB2.EnQue<half>(weightB2);
        inQueueFmA1.FreeTensor(featureMapA1);
        inQueueWeB1.FreeTensor(weightB1);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<half> featureMapA2 = inQueueFmA2.DeQue<half>();
        AscendC::LocalTensor<half> weightB2 = inQueueWeB2.DeQue<half>();
        AscendC::LocalTensor<float> dstCO1 = outQueueCO1.AllocTensor<float>();
        AscendC::Mmad(dstCO1, featureMapA2, weightB2, { m, n, k, 0, false, true });
        outQueueCO1.EnQue<float>(dstCO1);
        inQueueFmA2.FreeTensor(featureMapA2);
        inQueueWeB2.FreeTensor(weightB2);
    }
    __aicore__ inline void CopyUB()
    {
        AscendC::LocalTensor<float> dstCO1 = outQueueCO1.DeQue<float>();
        AscendC::LocalTensor<half> dstUB = outQueueUB.AllocTensor<half>();
        AscendC::DataCopyParams dataCopyParams;
        dataCopyParams.blockCount = 1;
        dataCopyParams.blockLen = m * n * sizeof(float) / 1024;
        AscendC::DataCopyEnhancedParams enhancedParams;
        enhancedParams.blockMode = AscendC::BlockMode::BLOCK_MODE_MATRIX;
        AscendC::DataCopy(dstUB, dstCO1, dataCopyParams, enhancedParams);
        outQueueUB.EnQue<half>(dstUB);
        outQueueCO1.FreeTensor(dstCO1);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<half> dstUB = outQueueUB.DeQue<half>();
        AscendC::DataCopy(dstGlobal, dstUB, m * n);
        outQueueUB.FreeTensor(dstUB);
    }

private:
    AscendC::TPipe pipe;
    // feature map queue
    AscendC::TQue<AscendC::TPosition::A1, 1> inQueueFmA1;
    AscendC::TQue<AscendC::TPosition::A2, 1> inQueueFmA2;
    // weight queue
    AscendC::TQue<AscendC::TPosition::B1, 1> inQueueWeB1;
    AscendC::TQue<AscendC::TPosition::B2, 1> inQueueWeB2;
    // dst queue
    AscendC::TQue<AscendC::TPosition::CO1, 1> outQueueCO1;
    AscendC::TQue<AscendC::TPosition::CO2, 1> outQueueUB;
    AscendC::GlobalTensor<half> fmGlobal, weGlobal, dstGlobal;
    uint16_t C1 = 2;
    uint16_t H = 4, W = 4;
    uint8_t Kh = 2, Kw = 2;
    uint16_t Cout = 16;
    uint16_t C0 = 16;
    uint8_t dilationH = 2, dilationW = 2;
    uint8_t padTop = 1, padBottom = 1, padLeft = 1, padRight = 1;
    uint8_t strideH = 1, strideW = 1;
    uint16_t coutBlocks, ho, wo, howo, howoRound;
    uint32_t featureMapA1Size, weightA1Size, featureMapA2Size, weightB2Size, dstSize, dstCO1Size;
    uint16_t m, k, n;
    uint8_t fmRepeat, weRepeat;
};

extern "C" __global__ __aicore__ void load_data_simple_kernel(__gm__ uint8_t* fmGm, __gm__ uint8_t* weGm,
    __gm__ uint8_t* dstGm)
{
    KernelLoadData op;
    op.Init(fmGm, weGm, dstGm);
    op.Process();
}

Parent topic: LoadData