More Samples

  • Use the contiguous mask mode of a high-dimensional tensor sharding computation API to implement discontinuous data calculation.
    1
    2
    uint64_t mask = 64;  //  Only the first 64 bits are calculated in each iteration.
    AscendC::Add(dstLocal, src0Local, src1Local, mask, 4, { 1, 1, 1, 8, 8, 8 });
    

    Result example:

    Input (src0Local): [1 2 3 ... 512]
    Input (src1Local): [513 514 515 ... 1024]
    Output (dstLocal):
    [514 516 518 ... 640 undefined ... undefined
     770 772 774 ... 896 undefined ... undefined
     1026 1028 1030 ... 1152 undefined ... undefined
     1282 1284 1286 ... 1408 undefined ... undefined]
  • Use the bitwise mask mode of a high-dimensional tensor sharding computation API to implement discontinuous data calculation.
    1
    2
    uint64_t mask[2] = { UINT64_MAX, 0 }; // mask[0] is set to max, mask[1] is set to empty, and only the first 64 bits are calculated each time.
    AscendC::Add(dstLocal, src0Local, src1Local, mask, 4, { 1, 1, 1, 8, 8, 8 });
    
    Result example:
    Input (src0Local): [1 2 3 ... 512]
    Input (src1Local): [513 514 515 ... 1024]
    Output (dstLocal):
    [514 516 518 ... 640 undefined ... undefined
     770 772 774 ... 896 undefined ... undefined
     1026 1028 1030 ... 1152 undefined ... undefined
     1282 1284 1286 ... 1408 undefined ... undefined]
  • Set the repeatStride parameter of a high-dimensional tensor sharding computation API to implement discontinuous data computation.
    1
    2
    3
    4
    5
    uint64_t mask = 128;
    // If repeatTime is set to 2, two iterations are required.
    // Set src0BlkStride and src1BlkStride to 1, indicating that the interval between src0 data addresses involved in calculation in each iteration is one data block.
    // Set src0RepStride to 16, indicating that the interval of the src0 start addresses between adjacent iterations is 16 data blocks.
    AscendC::Add(dstLocal, src0Local, src1Local, mask, 2, { 1, 1, 1, 8, 16, 8 });
    

    Result example:

    Input (src0Local): [1 2 3 ... 512]
    Input (src1Local): [513 514 515 ... 1024]
    Output (dstLocal):
    [514 516 518 ...768 898 900 902 ... 1150 1152 undefined ... undefined]
  • Set the dataBlockStride and repeatStride parameters of a high-dimensional tensor sharding computation API to implement discontinuous data calculation.
    1
    2
    3
    4
    5
    uint64_t mask = 128;
    // repeatTime is set to 2, indicating that two iterations are required.
    // Set src0BlkStride to 2, indicating that the interval between src0 data addresses involved in calculation in each iteration is two data blocks.
    // Set src0RepStride to 16, indicating that the interval of the src0 start addresses between adjacent iterations is 16 data blocks.
    AscendC::Add(dstLocal, src0Local, src1Local, mask, 2, { 1, 2, 1, 8, 16, 8 });
    
    Result example:
    Input (src0Local): [1 2 3 ... 512]
    Input (src1Local): [513 514 515 ... 1024]
    Output (dstLocal):
    [514 516 518 ... 544  562 564 566 ... 592  610 612 614 ... 640  658 660 662 ... 688
     706 708 710 ... 736  754 756 758 ... 784  802 804 806 ... 832  850 852 854 ... 880 
     898 900 902 ... 928  946 948 950 ... 976  994 996 998 ... 1024  1042 1044 1046 ... 1072
    1090 1092 1094 ... 1120  1138 1140 1142 ... 1168  1186 1188 1190 ... 1216 1234 1236 1238 … 1264
    undefined ... undefined]
  • The following is an example of using an API that requires scalar parameters.
     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    #include "kernel_operator.h"
    constexpr int32_t BUFFER_NUM = 2;
    class KernelBinaryScalar {
    public:
        __aicore__ inline KernelBinaryScalar() {}
        __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, float scalar, uint32_t totalLength, uint32_t tileNum)
        {
            ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
            this->blockLength = totalLength / AscendC::GetBlockNum();
            this->scalar = scalar;
            this->tileNum = tileNum;
            ASSERT(tileNum != 0 && "tile num can not be zero!");
            this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
            xGm.SetGlobalBuffer((__gm__ DTYPE_X*)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
            zGm.SetGlobalBuffer((__gm__ DTYPE_Z*)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
            pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X));
            pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Z));
        }
        __aicore__ inline void Process()
        {
            int32_t loopCount = this->tileNum * BUFFER_NUM;
            for (int32_t i = 0; i < loopCount; i++) {
                CopyIn(i);
                Compute(i);
                CopyOut(i);
            }
        }
    private:
        __aicore__ inline void CopyIn(int32_t progress)
        {
            AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.AllocTensor<DTYPE_X>();
            AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
            inQueueX.EnQue(xLocal);
        }
        __aicore__ inline void Compute(int32_t progress)
        {
            AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
            AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
            AscendC::Adds(zLocal, xLocal, (DTYPE_X)scalar, this->tileLength);
            outQueueZ.EnQue<DTYPE_Z>(zLocal);
            inQueueX.FreeTensor(xLocal);
        }
        __aicore__ inline void CopyOut(int32_t progress)
        {
            AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.DeQue<DTYPE_Z>();
            AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
            outQueueZ.FreeTensor(zLocal);
        }
    private:
        AscendC::TPipe pipe;
        AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
        AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
        AscendC::GlobalTensor<DTYPE_X> xGm;
        AscendC::GlobalTensor<DTYPE_Z> zGm;
        float scalar;
        uint32_t blockLength;
        uint32_t tileNum;
        uint32_t tileLength;
    };
    extern "C" __global__ __aicore__ void binary_scalar_simple_kernel(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
    {
        GET_TILING_DATA(tilingData, tiling);
        KernelBinaryScalar op;
        op.Init(x, z, tilingData.scalar, tilingData.totalLength, tilingData.tileNum);
        if (TILING_KEY_IS(1)) {
            op.Process();
        }
    }