More Samples

Template Sample

This section provides a template sample of two-operand scalar instructions to help you quickly run reference samples in specific instructions.

You can use the following template sample as the code framework and only need to copy the sample snippet in specific instructions to replace the content in bold.

#include "kernel_operator.h"
class KernelBinaryScalar {
public:
    __aicore__ inline KernelBinaryScalar() {}
    __aicore__ inline void Init(__gm__ uint8_t* src, __gm__ uint8_t* dstGm)
    {
        srcGlobal.SetGlobalBuffer((__gm__ int16_t*)src);
        dstGlobal.SetGlobalBuffer((__gm__ int16_t*)dstGm);
        pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(int16_t));
        pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(int16_t));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<int16_t> srcLocal = inQueueSrc.AllocTensor<int16_t>();
        AscendC::DataCopy(srcLocal, srcGlobal, 512);
        inQueueSrc.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<int16_t> srcLocal = inQueueSrc.DeQue<int16_t>();
        AscendC::LocalTensor<int16_t> dstLocal = outQueueDst.AllocTensor<int16_t>();

        uint64_t mask = 128;
        int16_t scalar = 2;
        // repeatTimes = 4, 128 elements one repeat, 512 elements total
        // dstBlkStride, srcBlkStride = 1, no gap between blocks in one repeat
        // dstRepStride, srcRepStride =8, no gap between repeats
        AscendC::Adds(dstLocal, srcLocal, scalar, mask, 4, {1, 1, 8, 8});
        
        outQueueDst.EnQue<int16_t>(dstLocal);
        inQueueSrc.FreeTensor(srcLocal);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<int16_t> dstLocal = outQueueDst.DeQue<int16_t>();
        AscendC::DataCopy(dstGlobal, dstLocal, 512);
        outQueueDst.FreeTensor(dstLocal);
    }
private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueSrc;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueDst;
    AscendC::GlobalTensor<int16_t> srcGlobal, dstGlobal;
};
extern "C" __global__ __aicore__ void binary_scalar_simple_kernel(__gm__ uint8_t* src, __gm__ uint8_t* dstGm)
{
    KernelBinaryScalar op;
    op.Init(src, dstGm);
    op.Process();
}

More Samples

You can refer to the following examples to learn how to use the high-dimensional tensor sharding computation APIs of the two-operand scalar instructions to perform more flexible operations and implement more advanced functions.

If you need to run the sample code, copy the code snippet and replace the code in bold of the Compute function in the template.

Use the contiguous mask mode of a high-dimensional tensor sharding computation API to implement discontinuous data calculation.

uint64_t mask = 64;  // Only the first 64 bits are calculated in each iteration.
uint16_t scalar = 2;
AscendC::Adds(dstLocal, srcLocal, scalar, mask, 4, { 1, 1, 8, 8 });

Result example:

Input (srcLocal): [1 2 3 ... 512]
Input (scalar): 2
Output (dstLocal):
[3 4 5 ... 66 undefined ... undefined
 131 132 133 ... 194 undefined ... undefined
 259 260 261 ... 322 undefined ... undefined
 387 388 389 ... 450 undefined ... undefined]

Use the bitwise mask mode of a high-dimensional tensor sharding computation API to implement discontinuous data calculation.

uint64_t mask[2] = { UINT64_MAX, 0 }; // mask[0] is set to max, mask[1] is set to empty, and only the first 64 bits are calculated each time.
int16_t scalar = 2;
AscendC::Adds(dstLocal, src0Local, scalar, mask, 4, { 1, 1, 8, 8 });

Result example:

Input (srcLocal): [1 2 3 ... 512]
Input (scalar): 2
Output (dstLocal):
[3 4 5 ... 66 undefined ... undefined
 131 132 133 ... 194 undefined ... undefined
 259 260 261 ... 322 undefined ... undefined
 387 388 389 ... 450 undefined ... undefined]

Set the repeatStride parameter of a high-dimensional tensor sharding computation API to implement discontinuous data calculation.

uint64_t mask = 128;
int16_t scalar = 2;
// Set repeatTimes to 2, indicating that two iterations are required.
// Set dstBlkStride/srcBlkStride to 1, indicating that the interval between the dst/src data addresses involved in computation in each iteration is one data block.
// Set dstRepStride to 8, indicating that the interval between the dst start addresses of adjacent iterations is eight data blocks.
// Set srcRepStride to 16, indicating that the interval between the src start addresses of adjacent iterations is 16 blocks.
AscendC::Adds(dstLocal, srcLocal, scalar, mask, 2, { 1, 1, 8, 16 });

Result example:

Input (srcLocal): [1 2 3 ... 512]
Input (scalar): 2
Output (dstLocal):
[3 4 5 ...130 259 260 261 ... 386  undefined ... undefined]

Set the dataBlockStride and repeatStride parameters of a high-dimensional tensor sharding computation API to implement discontinuous data calculation.

uint64_t mask = 128;
int16_t scalar = 2;
// Set repeatTimes to 2, indicating that two iterations are required.
// Set dstBlkStride to 2, indicating that the interval between the dst data addresses involved in computation in each iteration is two data blocks.
// Set srcBlkStride to 1, indicating that the interval between the src data addresses involved in computation in each iteration is one data block.
// Set dstRepStride to 16, indicating that the interval between the dst start addresses of adjacent iterations is 16 data blocks.
// Set srcRepStride to 8, indicating that the interval between the src start addresses of adjacent iterations is eight data blocks.
AscendC::Adds(dstLocal, srcLocal, scalar, mask, 2, { 2, 1, 16, 8 });

Result example:

Input (srcLocal): [1 2 3 ... 512]
Input (scalar): 2
Output (dstLocal):
[3 4 5 ... 18  undefined ... undefined 19 20 21 ... 34  undefined ... undefined 
35 36 37 ... 50 undefined ... undefined 51 52 53 ... 66 undefined ... undefined 
67 68 69 ... 82 undefined ... undefined 83 84 85 ... 98 undefined ... undefined  
99 100 101 ... 114 undefined ... undefined 115 116 117 ... 130 undefined ... undefined 
131 132 133 ... 146 undefined ... undefined 147 148 149 ... 162 undefined ... undefined 
163 164 165 ... 178 undefined ... undefined 179 180 181 ... 194 undefined ... undefined
195 196 197 ... 210 undefined ... undefined 211 212 213 ... 226 undefined ... undefined
227 228 229 ... 242 undefined ... undefined 243 244 245 ... 258 undefined ... undefined] (undefined contains 16 elements each time.)

When the scalar parameter used by the two-operand scalar instruction needs to be input from outside the kernel function, the kernel function can be modified as follows:

#include "kernel_operator.h"
constexpr int32_t BUFFER_NUM = 2;
class KernelBinaryScalar {
public:
    __aicore__ inline KernelBinaryScalar() {}
    __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, float scalar, uint32_t totalLength, uint32_t tileNum)
    {
        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
        this->blockLength = totalLength / AscendC::GetBlockNum();
        this->scalar = scalar;
        this->tileNum = tileNum;
        ASSERT(tileNum != 0 && "tile num can not be zero!");
        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
        xGm.SetGlobalBuffer((__gm__ DTYPE_X*)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        zGm.SetGlobalBuffer((__gm__ DTYPE_Z*)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X));
        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Z));
    }
    __aicore__ inline void Process()
    {
        int32_t loopCount = this->tileNum * BUFFER_NUM;
        for (int32_t i = 0; i < loopCount; i++) {
            CopyIn(i);
            Compute(i);
            CopyOut(i);
        }
    }
private:
    __aicore__ inline void CopyIn(int32_t progress)
    {
        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.AllocTensor<DTYPE_X>();
        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
        inQueueX.EnQue(xLocal);
    }
    __aicore__ inline void Compute(int32_t progress)
    {
        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
        AscendC::Adds(zLocal, xLocal, (DTYPE_X)scalar, this->tileLength);
        outQueueZ.EnQue<DTYPE_Z>(zLocal);
        inQueueX.FreeTensor(xLocal);
    }
    __aicore__ inline void CopyOut(int32_t progress)
    {
        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.DeQue<DTYPE_Z>();
        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
        outQueueZ.FreeTensor(zLocal);
    }
private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
    AscendC::GlobalTensor<DTYPE_X> xGm;
    AscendC::GlobalTensor<DTYPE_Z> zGm;
    float scalar;
    uint32_t blockLength;
    uint32_t tileNum;
    uint32_t tileLength;
};
extern "C" __global__ __aicore__ void binary_scalar_simple_kernel(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
{
    GET_TILING_DATA(tilingData, tiling);
    KernelBinaryScalar op;
    op.Init(x, z, tilingData.scalar, tilingData.totalLength, tilingData.tileNum);
    if (TILING_KEY_IS(1)) {
        op.Process();
    }
}

Parent topic: Two-Operand Scalar Instructions