Brcb

Function Usage

Extracts eight elements from a given input tensor each time and fills them in eight data blocks (32 bytes) in the result tensor. Each element corresponds to a data block.

Prototype

template <typename T>
__aicore__ inline void Brcb(const LocalTensor<T>& dstLocal, const LocalTensor<T>& src0Local, const uint8_t repeatTimes, const BrcbRepeatParams& repeatParams)

Parameters

**Table 1** Template parameters
Parameter	Description
T	Operand data type.

**Table 2** Parameters
Parameter	Input/Output	Meaning
dstLocal	Output	Destination operand. The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT. The start address of the LocalTensor must be 32-byte aligned. The address needs to be 32-byte aligned.
srcLocal	Input	Source operand, which consecutively stores the score elements The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT. The start address of the LocalTensor must be 32-byte aligned. The data type is the same as that of dstLocal.
repeatTimes	Input	Number of instruction iterations. Eight data blocks are collected in each iteration. Data range: repeatTimes ∈ [0, 255]
repeatParams	Input	Instruction iteration parameter of the BrcbRepeatParams type. For details about the parameter description, see Table 2.

**Table 3** Parameters in the **BrcbRepeatParams** structure
Parameter	Input/Output	Meaning
dstBlkStride	Output	Address stride of the vector destination operand between different data blocks in a single iteration Note: When dstBlkStride is set to 0, the value 1 is used by default.
dstRepStride	Input	Address stride of the vector destination operand for the same data block between adjacent iterations

Availability

Example

Example of brcb of the uint16_t data type

#include "kernel_operator.h"
class VbrcbCase {
public:
    __aicore__ inline VbrcbCase()
    {}
    __aicore__ inline void Init(__gm__ uint8_t *x, __gm__ uint8_t *y)
    {
        x_gm.SetGlobalBuffer(reinterpret_cast<__gm__ uint16_t *>(x));
        y_gm.SetGlobalBuffer(reinterpret_cast<__gm__ uint16_t *>(y));
        tpipe.InitBuffer(vecIn, 1, 16 * sizeof(uint16_t));
        tpipe.InitBuffer(vecOut, 1, 256 * sizeof(uint16_t));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
    __aicore__ inline void CopyIn()
    {
        auto x_buf = vecIn.AllocTensor<uint16_t>();
        AscendC::DataCopy(x_buf, x_gm, 16);
        vecIn.EnQue(x_buf);
    }
    __aicore__ inline void Compute()
    {
        auto x_buf = vecIn.DeQue<uint16_t>();
        auto y_buf = vecOut.AllocTensor<uint16_t>();
        AscendC::Brcb(y_buf, x_buf, 2, {1,8});
        vecOut.EnQue(y_buf);
        vecIn.FreeTensor(x_buf);
    }
    __aicore__ inline void CopyOut()
    {
        auto y_buf = vecOut.DeQue<uint16_t>();
        AscendC::DataCopy(y_gm, y_buf, 256);
        vecOut.FreeTensor(y_buf);
    }
private:
    AscendC::GlobalTensor<uint16_t> x_gm;
    AscendC::GlobalTensor<uint16_t> y_gm;
    AscendC::TPipe tpipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> vecIn;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> vecOut;
};
extern "C" __global__ __aicore__ void vbrcb_uint16_t_16(__gm__ uint8_t *x, __gm__ uint8_t *y)
{
    VbrcbCase op;
    op.Init(x, y);
    op.Process();
}

Result example:

Input (srcGlobal): [1 2 3... 16]
Output (dstGlobal): [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ... 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16]

Parent topic: Data Padding