TensorTrait

Function Usage

GlobalTensor and LocalTensor use member variables of the ShapeInfo type to store shape information, which can be set or obtained by using SetShapeInfo and GetShapeInfo. Generally, the member variables are used to store and transfer shape information inside operators. If the preceding ShapeInfo functions are not used, the information is not required. In this case, you can use TensorTrait to define the GlobalTensor and LocalTensor without ShapeInfo to reduce the memory usage and improve the running performance.

Prototype

template <typename T>
struct TensorTrait {
    using LiteType = T;
};

Parameters

**Table 1** Parameters in the TensorTrait struct template
Parameter	Description
T	Only the following basic data types are supported: int4b_t, uint8_t, int8_t, int16_t, uint16_t, bfloat16_t, int32_t, uint32_t, int64_t, uint64_t, float, and half. With TensorTrait, you can obtain a tensor data type expressed using TensorTrait. Inside the TensorTrait struct, the using keyword is used to define a type alias LiteType, which is the same as the type of the template parameter T. The LocalTensor/GlobalTensor defined by TensorTrait does not contain ShapeInfo. For example: The tensor without ShapeInfo corresponding to LocalTensor<float> is LocalTensor<TensorTrait<float>>.

Constraints

The same API does not support the input of GlobalTensor and LocalTensor of the TensorTrait and non-TensorTrait types at the same time.
Copy constructors and assignment operators are not supported between GlobalTensor and LocalTensor of the non-TensorTrait and TensorTrait types.

Currently, the TensorTrait feature supports only the following APIs.

**Table 2** APIs supported by TensorTrait
API Category	API	Remarks
Basic APIs > Memory Management and Synchronization Control > TQue/TQueBind	AllocTensor, FreeTensor, EnQue, DeQue	_
Basic APIs > Vector computing > One-operand instructions	Exp, Ln, Abs, Reciprocal, Sqrt, Rsqrt, Not, Relu	-
Basic APIs > Vector computing > Two-operand instructions	Add, Sub, Mul, Div, Max, Min, And, Or, AddRelu, AddReluCast, AddDeqRelu, SubRelu, SubReluCast, MulAddDst, FusedMulAdd, FusedMulAddRelu	-
Basic APIs > Vector computing > Two-operand scalar instructions	Adds, Muls, Maxs, Mins, ShiftLeft, ShiftRight, LeakyRelu	-
Basic APIs > Data movement	DataCopy, Copy	The API for moving data slices requires the ShapeInfo information. The GlobalTensor and LocalTensor of the TensorTrait type cannot be input.
Basic instructions > ISASI (architecture-related) > Matrix computing	InitConstValue, LoadData, LoadDataWithTranspose, SetAippFunctions, LoadImageToLocal, LoadUnzipIndex, LoadDataUnzip, LoadDataWithSparse, SetFmatrix, SetLoadDataBoundary, SetLoadDataRepeat, SetLoadDataPaddingValue, Mmad, MmadWithSparse, Fixpipe, SetFixPipeConfig, SetFixpipeNz2ndFlag, SetFixpipePreQuantFlag, BroadCastVecToMM, SetHF32Mode, SetHF32TransMode, SetMMLayoutTransform, CheckLocalMemoryIA, Conv2D, Gemm	-

Example

TensorTrait example for two-operand instructions

// Use the system descriptor TensorTrait.
AscendC::LocalTensor<AscendC::TensorTrait<half>> tensor1 = que1.DeQue<AscendC::TensorTrait<half>>();
AscendC::LocalTensor<AscendC::TensorTrait<half>> tensor2 = que2.DeQue<AscendC::TensorTrait<half>>();
AscendC::LocalTensor<AscendC::TensorTrait<half>> tensor3 = que3.AllocTensor<AscendC::TensorTrait<half>>();
Add(tensor3, tensor1, tensor2, tensor3.GetSize());

TensorTrait example for two-operand scalar instructions

#include "kernel_operator.h"
class KernelBinaryScalarTrait {
public:
    __aicore__ inline KernelBinaryScalarTrait() {}
    __aicore__ inline void Init(__gm__ uint8_t* src, __gm__ uint8_t* dstGm)
    {
        srcGlobal.SetGlobalBuffer((__gm__ int16_t*)src);
        dstGlobal.SetGlobalBuffer((__gm__ int16_t*)dstGm);
        pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(int16_t));
        pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(int16_t));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.AllocTensor<AscendC::TensorTrait<int16_t>>();
        AscendC::DataCopy(srcLocal, srcGlobal, 512);
        inQueueSrc.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.DeQue<AscendC::TensorTrait<int16_t>>();
        AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.AllocTensor<AscendC::TensorTrait<int16_t>>();

        uint64_t mask = 128;
        int16_t scalar = 2;
        // repeatTimes = 4, 128 elements one repeat, 512 elements total
       // dstBlkStride, srcBlkStride = 1, no gap between blocks in one repeat
       // dstRepStride, srcRepStride =8, no gap between repeats
        AscendC::Adds(dstLocal, srcLocal, scalar, mask, 4, {1, 1, 8, 8});
        
        outQueueDst.EnQue(dstLocal);
        inQueueSrc.FreeTensor(srcLocal);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.DeQue<AscendC::TensorTrait<int16_t>>();
        AscendC::DataCopy(dstGlobal, dstLocal, 512);
        outQueueDst.FreeTensor(dstLocal);
    }
private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueSrc;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueDst;
    AscendC::GlobalTensor<AscendC::TensorTrait<int16_t>> srcGlobal, dstGlobal;
};
extern "C" __global__ __aicore__ void binary_scalar_trait_kernel(__gm__ uint8_t* src, __gm__ uint8_t* dstGm)
{
    KernelBinaryScalarTrait op;
    op.Init(src, dstGm);
    op.Process();
}

TensorTrait example for matrix computing basic APIs

#include "kernel_operator.h"
template <typename dst_T, typename fmap_T, typename weight_T, typename dstCO1_T, typename bias_T> class KernelMatmul {
public:
    __aicore__ inline KernelMatmul(uint16_t mIn, uint8_t kIn, uint8_t nIn, bool initl1In, bool initl0In)
    {
        m = mIn;
        k = kIn;
        n = nIn;
        aSize = m * k;
        bSize = k * n;
        cSize = m * n;
        initl0 = initl0In;
        initl1 = initl1In;
    }
    __aicore__ inline void Init(__gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c)
    {
        aGM.SetGlobalBuffer((__gm__ fmap_T *)a);
        bGM.SetGlobalBuffer((__gm__ weight_T *)b);
        cGM.SetGlobalBuffer((__gm__ dstCO1_T *)c);
        pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(fmap_T));
        pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(fmap_T));
        pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(weight_T));
        pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(weight_T));
        pipe.InitBuffer(outQueueCO1, 1, cSize * sizeof(dstCO1_T));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        SplitA();
        SplitB();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.AllocTensor<AscendC::TensorTrait<fmap_T>>();
        AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.AllocTensor<AscendC::TensorTrait<weight_T>>();
        if(initl1 == true) {
            AscendC::InitConstValue(a1Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 32), 1, 0, 1});
            AscendC::InitConstValue(b1Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 32), 1, 0, 1});
        } else {
            AscendC::DataCopy(a1Local, aGM, aSize);
            AscendC::DataCopy(b1Local, bGM, bSize);
        }
        inQueueA1.EnQue(a1Local);
        inQueueB1.EnQue(b1Local);
    }
    __aicore__ inline void SplitA()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.DeQue<AscendC::TensorTrait<fmap_T>>();
        AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.AllocTensor<AscendC::TensorTrait<fmap_T>>();
        // 1. load2d L1 -> L0A
        AscendC::LoadData2dParams loadL0AParams;
        loadL0AParams.repeatTimes = m * k * sizeof(fmap_T) / 512;
        loadL0AParams.srcStride = 1;
        loadL0AParams.dstGap = 0;
        if (initl0 == true) {
            InitConstValue(a2Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 512), 1, 0, 1});
        } else{
            LoadData(a2Local, a1Local, loadL0AParams);
        }
        inQueueA2.EnQue<AscendC::TensorTrait<fmap_T>>(a2Local);
        inQueueA1.FreeTensor(a1Local);
    }
    __aicore__ inline void SplitB()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.DeQue<AscendC::TensorTrait<weight_T>>();
        AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.AllocTensor<AscendC::TensorTrait<weight_T>>();
        // 2. load2d L1 -> L0B
        AscendC::LoadData2dParams loadL0BParams;
        loadL0BParams.repeatTimes = k * n * sizeof(weight_T) / 512;
        loadL0BParams.srcStride = 1;
        loadL0BParams.dstGap = 0;
        if (initl0 == true) {
            AscendC::InitConstValue(b2Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 512), 1, 0, 1});
        } else{
            AscendC::LoadData(b2Local, b1Local, loadL0BParams);
        }
        inQueueB1.FreeTensor(b1Local);
        inQueueB2.EnQue<AscendC::TensorTrait<weight_T>>(b2Local);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.DeQue<AscendC::TensorTrait<fmap_T>>();
        AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.DeQue<AscendC::TensorTrait<weight_T>>();
        AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.AllocTensor<AscendC::TensorTrait<dstCO1_T>>();
        mmadParams.isBias = false;
        mmadParams.m = m;
        mmadParams.n = n;
        mmadParams.k = k;
        AscendC::Mmad(c1Local, a2Local, b2Local, mmadParams); // m*n
        outQueueCO1.EnQue<AscendC::TensorTrait<dstCO1_T>>(c1Local);
        inQueueA2.FreeTensor(a2Local);
        inQueueB2.FreeTensor(b2Local);
    }
#if __CCE_AICORE__ <= 200
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>();
        uint16_t M_ = Ceil(m, 16) * 16;
        AscendC::LocalTensor<AscendC::TensorTrait<dst_T>> ublocal;
        AscendC::TBuffAddr tbufublocal;
        tbufublocal.logicPos = (uint8_t)AscendC::QuePosition::C1;
        ublocal.SetAddr(tbufublocal);
        ublocal.InitBuffer(0, M_ * n);
        DataCopyParams dataCopyParams;
        dataCopyParams.blockCount = 1;
        dataCopyParams.blockLen = Ceil(M_ * n * 4, 1024);
        DataCopyEnhancedParams enhancedParams;
        enhancedParams.blockMode = AscendC::BlockMode::BLOCK_MODE_MATRIX;
        AscendC::DataCopy(ublocal, c1Local, dataCopyParams, enhancedParams);
        PipeBarrier<PIPE_ALL>();
        outQueueCO1.FreeTensor(c1Local);
        dataCopyParams.blockCount = 1;
        dataCopyParams.blockLen = m * n *sizeof(dstCO1_T) / ONE_BLK_SIZE;
        dataCopyParams.srcStride = 0;
        dataCopyParams.dstStride = 0;
        AscendC::DataCopy(cGM, ublocal, dataCopyParams);
    }
#else
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>();
        AscendC::FixpipeParamsV220 fixpipeParams;
        fixpipeParams.nSize = n;
        fixpipeParams.mSize = m;
        fixpipeParams.srcStride = m;
        fixpipeParams.dstStride = n;
        fixpipeParams.ndNum = 1;
        fixpipeParams.srcNdStride = 0;
        fixpipeParams.dstNdStride = 0;
        AscendC::Fixpipe(cGM, c1Local, fixpipeParams);
        outQueueCO1.FreeTensor(c1Local);
    }
#endif
private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::A1, 1> inQueueA1;
    AscendC::TQue<AscendC::QuePosition::A2, 1> inQueueA2;
    AscendC::TQue<AscendC::QuePosition::B1, 1> inQueueB1;
    AscendC::TQue<AscendC::QuePosition::B2, 1> inQueueB2;
    // dst queue
    AscendC::TQue<AscendC::QuePosition::CO1, 1> outQueueCO1;
    AscendC::GlobalTensor<AscendC::TensorTrait<fmap_T>> aGM;
    AscendC::GlobalTensor<AscendC::TensorTrait<weight_T>> bGM;
    AscendC::GlobalTensor<AscendC::TensorTrait<dst_T>> cGM;
    uint16_t m, k, n;
    bool initl0, initl1;
    uint16_t aSize, bSize, cSize, b2Size;
    AscendC::MmadParams mmadParams;
};
extern "C" __global__ __aicore__ void cube_initconstvalue_simple_operator_half_16_32_16_true_false(
    __gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c)
{
    if ASCEND_IS_AIV {
        return;
    }
    KernelMatmul<float, half, half, float, half> op(16, 32, 16, true, false);
    op.Init(a, b, c);
    op.Process();
}

Parent topic: Definitions of Data Types