CumSum

Function Description

Processing along the first axis: The first row of elements remains unchanged, and elements in subsequent rows accumulate.
Example: if the input tensor is ([[0, 1, 2], [3, 4, 5]]), the final output is tensor is ([[0, 1, 2], [3, 5, 7]]).
Processing on the last axis: The first column of elements remains unchanged, and elements in subsequent columns accumulate.
Example: if the input tensor is ([[0, 1, 2], [3, 4, 5]]), the final output tensor is ([[0, 1, 3], [3, 7, 12]]).

Prototype

Pass the temporary space through the sharedTmpBuffer input parameter.

template <typename T, const CumSumConfig &config = defaultCumSumConfig>
__aicore__ inline void CumSum(LocalTensor<T> &dstTensor, LocalTensor<T> &lastRowTensor, const LocalTensor<T> &srcTensor, LocalTensor<uint8_t> &sharedTmpBuffer, const CumSumInfo &cumSumInfo)

Allocate the temporary space through the API framework.

template <typename T, const CumSumConfig &config = defaultCumSumConfig>
__aicore__ inline void CumSum(LocalTensor<T> &dstTensor, LocalTensor<T> &lastRowTensor, const LocalTensor<T> &srcTensor, const CumSumInfo &cumSumInfo)

Parameters

Table 1 Parameters in the template

Parameter

Description

Data type of the operand.

config

Parameters for compiling the CumSum API.

struct CumSumConfig {
    bool isLastAxis{true};
    bool isReuseSource{false};
    bool outputLastRow{false};
};

isLastAxis: If the value is true, the last axis is used for computation. If the value is false, the first axis is used for computation.
isReuseSource: Whether the memory space of srcLocal can be reused.
outputLastRow: Whether to output the last row of data.

Table 2 API parameters

Parameter

Input/Output

Description

dstLocal

Output

Destination operand. The input elements are processed along the first axis or the last axis and their cumulative sum is calculated.

The type is LocalTensor.

lastRowTensor

Output

Destination operand. If outputLastRow in config is set to true, the last row of data is output.

The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT.

srcLocal

Input

Source operand.

The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT.

The source operand must have the same data type as the destination operand.

cumSumInfo

Input

Shape of srcTensor, CumSumInfo type. The specific definition is as follows:

struct CumSumInfo
{
    uint32_t outter{0};    // outer axis length of input data
    uint32_t inner{0};     // inner axis length of the input data
};

The value of CumSumInfo.inner*sizeof(T) must be an integer multiple of 32 bytes.

sharedTmpBuffer

Input

Temporary buffer.

The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT.

This parameter is used to store intermediate variables during complex computation in Cumsum and is provided by developers.

For details about how to obtain the temporary space size (BufferSize), see GetCumSumMaxMinTmpSize.

Returns

None

Availability

Constraints

For details about the alignment requirements of the operand address offset, see General Restrictions.
The input supports only the two-dimensional structure.
The value of inner must be an integer multiple of 32 bytes.

Example

#include "kernel_operator.h"

template <typename T>
class KernelCumSum
{
public:
    __aicore__ inline KernelCumSum(){}
    __aicore__ inline void Init(
        GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR lastRowGm, const AscendC::CumSumInfo& cumSumParams)
    {
        outer = cumSumParams.outter;
        inner = cumSumParams.inner;
        srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(srcGm), outer * inner);
        dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(dstGm), outer * inner);
        lastRowGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(lastRowGm), inner);
        pipe.InitBuffer(inQueueX, 1, outer * inner * sizeof(T));
        pipe.InitBuffer(outQueue, 1, outer * inner * sizeof(T));
        pipe.InitBuffer(lastRowQueue, 1, inner * sizeof(T));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<T> srcLocal = inQueueX.AllocTensor<T>();
        AscendC::DataCopy(srcLocal, srcGlobal, outer * inner);
        inQueueX.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<T> dstLocal = outQueue.AllocTensor<T>();
        AscendC::LocalTensor<T> lastRowLocal = lastRowQueue.AllocTensor<T>();
        AscendC::LocalTensor<T> srcLocal = inQueueX.DeQue<T>();
        static constexpr AscendC::CumSumConfig cumSumConfig{true, false, true};
        const AscendC::CumSumInfo cumSumInfo{outer, inner};
        AscendC::CumSum<T, cumSumConfig>(dstLocal, lastRowLocal, srcLocal, cumSumInfo);
        outQueue.EnQue<T>(dstLocal);
        lastRowQueue.EnQue<T>(lastRowLocal);
        inQueueX.FreeTensor(srcLocal);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<T> dstLocal = outQueue.DeQue<T>();
        AscendC::DataCopy(dstGlobal, dstLocal, outer * inner);
        outQueue.FreeTensor(dstLocal);
        AscendC::LocalTensor<T> lastRowLocal = lastRowQueue.DeQue<T>();
        AscendC::DataCopy(lastRowGlobal, lastRowLocal, inner);
        lastRowQueue.FreeTensor(lastRowLocal);
    }

private:
    AscendC::GlobalTensor<T> srcGlobal;
    AscendC::GlobalTensor<T> dstGlobal;
    AscendC::GlobalTensor<T> lastRowGlobal;
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> lastRowQueue;
    uint32_t outer{1};
    uint32_t inner{1};
};

template <typename T>
__aicore__ inline void kernel_cumsum_operator(
    GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR lastRowGm, const AscendC::CumSumInfo &cumSumParams)
{
    KernelCumSum<T> op;
    op.Init(srcGm, dstGm, lastRowGm, cumSumParams);
    op.Process();
}

Parent topic: CumSum