BatchNorm

Function Usage

BatchNorm normalizes the input of each layer to make the distribution of each layer as similar as possible, thereby expediting training and improving the model's generalization capability (effectively reducing vanishing and exploding gradients). The basic idea is to normalize each input feature of samples in each batch along the batch dimension. Specifically, for input feature x, the calculation process of BatchNorm can be expressed as:

For input feature x, calculate the mean μ and variance σ along the batch dimension:
$\text{[math]}$
For each feature i, normalize input feature x:
$\text{[math]}$
Scaling and translation of normalized features:
$\text{[math]}$

Prototype

Allocate the temporary space through the API framework.

template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
__aicore__ inline void BatchNorm(const LocalTensor<T>& output, const LocalTensor<T>& outputMean, const LocalTensor<T>& outputVariance, const LocalTensor<T>& inputX, const LocalTensor<T>& gamm, const LocalTensor<T>& beta, const T epsilon, BatchNormTiling& tiling)

Pass the temporary space through the sharedTmpBuffer input parameter.

template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
__aicore__ inline void BatchNorm(const LocalTensor<T>& output, const LocalTensor<T>& outputMean, const LocalTensor<T>& outputVariance, const LocalTensor<T>& inputX, const LocalTensor<T>& gamm, const LocalTensor<T>& beta, const LocalTensor<uint8_t>& sharedTmpBuffer, const T epsilon, BatchNormTiling& tiling)

Parameters

**Table 1** Parameters in the template
Parameter	Description
T	Data type of the operand.
isReuseSource	Whether the source operand can be modified. This parameter is reserved. Pass the default value false.
isBasicBlock	If the shape information and the tiling policy of inputX and output meet the basic block requirements, this parameter can be enabled to improve performance. By default, this parameter is disabled. The basic block requirements are as follows: The value of originB is a multiple of 8. The value of S × H is a multiple of 64 but less than 2048.

**Table 2** API parameters
Parameter	Input/Output	Description
output	Output	Destination operand, with a type of LocalTensor and a shape of [B, S, H].
outputMean	Output	Mean, destination operand, with a type of LocalTensor and a shape of [S, H].
outputVariance	Output	Variance, destination operand, with a type of LocalTensor and a shape of [S, H].
inputX	Input	Source operand, with a type of LocalTensor and a shape of [B, S, H]. The data type of inputX must be the same as that of the destination operand, and the value of *SH** must be 32-byte aligned.
gamm	Input	Source operand, with a type of LocalTensor and a shape of [B]. The data type of gamm must be the same as that of the destination operand, and the length must be 32-byte aligned.
beta	Input	Source operand, with a type of LocalTensor and a shape of [B]. The data type of beta must be the same as that of the destination operand, and the length must be 32-byte aligned.
sharedTmpBuffer	Input	This parameter is used to store intermediate variables during complex internal API computation and is provided by developers. The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT. For details about how to obtain the temporary space size (BufferSize), see BatchNorm Tiling.
epsilon	Input	Weight coefficient for preventing division by zero. The data type must be the same as that of inputX or output.
tilling	Input	Tiling information of input data. For details about how to obtain the tiling information, see BatchNorm Tiling.

Returns

None

Availability

Precautions

For details about the alignment requirements of the operand address offset, see General Restrictions.
Currently, only the ND format is supported.
The S*H of input data must be 32-byte aligned.
The source operand address can overlap the destination operand address.

Example

#include "kernel_operator.h"

template <typename dataType, bool isReuseSource = false, bool isBasicBlock = false>
class KernelBatchnorm {
public:
    __aicore__ inline KernelBatchnorm()
    {}
    __aicore__ inline void Init(GM_ADDR inputXGm, GM_ADDR gammGm, GM_ADDR betaGm, GM_ADDR outputGm,
        GM_ADDR outputMeanGm, GM_ADDR outputVariance_gm, const TilingData &tiling)
    {
        bLength = tiling.bLength;
        sLength = tiling.sLength;
        hLength = tiling.hLength;
        batchNormTilling = tiling.batchNormTilingData;
        originalBLength = tiling.originalBLength;
        bshLength = originalBLength * sLength * hLength;
        shLength = sLength * hLength;
        inputXGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(inputXGm), bshLength);
        gammGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(gammGm), bLength);
        betaGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(betaGm), bLength);
        outputGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(outputGm), bshLength);
        outputMeanGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(outputMeanGm), shLength);
        outputVarianceGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType *>(outputVariance_gm), shLength);
        pipe.InitBuffer(inQueueX, 1, sizeof(dataType) * bshLength);
        pipe.InitBuffer(inQueueGamma, 1, sizeof(dataType) * bLength);
        pipe.InitBuffer(inQueueBeta, 1, sizeof(dataType) * bLength);
        pipe.InitBuffer(outQueue, 1, sizeof(dataType) * bshLength);
        pipe.InitBuffer(outQueueMean, 1, sizeof(dataType) * shLength);
        pipe.InitBuffer(outQueueVariance, 1, sizeof(dataType) * shLength);
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<dataType> inputXLocal = inQueueX.AllocTensor<dataType>();
        AscendC::LocalTensor<dataType> gammaLocal = inQueueGamma.AllocTensor<dataType>();
        AscendC::LocalTensor<dataType> betaLocal = inQueueBeta.AllocTensor<dataType>();
        AscendC::DataCopy(inputXLocal, inputXGlobal, bshLength);
        AscendC::DataCopy(gammaLocal, gammGlobal, bLength);
        AscendC::DataCopy(betaLocal, betaGlobal, bLength);
        inQueueX.EnQue(inputXLocal);
        inQueueGamma.EnQue(gammaLocal);
        inQueueBeta.EnQue(betaLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<dataType> inputXLocal = inQueueX.DeQue<dataType>();
        AscendC::LocalTensor<dataType> gammaLocal = inQueueGamma.DeQue<dataType>();
        AscendC::LocalTensor<dataType> betaLocal = inQueueBeta.DeQue<dataType>();
        AscendC::LocalTensor<dataType> outputLocal = outQueue.AllocTensor<dataType>();
        AscendC::LocalTensor<dataType> meanLocal = outQueueMean.AllocTensor<dataType>();
        AscendC::LocalTensor<dataType> varianceLocal = outQueueVariance.AllocTensor<dataType>();
        AscendC::BatchNorm<dataType, isReuseSource, isBasicBlock>(outputLocal,
            meanLocal,
            varianceLocal,
            inputXLocal,
            gammaLocal,
            betaLocal,
            (dataType)epsilon,
            batchNormTilling);
        outQueue.EnQue<dataType>(outputLocal);
        outQueueMean.EnQue<dataType>(meanLocal);
        outQueueVariance.EnQue<dataType>(varianceLocal);
        inQueueX.FreeTensor(inputXLocal);
        inQueueGamma.FreeTensor(gammaLocal);
        inQueueBeta.FreeTensor(betaLocal);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<dataType> outputLocal = outQueue.DeQue<dataType>();
        AscendC::LocalTensor<dataType> meanLocal = outQueueMean.DeQue<dataType>();
        AscendC::LocalTensor<dataType> varianceLocal = outQueueVariance.DeQue<dataType>();

        AscendC::DataCopy(outputGlobal, outputLocal, bshLength);
        AscendC::DataCopy(outputMeanGlobal, meanLocal, shLength);
        AscendC::DataCopy(outputVarianceGlobal, varianceLocal, shLength);

        outQueue.FreeTensor(outputLocal);
        outQueueMean.FreeTensor(meanLocal);
        outQueueVariance.FreeTensor(varianceLocal);
    }

private:
    AscendC::GlobalTensor<dataType> inputXGlobal;
    AscendC::GlobalTensor<dataType> gammGlobal;
    AscendC::GlobalTensor<dataType> betaGlobal;
    AscendC::GlobalTensor<dataType> outputGlobal;
    AscendC::GlobalTensor<dataType> outputMeanGlobal;
    AscendC::GlobalTensor<dataType> outputVarianceGlobal;
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueGamma;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueBeta;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueMean;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueVariance;
    uint32_t bLength;
    uint32_t sLength;
    uint32_t hLength;
    uint32_t originalBLength;
    dataType epsilon = 0.001;
    uint32_t bshLength;
    uint32_t shLength;
    BatchNormTiling batchNormTilling;
};

extern "C" __global__ __aicore__ void kernel_batchnorm_operator(GM_ADDR inputXGm, GM_ADDR gammGm, GM_ADDR betaGm,
    GM_ADDR outputGm, GM_ADDR outputMeanGm, GM_ADDR outputVariance_gm, GM_ADDR tiling)
{
    GET_TILING_DATA(tilingData, tiling);
    KernelBatchnorm<half, false, false> op;
    op.Init(inputXGm, gammGm, betaGm, outputGm, outputMeanGm, outputVariance_gm, tilingData);
    op.Process();
}

Parent topic: Data Normalization