下面的样例展示了数学库kernel侧API和Tiling API的配套使用方法,具体流程如下:
Host侧调用Tiling接口,获取所需临时空间的大小,并将其写入tiling data中;kernel侧再读取tiling data,获取相应的临时空间大小,并根据此分配临时空间。
Host侧Tiling API 使用样例:
#include <vector> #include "register/op_def_registry.h" #include "register/tilingdata_base.h" #include "tiling/tiling_api.h" namespace optiling { BEGIN_TILING_DATA_DEF(AsinCustomTilingData) TILING_DATA_FIELD_DEF(uint32_t, srcSize); TILING_DATA_FIELD_DEF(uint32_t, tmpBufferSize); END_TILING_DATA_DEF; static ge::graphStatus TilingFunc(gert::TilingContext* context) { // Input source shapes. std::vector<int64_t> srcDims = {16, 128}; uint32_t srcSize = 1; for (auto dim : srcDims) { srcSize *= dim; } uint32_t typeSize = 2; ge::Shape shape(srcDims); uint32_t tmpSize = AscendC::GetAsinMinTmpSize(shape, typeSize, false); AsinCustomTilingData tiling; tiling.set_srcSize(srcSize); tiling.set_tmpBufferSize(tmpSize); context->SetBlockDim(1); tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); context->SetTilingKey(1); return ge::GRAPH_SUCCESS; } } // namespace optiling
#include "kernel_operator.h" template <typename srcType> class KernelAsin { public: __aicore__ inline KernelAsin() {} __aicore__ inline void Init(GM_ADDR src_gm, GM_ADDR dst_gm, uint32_t srcSize, uint32_t tmpBufferSize) { src_global.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(src_gm), srcSize); dst_global.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(dst_gm), srcSize); pipe.InitBuffer(inQueue, 1, srcSize * sizeof(srcType)); pipe.InitBuffer(outQueue, 1, srcSize * sizeof(srcType)); pipe.InitBuffer(tmpQueue, 1, tmpBufferSize); bufferSize = srcSize; } __aicore__ inline void Process() { CopyIn(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { LocalTensor<srcType> srcLocal = inQueue.AllocTensor<srcType>(); DataCopy(srcLocal, src_global, bufferSize); inQueue.EnQue(srcLocal); } __aicore__ inline void Compute() { LocalTensor<srcType> dstLocal = outQueue.AllocTensor<srcType>(); LocalTensor<srcType> srcLocal = inQueue.DeQue<srcType>(); LocalTensor<srcType> tmpTensor = tmpQueue.AllocTensor<srcType>(); Asin<srcType, false, true>(dstLocal, srcLocal, tmpTensor); outQueue.EnQue<srcType>(dstLocal); inQueue.FreeTensor(srcLocal); tmpQueue.FreeTensor(tmpTensor); } __aicore__ inline void CopyOut() { LocalTensor<srcType> dstLocal = outQueue.DeQue<srcType>(); DataCopy(dst_global, dstLocal, bufferSize); outQueue.FreeTensor(dstLocal); } private: GlobalTensor<srcType> src_global; GlobalTensor<srcType> dst_global; TPipe pipe; TQue<QuePosition::VECIN, 1> inQueue; TQue<QuePosition::VECIN, 1> tmpQueue; TQue<QuePosition::VECOUT, 1> outQueue; uint32_t bufferSize = 0; }; extern "C" __global__ __aicore__ void kernel_asin_operator(GM_ADDR src_gm, GM_ADDR dst_gm, GM_ADDR tiling) { GET_TILING_DATA(tilingData, tiling); KernelAsin<half> op; op.Init(src_gm, dst_gm, tilingData.srcSize, tilingData.tmpBufferSize); if (TILING_KEY_IS(1)) { op.Process(); } }