load_cbuf_to_ca_transpose
功能说明
实现数据从L1搬运到L0A,这条接口在完成搬运时将同时实现简单的分形矩阵转置,相较load_cbuf_to_ca,其区别在于:本类接口始终带有转置操作,而load_cbuf_to_ca的转置操作由参数transpose控制;并且本类接口对数据类型的支持范围更广(支持{b4, b8, b16, b32}),load_cbuf_to_ca若需实现分形的转置仅支持b16数据类型。
接口原型
// 相同接口的不同原型区别在于源地址和目的地址的数据类型不同 void load_cbuf_to_ca_transpose(__ca__ bfloat16_t *dst, __cbuf__ bfloat16_t *src, uint16_t indexID, uint8_t repeat, uint16_t srcStride, uint16_t dstGap, bool addrmode, uint16_t dstFracGap); void load_cbuf_to_ca_transpose(__ca__ half *dst, __cbuf__ half *src, uint16_t indexID, uint8_t repeat, uint16_t srcStride, uint16_t dstGap, bool addrmode, uint16_t dstFracGap); void load_cbuf_to_ca_transpose(__ca__ float *dst, __cbuf__ float *src, uint16_t indexID, uint8_t repeat, uint16_t srcStride, uint16_t dstGap, bool addrmode, uint16_t dstFracGap); void load_cbuf_to_ca_transpose(__ca__ int32_t *dst, __cbuf__ int32_t *src, uint16_t indexID, uint8_t repeat, uint16_t srcStride, uint16_t dstGap, bool addrmode, uint16_t dstFracGap); void load_cbuf_to_ca_transpose(__ca__ int8_t *dst, __cbuf__ int8_t *src, uint16_t indexID, uint8_t repeat, uint16_t srcStride, uint16_t dstGap, bool addrmode, uint16_t dstFracGap); void load_cbuf_to_ca_transpose(__ca__ uint32_t *dst, __cbuf__ uint32_t *src, uint16_t indexID, uint8_t repeat, uint16_t srcStride, uint16_t dstGap, bool addrmode, uint16_t dstFracGap); void load_cbuf_to_ca_transpose(__ca__ uint8_t *dst, __cbuf__ uint8_t *src, uint16_t indexID, uint8_t repeat, uint16_t srcStride, uint16_t dstGap, bool addrmode, uint16_t dstFracGap);
参数说明
参数含义见表1 具备转置的矩阵输入搬运参数说明。
流水类型
PIPE_MTE1
父主题: 具备转置的矩阵输入搬运