copy_matrix_cc_to_cbuf/copy_matrix_cc_to_gm
功能说明
把CUBE运算结果C矩阵从L0C搬运至L1。
接口原型
void copy_matrix_cc_to_cbuf(__cbuf__ half *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);
参数说明
参数名  | 
说明  | 
取值范围  | 
单位  | 
|---|---|---|---|
dst  | 
目的地址  | 
/  | 
/  | 
src  | 
源地址  | 
/  | 
/  | 
sid  | 
预留参数,设置为0即可  | 
/  | 
/  | 
NSize  | 
L0C源矩阵在N方向的大小  | 
[0, 2^12-1]  | 
elem  | 
MSize  | 
L0C源矩阵在M方向的大小。当M不是16的倍数时,硬件会读取额外的填充数据,并在写入目标时丢弃这些填充数据  | 
[0, 2^16-1]  | 
elem  | 
dstStride_dst_D  | 
使能NZ2ND格式转换时,表示目的ND矩阵每行中元素个数;未使能NZ2ND格式转换时,表示目的不同数据块间距离  | 
[1, 2^32-1]  | 
32B  | 
srcStride  | 
L0C源矩阵中不同数据块间距离,必须为16的倍数  | 
[0, 2^16-1]  | 
C0_size  | 
UnitFlagMode  | 
预留参数,设置为0即可  | 
/  | 
/  | 
QuantPRE  | 
量化模式 
  | 
/  | 
/  | 
ReLUPRE  | 
Relu模式 
  | 
/  | 
/  | 
channelSplit  | 
是否使能通道拆分  | 
[0, 1]  | 
/  | 
NZ2ND_EN  | 
是否使能NZ2ND格式转换  | 
[0, 1]  | 
/  | 
NZ2ND_EN使能后,需要通过以下指令设置相关参数:
void set_nd_para(uint64_t config);
- config[0:15]位表示nd块数量;
 - config[16:31]位表示源数据nd块步长,其单位为分形大小,取值范围为[1,512];
 - config[32:47]位表示目的数据nd块步长,其单位为元素,当(V)REQ4、(V)QF322S4_PRE、(V)QF162S4_PRE、QF162S4_POST使能时,必须是2的倍数。且其值不能为0。
 
流水类型
PIPE_FIX
类似指令
// copy_matrix_cc_to_cbuf void copy_matrix_cc_to_cbuf(__cbuf__ int8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ uint8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ half *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ int16_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ int8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ uint8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); // copy_matrix_cc_to_gm void copy_matrix_cc_to_gm(__gm__ half *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ int8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ int8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ uint8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ float *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ int16_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ half *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ int32_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ bfloat16_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ uint8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); // copy_matrix_cc_to_cbuf_b4 void copy_matrix_cc_to_cbuf_b4(__cbuf__ void *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf_b4(__cbuf__ void *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); // copy_matrix_cc_to_gm_b4 void copy_matrix_cc_to_gm_b4(__gm__ void *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm_b4(__gm__ void *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);
父主题: 矩阵输出搬运(Fixpipe)