copy_gm_to_cbuf_multi_nd2nz
功能说明
从OUT到L1数据加载期间的数据格式转换指令,支持ND->NZ、NHWC->NC1HWC0和NHWC->C1HWNC0的格式转换。如果移动数据的最低维度未对齐到32字节(针对C0_Size为32的情况),则会对其在L1中填充零值到32B。
存储在OUT中的数据为ND格式,存储在L1中的数据为NZ格式。该指令能够处理多个ND到NZ的转换。
每个元素的搬移运算可有以下循环计算:
for(i=0;i<nd_num;i++) { // every loop: n*d*32 src_nd_addr = Xn + SRC_nd_matrix_stride * i * sizeof(data_type); dst_nd_addr = Xd + DST_nz_matrix_stride * i * sizeof(data_type); for(j=0;j<n;j++) { // every loop d*32 src_n_addr = src_nd_addr + j * SRC_D * sizeof(data_type); dst_n_addr = dst_nd_addr + j * DST_nz_n_stride * C0_Size; for(k=0;k<ceil(d * sizeof(data_type)) / 32;k++) { // every loop: 32Byte src_block_addr = src_n_addr + k * 32; dst_block_addr = dst_n_addr + k * DST_nz_C0_stride * C0_Size; } } }
参数名 |
说明 |
取值范围 |
单位 |
---|---|---|---|
dst |
目的地址 |
/ |
/ |
src |
源数据地址 |
/ |
/ |
sid |
用于SMMU TLB预取提示, 一般为0 |
/ |
/ |
ndNum |
搬运数据nd块数量 |
[0 ,2^12-1] |
elem |
nValue |
数据块n方向长度 |
[0, 16384] |
elem |
dValue |
数据块d方向长度 |
[0, 65535] |
elem |
srcNdMatrixStride |
源数据块nd块之间的距离 |
[0, 65535] |
elem |
srcDValue |
SRC_D值,D* sizeof(data_type) 是实际的D大小。 |
[1, 65535] |
elem |
dstNzC0Stride |
nz在L1中的两个C0之间的步长 |
[1, 16384] |
C0_size |
dstNzNStride |
在L1中2个C0之间的步长,用于DeConv和Weight |
[1, 16384] |
C0_size |
dstNzMatrixStride |
L1中的DST_nz_matrix_stride是n*z块之间的步长 |
[1, 65535] |
elem |
约束:
- 当type={B8,B16,B32S},C0_size为32字节时,目标数据不得存在重叠。如果存在重叠,写入L1时硬件不会报告任何警告或错误,并且无法保证重叠数据的写入顺序。
- ndNum=0或nValue=0或dValue=0表示不执行,此指令将被视为 NOP(无操作指令),并会报告警告。
接口原型
void copy_gm_to_cbuf_multi_nd2nz_b8(__cbuf__ int8_t *dst, __gm__ int8_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride); void copy_gm_to_cbuf_multi_nd2nz_b16(__cbuf__ int16_t *dst, __gm__ int16_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride); void copy_gm_to_cbuf_multi_nd2nz_b32s(__cbuf__ uint32_t *dst, __gm__ uint32_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride); void copy_gm_to_cbuf_multi_nd2nz_b32s(__cbuf__ int32_t *dst, __gm__ int32_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride); void copy_gm_to_cbuf_multi_nd2nz_b16(__cbuf__ uint16_t *dst, __gm__ uint16_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride); void copy_gm_to_cbuf_multi_nd2nz_b32s(__cbuf__ float *dst, __gm__ float *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride); void copy_gm_to_cbuf_multi_nd2nz_b8(__cbuf__ uint8_t *dst, __gm__ uint8_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride); void copy_gm_to_cbuf_multi_nd2nz_b16(__cbuf__ bfloat16_t *dst, __gm__ bfloat16_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride); void copy_gm_to_cbuf_multi_nd2nz_b16(__cbuf__ half *dst, __gm__ half *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride);
流水类型
PIPE_MTE2
父主题: 矩阵搬运格式转换