昇腾社区首页
中文
注册

copy_gm_to_cbuf_multi_nd2nz

功能说明

从OUT到L1数据加载期间的数据格式转换指令,支持ND->NZ、NHWC->NC1HWC0和NHWC->C1HWNC0的格式转换。如果移动数据的最低维度未对齐到32字节(针对C0_Size为32的情况),则会对其在L1中填充零值到32B。

存储在OUT中的数据为ND格式,存储在L1中的数据为NZ格式。该指令能够处理多个ND到NZ的转换。

每个元素的搬移运算可有以下循环计算:
for(i=0;i<nd_num;i++) {  // every loop: n*d*32
	src_nd_addr = Xn + SRC_nd_matrix_stride * i * sizeof(data_type);
	dst_nd_addr = Xd + DST_nz_matrix_stride * i * sizeof(data_type);
	for(j=0;j<n;j++) {   // every loop d*32
                src_n_addr = src_nd_addr + j * SRC_D * sizeof(data_type);
                dst_n_addr = dst_nd_addr + j * DST_nz_n_stride * C0_Size;
		for(k=0;k<ceil(d * sizeof(data_type)) / 32;k++) {  // every loop: 32Byte
			src_block_addr = src_n_addr + k * 32;
			dst_block_addr = dst_n_addr + k * DST_nz_C0_stride * C0_Size;
		}
	}
}
表1 multi_nd2nz参数说明

参数名

说明

取值范围

单位

dst

目的地址

/

/

src

源数据地址

/

/

sid

用于SMMU TLB预取提示, 一般为0

/

/

ndNum

搬运数据nd块数量

[0 ,2^12-1]

elem

nValue

数据块n方向长度

[0, 16384]

elem

dValue

数据块d方向长度

[0, 65535]

elem

srcNdMatrixStride

源数据块nd块之间的距离

[0, 65535]

elem

srcDValue

SRC_D值,D* sizeof(data_type) 是实际的D大小。

[1, 65535]

elem

dstNzC0Stride

nz在L1中的两个C0之间的步长

[1, 16384]

C0_size

dstNzNStride

在L1中2个C0之间的步长,用于DeConv和Weight

[1, 16384]

C0_size

dstNzMatrixStride

L1中的DST_nz_matrix_stride是n*z块之间的步长

[1, 65535]

elem

约束:

  • 当type={B8,B16,B32S},C0_size为32字节时,目标数据不得存在重叠。如果存在重叠,写入L1时硬件不会报告任何警告或错误,并且无法保证重叠数据的写入顺序。
  • ndNum=0或nValue=0或dValue=0表示不执行,此指令将被视为 NOP(无操作指令),并会报告警告。

接口原型

void copy_gm_to_cbuf_multi_nd2nz_b8(__cbuf__ int8_t *dst, __gm__ int8_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride);

void copy_gm_to_cbuf_multi_nd2nz_b16(__cbuf__ int16_t *dst, __gm__ int16_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride);

void copy_gm_to_cbuf_multi_nd2nz_b32s(__cbuf__ uint32_t *dst, __gm__ uint32_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride);

void copy_gm_to_cbuf_multi_nd2nz_b32s(__cbuf__ int32_t *dst, __gm__ int32_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride);

void copy_gm_to_cbuf_multi_nd2nz_b16(__cbuf__ uint16_t *dst, __gm__ uint16_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride);

void copy_gm_to_cbuf_multi_nd2nz_b32s(__cbuf__ float *dst, __gm__ float *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride);

void copy_gm_to_cbuf_multi_nd2nz_b8(__cbuf__ uint8_t *dst, __gm__ uint8_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride);

void copy_gm_to_cbuf_multi_nd2nz_b16(__cbuf__ bfloat16_t *dst, __gm__ bfloat16_t *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride);

void copy_gm_to_cbuf_multi_nd2nz_b16(__cbuf__ half *dst, __gm__ half *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride);

流水类型

PIPE_MTE2