copy_matrix_cc_to_cbuf/copy_matrix_cc_to_gm
功能说明
把CUBE运算结果C矩阵从L0C搬运至L1或GM。
接口原型
// 相同接口的不同原型区别在于源地址和目的地址的数据类型不同 // copy_matrix_cc_to_cbuf void copy_matrix_cc_to_cbuf(__cbuf__ half *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ bfloat16_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ int8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ uint8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ half *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ int16_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ int8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf(__cbuf__ uint8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); // void *dst为b4 void copy_matrix_cc_to_cbuf_b4(__cbuf__ void *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_cbuf_b4(__cbuf__ void *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); // copy_matrix_cc_to_gm void copy_matrix_cc_to_gm(__gm__ half *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ bfloat16_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ int8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ uint8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ float *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ half *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ int16_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ int8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ uint8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm(__gm__ int32_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); // void *dst为b4 void copy_matrix_cc_to_gm_b4(__gm__ void *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); void copy_matrix_cc_to_gm_b4(__gm__ void *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);
参数说明
|
参数名 |
说明 |
取值范围 |
单位 |
|---|---|---|---|
|
dst |
目的地址,L1/GM。 若目标地址是L1,则是32字节对齐的;若目标地址是GM,则是单字节对齐的。 |
/ |
/ |
|
src |
源地址,源地址数据类型为{f32, s32}。对于type={f32, s32},其是64字节对齐的。 |
/ |
/ |
|
sid |
预留参数,设置为0即可。 |
/ |
/ |
|
NSize |
L0C源矩阵在n方向的大小。 如果启用了fp32通道拆分,它是8的倍数。 如果启用了int4通道合并,它是64的倍数。 如果NZ2ND_EN使能,则范围为[1,8192]。 |
[0, 2^12-1] |
elem |
|
MSize |
L0C源矩阵在m方向的大小。当m不是16的倍数时,硬件会读取额外的填充数据,并在写入目标时丢弃这些填充数据。 |
[0, 2^16-1] |
elem |
|
dstStride_dst_D |
如果NZ2ND_EN不使能,则它是不同数据块的dst stride(首地址到首地址),单位为32B。 如果NZ2ND_EN使能,它是以元素为单位的dst_D值,dst_D含义见图1 参数含义示意图。 |
[1, 2^32-1] |
32B/elem |
|
srcStride |
L0C源矩阵中不同数据块间距离,必须为16的倍数,srcStride含义见图1 参数含义示意图中的src_stride。 |
[0, 2^16-1] |
C0_size |
|
UnitFlagMode |
预留参数,设置为0即可 |
/ |
/ |
|
QuantPRE |
预量化模式,预留参数,设为5'b00000即可,代表不做量化。 |
/ |
/ |
|
ReLUPRE |
ReLU模式
|
/ |
/ |
|
channelSplit |
是否使能通道切分。 |
[0, 1] |
/ |
|
NZ2ND_EN |
是否使能NZ2ND_EN格式转换。 |
[0, 1] |
/ |
ReLUPRE使用leaky ReLU模式后,需要通过以下接口设置相关参数:
void set_lrelu_alpha(float config);
config[31:0]:ReLU_PRE阶段Leaky ReLU中的alpha。
NZ2ND_EN使能后,需要通过以下接口设置相关参数:
void set_nd_para(uint64_t config);
- config[0:15]位表示nd块数量;
- config[16:31]位表示源数据nd块步长,其含义见图1 参数含义示意图中的src_nd_stride,其单位为分形大小,取值范围为[1,512],对于type={f32, s32},分形大小是1024B;
- config[32:47]位表示目的数据nd块步长,其含义见图1 参数含义示意图中的dst_nd_stride,其单位为元素。
注意:
目标数据不能有重叠。如果对L1或OUT有重叠写入,硬件不会报告任何警告和错误,也不保证重叠数据的写入顺序。
NSize=0或MSize=0或nd块数量为0表示不执行,该接口将被视为NOP并报告警告。
流水类型
PIPE_FIX
