昇腾社区首页
中文
注册

copy_matrix_cc_to_cbuf/copy_matrix_cc_to_gm

功能说明

把CUBE运算结果C矩阵从L0C搬运至L1。

接口原型

void copy_matrix_cc_to_cbuf(__cbuf__ half *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

参数说明

表1 L0C搬至L1参数说明

参数名

说明

取值范围

单位

dst

目的地址

/

/

src

源地址

/

/

sid

预留参数,设置为0即可

/

/

NSize

L0C源矩阵在N方向的大小

[0, 2^12-1]

elem

MSize

L0C源矩阵在M方向的大小。当M不是16的倍数时,硬件会读取额外的填充数据,并在写入目标时丢弃这些填充数据

[0, 2^16-1]

elem

dstStride_dst_D

使能NZ2ND格式转换时,表示目的ND矩阵每行中元素个数;未使能NZ2ND格式转换时,表示目的不同数据块间距离

[1, 2^32-1]

32B

srcStride

L0C源矩阵中不同数据块间距离,必须为16的倍数

[0, 2^16-1]

C0_size

UnitFlagMode

预留参数,设置为0即可

/

/

QuantPRE

量化模式

  • 5'b00000 无转换
  • 5'b00001 F32->F16。Xm[31:16]被配置为源突发长度。目的地的突发长度是源突发长度的一半。舍入模式为四舍五入到最接近的偶数。
  • 5'b00110 将F16乘法附加到L0C16的输出,data_in_L0C * QUANT_PRE[15:0]和QUANT_PRE[15:0]被视为标准FP16。
  • 5'b01000 VREQ8模式。
  • 5'b01001 REQ8模式。
  • 5'b01010 VDEQF16模式。
  • 5'b01011 DEQF16模式。
  • 5'b01100 VSHIFTS322S16模式。
  • 5'b01101 SHIFTS322S16模式。
  • 5'b10000 F32->BF16
  • 5'b10001 VQF162B8_PRE f16->s8/u8
  • 5'b10010 QF162B8_PRE f16->s8/u8
  • 5'b10011 VQF162S4_PRE f16->s4
  • 5'b10100 QF162S4_PRE f16->s4
  • 5'b10101 VREQ4 s32->s4
  • 5'b10110 REQ4 s32- > h4.
  • 5'b10111 VQF322B8_PRE f32- >s8/u8
  • 5'b11000 QF322B8_PRE f32- >s8/u8
  • 5'b11001 VQF322S4_PRE f32- >s4
  • 5'b11010 QF322S4_PRE f32- >s4

/

/

ReLUPRE

Relu模式

  • 3’b000: no ReLU
  • 3’b001: normal ReLU
  • 3’b010: leaky ReLU
  • 3’b011: pReLU

/

/

channelSplit

是否使能通道拆分

[0, 1]

/

NZ2ND_EN

是否使能NZ2ND格式转换

[0, 1]

/

NZ2ND_EN使能后,需要通过以下指令设置相关参数:

void set_nd_para(uint64_t config);
  • config[0:15]位表示nd块数量;
  • config[16:31]位表示源数据nd块步长,其单位为分形大小,取值范围为[1,512];
  • config[32:47]位表示目的数据nd块步长,其单位为元素,当(V)REQ4、(V)QF322S4_PRE、(V)QF162S4_PRE、QF162S4_POST使能时,必须是2的倍数。且其值不能为0。

流水类型

PIPE_FIX

类似指令

// copy_matrix_cc_to_cbuf 
void copy_matrix_cc_to_cbuf(__cbuf__ int8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); 
 
void copy_matrix_cc_to_cbuf(__cbuf__ uint8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); 
 
void copy_matrix_cc_to_cbuf(__cbuf__ half *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); 
 
void copy_matrix_cc_to_cbuf(__cbuf__ int16_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); 
 
void copy_matrix_cc_to_cbuf(__cbuf__ int8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); 
 
void copy_matrix_cc_to_cbuf(__cbuf__ uint8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, QuantMode_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN); 
 
// copy_matrix_cc_to_gm 
void copy_matrix_cc_to_gm(__gm__ half *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ int8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ int8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ uint8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ float *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ int16_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ half *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ int32_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ bfloat16_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ uint8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

// copy_matrix_cc_to_cbuf_b4
void copy_matrix_cc_to_cbuf_b4(__cbuf__ void *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_cbuf_b4(__cbuf__ void *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

// copy_matrix_cc_to_gm_b4
void copy_matrix_cc_to_gm_b4(__gm__ void *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm_b4(__gm__ void *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);