昇腾社区首页
中文
注册
开发者
下载

copy_matrix_cc_to_cbuf/copy_matrix_cc_to_gm

功能说明

把CUBE运算结果C矩阵从L0C搬运至L1或GM。

接口原型

// 相同接口的不同原型区别在于源地址和目的地址的数据类型不同
// copy_matrix_cc_to_cbuf
void copy_matrix_cc_to_cbuf(__cbuf__ half *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_cbuf(__cbuf__ bfloat16_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_cbuf(__cbuf__ int8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_cbuf(__cbuf__ uint8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_cbuf(__cbuf__ half *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_cbuf(__cbuf__ int16_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_cbuf(__cbuf__ int8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_cbuf(__cbuf__ uint8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

// void *dst为b4
void copy_matrix_cc_to_cbuf_b4(__cbuf__ void *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_cbuf_b4(__cbuf__ void *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

// copy_matrix_cc_to_gm
void copy_matrix_cc_to_gm(__gm__ half *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ bfloat16_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ int8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ uint8_t *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ float *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ half *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ int16_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ int8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ uint8_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm(__gm__ int32_t *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

// void *dst为b4
void copy_matrix_cc_to_gm_b4(__gm__ void *dst, __cc__ float *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

void copy_matrix_cc_to_gm_b4(__gm__ void *dst, __cc__ int32_t *src, uint8_t sid, uint16_t NSize, uint16_t MSize, uint32_t dstStride_dst_D, uint16_t srcStride, uint8_t UnitFlagMode, uint64_t QuantPRE, uint8_t ReLUPRE, bool channelSplit, bool NZ2ND_EN);

参数说明

表1 L0C搬至L1参数说明

参数名

说明

取值范围

单位

dst

目的地址,L1/GM。

若目标地址是L1,则是32字节对齐的;若目标地址是GM,则是单字节对齐的。

/

/

src

源地址,源地址数据类型为{f32, s32}。对于type={f32, s32},其是64字节对齐的。

/

/

sid

预留参数,设置为0即可。

/

/

NSize

L0C源矩阵在n方向的大小。

如果启用了fp32通道拆分,它是8的倍数。

如果启用了int4通道合并,它是64的倍数。

如果NZ2ND_EN使能,则范围为[1,8192]。

[0, 2^12-1]

elem

MSize

L0C源矩阵在m方向的大小。当m不是16的倍数时,硬件会读取额外的填充数据,并在写入目标时丢弃这些填充数据。

[0, 2^16-1]

elem

dstStride_dst_D

如果NZ2ND_EN不使能,则它是不同数据块的dst stride(首地址到首地址),单位为32B。

如果NZ2ND_EN使能,它是以元素为单位的dst_D值,dst_D含义见图1 参数含义示意图

[1, 2^32-1]

32B/elem

srcStride

L0C源矩阵中不同数据块间距离,必须为16的倍数,srcStride含义见图1 参数含义示意图中的src_stride。

[0, 2^16-1]

C0_size

UnitFlagMode

预留参数,设置为0即可

/

/

QuantPRE

预量化模式,预留参数,设为5'b00000即可,代表不做量化。

/

/

ReLUPRE

ReLU模式

  • 3’b000: no ReLU;
  • 3’b001: normal ReLU;
  • 3’b010: leaky ReLU;
  • 3’b011: pReLU。

/

/

channelSplit

是否使能通道切分。

[0, 1]

/

NZ2ND_EN

是否使能NZ2ND_EN格式转换。

[0, 1]

/

ReLUPRE使用leaky ReLU模式后,需要通过以下接口设置相关参数:

void set_lrelu_alpha(float config);

config[31:0]:ReLU_PRE阶段Leaky ReLU中的alpha。

NZ2ND_EN使能后,需要通过以下接口设置相关参数:

void set_nd_para(uint64_t config);
  • config[0:15]位表示nd块数量;
  • config[16:31]位表示源数据nd块步长,其含义见图1 参数含义示意图中的src_nd_stride,其单位为分形大小,取值范围为[1,512],对于type={f32, s32},分形大小是1024B;
  • config[32:47]位表示目的数据nd块步长,其含义见图1 参数含义示意图中的dst_nd_stride,其单位为元素。
图1 参数含义示意图

注意:

目标数据不能有重叠。如果对L1或OUT有重叠写入,硬件不会报告任何警告和错误,也不保证重叠数据的写入顺序。

NSize=0或MSize=0或nd块数量为0表示不执行,该接口将被视为NOP并报告警告。

流水类型

PIPE_FIX