vconv
功能说明
vector 数据类型转换,接口形如 vconv_bf162f32/vconv_bf162f32a 后缀区分不同舍入模式:
R:四舍五入,偶数优先(C 语言中的 rint)
A:四舍五入,中间值向远离零的方向取整(C 语言中的 round)
F:向下取整(C 语言中的 floor)
C:向上取整(C 语言中的 ceil)
Z:向零取整(C 语言中的 trunc)
O:向奇数取整(冯·诺依曼舍入)
留空(null)默认为R模式。
该接口主要遵循单目运算模板。
该接口涉及不同宽度的类型转换,因此,源操作数和目的操作数 RepeatStride 不总是常见的8个块。接口以其中宽度较大的数据类型为准,由于转换前后元素个数是不变的,数据宽度更大的类型 RepeatStride 为8Block,而宽度较小的类型 RepeatStride 将小于8Block。
例如,f162s8 转换接口处理128个f16元素和128个s8元素,源操作数有8个块,目的操作数有4个块(而非常见的8个)。模板中对目标操作数的步幅和偏移定义在此仍然有效。反之,在 f322s64 接口中,源向量只有4个块,而目标向量则有8个块。
该接口支持MASK配置。MASK可控制哪些元素参与计算。
支持的数据类型及舍入模式如下表:
src |
dst |
rounding mode |
|---|---|---|
f32 |
f32 |
R/A/F/C/Z |
f16 |
(null)/R/A/F/C/Z/O |
|
bf16 |
R/A/F/C/Z |
|
s64 |
R/A/F/C/Z |
|
s32 |
R/A/F/C/Z |
|
s16 |
R/A/F/C/Z |
|
f16 |
s32 |
R/A/F/C/Z |
s16 |
R/A/F/C/Z |
|
s8 |
(null)/R/A/F/C/Z |
|
u8 |
(null)/R/A/F/C/Z |
|
s4 |
(null)/R/A/F/C/Z |
|
bf16 |
s32 |
R/A/F/C/Z |
s16 |
f16 |
(null)/R/A/F/C/Z |
s32 |
f32 |
(null)/R/A/F/C/Z |
s64 |
f32 |
R/A/F/C/Z |
DEQs162b8h |
/ |
/ |
DEQs162b8l |
/ |
/ |
VDEQs162b8h |
/ |
/ |
VDEQs162b8l |
/ |
/ |
DEQ/VDEQ 功能说明
部分类型转换接口涉及 DEQ/VDEQ,其功能是反量化。
DEQS162B8H/L 中:DEQSCALE[31:0] 表示为 M(在 f32 中表示,硬件将其视为 (1, 8, 10) 格式进行计算,即 1 位符号位、8 位指数位和 10 位尾数位)。DEQSCALE[45:37] 表示为 offset(s9 格式)。DEQSCALE[46] 用于指示量化结果是有符号还是无符号。DEQSCALE 中的其他位为保留位。
流程如下:
deq_factor[63:0] = DEQSCALE[63:0];
M[31:0] = deq_factor[31:0];
offset[8:0] = deq_factor[45:37];
deqs162b8_quantization(deq_factor, src_s16){
tmp0[31:0] = s16_to_f32(src_s16);
tmp1[31:0] = tmp0[31:0] * M;
tmp2[8:0] = f32_to_s9_saturation(tmp1[31:0]);
tmp3[8:0] = tmp2[8:0] + offset[8:0];
result[7:0] = deq_factor[46] ? s8_saturation(tmp3[8:0]) : u8_saturation(tmp3[8:0]);
}
s8_saturation/u8_saturation 表现如下表:
val[8:0] signed |
>=0 |
<0 |
||
|---|---|---|---|---|
to s8 |
<127 |
>=127 |
<=-128 |
>-128 |
val[7:0] |
127(0x7F) |
-128(0x80) |
val[7:0] |
|
to u8 |
<255 |
>=255 |
0 |
|
val[7:0] |
255(0xFF) |
|||
VDEQS162B8H/L 中:DEQSCALE 用作指向存储在 UB 中的实际量化向量的指针(间接寻址)。该量化向量是一个包含 16 个元素的 128 字节向量(16 个 64 位数字)。 DEQSCALE[13:0] 是 UB 中 128 字节反量化向量的地址,单位为 32Byte。 将量化向量的每个 64 位元素表示为 deq_factor[i][63:0],做以下操作:
for(i = 0; i < 16; i++){
deq_factor[i] = *((uint64_t*)(DEQSCALE[13:0] * 32) + i);
deqs162b8_quantization(deq_factor[i], s16_element[i]);
}
DEQ 接口中的 h/l 控制 u8/s8 是落在 bolck 的高位还是低位,如下图所示。

参数说明
参数含义见 表1 单目运算参数说明。
接口原型
// 通用的接口命名方式 vconv_{src}2{dst}{rnd_mode}
// bf162s32
void vconv_bf162s32a(__ubuf__ int32_t *dst, __ubuf__ bfloat16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_bf162s32c(__ubuf__ int32_t *dst, __ubuf__ bfloat16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_bf162s32f(__ubuf__ int32_t *dst, __ubuf__ bfloat16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_bf162s32r(__ubuf__ int32_t *dst, __ubuf__ bfloat16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_bf162s32z(__ubuf__ int32_t *dst, __ubuf__ bfloat16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
// f162s16
void vconv_f162s16a(__ubuf__ int16_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s16c(__ubuf__ int16_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s16f(__ubuf__ int16_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s16r(__ubuf__ int16_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s16z(__ubuf__ int16_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
// f162s32
void vconv_f162s32a(__ubuf__ int32_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s32c(__ubuf__ int32_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s32f(__ubuf__ int32_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s32r(__ubuf__ int32_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s32z(__ubuf__ int32_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
// f162s4
void vconv_f162s4(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s4a(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s4c(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s4f(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s4r(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162s4z(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
// 其余接口不做展开
// f162{s8,u8}
void vconv_f162s8{(null),a,c,f,r,z}(__ubuf__ int8_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f162u8{(null),a,c,f,r,z}(__ubuf__ uint8_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
// f322{bf16,f32,s16,s32,s64}
void vconv_f322bf16{a,c,f,r,z,o}(__ubuf__ bfloat16_t *dst, __ubuf__ float *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f322f32{a,c,f,r,z}(__ubuf__ float *dst, __ubuf__ float *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f322s16{(null),a,c,f,r,z}(__ubuf__ int16_t *dst, __ubuf__ float *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f322s32{a,c,f,r,z}(__ubuf__ int32_t *dst, __ubuf__ float *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
void vconv_f322s64{a,c,f,r,z}(__ubuf__ int64_t *dst, __ubuf__ float *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
// s162f16
void vconv_s162f16{(null),a,c,f,r,z}(__ubuf__ half *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
// s322f32
void vconv_s322f32{(null),a,c,f,r,z}(__ubuf__ float *dst, __ubuf__ int32_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
// s642f32
void vconv_s642f32{a,c,f,r,z}(__ubuf__ float *dst, __ubuf__ int64_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);
// deqs162b8{h,l}
void vconv_deqs162b8h(__ubuf__ int8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride);
void vconv_deqs162b8h(__ubuf__ uint8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride);
void vconv_deqs162b8l(__ubuf__ int8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride);
void vconv_deqs162b8l(__ubuf__ uint8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride);
// vdeqs162b8{h,l}
void vconv_vdeqs162b8h(__ubuf__ int8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride);
void vconv_vdeqs162b8h(__ubuf__ uint8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride);
void vconv_vdeqs162b8l(__ubuf__ int8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride);
void vconv_vdeqs162b8l(__ubuf__ uint8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride);
流水类型
PIPE_V