昇腾社区首页
中文
注册
开发者
下载

vconv

功能说明

vector 数据类型转换,接口形如 vconv_bf162f32/vconv_bf162f32a 后缀区分不同舍入模式:

R:四舍五入,偶数优先(C 语言中的 rint)

A:四舍五入,中间值向远离零的方向取整(C 语言中的 round)

F:向下取整(C 语言中的 floor)

C:向上取整(C 语言中的 ceil)

Z:向零取整(C 语言中的 trunc)

O:向奇数取整(冯·诺依曼舍入)

留空(null)默认为R模式。

该接口主要遵循单目运算模板。

该接口涉及不同宽度的类型转换,因此,源操作数和目的操作数 RepeatStride 不总是常见的8个块。接口以其中宽度较大的数据类型为准,由于转换前后元素个数是不变的,数据宽度更大的类型 RepeatStride 为8Block,而宽度较小的类型 RepeatStride 将小于8Block。

例如,f162s8 转换接口处理128个f16元素和128个s8元素,源操作数有8个块,目的操作数有4个块(而非常见的8个)。模板中对目标操作数的步幅和偏移定义在此仍然有效。反之,在 f322s64 接口中,源向量只有4个块,而目标向量则有8个块。

该接口支持MASK配置。MASK可控制哪些元素参与计算。

支持的数据类型及舍入模式如下表:

src

dst

rounding mode

f32

f32

R/A/F/C/Z

f16

(null)/R/A/F/C/Z/O

bf16

R/A/F/C/Z

s64

R/A/F/C/Z

s32

R/A/F/C/Z

s16

R/A/F/C/Z

f16

s32

R/A/F/C/Z

s16

R/A/F/C/Z

s8

(null)/R/A/F/C/Z

u8

(null)/R/A/F/C/Z

s4

(null)/R/A/F/C/Z

bf16

s32

R/A/F/C/Z

s16

f16

(null)/R/A/F/C/Z

s32

f32

(null)/R/A/F/C/Z

s64

f32

R/A/F/C/Z

DEQs162b8h

/

/

DEQs162b8l

/

/

VDEQs162b8h

/

/

VDEQs162b8l

/

/

DEQ/VDEQ 功能说明

部分类型转换接口涉及 DEQ/VDEQ,其功能是反量化。

DEQS162B8H/L 中:DEQSCALE[31:0] 表示为 M(在 f32 中表示,硬件将其视为 (1, 8, 10) 格式进行计算,即 1 位符号位、8 位指数位和 10 位尾数位)。DEQSCALE[45:37] 表示为 offset(s9 格式)。DEQSCALE[46] 用于指示量化结果是有符号还是无符号。DEQSCALE 中的其他位为保留位。

流程如下:

deq_factor[63:0] = DEQSCALE[63:0];
M[31:0] = deq_factor[31:0];
offset[8:0] = deq_factor[45:37];
deqs162b8_quantization(deq_factor, src_s16){
  tmp0[31:0] = s16_to_f32(src_s16);
  tmp1[31:0] = tmp0[31:0] * M;
  tmp2[8:0] = f32_to_s9_saturation(tmp1[31:0]);
  tmp3[8:0] = tmp2[8:0] + offset[8:0];
  result[7:0] = deq_factor[46] ? s8_saturation(tmp3[8:0]) : u8_saturation(tmp3[8:0]);
}

s8_saturation/u8_saturation 表现如下表:

val[8:0] signed

>=0

<0

to s8

<127

>=127

<=-128

>-128

val[7:0]

127(0x7F)

-128(0x80)

val[7:0]

to u8

<255

>=255

0

val[7:0]

255(0xFF)

VDEQS162B8H/L 中:DEQSCALE 用作指向存储在 UB 中的实际量化向量的指针(间接寻址)。该量化向量是一个包含 16 个元素的 128 字节向量(16 个 64 位数字)。 DEQSCALE[13:0] 是 UB 中 128 字节反量化向量的地址,单位为 32Byte。 将量化向量的每个 64 位元素表示为 deq_factor[i][63:0],做以下操作:

for(i = 0; i < 16; i++){
  deq_factor[i] = *((uint64_t*)(DEQSCALE[13:0] * 32) + i);
  deqs162b8_quantization(deq_factor[i], s16_element[i]);
}

DEQ 接口中的 h/l 控制 u8/s8 是落在 bolck 的高位还是低位,如下图所示。

参数说明

参数含义见 表1 单目运算参数说明

接口原型

// 通用的接口命名方式 vconv_{src}2{dst}{rnd_mode} 

// bf162s32
void vconv_bf162s32a(__ubuf__ int32_t *dst, __ubuf__ bfloat16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_bf162s32c(__ubuf__ int32_t *dst, __ubuf__ bfloat16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_bf162s32f(__ubuf__ int32_t *dst, __ubuf__ bfloat16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_bf162s32r(__ubuf__ int32_t *dst, __ubuf__ bfloat16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_bf162s32z(__ubuf__ int32_t *dst, __ubuf__ bfloat16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

// f162s16
void vconv_f162s16a(__ubuf__ int16_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s16c(__ubuf__ int16_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s16f(__ubuf__ int16_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s16r(__ubuf__ int16_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s16z(__ubuf__ int16_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

// f162s32
void vconv_f162s32a(__ubuf__ int32_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s32c(__ubuf__ int32_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s32f(__ubuf__ int32_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s32r(__ubuf__ int32_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s32z(__ubuf__ int32_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

// f162s4
void vconv_f162s4(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s4a(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s4c(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s4f(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s4r(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162s4z(__ubuf__ void *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride);

// 其余接口不做展开
// f162{s8,u8}
void vconv_f162s8{(null),a,c,f,r,z}(__ubuf__ int8_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f162u8{(null),a,c,f,r,z}(__ubuf__ uint8_t *dst, __ubuf__ half *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

// f322{bf16,f32,s16,s32,s64}
void vconv_f322bf16{a,c,f,r,z,o}(__ubuf__ bfloat16_t *dst, __ubuf__ float *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f322f32{a,c,f,r,z}(__ubuf__ float *dst, __ubuf__ float *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f322s16{(null),a,c,f,r,z}(__ubuf__ int16_t *dst, __ubuf__ float *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f322s32{a,c,f,r,z}(__ubuf__ int32_t *dst, __ubuf__ float *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

void vconv_f322s64{a,c,f,r,z}(__ubuf__ int64_t *dst, __ubuf__ float *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

// s162f16
void vconv_s162f16{(null),a,c,f,r,z}(__ubuf__ half *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

// s322f32
void vconv_s322f32{(null),a,c,f,r,z}(__ubuf__ float *dst, __ubuf__ int32_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

// s642f32
void vconv_s642f32{a,c,f,r,z}(__ubuf__ float *dst, __ubuf__ int64_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride, uint16_t srcRepeatStride); 

// deqs162b8{h,l}
void vconv_deqs162b8h(__ubuf__ int8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride); 

void vconv_deqs162b8h(__ubuf__ uint8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride); 

void vconv_deqs162b8l(__ubuf__ int8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride); 

void vconv_deqs162b8l(__ubuf__ uint8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride);

// vdeqs162b8{h,l}
void vconv_vdeqs162b8h(__ubuf__ int8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride); 

void vconv_vdeqs162b8h(__ubuf__ uint8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride); 

void vconv_vdeqs162b8l(__ubuf__ int8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride); 

void vconv_vdeqs162b8l(__ubuf__ uint8_t *dst, __ubuf__ int16_t *src, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride);

流水类型

PIPE_V