昇腾社区首页
中文
注册

向量计算典型语义

一条vadd的intrinsic接口最多可以完成三层for循环的运算,最内层循环是element层面的(需要连续且32B对齐),中间层循环是block层面的(使用BlockStride可以设置间隔大小),最外层是重复的次数(使用RepeatStride设置间隔大小)。

/* intrinsic
void vadd(__ubuf__ DataType *dst, __ubuf__ DataType *src0, __ubuf__ DataType *src1, uint8_t repeat, 
          uint8_t dstBlockStride, uint8_t src0BlockStride, uint8_t src1BlockStride, 
          uint8_t dstRepeatStride, uint8_t src0RepeatStride, uint8_t src1RepeatStride); 
*/  

/* 语义
int blkNum = 8; 
int eleNumInOneBlk = 32 / sizeof(DataType); 
for (int i = 0; i < repeat; i++) { 
  for (int j = 0; j < blkNum; j++) { 
    for (int e = 0; e < eleNumInOneBlk ; e++) { 
      eltSrc0 = src0 + i * src0RepeatStride * eleNumInOneBlk + 
                      j * src0BlockStride * eleNumInOneBlk + e; // src element 
      eltSrc1 = src1 + i * src1RepeatStride * eleNumInOneBlk + 
                      j * src1BlockStride * eleNumInOneBlk + e; 
      eltDst = dst + i * dstRepeatStride * eleNumInOneBlk + 
                    j * dstBlockStride * eleNumInOneBlk + e; // dst element 
      *eltDst = *eltSrc0 + *eltSrc1; 
    } 
  } 
}
*/