swiglu函数激活对输出进行per token量化。
1 2 3 4 5 6 7 | struct SwigluQuantParam{ enum QuantType:int{ QUANT_TYPE_PER_TOKEN =0, }; QuantType quantType = QUANT_TYPE_PER_TOKEN; uint8_t rsv[8] = {0}; }; |
成员名称 |
类型 |
默认值 |
取值范围 |
是否必选 |
描述 |
---|---|---|---|---|---|
quantType |
QuantType |
QUANT_TYPE_PER_TOKEN |
[0] |
是 |
PER_TOKEN量化。 |
rsv[8] |
uint8_t |
{0} |
[0] |
否 |
预留参数。 |
参数 |
维度 |
数据类型 |
格式 |
描述 |
---|---|---|---|---|
inTensor |
[nTokens, 2 * hiddenSize] |
float16/bf16 |
ND |
输入tensor。 |
参数 |
维度 |
数据类型 |
格式 |
描述 |
---|---|---|---|---|
outTensor1 |
[nTokens, hiddenSize] |
int8 |
ND |
输出tensor,量化输出。 |
outTensor2 |
[nTokens] |
float |
ND |
输出tensor,量化后的scale。 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import sys import os import unittest import torch import torch_npu import numpy as np sys.path.append(os.path.join(os.path.dirname(__file__), "../")) import operation_test # NOQA: E402 OP_NAME = "SwigluQuantOperation" PARAM = {"":1} class TestSwigluQuantOperation(operation_test.OperationTest): def sigmoid(self, x): return 1 / (1 + np.exp(-x)) def do_swiglu(self, a, b): sigmoid_mul_a = self.sigmoid(a) * a swiglu_y = sigmoid_mul_a * b return swiglu_y def do_quant(self, swiglu_y): y_tmp = swiglu_y y_tmp = np.array(y_tmp) y_max = np.amax(np.abs(y_tmp), axis=1) # 动态量化依赖于每一行的最大值来调整缩放因子 dynamic_scale_tmp = 127 / y_max dynamic_scale = dynamic_scale_tmp.reshape(-1, 1) # 将一维行向量转换为二维,如(256)转为(256,1) y_tmp = y_tmp * dynamic_scale quant_y_tmp = np.round(y_tmp) # 使用 numpy 进行乘法和四舍五入 quant_y = quant_y_tmp.astype(np.int8) # 转换为 int8 类型 dynamic_scale_output = np.array([]) dynamic_scale = 1 / dynamic_scale dynamic_scale_output = np.append(dynamic_scale_output, dynamic_scale) return quant_y,dynamic_scale_output def SwiGluQuantGolden(self, x_golden): a, b = np.split(x_golden, 2, axis=1) wiglu_y = self.do_swiglu(a, b) quant_y, dynamic_scale = self.do_quant(wiglu_y) return torch.from_numpy(quant_y).npu(), torch.from_numpy(dynamic_scale).npu() def golden_calc(self, in_tensors): golden_res = self.SwiGluQuantGolden(in_tensors[0].cpu()) return [golden_res[0], golden_res[1]] def golden_compare(self, out_tensor, golden_out_tensor): actual_output = out_tensor.cpu() golden_output = golden_out_tensor.cpu() res = False if actual_output.dtype == torch.int8: diff = np.abs(actual_output - golden_output) res = not (diff > 1).nonzero().size(0) > 0 elif actual_output.dtype == torch.float32: res = np.allclose(actual_output, golden_output, rtol=0.0001, atol=0.0001) return res def test_swi_glu_quant_case0(self): input_token_num = 128 input_hidden_size = 4096 shape = (input_token_num, input_hidden_size) x = torch.empty(shape).uniform_(0, 1).to(torch.float16) self.execute(OP_NAME, PARAM, [x.npu().half()]) if __name__ == '__main__': unittest.main() |