对输入x进行分组计算,group_index表示每个group分组的Tokens数,每组使用不同的量化scale(如scale activation_scale、quant_scale)。当group_index=1或None时,表示共享一个scale。
举例说明:假设x.shape=[128, 2H],group_index=[2, 0, 3] ,表示有3个group,对应的scale维度为[3, 2H]。每个group数据使用不同的scale分别做dequant反量化+swiglu+quant量化操作。
torch_npu.npu_dequant_swiglu_quant(Tensor x, *, Tensor? weight_scale=None, Tensor? activation_scale=None, Tensor? bias=None, Tensor? quant_scale=None, Tensor? quant_offset=None, Tensor? group_index=None, bool activate_left=False, int quant_mode=0) -> (Tensor, Tensor)
Tensor中shape使用的变量说明:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | import os import shutil import unittest import torch import torch_npu from torch_npu.testing.testcase import TestCase, run_tests from torch_npu.testing.common_utils import SupportedDevices class TestNPUDequantSwigluQuant(TestCase): def test_npu_dequant_swiglu_quant(self, device="npu"): x_shape = [4608, 2048] x = torch.randint(-10, 10, x_shape, dtype=torch.int32).npu() weight_scale = torch.randn(x_shape[1], dtype=torch.float32).npu() activatition_scale = torch.randn((x_shape[0], 1), dtype=torch.float32).npu() bias = None quant_scale = torch.randn((1, x_shape[1] // 2), dtype=torch.float32).npu() quant_offset = None group_index = None y_npu, scale_npu = torch_npu.npu_dequant_swiglu_quant( x, weight_scale=weight_scale, activation_scale=activatition_scale, bias=bias, quant_scale=quant_scale, quant_offset=quant_offset, group_index=group_index, activate_left=False, quant_mode=1, ) if __name__ == "__main__": run_tests() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | import os import shutil import unittest import torch import torch_npu from torch_npu.testing.testcase import TestCase, run_tests from torch_npu.testing.common_utils import SupportedDevices from torchair.configs.compiler_config import CompilerConfig import torchair as tng config = CompilerConfig() config.experimental_config.frozen_parameter = True config.experimental_config.tiling_schedule_optimize = True npu_backend = tng.get_npu_backend(compiler_config=config) class TestNPUDequantSwigluQuant(TestCase): def test_npu_dequant_swiglu_quant(self, device="npu"): x_shape = [4608, 2048] x = torch.randint(-10, 10, x_shape, dtype=torch.int32).npu() weight_scale = torch.randn(x_shape[1], dtype=torch.float32).npu() activatition_scale = torch.randn((x_shape[0], 1), dtype=torch.float32).npu() bias = None quant_scale = torch.randn((1, x_shape[1] // 2), dtype=torch.float32).npu() quant_offset = None group_index = None grapg_model = torch.compile(torch_npu.npu_dequant_swiglu_quant, backend=npu_backend, dynamic=True, fullgraph=True) y_npu, scale_npu = grapg_model( x, weight_scale=weight_scale, activation_scale=activatition_scale, bias=bias, quant_scale=quant_scale, quant_offset=quant_offset, group_index=group_index, activate_left=False, quant_mode=1, ) if __name__ == "__main__": run_tests() |