昇腾社区首页
中文
注册
开发者
下载

Write Combining性能优化

约束限制

  • 架构限制:Arm架构,昇腾+鲲鹏的协同计算架构
  • 操作系统:openEuler 22.03 LTS SP4
  • Libvirt:使用Libvirt v9.4及以上
  • QEMU:使用QEMU v8.1及以上
  • HDK:Ascend HDK 24.1.1及以上版本
  • 适用的硬件产品:Atlas 900 A3 SuperPoD 超节点Atlas 800T A2 训练服务器

原理介绍

WC(Write Combining,写合并)是一种提升主机向非缓存PCIe设备写入性能的技术。写入WC区域的数据会暂存于64字节缓冲区,待缓冲区填满或触发刷新事件(如写入地址超出当前缓冲区范围)时,执行合并写入,显著提升总线利用率,实现更高吞吐量。该特性在当前约束限制下的物理机上默认开启,本章节主要指导用户如何开启虚拟机内的WC特性,用户需要修改物理机内核代码、QEMU代码后重新编译安装。

本节代码行前带有“##”标识的代码行是需要删除的代码,加粗样式的代码是需要用户增加的代码。

修改物理机内核代码

  1. 执行如下命令安装必要的工具软件和库文件。
    yum install -y kernel-source
    yum install -y rpm-build openssl-devel bc rsync gcc gcc-c++ flex bison m4 elfutils-libelf-devel
  2. 确保工具软件和库文件安装成功后,进入内核源码目录,其中version由实际下载得到的内核版本确定。
    cd /usr/src/linux-{version}.oe2203sp3.aarch64
  3. 在内核代码中加入开启虚拟机WC特性相关的代码。
    1. 执行命令,vi ./include/linux/mm.h,修改“mm.h”文件。
      #ifndef VM_GROWSUP
      # define VM_GROWSUP     VM_NONE
      #endif
      
      /*
       * This flag is used to connect VFIO to arch specific KVM code. It
       * indicates that the memory under this VMA is safe for use with any
       * non-cachable memory type inside KVM. Some VFIO devices, on some
       * platforms, are thought to be unsafe and can cause machine crashes
       * if KVM does not lock down the memory type.
       */
      #ifdef CONFIG_64BIT
      #define VM_ALLOW_ANY_UNCACHED_BIT	39
      #define VM_ALLOW_ANY_UNCACHED		BIT(VM_ALLOW_ANY_UNCACHED_BIT)
      #else
      #define VM_ALLOW_ANY_UNCACHED		VM_NONE
      #endif 
      
      /* Bits set in the VMA until the stack is in its final location */
      #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
    2. 执行命令,vi ./include/uapi/linux/vfio.h,修改“vfio.h”文件
      struct vfio_region_info {
      	__u32	argsz;
      	__u32	flags;
      #define VFIO_REGION_INFO_FLAG_READ	(1 << 0) /* Region supports read */
      #define VFIO_REGION_INFO_FLAG_WRITE	(1 << 1) /* Region supports write */
      #define VFIO_REGION_INFO_FLAG_MMAP	(1 << 2) /* Region supports mmap */
      #define VFIO_REGION_INFO_FLAG_CAPS	(1 << 3) /* Info supports caps */
      #define VFIO_REGION_INFO_FLAG_NORMAL_NC (1 << 4) /* Region supports normal NC */
      	__u32	index;		/* Region index */
      	__u32	cap_offset;	/* Offset within info struct of first cap */
      	__u64	size;		/* Region size (bytes) */
      	__u64	offset;		/* Region offset from start of device fd */
      };
    3. 执行命令,vi ./arch/arm64/include/asm/kvm_pgtable.h,修改“kvm_pgtable.h”文件
      /**
       * enum kvm_pgtable_prot - Page-table permissions and attributes.
       * @KVM_PGTABLE_PROT_X:		Execute permission.
       * @KVM_PGTABLE_PROT_W:		Write permission.
       * @KVM_PGTABLE_PROT_R:		Read permission.
       * @KVM_PGTABLE_PROT_DEVICE:	Device attributes. 
       * @KVM_PGTABLE_PROT_NORMAL_NC:Normal noncacheable attributes.
       * @KVM_PGTABLE_PROT_PBHA0:Page-Based Hardware Attribute 0.
       * @KVM_PGTABLE_PROT_PBHA1:	Page-Based Hardware Attribute 1.
       * @KVM_PGTABLE_PROT_PBHA2:	Page-Based Hardware Attribute 2.
       * @KVM_PGTABLE_PROT_PBHA3:	Page-Based Hardware Attribute 3.
       */
      enum kvm_pgtable_prot {
      	KVM_PGTABLE_PROT_X			= BIT(0),
      	KVM_PGTABLE_PROT_W			= BIT(1),
      	KVM_PGTABLE_PROT_R			= BIT(2),
      	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
              KVM_PGTABLE_PROT_NORMAL_NC	        = BIT(4),        
      	KVM_PGTABLE_PROT_PBHA0			= BIT(59),
      	KVM_PGTABLE_PROT_PBHA1			= BIT(60),
      	KVM_PGTABLE_PROT_PBHA2			= BIT(61),
      	KVM_PGTABLE_PROT_PBHA3			= BIT(62),
      };
    4. 执行命令,vi ./arch/arm64/include/asm/memory.h,修改“memory.h”文件
      /*
       * Memory types for Stage-2 translation
       */
      #define MT_S2_NORMAL		0xf
      #define MT_S2_NORMAL_NC	0x5
      #define MT_S2_DEVICE_nGnRE	0x1
      
      /*
       * Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001
       * Stage-2 enforces Normal-WB and Device-nGnRE
       */
      #define MT_S2_FWB_NORMAL	6
      #define MT_S2_FWB_NORMAL_NC	5
      #define MT_S2_FWB_DEVICE_nGnRE	1
      
      #ifdef CONFIG_ARM64_4K_PAGES
    5. 执行命令,vi ./arch/arm64/kvm/hyp/pgtable.c,修改“pgtable.c”文件
      代码中标识“##”的代码行是需要删除的。
       static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
      				    struct stage2_map_data *data)
       {
         ##bool device = prot & KVM_PGTABLE_PROT_DEVICE;
         ##kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
         ##           PAGE_S2_MEMATTR(NORMAL);
         
          kvm_pte_t attr;
          u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
          switch (prot & (KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC)) {
                  case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
                      return -EINVAL;
                  case KVM_PGTABLE_PROT_DEVICE:
                      if (prot & KVM_PGTABLE_PROT_X) {
                          return -EINVAL;
                  }
                      attr = PAGE_S2_MEMATTR(DEVICE_nGnRE);
                      break;
                  case KVM_PGTABLE_PROT_NORMAL_NC:
                      if (prot & KVM_PGTABLE_PROT_X) {
                          return -EINVAL;
                  }
                      attr = PAGE_S2_MEMATTR(NORMAL_NC);
                      break;
                  default:
                      attr = PAGE_S2_MEMATTR(NORMAL);
                      break;
                 }
          
          if (!(prot & KVM_PGTABLE_PROT_X))
              attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
         ##else if (device)
             ##return -EINVAL;
    6. 执行命令,vi ./drivers/vfio/pci/vfio_pci.c,修改“vfio_pci.c”文件。
                  if (vdev->bar_mmap_supported[info.index]) {
                      info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
                      if (info.index == vdev->msix_bar) {
                          ret = msix_mmappable_cap(vdev, &caps);
                          if (ret)
                              return ret;
                      }
                  }
                  
                  if (VM_ALLOW_ANY_UNCACHED != VM_NONE) {
                       info.flags |= VFIO_REGION_INFO_FLAG_NORMAL_NC;
                      }
                  
                  break;
              case VFIO_PCI_ROM_REGION_INDEX:
              {
                  void __iomem *io;
                  size_t size;
                  u16 cmd;
                  info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
                  info.flags = 0;
                  /* Report the BAR size, not the ROM size */
                  info.size = pci_resource_len(pdev, info.index);

      修改“vfio_pci.c”文件中vma->vm_flags,做如下修改。代码中标识“##”的代码行是需要删除的。

              /*
      	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
      	 * change vm_flags within the fault handler.  Set them now.
      
      	 *
      	 * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
      	 * allowing KVM stage 2 device mapping attributes to use Normal-NC
      	 * rather than DEVICE_nGnRE, which allows guest mappings
      	 * supporting write-combining attributes (WC). ARM does not
      	 * architecturally guarantee this is safe, and indeed some MMIO
      	 * regions like the GICv2 VCPU interface can trigger uncontained
      	 * faults if Normal-NC is used.
      	 *
      	 * To safely use VFIO in KVM the platform must guarantee full
      	 * safety in the guest where no action taken against a MMIO
      	 * mapping can trigger an uncontained failure. The assumption is
      	 * that most VFIO PCI platforms support this for both mapping types,
      	 * at least in common flows, based on some expectations of how
      	 * PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in
      	 * the VMA flags.
      
      	 */
      
            ##vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
      	vma->vm_flags |= VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
      
      	vma->vm_ops = &vfio_pci_mmap_ops;
      
      	return 0;
           }
    7. 执行命令,vi ./arch/arm64/kvm/mmu.c,修改“mmu.c”文件。代码中标识“##”的代码行是需要删除的。
      static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
      			  struct kvm_memory_slot *memslot, unsigned long hva,
      			  unsigned long fault_status)
      {
      	int ret = 0;
      	bool write_fault, writable, force_pte = false;
      	bool exec_fault;
            ##bool device = false;
      	bool device = false, vfio_allow_any_uc = false;
      	unsigned long mmu_seq;
      	struct kvm *kvm = vcpu->kvm;
      
      	...
      
      	if (exec_fault) {
      		prot |= KVM_PGTABLE_PROT_X;
      		invalidate_icache_guest_page(pfn, vma_pagesize);
      	}
      
            ##if (device)
      	      ##prot |= KVM_PGTABLE_PROT_DEVICE;
            ##else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
      	      ##prot |= KVM_PGTABLE_PROT_X;
      
              vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
      
      	if (device) {
                  if (vfio_allow_any_uc) {
                      prot |= KVM_PGTABLE_PROT_NORMAL_NC;
                } else {
                      prot |= KVM_PGTABLE_PROT_DEVICE;
                  }
                } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
                      prot |= KVM_PGTABLE_PROT_X;
                } 
      
      	/*
      	 * Under the premise of getting a FSC_PERM fault, we just need to relax
      	 * permissions only if vma_pagesize equals fault_granule. Otherwise,
      	 * kvm_pgtable_stage2_map() should be called to change block size.
      	 */
    8. 执行命令vi Makefile,修改版本号。修改其中的SUBLEVEL,将SUBLEVEL_VAL改为未使用过的非0数值即可,例如1、2、3等。
      # SPDX-License-Identifier: GPL-2.0
      VERSION = 5
      PATCHLEVEL = 10
      SUBLEVEL = SUBLEVEL_VAL
      EXTRAVERSION =
    9. 编译并且安装修改后的内核,rpm安装的时候显示”Error! echo”为正常现象。
      # 安装必要的依赖
      yum install dwarves
      
      make openeuler_defconfig
      
      #其中64是用64个cpu核可以根据实际环境修改
      make binrpm-pkg -j64
      
      #查看编译完成的内核rpm包,找到自定义的SUBLEVEL_VAL版本的包
      ll /root/rpmbuild/RPMS/aarch64
      
      # 根据查询到的rpm包进行安装,kernel-headers包无需安装
      rpm -ivh /root/rpmbuild/RPMS/aarch64/kernel-{version}.aarch64.rpm
      
      # 查看是否安装成功
      rpm -qa | grep kernel

      如果构建失败,可能是缺失部分依赖,根据报错信息安装依赖。

    10. 安装内核成功后,可以将其设置成默认启动内核后重启即可生效,或者在IBMC页面手动重启并于内核选择时更换编译修改后的内核。
      # 查看当前服务器的所有内核
      grep ^menuentry /boot/grub2/grub.cfg | cut -d "'" -f2
      
      # 查寻到上面安装的内核设置为默认启动内核,将version替换为实际安装的修改后的内核
      sudo grub2-set-default "openEuler {version}"
      
      # 查看当前默认启动内核,确定是否替换成功
      grub2-editenv list

修改QEMU代码

  1. 执行如下命令安装必要的工具软件和库文件。
    yum install spice-server spice-server-devel -y
    yum install numactl-devel flex bison -y
  2. 单机QEMU网站下载8.1.0版本的QEMU源码包,并将下载的源码包上传至环境中。
  3. 进入源码包所在目录,执行如下命令,修改压缩包权限。
    chmod +x ./qemu-8.1.0.tar.xz
  4. 执行如下命令,进入QEMU源码目录。
    cd qemu-8.1.0/
  5. 修改QEMU源码。
    1. 执行命令,vi ./hw/arm/virt.c,修改“virt.c”文件,将PCIE_MMIO由512扩展为2048以容纳更多的PCIE设备,512会导致直通场景部分卡设备初始化失败。
      static MemMapEntry extended_memmap[] = {
          /* Additional 64 MB redist region (can contain up to 512 redistributors) */
          [VIRT_HIGH_GIC_REDIST2] =   { 0x0, 64 * MiB },
          [VIRT_HIGH_PCIE_ECAM] =     { 0x0, 256 * MiB },
          /* Second PCIe window */
      
          # 修改此处
          [VIRT_HIGH_PCIE_MMIO] =     { 0x0, 2048 * GiB },
      };
    2. 执行命令,vi ./hw/vfio/pci-quirks.c,修改“pci-quirks.c”文件。
      #include "hw/nvram/fw_cfg.h"
      #include "pci.h"
      #include "trace.h"
      
      
      #define PCI_VENDOR_ID_HUAWEI      0x19e5
      #define PCI_DEVICE_ID_ASCEND910   0xd801
      #define PCI_DEVICE_ID_ASCEND910A2  0xd802
      #define PCI_DEVICE_ID_ASCEND910A3  0xd803
      #define PCI_DEVICE_ID_ASCEND310P   0xd500
      #define PCI_DEVICE_ID_ASCEND310B   0xd105
      #define PCI_DEVICE_ID_ASCEND310   0xd100
      #define PCI_SUB_DEVICE_ID_ASCEND310P_1P_MIN  0x100
      #define PCI_SUB_DEVICE_ID_ASCEND310P_1P_MAX  0x10f
      #define PCI_SUB_DEVICE_ID_ASCEND310P_2P_MIN  0x110
      #define PCI_SUB_DEVICE_ID_ASCEND310P_2P_MAX  0x11f
      #define ASCEND910_XLOADER_SIZE    	4
      #define ASCEND910_XLOADER_OFFSET  	0x80400
      
      #define ASCEND910A2_XLOADER_SIZE   4
      #define ASCEND910A2_XLOADER_OFFSET    0x18208430
      #define ASCEND910A2_FEATURE_SIZE    	4
      #define ASCEND910A2_FEATURE_OFFSET  	0x182085f8
      
      #define ASCEND910A3_XLOADER_SIZE   4
      #define ASCEND910A3_XLOADER_OFFSET    0x18208430
      #define ASCEND910A3_FEATURE_SIZE   4
      #define ASCEND910A3_FEATURE_OFFSET    0x182085f8
      
      #define ASCEND310P_2P_BASE         (128 * 1024 * 1024)
      #define ASCEND310P_1P_DEVNUM       1
      #define ASCEND310P_2P_DEVNUM       2
      #define ASCEND310P_XLOADER_SIZE    4
      #define ASCEND310P_XLOADER_OFFSET  0x100430
      #define ASCEND310B_XLOADER_SIZE    4
      #define ASCEND310B_XLOADER_OFFSET  0x4430
      #define ASCEND310_XLOADER_SIZE    4
      #define ASCEND310_XLOADER_OFFSET  0x400
      
      enum {
          VFIO_ASCEND_TYPE_ERR            = 0,
          VFIO_ASCEND_TYPE_XLOADER        = 1,
          VFIO_ASCEND_TYPE_FEATURE        = 2,
      };
      
      enum {
          QIURK_READ_SIZE_1_BYTE          = 1,
          QIURK_READ_SIZE_2_BYTE          = 2,
          QIURK_READ_SIZE_4_BYTE          = 4,
      };
      
      typedef struct VFIOAscendBarQuirk {
          struct VFIOPCIDevice *vdev;
          pcibus_t offset;
          unsigned int flags;
          int type;
          uint8_t bar;
          MemoryRegion *mem;
      } VFIOAscendBarQuirk;
      
      static uint64_t vfio_ascend_quirk_read(void *opaque, hwaddr addr, unsigned size)
      {
          uint64_t value, off_value;
          const unsigned byte_size = 8;
          unsigned start = addr * byte_size, length_size = size * byte_size;	
          VFIOAscendBarQuirk *quirk = opaque;
          VFIOPCIDevice *vdev = quirk->vdev;
          qemu_log("read RO region! addr=0x%" HWADDR_PRIx ", size=%d\n",
                  addr + quirk->offset, size);
          switch (quirk->type) {
              case VFIO_ASCEND_TYPE_XLOADER:
                  value = vfio_region_read(&vdev->bars[quirk->bar].region,
                                           addr + quirk->offset, size);
                  break;
              case VFIO_ASCEND_TYPE_FEATURE:
                  switch (size) {
                      case QIURK_READ_SIZE_1_BYTE:
                      case QIURK_READ_SIZE_2_BYTE:
                      case QIURK_READ_SIZE_4_BYTE:
                          off_value = ((uint64_t)quirk->flags >> start) & ((1UL << length_size) - 1);
                          return le64_to_cpu(off_value);
                      default:
                          qemu_log("Ascend quirk unsupported read size, %d bytes\n", size);
                          return 0;
                  }
              default:
                  qemu_log("read RO region error type! addr=0x%" HWADDR_PRIx ", size=%d\n",
                           addr + quirk->offset, size);
                  return 0;
          }
      
          return value;
      }
      
      static void vfio_ascend_quirk_write(void *opaque, hwaddr addr, uint64_t data, unsigned size)
      {
          VFIOAscendBarQuirk *quirk = opaque;
          qemu_log("modifying RO region is not allowed! addr=0x%"
                  HWADDR_PRIx ", data=0x%" PRIx64 ", size=%d\n",
                  addr + quirk->offset, data, size);
      }
      
      static unsigned int vfio_ascend_get_region_flags(VFIOPCIDevice *vdev, int nr)
      {
          int ret = 0;
          unsigned int flags;
          VFIODevice *vbasedev = &vdev->vbasedev;
          struct vfio_region_info *reg_info;
      
          ret = vfio_get_region_info(vbasedev, nr, &reg_info);
          if (ret != 0) {
              qemu_log("ascend failed to get region %d info\n", nr);
              return 0;
          }
          flags = reg_info->flags;
          g_free(reg_info);
      
          return flags;
      }
      
      static unsigned int vfio_ascend_get_all_regions_flags(VFIOPCIDevice *vdev)
      {
          int i = 0;
          unsigned int flag = 0, flags = 0;
      
          for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
              flag = vfio_ascend_get_region_flags(vdev, i);
      	    if (flag & VFIO_REGION_INFO_FLAG_NORMAL_NC) {
                  flags |= (1 << i);
      	    }
          }
      
          return flags;
      }
      
      static void vfio_ascend_set_bar_quirk_array(VFIOAscendBarQuirk *bar_quirk,
                                                  VFIOPCIDevice *vdev, int index,
                                                  pcibus_t offset, unsigned int flags,
                                                  int type, uint8_t bar)
      {
          bar_quirk[index].vdev = vdev;
          bar_quirk[index].offset = offset;
          bar_quirk[index].flags = flags;
          bar_quirk[index].type = type;
          bar_quirk[index].bar = bar;
      }
      
      static const MemoryRegionOps vfio_ascend_intercept_regs_quirk = {
          .read = vfio_ascend_quirk_read,
          .write = vfio_ascend_quirk_write,
          .endianness = DEVICE_LITTLE_ENDIAN,
      };
      
      static void vfio_probe_ascend910a3_bar2_quirk(VFIOPCIDevice *vdev, int nr)
      {
          VFIOQuirk *quirk;
          VFIOAscendBarQuirk *bar2_quirk;
          const int quirk_region_num = 2; /* XLOADER and FEATURE */
      
          if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 2 || vdev->device_id != PCI_DEVICE_ID_ASCEND910A3) {
              return;
          }
      
          quirk = vfio_quirk_alloc(quirk_region_num);
          bar2_quirk = quirk->data = g_new0(typeof(*bar2_quirk), quirk->nr_mem);
          vfio_ascend_set_bar_quirk_array(bar2_quirk, vdev, 0, ASCEND910A3_XLOADER_OFFSET, 0, VFIO_ASCEND_TYPE_XLOADER, nr);
      
          /* intercept w/r to the xloader-updating register,
           * so the vm can't enable xloader-updating
           */
          memory_region_init_io(&quirk->mem[0],
              OBJECT(vdev),
              &vfio_ascend_intercept_regs_quirk,
              &bar2_quirk[0],
              "vfio-ascend910a3-bar2-intercept-regs-quirk",
              ASCEND910A3_XLOADER_SIZE);
          memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, bar2_quirk[0].offset, &quirk->mem[0], 1);
      
          /* 910A3 FEATURE */
          vfio_ascend_set_bar_quirk_array(bar2_quirk,
              vdev,
             1,
              ASCEND910A3_FEATURE_OFFSET,
              vfio_ascend_get_all_regions_flags(vdev),
              VFIO_ASCEND_TYPE_FEATURE,
              nr);
          memory_region_init_io(&quirk->mem[1],
              OBJECT(vdev),
              &vfio_ascend_intercept_regs_quirk,
              &bar2_quirk[1],
              "vfio-ascend910a3-bar2-feature-regs-quirk",
              ASCEND910A3_FEATURE_SIZE);
          memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, bar2_quirk[1].offset, &quirk->mem[1], 1);
      
          QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
      }
      
      static void vfio_probe_ascend910a2_bar2_quirk(VFIOPCIDevice *vdev, int nr)
      {
          VFIOQuirk *quirk;
          VFIOAscendBarQuirk *bar2_quirk;
          const int quirk_region_num = 2; /* XLOADER and FEATURE */
      
          if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 2
              || vdev->device_id != PCI_DEVICE_ID_ASCEND910A2) {
              return;
          }
      
          quirk = vfio_quirk_alloc(quirk_region_num);
          bar2_quirk = quirk->data = g_new0(typeof(*bar2_quirk), quirk->nr_mem);
          vfio_ascend_set_bar_quirk_array(bar2_quirk, vdev, 0, ASCEND910A2_XLOADER_OFFSET,
                                          0, VFIO_ASCEND_TYPE_XLOADER, nr);
      
          /* intercept w/r to the xloader-updating register,
           * so the vm can't enable xloader-updating
           */
          memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
                                &vfio_ascend_intercept_regs_quirk,
                                &bar2_quirk[0],
                                "vfio-ascend910_a2-bar2-intercept-regs-quirk",
                                ASCEND910A2_XLOADER_SIZE);
          memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
                                              bar2_quirk[0].offset,
                                              &quirk->mem[0], 1);
           /* 910B FEATURE */
          vfio_ascend_set_bar_quirk_array(bar2_quirk, vdev, 1, ASCEND910A2_FEATURE_OFFSET,
                                          vfio_ascend_get_all_regions_flags(vdev),
                                          VFIO_ASCEND_TYPE_FEATURE, nr);
          memory_region_init_io(&quirk->mem[1], OBJECT(vdev),
                                &vfio_ascend_intercept_regs_quirk,
                                &bar2_quirk[1],
                                "vfio-ascend910b-bar2-feature-regs-quirk",
                                ASCEND910A2_FEATURE_SIZE);
          memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
                                              bar2_quirk[1].offset,
                                              &quirk->mem[1], 1);
          QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
      }
      
      static void vfio_probe_ascend910_bar0_quirk(VFIOPCIDevice *vdev, int nr)
      {
          VFIOQuirk *quirk;
          VFIOAscendBarQuirk *bar0_quirk;
          if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 0 ||
              vdev->device_id != PCI_DEVICE_ID_ASCEND910) {
              return;
          }
          quirk = g_malloc0(sizeof(*quirk));
          quirk->nr_mem = 1;
          quirk->mem = g_new0(MemoryRegion, quirk->nr_mem);
          bar0_quirk = quirk->data = g_new0(typeof(*bar0_quirk), quirk->nr_mem);
          bar0_quirk[0].vdev = vdev;
          bar0_quirk[0].offset = ASCEND910_XLOADER_OFFSET;
          bar0_quirk[0].bar = nr;
          /*
           * intercept w/r to the xloader-updating register,
           * so the vm can't enable xloader-updating
           */
          memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
                                &vfio_ascend_intercept_regs_quirk,
                                &bar0_quirk[0],
                                "vfio-ascend910-bar0-intercept-regs-quirk",
                                ASCEND910_XLOADER_SIZE);
          memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
                                              bar0_quirk[0].offset,
                                              &quirk->mem[0], 1);
          QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
      }
      
      static void vfio_probe_ascend310p_bar2_quirk(VFIOPCIDevice *vdev, int nr)
      {
          VFIOQuirk *quirk;
          VFIOAscendBarQuirk *bar2_quirk;
          int sub_device_id;
          int devnum = 0;
          if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 2 ||
              vdev->device_id != PCI_DEVICE_ID_ASCEND310P) {
              return;
          }
          sub_device_id = pci_get_word(vdev->pdev.config + PCI_SUBSYSTEM_ID);
          if (sub_device_id >= PCI_SUB_DEVICE_ID_ASCEND310P_1P_MIN &&
              sub_device_id <= PCI_SUB_DEVICE_ID_ASCEND310P_1P_MAX) {
              devnum = ASCEND310P_1P_DEVNUM;
          } else if (sub_device_id >= PCI_SUB_DEVICE_ID_ASCEND310P_2P_MIN &&
                     sub_device_id <= PCI_SUB_DEVICE_ID_ASCEND310P_2P_MAX) {
              devnum = ASCEND310P_2P_DEVNUM;
          }
          if (devnum != ASCEND310P_1P_DEVNUM && devnum != ASCEND310P_2P_DEVNUM) {
              return;
          }
          quirk = g_malloc0(sizeof(*quirk));
          quirk->nr_mem = devnum;
          quirk->mem = g_new0(MemoryRegion, quirk->nr_mem);
          bar2_quirk = quirk->data = g_new0(typeof(*bar2_quirk), quirk->nr_mem);
          bar2_quirk[0].vdev = vdev;
          bar2_quirk[0].offset = ASCEND310P_XLOADER_OFFSET;
          bar2_quirk[0].bar = nr;
          /*
           * intercept w/r to the xloader-updating register,
           * so the vm can't enable xloader-updating
           */
          memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
                                &vfio_ascend_intercept_regs_quirk,
                                &bar2_quirk[0],
                                "vfio-ascend310p-bar2-1p-intercept-regs-quirk",
                                ASCEND310P_XLOADER_SIZE);
          memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
                                              bar2_quirk[0].offset,
                                              &quirk->mem[0], 1);
          if (devnum == ASCEND310P_2P_DEVNUM) {
              bar2_quirk[1].vdev = vdev;
              bar2_quirk[1].offset = (ASCEND310P_2P_BASE + ASCEND310P_XLOADER_OFFSET);
              bar2_quirk[1].bar = nr;
              memory_region_init_io(&quirk->mem[1], OBJECT(vdev),
                                    &vfio_ascend_intercept_regs_quirk,
                                    &bar2_quirk[1],
                                    "vfio-ascend310p-bar2-2p-intercept-regs-quirk",
                                    ASCEND310P_XLOADER_SIZE);
              memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
                                                  bar2_quirk[1].offset,
                                                  &quirk->mem[1], 1);
          }
          QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
      }
      
      static void vfio_probe_ascend310b_bar2_quirk(VFIOPCIDevice *vdev, int nr)
      {
          VFIOQuirk *quirk;
          VFIOAscendBarQuirk *bar2_quirk;
      
          if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 2
              || vdev->device_id != PCI_DEVICE_ID_ASCEND310B) {
              return;
          }
      
          quirk = g_malloc0(sizeof(*quirk));
          quirk->nr_mem = 1;
          quirk->mem = g_new0(MemoryRegion, quirk->nr_mem);
          bar2_quirk = quirk->data = g_new0(typeof(*bar2_quirk), quirk->nr_mem);
          bar2_quirk[0].vdev = vdev;
          bar2_quirk[0].offset = ASCEND310B_XLOADER_OFFSET;
          bar2_quirk[0].bar = nr;
      
          /* intercept w/r to the xloader-updating register,
           * so the vm can't enable xloader-updating
           */
          memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
                                &vfio_ascend_intercept_regs_quirk,
                                &bar2_quirk[0],
                                "vfio-ascend310b-bar2-intercept-regs-quirk",
                                ASCEND310B_XLOADER_SIZE);
          memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
                                              bar2_quirk[0].offset,
                                              &quirk->mem[0], 1);
          QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
      }
      
      static void vfio_probe_ascend310_bar4_quirk(VFIOPCIDevice *vdev, int nr)
      {
          VFIOQuirk *quirk;
          VFIOAscendBarQuirk *bar4_quirk;
          if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 4 ||
              vdev->device_id != PCI_DEVICE_ID_ASCEND310) {
              return;
          }
          quirk = g_malloc0(sizeof(*quirk));
          quirk->nr_mem = 1;
          quirk->mem = g_new0(MemoryRegion, quirk->nr_mem);
          bar4_quirk = quirk->data = g_new0(typeof(*bar4_quirk), quirk->nr_mem);
          bar4_quirk[0].vdev = vdev;
          bar4_quirk[0].offset = ASCEND310_XLOADER_OFFSET;
          bar4_quirk[0].bar = nr;
          /*
           * intercept w/r to the xloader-updating register,
           * so the vm can't enable xloader-updating
           */
          memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
                                &vfio_ascend_intercept_regs_quirk,
                                &bar4_quirk[0],
                                "vfio-ascend310-bar4-intercept-regs-quirk",
                                ASCEND310_XLOADER_SIZE);
          memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
                                              bar4_quirk[0].offset,
                                              &quirk->mem[0], 1);
          QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
      }
      修改“pci-quirks.c”文件中vfio_bar_quirk_setup函数,增加如下代码并保存。
      void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
      {
          vfio_probe_ati_bar4_quirk(vdev, nr);
          vfio_probe_ati_bar2_quirk(vdev, nr);
          vfio_probe_nvidia_bar5_quirk(vdev, nr);
          vfio_probe_nvidia_bar0_quirk(vdev, nr);
      
          vfio_probe_ascend910a3_bar2_quirk(vdev, nr);
          vfio_probe_ascend910a2_bar2_quirk(vdev, nr);
          vfio_probe_ascend910_bar0_quirk(vdev, nr);
          vfio_probe_ascend310p_bar2_quirk(vdev, nr);
          vfio_probe_ascend310b_bar2_quirk(vdev, nr);
          vfio_probe_ascend310_bar4_quirk(vdev, nr);    
      
          vfio_probe_rtl8168_bar2_quirk(vdev, nr);
          vfio_probe_igd_bar4_quirk(vdev, nr);
      }…
    3. 执行命令,vi ./linux-headers/linux/vfio.h。
      struct vfio_region_info {
      	__u32	argsz;
      	__u32	flags;
      #define VFIO_REGION_INFO_FLAG_READ	(1 << 0) /* Region supports read */
      #define VFIO_REGION_INFO_FLAG_WRITE	(1 << 1) /* Region supports write */
      #define VFIO_REGION_INFO_FLAG_MMAP	(1 << 2) /* Region supports mmap */
      #define VFIO_REGION_INFO_FLAG_CAPS	(1 << 3) /* Info supports caps */
      #define VFIO_REGION_INFO_FLAG_NORMAL_NC (1 << 4) /* Region supports normal NC */
      
      	__u32	index;		/* Region index */
      	__u32	cap_offset;	/* Offset within info struct of first cap */
      	__u64	size;		/* Region size (bytes) */
      	__u64	offset;		/* Region offset from start of device fd */
      };
    4. 执行如下命令,构建QEMU并安装。
      ./configure --enable-kvm --enable-numa 
      make -j64
      make install

      如果构建失败,可能是缺失部分依赖,根据报错信息安装依赖。

    5. 重启libvirt服务生效。
      systemctl daemon-reload  
      systemctl restart libvirtd

验证Write Combining是否开启

  1. 在虚拟机内安装NPU Driver驱动,可以通过npu-smi info查询到所有NPU信息后,执行如下命令。
    msnpureport
  2. 会在当前执行目录下生成例如"yyyy-MM-dd-HH-mm-ss"的日志目录,进入该目录,执行如下命令。
    grep -nr "Device capability info"
  3. 观察到“feature_bar_mem=1”,表示Write Combining已开启。
    [ascend] [devmm] [INFO] [devmm_set_dev_capability 178] <kworker/5:1:1224,1224> Device capability info. (did=5; vfid=0; ts_shm_map_bar=0; ts_shm_data_num=32768; feature_phycial_address=0x1; feature_pcie_th=1; feature_bar_mem=1; dvpp_memsize=17179869184; svm_offset_num=32768; feature_read_mem=1; feature_pcie_dma_support_sva=1; feature_dev_mem_map_host=1; feature_bar_huge_mem=1; double_pgtable_offset=17592186044416; feature_giant_page=1);