Write Combining性能优化
约束限制
- 架构限制:Arm架构,昇腾+鲲鹏的协同计算架构
- 操作系统:openEuler 22.03 LTS SP4
- Libvirt:使用Libvirt v9.4及以上
- QEMU:使用QEMU v8.1及以上
- HDK:Ascend HDK 24.1.1及以上版本
- 适用的硬件产品:Atlas 900 A3 SuperPoD 超节点、Atlas 800T A2 训练服务器
原理介绍
WC(Write Combining,写合并)是一种提升主机向非缓存PCIe设备写入性能的技术。写入WC区域的数据会暂存于64字节缓冲区,待缓冲区填满或触发刷新事件(如写入地址超出当前缓冲区范围)时,执行合并写入,显著提升总线利用率,实现更高吞吐量。该特性在当前约束限制下的物理机上默认开启,本章节主要指导用户如何开启虚拟机内的WC特性,用户需要修改物理机内核代码、QEMU代码后重新编译安装。
本节代码行前带有“##”标识的代码行是需要删除的代码,加粗样式的代码是需要用户增加的代码。
修改物理机内核代码
- 执行如下命令安装必要的工具软件和库文件。
yum install -y kernel-source yum install -y rpm-build openssl-devel bc rsync gcc gcc-c++ flex bison m4 elfutils-libelf-devel
- 确保工具软件和库文件安装成功后,进入内核源码目录,其中version由实际下载得到的内核版本确定。
cd /usr/src/linux-{version}.oe2203sp3.aarch64 - 在内核代码中加入开启虚拟机WC特性相关的代码。
- 执行命令,vi ./include/linux/mm.h,修改“mm.h”文件。
#ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif /* * This flag is used to connect VFIO to arch specific KVM code. It * indicates that the memory under this VMA is safe for use with any * non-cachable memory type inside KVM. Some VFIO devices, on some * platforms, are thought to be unsafe and can cause machine crashes * if KVM does not lock down the memory type. */ #ifdef CONFIG_64BIT #define VM_ALLOW_ANY_UNCACHED_BIT 39 #define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) #else #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
- 执行命令,vi ./include/uapi/linux/vfio.h,修改“vfio.h”文件。
struct vfio_region_info { __u32 argsz; __u32 flags; #define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */ #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ #define VFIO_REGION_INFO_FLAG_NORMAL_NC (1 << 4) /* Region supports normal NC */ __u32 index; /* Region index */ __u32 cap_offset; /* Offset within info struct of first cap */ __u64 size; /* Region size (bytes) */ __u64 offset; /* Region offset from start of device fd */ }; - 执行命令,vi ./arch/arm64/include/asm/kvm_pgtable.h,修改“kvm_pgtable.h”文件。
/** * enum kvm_pgtable_prot - Page-table permissions and attributes. * @KVM_PGTABLE_PROT_X: Execute permission. * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. * @KVM_PGTABLE_PROT_DEVICE: Device attributes. * @KVM_PGTABLE_PROT_NORMAL_NC:Normal noncacheable attributes. * @KVM_PGTABLE_PROT_PBHA0:Page-Based Hardware Attribute 0. * @KVM_PGTABLE_PROT_PBHA1: Page-Based Hardware Attribute 1. * @KVM_PGTABLE_PROT_PBHA2: Page-Based Hardware Attribute 2. * @KVM_PGTABLE_PROT_PBHA3: Page-Based Hardware Attribute 3. */ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_X = BIT(0), KVM_PGTABLE_PROT_W = BIT(1), KVM_PGTABLE_PROT_R = BIT(2), KVM_PGTABLE_PROT_DEVICE = BIT(3), KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), KVM_PGTABLE_PROT_PBHA0 = BIT(59), KVM_PGTABLE_PROT_PBHA1 = BIT(60), KVM_PGTABLE_PROT_PBHA2 = BIT(61), KVM_PGTABLE_PROT_PBHA3 = BIT(62), }; - 执行命令,vi ./arch/arm64/include/asm/memory.h,修改“memory.h”文件。
/* * Memory types for Stage-2 translation */ #define MT_S2_NORMAL 0xf #define MT_S2_NORMAL_NC 0x5 #define MT_S2_DEVICE_nGnRE 0x1 /* * Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001 * Stage-2 enforces Normal-WB and Device-nGnRE */ #define MT_S2_FWB_NORMAL 6 #define MT_S2_FWB_NORMAL_NC 5 #define MT_S2_FWB_DEVICE_nGnRE 1 #ifdef CONFIG_ARM64_4K_PAGES
- 执行命令,vi ./arch/arm64/kvm/hyp/pgtable.c,修改“pgtable.c”文件。代码中标识“##”的代码行是需要删除的。
static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot, struct stage2_map_data *data) { ##bool device = prot & KVM_PGTABLE_PROT_DEVICE; ##kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) : ## PAGE_S2_MEMATTR(NORMAL); kvm_pte_t attr; u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; switch (prot & (KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC)) { case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: return -EINVAL; case KVM_PGTABLE_PROT_DEVICE: if (prot & KVM_PGTABLE_PROT_X) { return -EINVAL; } attr = PAGE_S2_MEMATTR(DEVICE_nGnRE); break; case KVM_PGTABLE_PROT_NORMAL_NC: if (prot & KVM_PGTABLE_PROT_X) { return -EINVAL; } attr = PAGE_S2_MEMATTR(NORMAL_NC); break; default: attr = PAGE_S2_MEMATTR(NORMAL); break; } if (!(prot & KVM_PGTABLE_PROT_X)) attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; ##else if (device) ##return -EINVAL; - 执行命令,vi ./drivers/vfio/pci/vfio_pci.c,修改“vfio_pci.c”文件。
if (vdev->bar_mmap_supported[info.index]) { info.flags |= VFIO_REGION_INFO_FLAG_MMAP; if (info.index == vdev->msix_bar) { ret = msix_mmappable_cap(vdev, &caps); if (ret) return ret; } } if (VM_ALLOW_ANY_UNCACHED != VM_NONE) { info.flags |= VFIO_REGION_INFO_FLAG_NORMAL_NC; } break; case VFIO_PCI_ROM_REGION_INDEX: { void __iomem *io; size_t size; u16 cmd; info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); info.flags = 0; /* Report the BAR size, not the ROM size */ info.size = pci_resource_len(pdev, info.index);修改“vfio_pci.c”文件中vma->vm_flags,做如下修改。代码中标识“##”的代码行是需要删除的。
/* * See remap_pfn_range(), called from vfio_pci_fault() but we can't * change vm_flags within the fault handler. Set them now. * * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64, * allowing KVM stage 2 device mapping attributes to use Normal-NC * rather than DEVICE_nGnRE, which allows guest mappings * supporting write-combining attributes (WC). ARM does not * architecturally guarantee this is safe, and indeed some MMIO * regions like the GICv2 VCPU interface can trigger uncontained * faults if Normal-NC is used. * * To safely use VFIO in KVM the platform must guarantee full * safety in the guest where no action taken against a MMIO * mapping can trigger an uncontained failure. The assumption is * that most VFIO PCI platforms support this for both mapping types, * at least in common flows, based on some expectations of how * PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in * the VMA flags. */ ##vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_flags |= VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &vfio_pci_mmap_ops; return 0; } - 执行命令,vi ./arch/arm64/kvm/mmu.c,修改“mmu.c”文件。代码中标识“##”的代码行是需要删除的。
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) { int ret = 0; bool write_fault, writable, force_pte = false; bool exec_fault; ##bool device = false; bool device = false, vfio_allow_any_uc = false; unsigned long mmu_seq; struct kvm *kvm = vcpu->kvm; ... if (exec_fault) { prot |= KVM_PGTABLE_PROT_X; invalidate_icache_guest_page(pfn, vma_pagesize); } ##if (device) ##prot |= KVM_PGTABLE_PROT_DEVICE; ##else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) ##prot |= KVM_PGTABLE_PROT_X; vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; if (device) { if (vfio_allow_any_uc) { prot |= KVM_PGTABLE_PROT_NORMAL_NC; } else { prot |= KVM_PGTABLE_PROT_DEVICE; } } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { prot |= KVM_PGTABLE_PROT_X; } /* * Under the premise of getting a FSC_PERM fault, we just need to relax * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - 执行命令vi Makefile,修改版本号。修改其中的SUBLEVEL,将SUBLEVEL_VAL改为未使用过的非0数值即可,例如1、2、3等。
# SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 10 SUBLEVEL = SUBLEVEL_VAL EXTRAVERSION =
- 编译并且安装修改后的内核,rpm安装的时候显示”Error! echo”为正常现象。
# 安装必要的依赖 yum install dwarves make openeuler_defconfig #其中64是用64个cpu核可以根据实际环境修改 make binrpm-pkg -j64 #查看编译完成的内核rpm包,找到自定义的SUBLEVEL_VAL版本的包 ll /root/rpmbuild/RPMS/aarch64 # 根据查询到的rpm包进行安装,kernel-headers包无需安装 rpm -ivh /root/rpmbuild/RPMS/aarch64/kernel-{version}.aarch64.rpm # 查看是否安装成功 rpm -qa | grep kernel
如果构建失败,可能是缺失部分依赖,根据报错信息安装依赖。
- 安装内核成功后,可以将其设置成默认启动内核后重启即可生效,或者在IBMC页面手动重启并于内核选择时更换编译修改后的内核。
# 查看当前服务器的所有内核 grep ^menuentry /boot/grub2/grub.cfg | cut -d "'" -f2 # 查寻到上面安装的内核设置为默认启动内核,将version替换为实际安装的修改后的内核 sudo grub2-set-default "openEuler {version}" # 查看当前默认启动内核,确定是否替换成功 grub2-editenv list
- 执行命令,vi ./include/linux/mm.h,修改“mm.h”文件。
修改QEMU代码
- 执行如下命令安装必要的工具软件和库文件。
yum install spice-server spice-server-devel -y yum install numactl-devel flex bison -y
- 单机QEMU网站下载8.1.0版本的QEMU源码包,并将下载的源码包上传至环境中。
- 进入源码包所在目录,执行如下命令,修改压缩包权限。
chmod +x ./qemu-8.1.0.tar.xz
- 执行如下命令,进入QEMU源码目录。
cd qemu-8.1.0/
- 修改QEMU源码。
- 执行命令,vi ./hw/arm/virt.c,修改“virt.c”文件,将PCIE_MMIO由512扩展为2048以容纳更多的PCIE设备,512会导致直通场景部分卡设备初始化失败。
static MemMapEntry extended_memmap[] = { /* Additional 64 MB redist region (can contain up to 512 redistributors) */ [VIRT_HIGH_GIC_REDIST2] = { 0x0, 64 * MiB }, [VIRT_HIGH_PCIE_ECAM] = { 0x0, 256 * MiB }, /* Second PCIe window */ # 修改此处 [VIRT_HIGH_PCIE_MMIO] = { 0x0, 2048 * GiB }, }; - 执行命令,vi ./hw/vfio/pci-quirks.c,修改“pci-quirks.c”文件。
#include "hw/nvram/fw_cfg.h" #include "pci.h" #include "trace.h" #define PCI_VENDOR_ID_HUAWEI 0x19e5 #define PCI_DEVICE_ID_ASCEND910 0xd801 #define PCI_DEVICE_ID_ASCEND910A2 0xd802 #define PCI_DEVICE_ID_ASCEND910A3 0xd803 #define PCI_DEVICE_ID_ASCEND310P 0xd500 #define PCI_DEVICE_ID_ASCEND310B 0xd105 #define PCI_DEVICE_ID_ASCEND310 0xd100 #define PCI_SUB_DEVICE_ID_ASCEND310P_1P_MIN 0x100 #define PCI_SUB_DEVICE_ID_ASCEND310P_1P_MAX 0x10f #define PCI_SUB_DEVICE_ID_ASCEND310P_2P_MIN 0x110 #define PCI_SUB_DEVICE_ID_ASCEND310P_2P_MAX 0x11f #define ASCEND910_XLOADER_SIZE 4 #define ASCEND910_XLOADER_OFFSET 0x80400 #define ASCEND910A2_XLOADER_SIZE 4 #define ASCEND910A2_XLOADER_OFFSET 0x18208430 #define ASCEND910A2_FEATURE_SIZE 4 #define ASCEND910A2_FEATURE_OFFSET 0x182085f8 #define ASCEND910A3_XLOADER_SIZE 4 #define ASCEND910A3_XLOADER_OFFSET 0x18208430 #define ASCEND910A3_FEATURE_SIZE 4 #define ASCEND910A3_FEATURE_OFFSET 0x182085f8 #define ASCEND310P_2P_BASE (128 * 1024 * 1024) #define ASCEND310P_1P_DEVNUM 1 #define ASCEND310P_2P_DEVNUM 2 #define ASCEND310P_XLOADER_SIZE 4 #define ASCEND310P_XLOADER_OFFSET 0x100430 #define ASCEND310B_XLOADER_SIZE 4 #define ASCEND310B_XLOADER_OFFSET 0x4430 #define ASCEND310_XLOADER_SIZE 4 #define ASCEND310_XLOADER_OFFSET 0x400 enum { VFIO_ASCEND_TYPE_ERR = 0, VFIO_ASCEND_TYPE_XLOADER = 1, VFIO_ASCEND_TYPE_FEATURE = 2, }; enum { QIURK_READ_SIZE_1_BYTE = 1, QIURK_READ_SIZE_2_BYTE = 2, QIURK_READ_SIZE_4_BYTE = 4, }; typedef struct VFIOAscendBarQuirk { struct VFIOPCIDevice *vdev; pcibus_t offset; unsigned int flags; int type; uint8_t bar; MemoryRegion *mem; } VFIOAscendBarQuirk; static uint64_t vfio_ascend_quirk_read(void *opaque, hwaddr addr, unsigned size) { uint64_t value, off_value; const unsigned byte_size = 8; unsigned start = addr * byte_size, length_size = size * byte_size; VFIOAscendBarQuirk *quirk = opaque; VFIOPCIDevice *vdev = quirk->vdev; qemu_log("read RO region! addr=0x%" HWADDR_PRIx ", size=%d\n", addr + quirk->offset, size); switch (quirk->type) { case VFIO_ASCEND_TYPE_XLOADER: value = vfio_region_read(&vdev->bars[quirk->bar].region, addr + quirk->offset, size); break; case VFIO_ASCEND_TYPE_FEATURE: switch (size) { case QIURK_READ_SIZE_1_BYTE: case QIURK_READ_SIZE_2_BYTE: case QIURK_READ_SIZE_4_BYTE: off_value = ((uint64_t)quirk->flags >> start) & ((1UL << length_size) - 1); return le64_to_cpu(off_value); default: qemu_log("Ascend quirk unsupported read size, %d bytes\n", size); return 0; } default: qemu_log("read RO region error type! addr=0x%" HWADDR_PRIx ", size=%d\n", addr + quirk->offset, size); return 0; } return value; } static void vfio_ascend_quirk_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { VFIOAscendBarQuirk *quirk = opaque; qemu_log("modifying RO region is not allowed! addr=0x%" HWADDR_PRIx ", data=0x%" PRIx64 ", size=%d\n", addr + quirk->offset, data, size); } static unsigned int vfio_ascend_get_region_flags(VFIOPCIDevice *vdev, int nr) { int ret = 0; unsigned int flags; VFIODevice *vbasedev = &vdev->vbasedev; struct vfio_region_info *reg_info; ret = vfio_get_region_info(vbasedev, nr, ®_info); if (ret != 0) { qemu_log("ascend failed to get region %d info\n", nr); return 0; } flags = reg_info->flags; g_free(reg_info); return flags; } static unsigned int vfio_ascend_get_all_regions_flags(VFIOPCIDevice *vdev) { int i = 0; unsigned int flag = 0, flags = 0; for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) { flag = vfio_ascend_get_region_flags(vdev, i); if (flag & VFIO_REGION_INFO_FLAG_NORMAL_NC) { flags |= (1 << i); } } return flags; } static void vfio_ascend_set_bar_quirk_array(VFIOAscendBarQuirk *bar_quirk, VFIOPCIDevice *vdev, int index, pcibus_t offset, unsigned int flags, int type, uint8_t bar) { bar_quirk[index].vdev = vdev; bar_quirk[index].offset = offset; bar_quirk[index].flags = flags; bar_quirk[index].type = type; bar_quirk[index].bar = bar; } static const MemoryRegionOps vfio_ascend_intercept_regs_quirk = { .read = vfio_ascend_quirk_read, .write = vfio_ascend_quirk_write, .endianness = DEVICE_LITTLE_ENDIAN, }; static void vfio_probe_ascend910a3_bar2_quirk(VFIOPCIDevice *vdev, int nr) { VFIOQuirk *quirk; VFIOAscendBarQuirk *bar2_quirk; const int quirk_region_num = 2; /* XLOADER and FEATURE */ if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 2 || vdev->device_id != PCI_DEVICE_ID_ASCEND910A3) { return; } quirk = vfio_quirk_alloc(quirk_region_num); bar2_quirk = quirk->data = g_new0(typeof(*bar2_quirk), quirk->nr_mem); vfio_ascend_set_bar_quirk_array(bar2_quirk, vdev, 0, ASCEND910A3_XLOADER_OFFSET, 0, VFIO_ASCEND_TYPE_XLOADER, nr); /* intercept w/r to the xloader-updating register, * so the vm can't enable xloader-updating */ memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_ascend_intercept_regs_quirk, &bar2_quirk[0], "vfio-ascend910a3-bar2-intercept-regs-quirk", ASCEND910A3_XLOADER_SIZE); memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, bar2_quirk[0].offset, &quirk->mem[0], 1); /* 910A3 FEATURE */ vfio_ascend_set_bar_quirk_array(bar2_quirk, vdev, 1, ASCEND910A3_FEATURE_OFFSET, vfio_ascend_get_all_regions_flags(vdev), VFIO_ASCEND_TYPE_FEATURE, nr); memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_ascend_intercept_regs_quirk, &bar2_quirk[1], "vfio-ascend910a3-bar2-feature-regs-quirk", ASCEND910A3_FEATURE_SIZE); memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, bar2_quirk[1].offset, &quirk->mem[1], 1); QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); } static void vfio_probe_ascend910a2_bar2_quirk(VFIOPCIDevice *vdev, int nr) { VFIOQuirk *quirk; VFIOAscendBarQuirk *bar2_quirk; const int quirk_region_num = 2; /* XLOADER and FEATURE */ if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 2 || vdev->device_id != PCI_DEVICE_ID_ASCEND910A2) { return; } quirk = vfio_quirk_alloc(quirk_region_num); bar2_quirk = quirk->data = g_new0(typeof(*bar2_quirk), quirk->nr_mem); vfio_ascend_set_bar_quirk_array(bar2_quirk, vdev, 0, ASCEND910A2_XLOADER_OFFSET, 0, VFIO_ASCEND_TYPE_XLOADER, nr); /* intercept w/r to the xloader-updating register, * so the vm can't enable xloader-updating */ memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_ascend_intercept_regs_quirk, &bar2_quirk[0], "vfio-ascend910_a2-bar2-intercept-regs-quirk", ASCEND910A2_XLOADER_SIZE); memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, bar2_quirk[0].offset, &quirk->mem[0], 1); /* 910B FEATURE */ vfio_ascend_set_bar_quirk_array(bar2_quirk, vdev, 1, ASCEND910A2_FEATURE_OFFSET, vfio_ascend_get_all_regions_flags(vdev), VFIO_ASCEND_TYPE_FEATURE, nr); memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_ascend_intercept_regs_quirk, &bar2_quirk[1], "vfio-ascend910b-bar2-feature-regs-quirk", ASCEND910A2_FEATURE_SIZE); memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, bar2_quirk[1].offset, &quirk->mem[1], 1); QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); } static void vfio_probe_ascend910_bar0_quirk(VFIOPCIDevice *vdev, int nr) { VFIOQuirk *quirk; VFIOAscendBarQuirk *bar0_quirk; if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 0 || vdev->device_id != PCI_DEVICE_ID_ASCEND910) { return; } quirk = g_malloc0(sizeof(*quirk)); quirk->nr_mem = 1; quirk->mem = g_new0(MemoryRegion, quirk->nr_mem); bar0_quirk = quirk->data = g_new0(typeof(*bar0_quirk), quirk->nr_mem); bar0_quirk[0].vdev = vdev; bar0_quirk[0].offset = ASCEND910_XLOADER_OFFSET; bar0_quirk[0].bar = nr; /* * intercept w/r to the xloader-updating register, * so the vm can't enable xloader-updating */ memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_ascend_intercept_regs_quirk, &bar0_quirk[0], "vfio-ascend910-bar0-intercept-regs-quirk", ASCEND910_XLOADER_SIZE); memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, bar0_quirk[0].offset, &quirk->mem[0], 1); QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); } static void vfio_probe_ascend310p_bar2_quirk(VFIOPCIDevice *vdev, int nr) { VFIOQuirk *quirk; VFIOAscendBarQuirk *bar2_quirk; int sub_device_id; int devnum = 0; if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 2 || vdev->device_id != PCI_DEVICE_ID_ASCEND310P) { return; } sub_device_id = pci_get_word(vdev->pdev.config + PCI_SUBSYSTEM_ID); if (sub_device_id >= PCI_SUB_DEVICE_ID_ASCEND310P_1P_MIN && sub_device_id <= PCI_SUB_DEVICE_ID_ASCEND310P_1P_MAX) { devnum = ASCEND310P_1P_DEVNUM; } else if (sub_device_id >= PCI_SUB_DEVICE_ID_ASCEND310P_2P_MIN && sub_device_id <= PCI_SUB_DEVICE_ID_ASCEND310P_2P_MAX) { devnum = ASCEND310P_2P_DEVNUM; } if (devnum != ASCEND310P_1P_DEVNUM && devnum != ASCEND310P_2P_DEVNUM) { return; } quirk = g_malloc0(sizeof(*quirk)); quirk->nr_mem = devnum; quirk->mem = g_new0(MemoryRegion, quirk->nr_mem); bar2_quirk = quirk->data = g_new0(typeof(*bar2_quirk), quirk->nr_mem); bar2_quirk[0].vdev = vdev; bar2_quirk[0].offset = ASCEND310P_XLOADER_OFFSET; bar2_quirk[0].bar = nr; /* * intercept w/r to the xloader-updating register, * so the vm can't enable xloader-updating */ memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_ascend_intercept_regs_quirk, &bar2_quirk[0], "vfio-ascend310p-bar2-1p-intercept-regs-quirk", ASCEND310P_XLOADER_SIZE); memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, bar2_quirk[0].offset, &quirk->mem[0], 1); if (devnum == ASCEND310P_2P_DEVNUM) { bar2_quirk[1].vdev = vdev; bar2_quirk[1].offset = (ASCEND310P_2P_BASE + ASCEND310P_XLOADER_OFFSET); bar2_quirk[1].bar = nr; memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_ascend_intercept_regs_quirk, &bar2_quirk[1], "vfio-ascend310p-bar2-2p-intercept-regs-quirk", ASCEND310P_XLOADER_SIZE); memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, bar2_quirk[1].offset, &quirk->mem[1], 1); } QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); } static void vfio_probe_ascend310b_bar2_quirk(VFIOPCIDevice *vdev, int nr) { VFIOQuirk *quirk; VFIOAscendBarQuirk *bar2_quirk; if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 2 || vdev->device_id != PCI_DEVICE_ID_ASCEND310B) { return; } quirk = g_malloc0(sizeof(*quirk)); quirk->nr_mem = 1; quirk->mem = g_new0(MemoryRegion, quirk->nr_mem); bar2_quirk = quirk->data = g_new0(typeof(*bar2_quirk), quirk->nr_mem); bar2_quirk[0].vdev = vdev; bar2_quirk[0].offset = ASCEND310B_XLOADER_OFFSET; bar2_quirk[0].bar = nr; /* intercept w/r to the xloader-updating register, * so the vm can't enable xloader-updating */ memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_ascend_intercept_regs_quirk, &bar2_quirk[0], "vfio-ascend310b-bar2-intercept-regs-quirk", ASCEND310B_XLOADER_SIZE); memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, bar2_quirk[0].offset, &quirk->mem[0], 1); QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); } static void vfio_probe_ascend310_bar4_quirk(VFIOPCIDevice *vdev, int nr) { VFIOQuirk *quirk; VFIOAscendBarQuirk *bar4_quirk; if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 4 || vdev->device_id != PCI_DEVICE_ID_ASCEND310) { return; } quirk = g_malloc0(sizeof(*quirk)); quirk->nr_mem = 1; quirk->mem = g_new0(MemoryRegion, quirk->nr_mem); bar4_quirk = quirk->data = g_new0(typeof(*bar4_quirk), quirk->nr_mem); bar4_quirk[0].vdev = vdev; bar4_quirk[0].offset = ASCEND310_XLOADER_OFFSET; bar4_quirk[0].bar = nr; /* * intercept w/r to the xloader-updating register, * so the vm can't enable xloader-updating */ memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_ascend_intercept_regs_quirk, &bar4_quirk[0], "vfio-ascend310-bar4-intercept-regs-quirk", ASCEND310_XLOADER_SIZE); memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, bar4_quirk[0].offset, &quirk->mem[0], 1); QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); }修改“pci-quirks.c”文件中vfio_bar_quirk_setup函数,增加如下代码并保存。void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) { vfio_probe_ati_bar4_quirk(vdev, nr); vfio_probe_ati_bar2_quirk(vdev, nr); vfio_probe_nvidia_bar5_quirk(vdev, nr); vfio_probe_nvidia_bar0_quirk(vdev, nr); vfio_probe_ascend910a3_bar2_quirk(vdev, nr); vfio_probe_ascend910a2_bar2_quirk(vdev, nr); vfio_probe_ascend910_bar0_quirk(vdev, nr); vfio_probe_ascend310p_bar2_quirk(vdev, nr); vfio_probe_ascend310b_bar2_quirk(vdev, nr); vfio_probe_ascend310_bar4_quirk(vdev, nr); vfio_probe_rtl8168_bar2_quirk(vdev, nr); vfio_probe_igd_bar4_quirk(vdev, nr); }… - 执行命令,vi ./linux-headers/linux/vfio.h。
struct vfio_region_info { __u32 argsz; __u32 flags; #define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */ #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ #define VFIO_REGION_INFO_FLAG_NORMAL_NC (1 << 4) /* Region supports normal NC */ __u32 index; /* Region index */ __u32 cap_offset; /* Offset within info struct of first cap */ __u64 size; /* Region size (bytes) */ __u64 offset; /* Region offset from start of device fd */ }; - 执行如下命令,构建QEMU并安装。
./configure --enable-kvm --enable-numa make -j64 make install
如果构建失败,可能是缺失部分依赖,根据报错信息安装依赖。
- 重启libvirt服务生效。
systemctl daemon-reload systemctl restart libvirtd
- 执行命令,vi ./hw/arm/virt.c,修改“virt.c”文件,将PCIE_MMIO由512扩展为2048以容纳更多的PCIE设备,512会导致直通场景部分卡设备初始化失败。
验证Write Combining是否开启
- 在虚拟机内安装NPU Driver驱动,可以通过npu-smi info查询到所有NPU信息后,执行如下命令。
msnpureport
- 会在当前执行目录下生成例如"yyyy-MM-dd-HH-mm-ss"的日志目录,进入该目录,执行如下命令。
grep -nr "Device capability info"
- 观察到“feature_bar_mem=1”,表示Write Combining已开启。
[ascend] [devmm] [INFO] [devmm_set_dev_capability 178] <kworker/5:1:1224,1224> Device capability info. (did=5; vfid=0; ts_shm_map_bar=0; ts_shm_data_num=32768; feature_phycial_address=0x1; feature_pcie_th=1; feature_bar_mem=1; dvpp_memsize=17179869184; svm_offset_num=32768; feature_read_mem=1; feature_pcie_dma_support_sva=1; feature_dev_mem_map_host=1; feature_bar_huge_mem=1; double_pgtable_offset=17592186044416; feature_giant_page=1);
父主题: (可选)虚拟机性能调优