diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 1e33e1f3d0b88bd9682f8abb927f3f201b506f58..3d973e3511f237c582a8db42ea3cd45b34a626aa 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2053,7 +2053,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) build_srat_memory(table_data, mem_base, mem_len, i - 1, MEM_AFFINITY_ENABLED); } - mem_base = 1ULL << 32; + mem_base = x86ms->above_4g_mem_start; mem_len = next_base - x86ms->below_4g_mem_size; next_base = mem_base + mem_len; } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 7003ea1a05dd153c714b8fde52ec8e85bb77db16..82edb90379d4df9326bc87a83b49e6dd9580ef5d 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -788,10 +788,89 @@ void xen_load_linux(PCMachineState *pcms) #define PC_ROM_ALIGN 0x800 #define PC_ROM_SIZE (PC_ROM_MAX - PC_ROM_MIN_VGA) +static hwaddr pc_above_4g_end(PCMachineState *pcms) +{ + X86MachineState *x86ms = X86_MACHINE(pcms); + + if (pcms->sgx_epc.size != 0) { + return sgx_epc_above_4g_end(&pcms->sgx_epc); + } + + return x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; +} + +static void pc_get_device_memory_range(PCMachineState *pcms, + hwaddr *base, + ram_addr_t *device_mem_size) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + MachineState *machine = MACHINE(pcms); + ram_addr_t size; + hwaddr addr; + + size = machine->maxram_size - machine->ram_size; + addr = ROUND_UP(pc_above_4g_end(pcms), 1 * GiB); + + if (pcmc->enforce_aligned_dimm) { + /* size device region assuming 1G page max alignment per slot */ + size += (1 * GiB) * machine->ram_slots; + } + + *base = addr; + *device_mem_size = size; +} + +static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size) +{ + X86CPU *cpu = X86_CPU(first_cpu); + + /* 32-bit systems don't have hole64 thus return max CPU address */ + if (cpu->phys_bits <= 32) { + return ((hwaddr)1 << cpu->phys_bits) - 1; + } + + return pc_pci_hole64_start() + pci_hole64_size - 1; +} + +/* + * AMD systems with an IOMMU have an additional hole close to the + * 1Tb, which are special GPAs that cannot be DMA mapped. Depending + * on kernel version, VFIO may or may not let you DMA map those ranges. + * Starting Linux v5.4 we validate it, and can't create guests on AMD machines + * with certain memory sizes. It's also wrong to use those IOVA ranges + * in detriment of leading to IOMMU INVALID_DEVICE_REQUEST or worse. + * The ranges reserved for Hyper-Transport are: + * + * FD_0000_0000h - FF_FFFF_FFFFh + * + * The ranges represent the following: + * + * Base Address Top Address Use + * + * FD_0000_0000h FD_F7FF_FFFFh Reserved interrupt address space + * FD_F800_0000h FD_F8FF_FFFFh Interrupt/EOI IntCtl + * FD_F900_0000h FD_F90F_FFFFh Legacy PIC IACK + * FD_F910_0000h FD_F91F_FFFFh System Management + * FD_F920_0000h FD_FAFF_FFFFh Reserved Page Tables + * FD_FB00_0000h FD_FBFF_FFFFh Address Translation + * FD_FC00_0000h FD_FDFF_FFFFh I/O Space + * FD_FE00_0000h FD_FFFF_FFFFh Configuration + * FE_0000_0000h FE_1FFF_FFFFh Extended Configuration/Device Messages + * FE_2000_0000h FF_FFFF_FFFFh Reserved + * + * See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology", + * Table 3: Special Address Controls (GPA) for more information. + */ +#define AMD_HT_START 0xfd00000000UL +#define AMD_HT_END 0xffffffffffUL +#define AMD_ABOVE_1TB_START (AMD_HT_END + 1) +#define AMD_HT_SIZE (AMD_ABOVE_1TB_START - AMD_HT_START) + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, - MemoryRegion **ram_memory) + MemoryRegion **ram_memory, + uint64_t pci_hole64_size) { int linux_boot, i; MemoryRegion *option_rom_mr; @@ -801,12 +880,47 @@ void pc_memory_init(PCMachineState *pcms, MachineClass *mc = MACHINE_GET_CLASS(machine); PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); X86MachineState *x86ms = X86_MACHINE(pcms); + hwaddr maxphysaddr, maxusedaddr; + X86CPU *cpu = X86_CPU(first_cpu); assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); linux_boot = (machine->kernel_filename != NULL); + /* + * The HyperTransport range close to the 1T boundary is unique to AMD + * hosts with IOMMUs enabled. Restrict the ram-above-4g relocation + * to above 1T to AMD vCPUs only. + */ + if (IS_AMD_CPU(&cpu->env)) { + /* Bail out if max possible address does not cross HT range */ + if (pc_max_used_gpa(pcms, pci_hole64_size) >= AMD_HT_START) { + x86ms->above_4g_mem_start = AMD_ABOVE_1TB_START; + } + + /* + * Advertise the HT region if address space covers the reserved + * region or if we relocate. + */ + if (cpu->phys_bits >= 40) { + e820_add_entry(AMD_HT_START, AMD_HT_SIZE, E820_RESERVED); + } + } + + /* + * phys-bits is required to be appropriately configured + * to make sure max used GPA is reachable. + */ + maxusedaddr = pc_max_used_gpa(pcms, pci_hole64_size); + maxphysaddr = ((hwaddr)1 << cpu->phys_bits) - 1; + if (maxphysaddr < maxusedaddr) { + error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64 + " phys-bits too low (%u)", + maxphysaddr, maxusedaddr, cpu->phys_bits); + exit(EXIT_FAILURE); + } + /* * Split single memory region and use aliases to address portions of it, * done for backwards compatibility with older qemus. @@ -823,9 +937,10 @@ void pc_memory_init(PCMachineState *pcms, machine->ram, x86ms->below_4g_mem_size, x86ms->above_4g_mem_size); - memory_region_add_subregion(system_memory, 0x100000000ULL, + memory_region_add_subregion(system_memory, x86ms->above_4g_mem_start, ram_above_4g); - e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM); + e820_add_entry(x86ms->above_4g_mem_start, x86ms->above_4g_mem_size, + E820_RAM); } if (pcms->sgx_epc.size != 0) { @@ -847,7 +962,7 @@ void pc_memory_init(PCMachineState *pcms, /* initialize device memory address space */ if (pcmc->has_reserved_memory && (machine->ram_size < machine->maxram_size)) { - ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size; + ram_addr_t device_mem_size; if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) { error_report("unsupported amount of memory slots: %"PRIu64, @@ -862,20 +977,7 @@ void pc_memory_init(PCMachineState *pcms, exit(EXIT_FAILURE); } - if (pcms->sgx_epc.size != 0) { - machine->device_memory->base = sgx_epc_above_4g_end(&pcms->sgx_epc); - } else { - machine->device_memory->base = - 0x100000000ULL + x86ms->above_4g_mem_size; - } - - machine->device_memory->base = - ROUND_UP(machine->device_memory->base, 1 * GiB); - - if (pcmc->enforce_aligned_dimm) { - /* size device region assuming 1G page max alignment per slot */ - device_mem_size += (1 * GiB) * machine->ram_slots; - } + pc_get_device_memory_range(pcms, &machine->device_memory->base, &device_mem_size); if ((machine->device_memory->base + device_mem_size) < device_mem_size) { @@ -947,18 +1049,16 @@ uint64_t pc_pci_hole64_start(void) PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); MachineState *ms = MACHINE(pcms); - X86MachineState *x86ms = X86_MACHINE(pcms); uint64_t hole64_start = 0; + ram_addr_t size = 0; - if (pcmc->has_reserved_memory && ms->device_memory->base) { - hole64_start = ms->device_memory->base; + if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) { + pc_get_device_memory_range(pcms, &hole64_start, &size); if (!pcmc->broken_reserved_end) { - hole64_start += memory_region_size(&ms->device_memory->mr); + hole64_start += size; } - } else if (pcms->sgx_epc.size != 0) { - hole64_start = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { - hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size; + hole64_start = pc_above_4g_end(pcms); } return ROUND_UP(hole64_start, 1 * GiB); diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 223dd3e05d15622ca565b1bf124c42ba3f0ee7a6..ae931dbfd395c9b3833ec8dd730aac1fd1727088 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -91,6 +91,8 @@ static void pc_init1(MachineState *machine, MemoryRegion *pci_memory; MemoryRegion *rom_memory; ram_addr_t lowmem; + uint64_t hole64_size; + DeviceState *i440fx_host; /* * Calculate ram split, for memory below and above 4G. It's a bit @@ -164,9 +166,15 @@ static void pc_init1(MachineState *machine, pci_memory = g_new(MemoryRegion, 1); memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); rom_memory = pci_memory; + i440fx_host = qdev_new(host_type); + hole64_size = object_property_get_uint(OBJECT(i440fx_host), + PCI_HOST_PROP_PCI_HOLE64_SIZE, + &error_abort); } else { pci_memory = NULL; rom_memory = system_memory; + i440fx_host = NULL; + hole64_size = 0; } pc_guest_info_init(pcms); @@ -183,7 +191,7 @@ static void pc_init1(MachineState *machine, /* allocate ram and load rom/bios */ if (!xen_enabled()) { pc_memory_init(pcms, system_memory, - rom_memory, &ram_memory); + rom_memory, &ram_memory, hole64_size); } else { pc_system_flash_cleanup_unused(pcms); if (machine->kernel_filename != NULL) { @@ -197,8 +205,8 @@ static void pc_init1(MachineState *machine, if (pcmc->pci_enabled) { PIIX3State *piix3; - pci_bus = i440fx_init(host_type, - pci_type, + pci_bus = i440fx_init(pci_type, + i440fx_host, &i440fx_state, system_memory, system_io, machine->ram_size, x86ms->below_4g_mem_size, diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index e1e100316d93319aa79fca805b7b6ed4a05174dc..99792077a13430c2a6d1537322bea80c24ad5d79 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -138,6 +138,7 @@ static void pc_q35_init(MachineState *machine) MachineClass *mc = MACHINE_GET_CLASS(machine); bool acpi_pcihp; bool keep_pci_slot_hpc; + uint64_t pci_hole64_size = 0; /* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping @@ -203,12 +204,19 @@ static void pc_q35_init(MachineState *machine) SMBIOS_ENTRY_POINT_21); } - /* allocate ram and load rom/bios */ - pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory); - /* create pci host bus */ q35_host = Q35_HOST_DEVICE(qdev_new(TYPE_Q35_HOST_DEVICE)); + if (pcmc->pci_enabled) { + pci_hole64_size = object_property_get_uint(OBJECT(q35_host), + PCI_HOST_PROP_PCI_HOLE64_SIZE, + &error_abort); + } + + /* allocate ram and load rom/bios */ + pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory, + pci_hole64_size); + object_property_add_child(qdev_get_machine(), "q35", OBJECT(q35_host)); object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM, OBJECT(ram_memory), NULL); diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c index a2b318dd9387d1343736ae50da1b7bfb6159e40f..164ee1ddb8de028077ffdd6dd69e27cc7c3cc225 100644 --- a/hw/i386/sgx.c +++ b/hw/i386/sgx.c @@ -295,7 +295,7 @@ void pc_machine_init_sgx_epc(PCMachineState *pcms) return; } - sgx_epc->base = 0x100000000ULL + x86ms->above_4g_mem_size; + sgx_epc->base = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; memory_region_init(&sgx_epc->mr, OBJECT(pcms), "sgx-epc", UINT64_MAX); memory_region_add_subregion(get_system_memory(), sgx_epc->base, diff --git a/hw/i386/x86.c b/hw/i386/x86.c index a3258d78facf41c63fdad6c8c6f21da82ad1938e..56ac3b557b57f90a08c065016274c127e8a26f3d 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -1339,6 +1339,7 @@ static void x86_machine_initfn(Object *obj) x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); x86ms->bus_lock_ratelimit = 0; + x86ms->above_4g_mem_start = 4 * GiB; } static void x86_machine_class_init(ObjectClass *oc, void *data) diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c index e08716142b6e74fafad7522e9d2bf19731b40073..924beb188f6683cbd9af9ee1dc4e3f9816537a93 100644 --- a/hw/pci-host/i440fx.c +++ b/hw/pci-host/i440fx.c @@ -237,7 +237,8 @@ static void i440fx_realize(PCIDevice *dev, Error **errp) } } -PCIBus *i440fx_init(const char *host_type, const char *pci_type, +PCIBus *i440fx_init(const char *pci_type, + DeviceState *dev, PCII440FXState **pi440fx_state, MemoryRegion *address_space_mem, MemoryRegion *address_space_io, @@ -247,7 +248,6 @@ PCIBus *i440fx_init(const char *host_type, const char *pci_type, MemoryRegion *pci_address_space, MemoryRegion *ram_memory) { - DeviceState *dev; PCIBus *b; PCIDevice *d; PCIHostState *s; @@ -255,7 +255,6 @@ PCIBus *i440fx_init(const char *host_type, const char *pci_type, unsigned i; I440FXState *i440fx; - dev = qdev_new(host_type); s = PCI_HOST_BRIDGE(dev); b = pci_root_bus_new(dev, NULL, pci_address_space, address_space_io, 0, TYPE_PCI_BUS); diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 9ab39e428f8009317f3b8b998f1e3f992c8e68df..7c1dca847d66e39ae0cc96d0ef29e008b771f5fe 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -154,7 +154,8 @@ void xen_load_linux(PCMachineState *pcms); void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, - MemoryRegion **ram_memory); + MemoryRegion **ram_memory, + uint64_t pci_hole64_size); uint64_t pc_pci_hole64_start(void); DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus); void pc_basic_device_init(struct PCMachineState *pcms, diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h index bb1cfb8896643155e150bae671ffa9ad6fab5c19..e86e9d4571b591deedf6ef3c8c6ef37bbbaa4002 100644 --- a/include/hw/i386/x86.h +++ b/include/hw/i386/x86.h @@ -59,6 +59,9 @@ struct X86MachineState { /* RAM information (sizes, addresses, configuration): */ ram_addr_t below_4g_mem_size, above_4g_mem_size; + /* Start address of the initial RAM above 4G */ + uint64_t above_4g_mem_start; + /* CPU and apic information: */ bool apic_xrupt_override; unsigned pci_irq_mask; diff --git a/include/hw/pci-host/i440fx.h b/include/hw/pci-host/i440fx.h index f068aaba8fdaff72b27750864bd971dd1f433784..bcaaafde9ee197ca8b903d72c0951f65ae0de9db 100644 --- a/include/hw/pci-host/i440fx.h +++ b/include/hw/pci-host/i440fx.h @@ -35,7 +35,8 @@ struct PCII440FXState { #define TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE "igd-passthrough-i440FX" -PCIBus *i440fx_init(const char *host_type, const char *pci_type, +PCIBus *i440fx_init(const char *pci_type, + DeviceState *dev, PCII440FXState **pi440fx_state, MemoryRegion *address_space_mem, MemoryRegion *address_space_io,