From bcefcd7ed32e0e310b41ea2f065d7a5b439b1202 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:04 +0100 Subject: [PATCH 1/7] hw/i386: add 4g boundary start to X86MachineState Rather than hardcoding the 4G boundary everywhere, introduce a X86MachineState field @above_4g_mem_start and use it accordingly. This is in preparation for relocating ram-above-4g to be dynamically start at 1T on AMD platforms. Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-2-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Conflicts: hw/i386/pc.c Signed-off-by: frankyj915 --- hw/i386/acpi-build.c | 2 +- hw/i386/pc.c | 9 +++++---- hw/i386/sgx.c | 2 +- hw/i386/x86.c | 1 + include/hw/i386/x86.h | 3 +++ 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 1e33e1f3d0..3d973e3511 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2053,7 +2053,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) build_srat_memory(table_data, mem_base, mem_len, i - 1, MEM_AFFINITY_ENABLED); } - mem_base = 1ULL << 32; + mem_base = x86ms->above_4g_mem_start; mem_len = next_base - x86ms->below_4g_mem_size; next_base = mem_base + mem_len; } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 7003ea1a05..2fb10182b8 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -823,9 +823,10 @@ void pc_memory_init(PCMachineState *pcms, machine->ram, x86ms->below_4g_mem_size, x86ms->above_4g_mem_size); - memory_region_add_subregion(system_memory, 0x100000000ULL, + memory_region_add_subregion(system_memory, x86ms->above_4g_mem_start, ram_above_4g); - e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM); + e820_add_entry(x86ms->above_4g_mem_start, x86ms->above_4g_mem_size, + E820_RAM); } if (pcms->sgx_epc.size != 0) { @@ -866,7 +867,7 @@ void pc_memory_init(PCMachineState *pcms, machine->device_memory->base = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { machine->device_memory->base = - 0x100000000ULL + x86ms->above_4g_mem_size; + x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } machine->device_memory->base = @@ -958,7 +959,7 @@ uint64_t pc_pci_hole64_start(void) } else if (pcms->sgx_epc.size != 0) { hole64_start = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { - hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size; + hole64_start = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } return ROUND_UP(hole64_start, 1 * GiB); diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c index a2b318dd93..164ee1ddb8 100644 --- a/hw/i386/sgx.c +++ b/hw/i386/sgx.c @@ -295,7 +295,7 @@ void pc_machine_init_sgx_epc(PCMachineState *pcms) return; } - sgx_epc->base = 0x100000000ULL + x86ms->above_4g_mem_size; + sgx_epc->base = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; memory_region_init(&sgx_epc->mr, OBJECT(pcms), "sgx-epc", UINT64_MAX); memory_region_add_subregion(get_system_memory(), sgx_epc->base, diff --git a/hw/i386/x86.c b/hw/i386/x86.c index a3258d78fa..56ac3b557b 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -1339,6 +1339,7 @@ static void x86_machine_initfn(Object *obj) x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); x86ms->bus_lock_ratelimit = 0; + x86ms->above_4g_mem_start = 4 * GiB; } static void x86_machine_class_init(ObjectClass *oc, void *data) diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h index bb1cfb8896..e86e9d4571 100644 --- a/include/hw/i386/x86.h +++ b/include/hw/i386/x86.h @@ -59,6 +59,9 @@ struct X86MachineState { /* RAM information (sizes, addresses, configuration): */ ram_addr_t below_4g_mem_size, above_4g_mem_size; + /* Start address of the initial RAM above 4G */ + uint64_t above_4g_mem_start; + /* CPU and apic information: */ bool apic_xrupt_override; unsigned pci_irq_mask; -- Gitee From 6be6ff397a378ce8ea164edda7417174ab91bbbf Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:05 +0100 Subject: [PATCH 2/7] i386/pc: create pci-host qdev prior to pc_memory_init() At the start of pc_memory_init() we usually pass a range of 0..UINT64_MAX as pci_memory, when really its 2G (i440fx) or 32G (q35). To get the real user value, we need to get pci-host passed property for default pci_hole64_size. Thus to get that, create the qdev prior to memory init to better make estimations on max used/phys addr. This is in preparation to determine that host-phys-bits are enough and also for pci-hole64-size to be considered to relocate ram-above-4g to be at 1T (on AMD platforms). Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-3-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Conflicts: hw/i386/pc_piix.c hw/pci-host/i440fx.c include/hw/pci-host/i440fx.h Signed-off-by: frankyj915 --- hw/i386/pc_piix.c | 7 +++++-- hw/i386/pc_q35.c | 6 +++--- hw/pci-host/i440fx.c | 5 ++--- include/hw/pci-host/i440fx.h | 3 ++- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 223dd3e05d..2bbed0e0c7 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -91,6 +91,7 @@ static void pc_init1(MachineState *machine, MemoryRegion *pci_memory; MemoryRegion *rom_memory; ram_addr_t lowmem; + DeviceState *i440fx_host; /* * Calculate ram split, for memory below and above 4G. It's a bit @@ -164,9 +165,11 @@ static void pc_init1(MachineState *machine, pci_memory = g_new(MemoryRegion, 1); memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); rom_memory = pci_memory; + i440fx_host = qdev_new(host_type); } else { pci_memory = NULL; rom_memory = system_memory; + i440fx_host = NULL; } pc_guest_info_init(pcms); @@ -197,8 +200,8 @@ static void pc_init1(MachineState *machine, if (pcmc->pci_enabled) { PIIX3State *piix3; - pci_bus = i440fx_init(host_type, - pci_type, + pci_bus = i440fx_init(pci_type, + i440fx_host, &i440fx_state, system_memory, system_io, machine->ram_size, x86ms->below_4g_mem_size, diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index e1e100316d..c4162eb912 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -203,12 +203,12 @@ static void pc_q35_init(MachineState *machine) SMBIOS_ENTRY_POINT_21); } - /* allocate ram and load rom/bios */ - pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory); - /* create pci host bus */ q35_host = Q35_HOST_DEVICE(qdev_new(TYPE_Q35_HOST_DEVICE)); + /* allocate ram and load rom/bios */ + pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory); + object_property_add_child(qdev_get_machine(), "q35", OBJECT(q35_host)); object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM, OBJECT(ram_memory), NULL); diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c index e08716142b..924beb188f 100644 --- a/hw/pci-host/i440fx.c +++ b/hw/pci-host/i440fx.c @@ -237,7 +237,8 @@ static void i440fx_realize(PCIDevice *dev, Error **errp) } } -PCIBus *i440fx_init(const char *host_type, const char *pci_type, +PCIBus *i440fx_init(const char *pci_type, + DeviceState *dev, PCII440FXState **pi440fx_state, MemoryRegion *address_space_mem, MemoryRegion *address_space_io, @@ -247,7 +248,6 @@ PCIBus *i440fx_init(const char *host_type, const char *pci_type, MemoryRegion *pci_address_space, MemoryRegion *ram_memory) { - DeviceState *dev; PCIBus *b; PCIDevice *d; PCIHostState *s; @@ -255,7 +255,6 @@ PCIBus *i440fx_init(const char *host_type, const char *pci_type, unsigned i; I440FXState *i440fx; - dev = qdev_new(host_type); s = PCI_HOST_BRIDGE(dev); b = pci_root_bus_new(dev, NULL, pci_address_space, address_space_io, 0, TYPE_PCI_BUS); diff --git a/include/hw/pci-host/i440fx.h b/include/hw/pci-host/i440fx.h index f068aaba8f..bcaaafde9e 100644 --- a/include/hw/pci-host/i440fx.h +++ b/include/hw/pci-host/i440fx.h @@ -35,7 +35,8 @@ struct PCII440FXState { #define TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE "igd-passthrough-i440FX" -PCIBus *i440fx_init(const char *host_type, const char *pci_type, +PCIBus *i440fx_init(const char *pci_type, + DeviceState *dev, PCII440FXState **pi440fx_state, MemoryRegion *address_space_mem, MemoryRegion *address_space_io, -- Gitee From e35280155e311388c6453b5bdfc8409f279a207b Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:06 +0100 Subject: [PATCH 3/7] i386/pc: pass pci_hole64_size to pc_memory_init() Use the pre-initialized pci-host qdev and fetch the pci-hole64-size into pc_memory_init() newly added argument. Use PCI_HOST_PROP_PCI_HOLE64_SIZE pci-host property for fetching pci-hole64-size. This is in preparation to determine that host-phys-bits are enough and for pci-hole64-size to be considered to relocate ram-above-4g to be at 1T (on AMD platforms). Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-4-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 3 ++- hw/i386/pc_piix.c | 7 ++++++- hw/i386/pc_q35.c | 10 +++++++++- include/hw/i386/pc.h | 3 ++- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 2fb10182b8..919bfed86d 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -791,7 +791,8 @@ void xen_load_linux(PCMachineState *pcms) void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, - MemoryRegion **ram_memory) + MemoryRegion **ram_memory, + uint64_t pci_hole64_size) { int linux_boot, i; MemoryRegion *option_rom_mr; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 2bbed0e0c7..ae931dbfd3 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -91,6 +91,7 @@ static void pc_init1(MachineState *machine, MemoryRegion *pci_memory; MemoryRegion *rom_memory; ram_addr_t lowmem; + uint64_t hole64_size; DeviceState *i440fx_host; /* @@ -166,10 +167,14 @@ static void pc_init1(MachineState *machine, memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); rom_memory = pci_memory; i440fx_host = qdev_new(host_type); + hole64_size = object_property_get_uint(OBJECT(i440fx_host), + PCI_HOST_PROP_PCI_HOLE64_SIZE, + &error_abort); } else { pci_memory = NULL; rom_memory = system_memory; i440fx_host = NULL; + hole64_size = 0; } pc_guest_info_init(pcms); @@ -186,7 +191,7 @@ static void pc_init1(MachineState *machine, /* allocate ram and load rom/bios */ if (!xen_enabled()) { pc_memory_init(pcms, system_memory, - rom_memory, &ram_memory); + rom_memory, &ram_memory, hole64_size); } else { pc_system_flash_cleanup_unused(pcms); if (machine->kernel_filename != NULL) { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index c4162eb912..99792077a1 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -138,6 +138,7 @@ static void pc_q35_init(MachineState *machine) MachineClass *mc = MACHINE_GET_CLASS(machine); bool acpi_pcihp; bool keep_pci_slot_hpc; + uint64_t pci_hole64_size = 0; /* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping @@ -206,8 +207,15 @@ static void pc_q35_init(MachineState *machine) /* create pci host bus */ q35_host = Q35_HOST_DEVICE(qdev_new(TYPE_Q35_HOST_DEVICE)); + if (pcmc->pci_enabled) { + pci_hole64_size = object_property_get_uint(OBJECT(q35_host), + PCI_HOST_PROP_PCI_HOLE64_SIZE, + &error_abort); + } + /* allocate ram and load rom/bios */ - pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory); + pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory, + pci_hole64_size); object_property_add_child(qdev_get_machine(), "q35", OBJECT(q35_host)); object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM, diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 9ab39e428f..7c1dca847d 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -154,7 +154,8 @@ void xen_load_linux(PCMachineState *pcms); void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, - MemoryRegion **ram_memory); + MemoryRegion **ram_memory, + uint64_t pci_hole64_size); uint64_t pc_pci_hole64_start(void); DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus); void pc_basic_device_init(struct PCMachineState *pcms, -- Gitee From e86f465f2bdf2999384f461e1aaecdfc510648dd Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:07 +0100 Subject: [PATCH 4/7] i386/pc: factor out above-4g end to an helper There's a couple of places that seem to duplicate this calculation of RAM size above the 4G boundary. Move all those to a helper function. Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-5-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 919bfed86d..0857bb124f 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -788,6 +788,17 @@ void xen_load_linux(PCMachineState *pcms) #define PC_ROM_ALIGN 0x800 #define PC_ROM_SIZE (PC_ROM_MAX - PC_ROM_MIN_VGA) +static hwaddr pc_above_4g_end(PCMachineState *pcms) +{ + X86MachineState *x86ms = X86_MACHINE(pcms); + + if (pcms->sgx_epc.size != 0) { + return sgx_epc_above_4g_end(&pcms->sgx_epc); + } + + return x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; +} + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -864,15 +875,8 @@ void pc_memory_init(PCMachineState *pcms, exit(EXIT_FAILURE); } - if (pcms->sgx_epc.size != 0) { - machine->device_memory->base = sgx_epc_above_4g_end(&pcms->sgx_epc); - } else { - machine->device_memory->base = - x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; - } - machine->device_memory->base = - ROUND_UP(machine->device_memory->base, 1 * GiB); + ROUND_UP(pc_above_4g_end(pcms), 1 * GiB); if (pcmc->enforce_aligned_dimm) { /* size device region assuming 1G page max alignment per slot */ @@ -949,7 +953,6 @@ uint64_t pc_pci_hole64_start(void) PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); MachineState *ms = MACHINE(pcms); - X86MachineState *x86ms = X86_MACHINE(pcms); uint64_t hole64_start = 0; if (pcmc->has_reserved_memory && ms->device_memory->base) { @@ -957,10 +960,8 @@ uint64_t pc_pci_hole64_start(void) if (!pcmc->broken_reserved_end) { hole64_start += memory_region_size(&ms->device_memory->mr); } - } else if (pcms->sgx_epc.size != 0) { - hole64_start = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { - hole64_start = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; + hole64_start = pc_above_4g_end(pcms); } return ROUND_UP(hole64_start, 1 * GiB); -- Gitee From eb9e4c84d938dc1fc4df197d483277e778048593 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:11 +0100 Subject: [PATCH 5/7] i386/pc: factor out device_memory base/size to helper Move obtaining hole64_start from device_memory memory region base/size into an helper alongside correspondent getters in pc_memory_init() when the hotplug range is unitialized. While doing that remove the memory region based logic from this newly added helper. This is the final step that allows pc_pci_hole64_start() to be callable at the beginning of pc_memory_init() before any memory regions are initialized. Cc: Jonathan Cameron Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-9-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Conflicts: hw/i386/pc.c Signed-off-by: frankyj915 --- hw/i386/pc.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 0857bb124f..44a5658c27 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -799,6 +799,27 @@ static hwaddr pc_above_4g_end(PCMachineState *pcms) return x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } +static void pc_get_device_memory_range(PCMachineState *pcms, + hwaddr *base, + ram_addr_t *device_mem_size) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + MachineState *machine = MACHINE(pcms); + ram_addr_t size; + hwaddr addr; + + size = machine->maxram_size - machine->ram_size; + addr = ROUND_UP(pc_above_4g_end(pcms), 1 * GiB); + + if (pcmc->enforce_aligned_dimm) { + /* size device region assuming 1G page max alignment per slot */ + size += (1 * GiB) * machine->ram_slots; + } + + *base = addr; + *device_mem_size = size; +} + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -860,7 +881,7 @@ void pc_memory_init(PCMachineState *pcms, /* initialize device memory address space */ if (pcmc->has_reserved_memory && (machine->ram_size < machine->maxram_size)) { - ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size; + ram_addr_t device_mem_size; if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) { error_report("unsupported amount of memory slots: %"PRIu64, @@ -875,13 +896,7 @@ void pc_memory_init(PCMachineState *pcms, exit(EXIT_FAILURE); } - machine->device_memory->base = - ROUND_UP(pc_above_4g_end(pcms), 1 * GiB); - - if (pcmc->enforce_aligned_dimm) { - /* size device region assuming 1G page max alignment per slot */ - device_mem_size += (1 * GiB) * machine->ram_slots; - } + pc_get_device_memory_range(pcms, &machine->device_memory->base, &device_mem_size); if ((machine->device_memory->base + device_mem_size) < device_mem_size) { @@ -954,11 +969,12 @@ uint64_t pc_pci_hole64_start(void) PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); MachineState *ms = MACHINE(pcms); uint64_t hole64_start = 0; + ram_addr_t size = 0; - if (pcmc->has_reserved_memory && ms->device_memory->base) { - hole64_start = ms->device_memory->base; + if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) { + pc_get_device_memory_range(pcms, &hole64_start, &size); if (!pcmc->broken_reserved_end) { - hole64_start += memory_region_size(&ms->device_memory->mr); + hole64_start += size; } } else { hole64_start = pc_above_4g_end(pcms); -- Gitee From a1eae7bc11804427112b1b8f1d18464786d490e1 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:12 +0100 Subject: [PATCH 6/7] i386/pc: bounds check phys-bits against max used GPA Calculate max *used* GPA against the CPU maximum possible address and error out if the former surprasses the latter. This ensures max used GPA is reacheable by configured phys-bits. Default phys-bits on Qemu is TCG_PHYS_ADDR_BITS (40) which is enough for the CPU to address 1Tb (0xff ffff ffff) or 1010G (0xfc ffff ffff) in AMD hosts with IOMMU. This is preparation for AMD guests with >1010G, where it will want relocate ram-above-4g to be after 1Tb instead of 4G. Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-10-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Conflicts: hw/i386/pc.c Signed-off-by: frankyj915 --- hw/i386/pc.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 44a5658c27..ab46ce4a98 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -820,6 +820,18 @@ static void pc_get_device_memory_range(PCMachineState *pcms, *device_mem_size = size; } +static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size) +{ + X86CPU *cpu = X86_CPU(first_cpu); + + /* 32-bit systems don't have hole64 thus return max CPU address */ + if (cpu->phys_bits <= 32) { + return ((hwaddr)1 << cpu->phys_bits) - 1; + } + + return pc_pci_hole64_start() + pci_hole64_size - 1; +} + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -834,12 +846,27 @@ void pc_memory_init(PCMachineState *pcms, MachineClass *mc = MACHINE_GET_CLASS(machine); PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); X86MachineState *x86ms = X86_MACHINE(pcms); + hwaddr maxphysaddr, maxusedaddr; + X86CPU *cpu = X86_CPU(first_cpu); assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); linux_boot = (machine->kernel_filename != NULL); + /* + * phys-bits is required to be appropriately configured + * to make sure max used GPA is reachable. + */ + maxusedaddr = pc_max_used_gpa(pcms, pci_hole64_size); + maxphysaddr = ((hwaddr)1 << cpu->phys_bits) - 1; + if (maxphysaddr < maxusedaddr) { + error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64 + " phys-bits too low (%u)", + maxphysaddr, maxusedaddr, cpu->phys_bits); + exit(EXIT_FAILURE); + } + /* * Split single memory region and use aliases to address portions of it, * done for backwards compatibility with older qemus. -- Gitee From 794772716628256df0aaa356371a83123f0ffb4d Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:13 +0100 Subject: [PATCH 7/7] i386/pc: relocate 4g start to 1T where applicable It is assumed that the whole GPA space is available to be DMA addressable, within a given address space limit, except for a tiny region before the 4G. Since Linux v5.4, VFIO validates whether the selected GPA is indeed valid i.e. not reserved by IOMMU on behalf of some specific devices or platform-defined restrictions, and thus failing the ioctl(VFIO_DMA_MAP) with -EINVAL. AMD systems with an IOMMU are examples of such platforms and particularly may only have these ranges as allowed: 0000000000000000 - 00000000fedfffff (0 .. 3.982G) 00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G) 0000010000000000 - ffffffffffffffff (1Tb .. 16Pb[*]) We already account for the 4G hole, albeit if the guest is big enough we will fail to allocate a guest with >1010G due to the ~12G hole at the 1Tb boundary, reserved for HyperTransport (HT). [*] there is another reserved region unrelated to HT that exists in the 256T boundary in Fam 17h according to Errata #1286, documeted also in "Open-Source Register Reference for AMD Family 17h Processors (PUB)" When creating the region above 4G, take into account that on AMD platforms the HyperTransport range is reserved and hence it cannot be used either as GPAs. On those cases rather than establishing the start of ram-above-4g to be 4G, relocate instead to 1Tb. See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology", for more information on the underlying restriction of IOVAs. After accounting for the 1Tb hole on AMD hosts, mtree should look like: 0000000000000000-000000007fffffff (prio 0, i/o): alias ram-below-4g @pc.ram 0000000000000000-000000007fffffff 0000010000000000-000001ff7fffffff (prio 0, i/o): alias ram-above-4g @pc.ram 0000000080000000-000000ffffffffff If the relocation is done or the address space covers it, we also add the the reserved HT e820 range as reserved. Default phys-bits on Qemu is TCG_PHYS_ADDR_BITS (40) which is enough to address 1Tb (0xff ffff ffff). On AMD platforms, if a ram-above-4g relocation is attempted and the CPU wasn't configured with a big enough phys-bits, an error message will be printed due to the maxphysaddr vs maxusedaddr check previously added. Suggested-by: Igor Mammedov Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-11-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index ab46ce4a98..82edb90379 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -832,6 +832,40 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size) return pc_pci_hole64_start() + pci_hole64_size - 1; } +/* + * AMD systems with an IOMMU have an additional hole close to the + * 1Tb, which are special GPAs that cannot be DMA mapped. Depending + * on kernel version, VFIO may or may not let you DMA map those ranges. + * Starting Linux v5.4 we validate it, and can't create guests on AMD machines + * with certain memory sizes. It's also wrong to use those IOVA ranges + * in detriment of leading to IOMMU INVALID_DEVICE_REQUEST or worse. + * The ranges reserved for Hyper-Transport are: + * + * FD_0000_0000h - FF_FFFF_FFFFh + * + * The ranges represent the following: + * + * Base Address Top Address Use + * + * FD_0000_0000h FD_F7FF_FFFFh Reserved interrupt address space + * FD_F800_0000h FD_F8FF_FFFFh Interrupt/EOI IntCtl + * FD_F900_0000h FD_F90F_FFFFh Legacy PIC IACK + * FD_F910_0000h FD_F91F_FFFFh System Management + * FD_F920_0000h FD_FAFF_FFFFh Reserved Page Tables + * FD_FB00_0000h FD_FBFF_FFFFh Address Translation + * FD_FC00_0000h FD_FDFF_FFFFh I/O Space + * FD_FE00_0000h FD_FFFF_FFFFh Configuration + * FE_0000_0000h FE_1FFF_FFFFh Extended Configuration/Device Messages + * FE_2000_0000h FF_FFFF_FFFFh Reserved + * + * See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology", + * Table 3: Special Address Controls (GPA) for more information. + */ +#define AMD_HT_START 0xfd00000000UL +#define AMD_HT_END 0xffffffffffUL +#define AMD_ABOVE_1TB_START (AMD_HT_END + 1) +#define AMD_HT_SIZE (AMD_ABOVE_1TB_START - AMD_HT_START) + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -854,6 +888,26 @@ void pc_memory_init(PCMachineState *pcms, linux_boot = (machine->kernel_filename != NULL); + /* + * The HyperTransport range close to the 1T boundary is unique to AMD + * hosts with IOMMUs enabled. Restrict the ram-above-4g relocation + * to above 1T to AMD vCPUs only. + */ + if (IS_AMD_CPU(&cpu->env)) { + /* Bail out if max possible address does not cross HT range */ + if (pc_max_used_gpa(pcms, pci_hole64_size) >= AMD_HT_START) { + x86ms->above_4g_mem_start = AMD_ABOVE_1TB_START; + } + + /* + * Advertise the HT region if address space covers the reserved + * region or if we relocate. + */ + if (cpu->phys_bits >= 40) { + e820_add_entry(AMD_HT_START, AMD_HT_SIZE, E820_RESERVED); + } + } + /* * phys-bits is required to be appropriately configured * to make sure max used GPA is reachable. -- Gitee