From 52e651e5ff84c5bf658ff00b01e70d178d41618f Mon Sep 17 00:00:00 2001 From: leizongkun Date: Tue, 11 Nov 2025 14:00:12 +0800 Subject: [PATCH] system: add support of hugepage use on demand Optimize memory regions that use 2MB huge pages and are not pre-allocated, this defers physical memory allocation, reduces memory overhead, and achieves on-demand memory usage when starting virtual machines. Signed-off-by: wangzhigang Signed-off-by: zhangliang Signed-off-by: leizongkun --- accel/kvm/kvm-all.c | 40 +++++++ hw/virtio/virtio-balloon.c | 190 ++++++++++++++++++++++++++++++++++ include/exec/memory.h | 11 ++ include/sysemu/kvm.h | 5 + linux-headers/linux/kvm.h | 6 ++ meson.build | 8 ++ meson_options.txt | 3 + migration/migration.c | 34 ++++++ migration/migration.h | 4 + migration/ram.c | 64 ++++++++++++ scripts/meson-buildoptions.sh | 3 + system/memory.c | 25 +++++ 12 files changed, 393 insertions(+) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index a321bf514c..8fb3f2eee7 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -513,6 +513,11 @@ static int kvm_mem_flags(MemoryRegion *mr) if (readonly && kvm_readonly_mem_allowed) { flags |= KVM_MEM_READONLY; } +#ifdef CONFIG_HUGEPAGE_POD + if (memory_region_is_huge_pod(mr)) { + flags |= KVM_MEM_HUGE_POD; + } +#endif return flags; } @@ -4233,6 +4238,41 @@ void query_stats_schemas_cb(StatsSchemaList **result, Error **errp) } } +#ifdef CONFIG_HUGEPAGE_POD +int kvm_update_touched_log(void) +{ + return kvm_vm_ioctl(kvm_state, KVM_POD_TOUCHED_LOG, NULL); +} + +int kvm_clear_slot_dirty_bitmap(void *ram) +{ + KVMState *s = kvm_state; + KVMMemoryListener *kml; + int i; + int ret = -1; + + if (!s) + return ret; + + kml = &s->memory_listener; + kvm_slots_lock(); + for (i = 0; i < s->nr_slots; i++) { + KVMSlot *mem = &kml->slots[i]; + + if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { + kvm_slot_reset_dirty_pages(mem); + ret = 0; + + qemu_log("Reset kvm slot dirty bitmap for ram %p", ram); + break; + } + } + kvm_slots_unlock(); + + return ret; +} +#endif + void kvm_mark_guest_state_protected(void) { kvm_state->guest_state_protected = true; diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index d004cf29d2..9f41b303b0 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -39,6 +39,162 @@ #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) +#ifdef CONFIG_HUGEPAGE_POD +#define ULONGS_PER_HUGEPAGE 8 /* Number of unsigned longs per huge page in the bitmap */ +static bool guest_enabled_fpr = false; + +/* Set if guest support and enabled free-page-reporting */ +static void set_guest_enabled_fpr(bool enabled) { + guest_enabled_fpr = enabled; +} + +/* Represent of a RAMBlock */ +typedef struct GlobalBalloonedPage { + void *base_hva; /* start HVA of a RAMBlock */ + size_t page_nr; /* total 4KiB page count of a RAMBlock */ + unsigned long *freed_page_bitmap; /* every set bit represent a freed 4KiB page */ + int *hugepage_freed_pages; /* every element represent freed subpages count in a hugepage */ +} GlobalBalloonedPage; + +#define PAGES_IN_HUGEPAGE 512 +#define HUGEPAGE_SHIFT 21 +#define GBP_LIST_LENGTH 8 +GlobalBalloonedPage *gbp_list[GBP_LIST_LENGTH] = { 0 }; + +static GlobalBalloonedPage *find_gbp_by_addr(void *base_hva) +{ + int i; + + for (i = 0; i < GBP_LIST_LENGTH; i++) { + GlobalBalloonedPage *gbp = gbp_list[i]; + if (gbp == NULL) { + continue; + } + + if (gbp->base_hva == base_hva) { + return gbp; + } + } + return NULL; +} + +static GlobalBalloonedPage *alloc_new_gbp(void *base_hva, ram_addr_t length) +{ + int i; + + for (i = 0; i < GBP_LIST_LENGTH; i++) { + GlobalBalloonedPage *gbp = gbp_list[i]; + if (gbp == NULL) { + gbp = g_malloc0(sizeof(GlobalBalloonedPage)); + if (gbp == NULL) { + error_report("alloc memory for GlobalBalloonedPage failed"); + return NULL; + } + gbp->base_hva = base_hva; + gbp->page_nr = length >> VIRTIO_BALLOON_PFN_SHIFT; + gbp->freed_page_bitmap = bitmap_new(gbp->page_nr); + gbp->hugepage_freed_pages = g_malloc0(gbp->page_nr/PAGES_IN_HUGEPAGE * sizeof(int)); + + gbp_list[i] = gbp; + return gbp; + } + } + warn_report("gbp list is full, max length: %d", GBP_LIST_LENGTH); + + return NULL; +} + +static void free_gbp(void) +{ + int i; + + for (i = 0; i < GBP_LIST_LENGTH; i++) { + GlobalBalloonedPage *gbp = gbp_list[i]; + if (gbp == NULL) { + continue; + } + + g_free(gbp->freed_page_bitmap); + g_free(gbp->hugepage_freed_pages); + g_free(gbp); + + gbp_list[i] = NULL; + } +} + +static inline void clear_subpages_in_hugepage(GlobalBalloonedPage *gbp, unsigned long hugepage_index) +{ + if (hugepage_index * ULONGS_PER_HUGEPAGE < gbp->page_nr) { + bitmap_zero(&gbp->freed_page_bitmap[hugepage_index * ULONGS_PER_HUGEPAGE], PAGES_IN_HUGEPAGE); + } +} + +static inline bool all_subpages_in_hugepage_freed(GlobalBalloonedPage *gbp, unsigned long hugepage_index) +{ + if (hugepage_index * ULONGS_PER_HUGEPAGE < gbp->page_nr) { + return bitmap_full(&gbp->freed_page_bitmap[hugepage_index * ULONGS_PER_HUGEPAGE], PAGES_IN_HUGEPAGE); + } +} + +static void mark_freed_subpage(RAMBlock *rb, ram_addr_t rb_offset) +{ + void *base_hva = qemu_ram_get_host_addr(rb); + ram_addr_t length = qemu_ram_get_max_length(rb); + ram_addr_t rb_page_size = qemu_ram_pagesize(rb); + ram_addr_t rb_aligned_offset = QEMU_ALIGN_DOWN(rb_offset, rb_page_size); + unsigned long page_index = rb_offset >> VIRTIO_BALLOON_PFN_SHIFT; + unsigned long hugepage_index = rb_offset >> HUGEPAGE_SHIFT; + GlobalBalloonedPage *gbp = find_gbp_by_addr(base_hva); + if (gbp == NULL) { + gbp = alloc_new_gbp(base_hva, length); + if (gbp == NULL) { + return; + } + } + + /* When one subpage released by balloon, set the bit of this page */ + if (page_index < gbp->page_nr && !test_and_set_bit(page_index, gbp->freed_page_bitmap)) { + if (hugepage_index < (gbp->page_nr / PAGES_IN_HUGEPAGE)) { + gbp->hugepage_freed_pages[hugepage_index]++; + /* + * All bits have been set meaning that all subpages of a hugepage is freed + * by balloon, So we can release this hugepage back to Host. + */ + if (gbp->hugepage_freed_pages[hugepage_index] == PAGES_IN_HUGEPAGE) { + clear_subpages_in_hugepage(gbp, hugepage_index); + gbp->hugepage_freed_pages[hugepage_index] = 0; + + /* Release this hugepage back to Host */ + ram_block_discard_range(rb, rb_aligned_offset, rb_page_size); + } + } + } +} + +static void mark_used_subpage(RAMBlock *rb, ram_addr_t rb_offset) +{ + void *base_hva = qemu_ram_get_host_addr(rb); + unsigned long page_index = rb_offset >> VIRTIO_BALLOON_PFN_SHIFT; + unsigned long hugepage_index = rb_offset >> HUGEPAGE_SHIFT; + GlobalBalloonedPage *gbp = find_gbp_by_addr(base_hva); + if (gbp == NULL) { + warn_report("Couldn't find gbp of rb_offset 0x%lx\n", rb_offset); + return; + } + + /* + * When one subpage deflated back to the Guest, clear the bit of this page. + * This means that this subpage could be used by Guest, so we cannot + * release to Host by mark_freed_subpage. + */ + if (page_index < gbp->page_nr && test_and_clear_bit(page_index, gbp->freed_page_bitmap)) { + if (hugepage_index < (gbp->page_nr / PAGES_IN_HUGEPAGE)) { + gbp->hugepage_freed_pages[hugepage_index]--; + } + } +} +#endif + typedef struct PartiallyBalloonedPage { ram_addr_t base_gpa; unsigned long *bitmap; @@ -92,6 +248,14 @@ static void balloon_inflate_page(VirtIOBalloon *balloon, rb = qemu_ram_block_from_host(addr, false, &rb_offset); rb_page_size = qemu_ram_pagesize(rb); +#ifdef CONFIG_HUGEPAGE_POD + if (rb_page_size == (1 << HUGEPAGE_SHIFT)) { + /* 2M pagesize case */ + mark_freed_subpage(rb, rb_offset); + return; + } +#endif + if (rb_page_size == BALLOON_PAGE_SIZE) { /* Easy case */ @@ -157,6 +321,14 @@ static void balloon_deflate_page(VirtIOBalloon *balloon, rb = qemu_ram_block_from_host(addr, false, &rb_offset); rb_page_size = qemu_ram_pagesize(rb); +#ifdef CONFIG_HUGEPAGE_POD + if (rb_page_size == (1 << HUGEPAGE_SHIFT)) { + /* 2M pagesize case */ + mark_used_subpage(rb, rb_offset); + return; + } +#endif + host_addr = (void *)((uintptr_t)addr & ~(rb_page_size - 1)); /* When a page is deflated, we hint the whole host page it lives @@ -257,6 +429,14 @@ static void balloon_stats_get_all(Object *obj, Visitor *v, const char *name, goto out_end; } for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) { +#ifdef CONFIG_HUGEPAGE_POD + if (guest_enabled_fpr && i == VIRTIO_BALLOON_S_CACHES) { + if (i < VIRTIO_BALLOON_S_NR) { + s->stats[i] |= 1024; + } + } +#endif + if (!visit_type_uint64(v, balloon_stat_names[i], &s->stats[i], errp)) { goto out_nested; } @@ -379,6 +559,10 @@ static void virtio_balloon_handle_report(VirtIODevice *vdev, VirtQueue *vq) ram_block_discard_range(rb, ram_offset, size); } +#ifdef CONFIG_HUGEPAGE_POD + set_guest_enabled_fpr(true); +#endif + skip_element: virtqueue_push(vq, elem, 0); virtio_notify(vdev, vq); @@ -923,6 +1107,9 @@ static void virtio_balloon_device_unrealize(DeviceState *dev) virtio_delete_queue(s->reporting_vq); } virtio_cleanup(vdev); +#ifdef CONFIG_HUGEPAGE_POD + free_gbp(); +#endif } static void virtio_balloon_device_reset(VirtIODevice *vdev) @@ -940,6 +1127,9 @@ static void virtio_balloon_device_reset(VirtIODevice *vdev) } s->poison_val = 0; +#ifdef CONFIG_HUGEPAGE_POD + set_guest_enabled_fpr(false); +#endif } static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status) diff --git a/include/exec/memory.h b/include/exec/memory.h index 51fe10d4a0..c5edf864e1 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -2109,6 +2109,17 @@ static inline bool memory_region_is_nonvolatile(MemoryRegion *mr) return mr->nonvolatile; } +#ifdef CONFIG_HUGEPAGE_POD +/** + * memory_region_is_huge_pod: check whether a memory region is POD hugepage + * + * Returns %true if a memory region is POD hugepage. + * + * @mr: the memory region being queried + */ +bool memory_region_is_huge_pod(MemoryRegion *mr); +#endif + /** * memory_region_get_fd: Get a file descriptor backing a RAM memory region. * diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 7602cd4429..de68df91a3 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -606,6 +606,11 @@ int kvm_create_shadow_device(PCIDevice *dev); int kvm_delete_shadow_device(PCIDevice *dev); #endif +#ifdef CONFIG_HUGEPAGE_POD +int kvm_update_touched_log(void); +int kvm_clear_slot_dirty_bitmap(void *ram); +#endif + void kvm_mark_guest_state_protected(void); #endif diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 96bc60475e..422a811f7e 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -104,6 +104,9 @@ struct kvm_userspace_memory_region { */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) +#ifdef CONFIG_HUGEPAGE_POD +#define KVM_MEM_HUGE_POD (1UL << 9) +#endif /* for KVM_IRQ_LINE */ struct kvm_irq_level { @@ -1785,6 +1788,9 @@ struct kvm_enc_region { /* Available with KVM_CAP_ARM_SVE */ #define KVM_ARM_VCPU_FINALIZE _IOW(KVMIO, 0xc2, int) +/* Available always */ +#define KVM_POD_TOUCHED_LOG _IO(KVMIO, 0xfe) + /* Available with KVM_CAP_S390_VCPU_RESETS */ #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) diff --git a/meson.build b/meson.build index 50b1e31edf..d379a71927 100644 --- a/meson.build +++ b/meson.build @@ -573,6 +573,13 @@ have_mbind_proportion = get_option('mbind_by_proportion') \ config_host_data.set('CONFIG_MBIND_PROPORTION', have_mbind_proportion) +# hugepage pod +have_hugepage_pod = get_option('hugepage_pod') \ + .require(targetos == 'linux', error_message: 'hugepage_pod is supported only on Linux') \ + .allowed() + +config_host_data.set('CONFIG_HUGEPAGE_POD', have_hugepage_pod) + # vhost have_vhost_user = get_option('vhost_user') \ .disable_auto_if(targetos != 'linux') \ @@ -4495,6 +4502,7 @@ summary_info += {'FUSE lseek': fuse_lseek.found()} summary_info += {'selinux': selinux} summary_info += {'libdw': libdw} summary_info += {'mbind proportion': have_mbind_proportion} +summary_info += {'hugepage pod': have_hugepage_pod} summary(summary_info, bool_yn: true, section: 'Dependencies') if host_arch == 'unknown' diff --git a/meson_options.txt b/meson_options.txt index 94a9b479bd..f446612ff6 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -377,3 +377,6 @@ option('hexagon_idef_parser', type : 'boolean', value : true, option('mbind_by_proportion', type: 'feature', value: 'auto', description: ' support of one guest numa node alloc memory from multi host nodes') + +option('hugepage_pod', type: 'feature', value: 'auto', + description: ' support of hugepage use on demand') diff --git a/migration/migration.c b/migration/migration.c index eba3f9d17d..91b2267c3f 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -67,6 +67,10 @@ #include "options.h" #include "sysemu/dirtylimit.h" #include "qemu/sockets.h" +#ifdef CONFIG_HUGEPAGE_POD +#include "qemu/log-for-trace.h" +#include "sysemu/kvm.h" +#endif #define DEFAULT_FD_MAX 4096 @@ -3777,3 +3781,33 @@ static void register_migration_types(void) } type_init(register_migration_types); + +#ifdef CONFIG_HUGEPAGE_POD +#define TOUCHED_LOG_TRY_TIME_MAX 3 +int ram_init_touched_log(void) +{ + int ret; + int try_times = 0; + + qemu_log("start init touched log\n"); + while(try_times < TOUCHED_LOG_TRY_TIME_MAX) { + ret = kvm_update_touched_log(); + if (!ret) { + qemu_log("end init touched log\n"); + return ret; + } + if (ret == -EINTR) { + try_times++; + continue; + } + if (ret) { + if (ret == -ENOSYS) { + qemu_log("kvm not support touched log\n"); + } + qemu_log("touched log failed (%d)\n", ret); + return ret; + } + } + return -EINTR; +} +#endif diff --git a/migration/migration.h b/migration/migration.h index eeddb7c0bd..66fe4dd799 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -555,4 +555,8 @@ void migrate_fd_cancel(MigrationState *s); bool memcrypt_enabled(void); +#ifdef CONFIG_HUGEPAGE_POD +int ram_init_touched_log(void); +#endif + #endif diff --git a/migration/ram.c b/migration/ram.c index 028b1ebb6e..b46de7cd6d 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -77,6 +77,10 @@ #include "qemu/userfaultfd.h" #endif /* defined(__linux__) */ +#ifdef CONFIG_HUGEPAGE_POD +#include "sysemu/kvm.h" +#endif + /***********************************************************/ /* ram save/restore */ @@ -3255,6 +3259,10 @@ static void ram_init_bitmaps(RAMState *rs) migration_bitmap_clear_discarded_pages(rs); } +#ifdef CONFIG_HUGEPAGE_POD +static int ram_init_touched_bitmap(RAMState *rs); +#endif + static int ram_init_all(RAMState **rsp) { if (ram_state_init(rsp)) { @@ -3267,6 +3275,11 @@ static int ram_init_all(RAMState **rsp) } ram_init_bitmaps(*rsp); +#ifdef CONFIG_HUGEPAGE_POD + if (ram_init_touched_bitmap(*rsp)) { + return -1; + } +#endif return 0; } @@ -4794,3 +4807,54 @@ void ram_mig_init(void) register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); ram_block_notifier_add(&ram_mig_ram_notifier); } + +#ifdef CONFIG_HUGEPAGE_POD +static int ram_init_touched_bitmap(RAMState *rs) +{ + RAMBlock *block; + bool has_pod = false; + + qemu_mutex_lock_ramlist(); + rcu_read_lock(); + RAMBLOCK_FOREACH_NOT_IGNORED(block) { + if (!memory_region_is_huge_pod(block->mr)) { + continue; + } + + kvm_clear_slot_dirty_bitmap(block->host); + has_pod = true; + } + rcu_read_unlock(); + qemu_mutex_unlock_ramlist(); + + if (!has_pod) { + return 0; + } + + if (ram_init_touched_log()) { + error_report("POD: Init touched log failed\n"); + return -1; + } + + info_report("Start update touched log bitmaps\n"); + qemu_mutex_lock_ramlist(); + rcu_read_lock(); + RAMBLOCK_FOREACH_NOT_IGNORED(block) { + if (!memory_region_is_huge_pod(block->mr)) { + continue; + } + + ram_state->migration_dirty_pages -= + bitmap_count_one_with_offset(block->bmap, 0, + block->used_length >> TARGET_PAGE_BITS); + bitmap_clear(block->bmap, 0, block->used_length >> TARGET_PAGE_BITS); + } + migration_bitmap_sync_precopy(rs, false); + rcu_read_unlock(); + qemu_mutex_unlock_ramlist(); + + info_report("End update touched log bitmaps, touched pages %lu\n", + (unsigned long)ram_state->migration_dirty_pages); + return 0; +} +#endif diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index d5d9130540..06f4f803c9 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -228,6 +228,7 @@ meson_options_help() { printf "%s\n" ' mbind-by-proportion' printf "%s\n" ' support of one guest numa node alloc memory from multi' printf "%s\n" ' host nodes' + printf "%s\n" ' hugepage-pod support of hugepage use on demand' } _meson_option_parse() { case $1 in @@ -576,6 +577,8 @@ _meson_option_parse() { --disable-uadk) printf "%s" -Duadk=disabled ;; --enable-mbind-by-proportion) printf "%s" -Dmbind_by_proportion=enabled ;; --disable-mbind-by-proportion) printf "%s" -Dmbind_by_proportion=disabled ;; + --enable-hugepage-pod) printf "%s" -Dhugepage_pod=enabled ;; + --disable-hugepage-pod) printf "%s" -Dhugepage_pod=disabled ;; *) return 1 ;; esac } diff --git a/system/memory.c b/system/memory.c index fa99009701..bf331d0e7b 100644 --- a/system/memory.c +++ b/system/memory.c @@ -3786,3 +3786,28 @@ static void memory_register_types(void) } type_init(memory_register_types) + +#ifdef CONFIG_HUGEPAGE_POD +#define HUGEPAGESIZE (1 << 21) +bool memory_region_is_huge_pod(MemoryRegion *mr) +{ + HostMemoryBackend *backend; + + rcu_read_lock(); + while (mr->alias) { + mr = mr->alias; + } + backend = (HostMemoryBackend *)object_dynamic_cast(mr->owner, TYPE_MEMORY_BACKEND); + rcu_read_unlock(); + + if (backend == NULL || backend->prealloc) { + return false; + } + + if (host_memory_backend_pagesize(backend) != HUGEPAGESIZE) { + return false; + } + + return true; +} +#endif -- Gitee