From 809a9ee44dad88067625d9236a8cd59f744517a1 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:11 +0530 Subject: [PATCH 01/17] powerpc/smp: Enable Asym packing for cores on shared processor ANBZ: #33236 commit aa80c6343fcf53cbc29f84ba9f89ca87d4e41350 upstream. If there are shared processor LPARs, underlying Hypervisor can have more virtual cores to handle than actual physical cores. Starting with Power 9, a big core (aka SMT8 core) has 2 nearly independent thread groups. On a shared processors LPARs, it helps to pack threads to lesser number of cores so that the overall system performance and utilization improves. PowerVM schedules at a big core level. Hence packing to fewer cores helps. Since each thread-group is independent, running threads on both the thread-groups of a SMT8 core, should have a minimal adverse impact in non over provisioned scenarios. These changes in this patchset will not affect in the over provisioned scenario. If there are more threads than SMT domains, then asym_packing will not kick-in For example: Lets says there are two 8-core Shared LPARs that are actually sharing a 8 Core shared physical pool, each running 8 threads each. Then Consolidating 8 threads to 4 cores on each LPAR would help them to perform better. This is because each of the LPAR will get 100% time to run applications and there will no switching required by the Hypervisor. To achieve this, enable SD_ASYM_PACKING flag at CACHE, MC and DIE level when the system is running in shared processor mode and has big cores. Intel-SIG: commit aa80c6343fcf powerpc/smp: Enable Asym packing for cores on shared processor. Backport SNC devination dependency. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-2-srikar@linux.vnet.ibm.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/powerpc/kernel/smp.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 4e4870031265..f39d3bce0fc4 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1003,6 +1003,13 @@ static int powerpc_smt_flags(void) } #endif +/* + * On shared processor LPARs scheduled on a big core (which has two or more + * independent thread groups per core), prefer lower numbered CPUs, so + * that workload consolidates to lesser number of cores. + */ +static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack); + /* * P9 has a slightly odd architecture where pairs of cores share an L2 cache. * This topology makes it *much* cheaper to migrate tasks between adjacent cores @@ -1011,9 +1018,20 @@ static int powerpc_smt_flags(void) */ static int powerpc_shared_cache_flags(void) { + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING; + return SD_SHARE_PKG_RESOURCES; } +static int powerpc_shared_proc_flags(void) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_ASYM_PACKING; + + return 0; +} + /* * We can't just pass cpu_l2_cache_mask() directly because * returns a non-const pointer and the compiler barfs on that. @@ -1050,8 +1068,8 @@ static struct sched_domain_topology_level powerpc_topology[] = { { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, #endif { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, - { cpu_mc_mask, SD_INIT_NAME(MC) }, - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + { cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) }, + { cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) }, { NULL, }, }; @@ -1686,6 +1704,9 @@ static void __init fixup_topology(void) { int i; + if (is_shared_processor() && has_big_cores) + static_branch_enable(&splpar_asym_pack); + #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); -- Gitee From 1c2df8a338c7594030fa0cf8bdc0272e547e9cfd Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:12 +0530 Subject: [PATCH 02/17] powerpc/smp: Disable MC domain for shared processor ANBZ: #33236 commit 0e1c1986e0e65746daa05405d7747ce882f83cf1 upstream. Like L2-cache info, coregroup information which is used to determine MC sched domains is only present on dedicated LPARs. i.e PowerVM doesn't export coregroup information for shared processor LPARs. Hence disable creating MC domains on shared LPAR Systems. Intel-SIG: commit 0e1c1986e0e6 powerpc/smp: Disable MC domain for shared processor. Backport SNC devination dependency. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-3-srikar@linux.vnet.ibm.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/powerpc/kernel/smp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f39d3bce0fc4..41344a4e0bf1 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1055,6 +1055,10 @@ static struct cpumask *cpu_coregroup_mask(int cpu) static bool has_coregroup_support(void) { + /* Coregroup identification not available on shared systems */ + if (is_shared_processor()) + return 0; + return coregroup_enabled; } -- Gitee From 2279a71b553143cdaeb0a114d249f220b2f91bed Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:13 +0530 Subject: [PATCH 03/17] powerpc/smp: Add __ro_after_init attribute ANBZ: #33236 commit fd535a858ebeb1f478b1d065b6c057f52aad483a upstream. There are some variables that are only updated at boot time. So add __ro_after_init attribute to such variables Intel-SIG: commit fd535a858ebe powerpc/smp: Add __ro_after_init attribute. Backport SNC devination dependency. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-4-srikar@linux.vnet.ibm.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/powerpc/kernel/smp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 41344a4e0bf1..c609059478bb 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -77,10 +77,10 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif struct task_struct *secondary_current; -bool has_big_cores; -bool coregroup_enabled; -bool thread_group_shares_l2; -bool thread_group_shares_l3; +bool has_big_cores __ro_after_init; +bool coregroup_enabled __ro_after_init; +bool thread_group_shares_l2 __ro_after_init; +bool thread_group_shares_l3 __ro_after_init; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); @@ -987,7 +987,7 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property) return 0; } -static bool shared_caches; +static bool shared_caches __ro_after_init; #ifdef CONFIG_SCHED_SMT /* cpumask of CPUs with asymmetric SMT dependency */ -- Gitee From 5b2b851e572ce811bab1007f578e0c6fbf865eea Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:14 +0530 Subject: [PATCH 04/17] powerpc/smp: Avoid asym packing within thread_group of a core ANBZ: #33236 commit 0e93f1c780e8fd315f1262467b7d35eb6f766d2f upstream. PowerVM Hypervisor will schedule at a core granularity. However each core can have more than one thread_groups. For better utilization in case of a shared processor, its preferable for the scheduler to pack to the lowest core. However there is no benefit of moving a thread between two thread groups of the same core. Intel-SIG: commit 0e93f1c780e8 powerpc/smp: Avoid asym packing within thread_group of a core. Backport SNC devination dependency. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-5-srikar@linux.vnet.ibm.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/powerpc/kernel/smp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index c609059478bb..181bf8755279 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1763,6 +1763,19 @@ void __init smp_cpus_done(unsigned int max_cpus) set_sched_topology(powerpc_topology); } +/* + * For asym packing, by default lower numbered CPU has higher priority. + * On shared processors, pack to lower numbered core. However avoid moving + * between thread_groups within the same core. + */ +int arch_asym_cpu_priority(int cpu) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return -cpu / threads_per_core; + + return -cpu; +} + #ifdef CONFIG_HOTPLUG_CPU int __cpu_disable(void) { -- Gitee From 4c218ee9b4259d4df2cfea99c79dbc9c9c3834b3 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:15 +0530 Subject: [PATCH 05/17] powerpc/smp: Dynamically build Powerpc topology ANBZ: #33236 commit c46975715f5a7b941aa09bc0539a8dbe297f308f upstream. Currently there are four Powerpc specific sched topologies. These are all statically defined. However not all these topologies are used by all Powerpc systems. To avoid unnecessary degenerations by the scheduler, masks and flags are compared. However if the sched topologies are build dynamically then the code is simpler and there are greater chances of avoiding degenerations. Note: Even X86 builds its sched topologies dynamically and proposed changes are very similar to the way X86 is building its topologies. Intel-SIG: commit c46975715f5a powerpc/smp: Dynamically build Powerpc topology. Backport SNC devination dependency. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-6-srikar@linux.vnet.ibm.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/powerpc/kernel/smp.c | 78 ++++++++++++++------------------------- 1 file changed, 28 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 181bf8755279..39679a5c14b5 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -93,15 +93,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); EXPORT_SYMBOL_GPL(has_big_cores); -enum { -#ifdef CONFIG_SCHED_SMT - smt_idx, -#endif - cache_idx, - mc_idx, - die_idx, -}; - #define MAX_THREAD_LIST_SIZE 8 #define THREAD_GROUP_SHARE_L1 1 #define THREAD_GROUP_SHARE_L2_L3 2 @@ -1067,16 +1058,6 @@ static const struct cpumask *cpu_mc_mask(int cpu) return cpu_coregroup_mask(cpu); } -static struct sched_domain_topology_level powerpc_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, -#endif - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, - { cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) }, - { cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) }, - { NULL, }, -}; - static int __init init_big_cores(void) { int cpu; @@ -1704,9 +1685,11 @@ void start_secondary(void *unused) BUG(); } -static void __init fixup_topology(void) +static struct sched_domain_topology_level powerpc_topology[6]; + +static void __init build_sched_topology(void) { - int i; + int i = 0; if (is_shared_processor() && has_big_cores) static_branch_enable(&splpar_asym_pack); @@ -1714,36 +1697,33 @@ static void __init fixup_topology(void) #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); - powerpc_topology[smt_idx].mask = smallcore_smt_mask; + powerpc_topology[i++] = (struct sched_domain_topology_level){ + smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) + }; + } else { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) + }; } #endif + if (shared_caches) { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) + }; + } + if (has_coregroup_support()) { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) + }; + } + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) + }; - if (!has_coregroup_support()) - powerpc_topology[mc_idx].mask = powerpc_topology[cache_idx].mask; - - /* - * Try to consolidate topology levels here instead of - * allowing scheduler to degenerate. - * - Dont consolidate if masks are different. - * - Dont consolidate if sd_flags exists and are different. - */ - for (i = 1; i <= die_idx; i++) { - if (powerpc_topology[i].mask != powerpc_topology[i - 1].mask) - continue; - - if (powerpc_topology[i].sd_flags && powerpc_topology[i - 1].sd_flags && - powerpc_topology[i].sd_flags != powerpc_topology[i - 1].sd_flags) - continue; - - if (!powerpc_topology[i - 1].sd_flags) - powerpc_topology[i - 1].sd_flags = powerpc_topology[i].sd_flags; + /* There must be one trailing NULL entry left. */ + BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); - powerpc_topology[i].mask = powerpc_topology[i + 1].mask; - powerpc_topology[i].sd_flags = powerpc_topology[i + 1].sd_flags; -#ifdef CONFIG_SCHED_DEBUG - powerpc_topology[i].name = powerpc_topology[i + 1].name; -#endif - } + set_sched_topology(powerpc_topology); } void __init smp_cpus_done(unsigned int max_cpus) @@ -1758,9 +1738,7 @@ void __init smp_cpus_done(unsigned int max_cpus) smp_ops->bringup_done(); dump_numa_cpu_topology(); - - fixup_topology(); - set_sched_topology(powerpc_topology); + build_sched_topology(); } /* -- Gitee From df17baab2b053f617259ba9d537af97b3ee10609 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 6 Aug 2024 20:08:23 +0800 Subject: [PATCH 06/17] x86/mm: Don't print out SRAT table information ANBZ: #33236 commit 830a0d12943f53077b235f2a3caa8ab2b36475a3 upstream. This per CPU log is becoming longer with more and more CPUs in system, which slows down the boot process due to the serializing nature of printk(). The value of this information is dubious and it can be retrieved by lscpu from user space if required.. Downgrade the printk() to pr_debug() so it is still accessible for debug purposes. [ tglx: Massaged changelog ] Intel-SIG: commit 830a0d12943f Don't print out SRAT table information. Backport SNC devination dependency. Signed-off-by: Li RongQing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240806120823.17111-1-lirongqing@baidu.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/mm/srat.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 9c52a95937ad..6f8e0f21c710 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -57,8 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", - pxm, apic_id, node); + pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); } /* Callback for Proximity Domain -> LAPIC mapping */ @@ -98,8 +97,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", - pxm, apic_id, node); + pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); } int __init x86_acpi_numa_init(void) -- Gitee From 8787d5ffddcfd4590db30cd462377c7aef1dd436 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 23 Dec 2024 04:34:03 +0000 Subject: [PATCH 07/17] x86/topology: Remove x86_smt_flags and use cpu_smt_flags directly ANBZ: #33236 commit 537e247879589f6bace747e3479e4abf42dbbbdc upstream. x86_*_flags() wrappers were introduced with commit d3d37d850d1d ("x86/sched: Add SD_ASYM_PACKING flags to x86 ITMT CPU") to add x86_sched_itmt_flags() in addition to the default domain flags for SMT and MC domain. commit 995998ebdebd ("x86/sched: Remove SD_ASYM_PACKING from the SMT domain flags") removed the ITMT flags for SMT domain but not the x86_smt_flags() wrappers which directly returns cpu_smt_flags(). Remove x86_smt_flags() and directly use cpu_smt_flags() to derive the flags for SMT domain. No functional changes intended. Intel-SIG: commit 537e24787958 Remove x86_smt_flags and use cpu_smt_flags directly. Backport SNC devination dependency. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Tim Chen Link: https://lore.kernel.org/r/20241223043407.1611-5-kprateek.nayak@amd.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/smpboot.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3426d6aea42b..bf4a831a03de 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -491,12 +491,6 @@ static int x86_core_flags(void) return cpu_core_flags() | x86_sched_itmt_flags(); } #endif -#ifdef CONFIG_SCHED_SMT -static int x86_smt_flags(void) -{ - return cpu_smt_flags(); -} -#endif #ifdef CONFIG_SCHED_CLUSTER static int x86_cluster_flags(void) { @@ -519,7 +513,7 @@ static void __init build_sched_topology(void) #ifdef CONFIG_SCHED_SMT x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) + cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }; #endif #ifdef CONFIG_SCHED_CLUSTER -- Gitee From bb094066c39b0ad5f27a6c9c89b303921b87450f Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Wed, 15 Apr 2026 09:09:51 +0800 Subject: [PATCH 08/17] sched: Move sched domain name out of CONFIG_SCHED_DEBUG ANBZ: #33236 commit 1c055a0f5d3bafaca5d218bbb3e4e63d6307be45 upstream. /proc/schedstat file shows cpu and sched domain level scheduler statistics. It does not show domain name instead shows domain level. It will be very useful for tools like `perf sched stats`[1] to aggragate domain level stats if domain names are shown in /proc/schedstat. But sched domain name is guarded by CONFIG_SCHED_DEBUG. As per the discussion[2], move sched domain name out of CONFIG_SCHED_DEBUG. [1] https://lore.kernel.org/lkml/20241122084452.1064968-1-swapnil.sapkal@amd.com/ [2] https://lore.kernel.org/lkml/fcefeb4d-3acb-462d-9c9b-3df8d927e522@amd.com/ Intel-SIG: commit 1c055a0f5d3b Move sched domain name out of CONFIG_SCHED_DEBUG. Backport SNC devination dependency. Suggested-by: "Gautham R. Shenoy" Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241220063224.17767-5-swapnil.sapkal@amd.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- include/linux/sched/topology.h | 9 --------- kernel/sched/topology.c | 4 ---- 2 files changed, 13 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7ce562c7fd64..2a608b9f2ab2 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -147,9 +147,7 @@ struct sched_domain { unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; #endif -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif union { void *private; /* used during construction */ struct rcu_head rcu; /* used during destruction */ @@ -210,20 +208,13 @@ struct sched_domain_topology_level { int flags; int numa_level; struct sd_data data; -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif }; extern void __init set_sched_topology(struct sched_domain_topology_level *tl); extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); - -#ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type -#else -# define SD_INIT_NAME(type) -#endif #else /* CONFIG_SMP */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e8ae21c49a9d..4f360c971608 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1683,9 +1683,7 @@ sd_init(struct sched_domain_topology_level *tl, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -#ifdef CONFIG_SCHED_DEBUG .name = tl->name, -#endif }; sd_span = sched_domain_span(sd); @@ -2441,10 +2439,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); -#endif /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), -- Gitee From cf6b41a6b3a41255647936d07f58bda676fd5708 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Wed, 15 Apr 2026 09:17:46 +0800 Subject: [PATCH 09/17] smpboot: introduce SDTL_INIT() helper to tidy sched topology setup ANBZ: #33236 commit e075f4360931263f5ec006ea5dadc065e5e98eb8 upstream. Define a small SDTL_INIT(maskfn, flagsfn, name) macro and use it to build the sched_domain_topology_level array. Purely a cleanup; behaviour is unchanged. Intel-SIG: commit e075f4360931 introduce SDTL_INIT() helper to tidy sched topology setup. Backport SNC devination dependency. Suggested-by: Thomas Gleixner Signed-off-by: Li Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250710105715.66594-2-me@linux.beauty [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/powerpc/kernel/smp.c | 25 ++++++++++--------------- arch/s390/kernel/topology.c | 10 +++++----- arch/x86/kernel/smpboot.c | 21 ++++++--------------- include/linux/sched/topology.h | 3 ++- kernel/sched/topology.c | 24 ++++++++---------------- 5 files changed, 31 insertions(+), 52 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 39679a5c14b5..f354e8d1e487 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1697,28 +1697,23 @@ static void __init build_sched_topology(void) #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); - powerpc_topology[i++] = (struct sched_domain_topology_level){ - smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) - }; + powerpc_topology[i++] = + SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); } else { - powerpc_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) - }; + powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); } #endif if (shared_caches) { - powerpc_topology[i++] = (struct sched_domain_topology_level){ - shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) - }; + powerpc_topology[i++] = + SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); } + if (has_coregroup_support()) { - powerpc_topology[i++] = (struct sched_domain_topology_level){ - cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) - }; + powerpc_topology[i++] = + SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); } - powerpc_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) - }; + + powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); /* There must be one trailing NULL entry left. */ BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 66bda6a8f918..4d94c45022eb 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -518,11 +518,11 @@ static const struct cpumask *cpu_drawer_mask(int cpu) } static struct sched_domain_topology_level s390_topology[] = { - { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, - { cpu_book_mask, SD_INIT_NAME(BOOK) }, - { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), + SDTL_INIT(cpu_book_mask, NULL, BOOK), + SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), + SDTL_INIT(cpu_cpu_mask, NULL, PKG), { NULL, }, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bf4a831a03de..e3e508435982 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -512,35 +512,26 @@ static void __init build_sched_topology(void) int i = 0; #ifdef CONFIG_SCHED_SMT - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) - }; + x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT); #endif #ifdef CONFIG_SCHED_CLUSTER - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) - }; + x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS); #endif #ifdef CONFIG_SCHED_MC - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) - }; + x86_topology[i++] = SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC); #endif /* * When there is NUMA topology inside the package skip the PKG domain * since the NUMA domains will auto-magically create the right spanning * domains based on the SLIT. */ - if (!x86_has_numa_in_package) { - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG) - }; - } + if (!x86_has_numa_in_package) + x86_topology[i++] = SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG); /* * There must be one trailing NULL entry left. */ - BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); + BUG_ON(i >= ARRAY_SIZE(x86_topology) - 1); set_sched_topology(x86_topology); } diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 2a608b9f2ab2..f489cf2636f1 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -214,7 +214,8 @@ struct sched_domain_topology_level { extern void __init set_sched_topology(struct sched_domain_topology_level *tl); extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); -# define SD_INIT_NAME(type) .name = #type +#define SDTL_INIT(maskfn, flagsfn, dname) ((struct sched_domain_topology_level) \ + { .mask = maskfn, .sd_flags = flagsfn, .name = #dname }) #else /* CONFIG_SMP */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4f360c971608..24978f702f2b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1747,17 +1747,17 @@ sd_init(struct sched_domain_topology_level *tl, */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, + SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), #endif - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(cpu_cpu_mask, NULL, PKG), { NULL, }, }; @@ -2070,23 +2070,15 @@ void sched_init_numa(int offline_node) /* * Add the NUMA identity distance, aka single NODE. */ - tl[i++] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .numa_level = 0, - SD_INIT_NAME(NODE) - }; + tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE); /* * .. and append 'j' levels of NUMA goodness. */ for (j = 1; j < nr_levels; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, - .flags = SDTL_OVERLAP, - .numa_level = j, - SD_INIT_NAME(NUMA) - }; + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); + tl[i].numa_level = j; + tl[i].flags = SDTL_OVERLAP; } sched_domain_topology_saved = sched_domain_topology; -- Gitee From efae0af60467aef7407fc06df4fb1c13a968b5b4 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 10 Jul 2025 18:57:08 +0800 Subject: [PATCH 10/17] x86/smpboot: remove redundant CONFIG_SCHED_SMT ANBZ: #33236 commit 992de2b02509bed68f693ea5a68b07cd586197b7 upstream. On x86 CONFIG_SCHED_SMT is default y if SMP is enabled, so let's simply drop CONFIG_SCHED_SMT. Intel-SIG: commit 992de2b02509 remove redundant CONFIG_SCHED_SMT. Backport SNC devination dependency. Suggested-by: Thomas Gleixner Signed-off-by: Li Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250710105715.66594-3-me@linux.beauty [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/smpboot.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e3e508435982..4856ffffa0b5 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -511,9 +511,7 @@ static void __init build_sched_topology(void) { int i = 0; -#ifdef CONFIG_SCHED_SMT x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT); -#endif #ifdef CONFIG_SCHED_CLUSTER x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS); #endif -- Gitee From c5e85837b4e108c7a194e298ecad146275de2469 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 10 Jul 2025 18:57:09 +0800 Subject: [PATCH 11/17] x86/smpboot: moves x86_topology to static initialize and truncate ANBZ: #33236 commit fbc2010d92e595dc13d8048db2419f963c8cb25e upstream. The #ifdeffery and the initializers in build_sched_topology() are just disgusting. Statically initialize the domain levels in the topology array and let build_sched_topology() invalidate the package domain level when NUMA in package is available. Intel-SIG: commit fbc2010d92e5 moves x86_topology to static initialize and truncate. Backport SNC devination dependency. Suggested-by: Thomas Gleixner Signed-off-by: Li Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250710105715.66594-4-me@linux.beauty [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/smpboot.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 4856ffffa0b5..11358a85f306 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -505,32 +505,30 @@ static int x86_cluster_flags(void) */ static bool x86_has_numa_in_package; -static struct sched_domain_topology_level x86_topology[6]; - -static void __init build_sched_topology(void) -{ - int i = 0; - - x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT); +static struct sched_domain_topology_level x86_topology[] = { + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER - x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS); + SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - x86_topology[i++] = SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC); + SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), #endif - /* - * When there is NUMA topology inside the package skip the PKG domain - * since the NUMA domains will auto-magically create the right spanning - * domains based on the SLIT. - */ - if (!x86_has_numa_in_package) - x86_topology[i++] = SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG); + SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), + { NULL }, +}; +static void __init build_sched_topology(void) +{ /* - * There must be one trailing NULL entry left. + * When there is NUMA topology inside the package invalidate the + * PKG domain since the NUMA domains will auto-magically create the + * right spanning domains based on the SLIT. */ - BUG_ON(i >= ARRAY_SIZE(x86_topology) - 1); + if (x86_has_numa_in_package) { + unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2; + memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); + } set_sched_topology(x86_topology); } -- Gitee From c2f05f466a12aa6a4d792dffcaa7f17a60562016 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Wed, 15 Apr 2026 09:19:16 +0800 Subject: [PATCH 12/17] x86/smpboot: avoid SMT domain attach/destroy if SMT is not enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #33236 commit f79c9aa446d638190578515afcd06d6c9d72da55 upstream. Currently, the SMT domain is added into sched_domain_topology by default. If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight is just 1, it will destroy it. On a large machine, such as one with 512 cores, this results in 512 redundant domain attach/destroy operations. Avoid these unnecessary operations by simply checking cpu_smt_num_threads and skip SMT domain if the SMT domain is not enabled. Intel-SIG: commit f79c9aa446d6 avoid SMT domain attach/destroy if SMT is not enabled. Backport SNC devination dependency. Suggested-by: K Prateek Nayak Signed-off-by: Li Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250710105715.66594-5-me@linux.beauty [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/smpboot.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 11358a85f306..861849fd9ee1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -519,6 +519,8 @@ static struct sched_domain_topology_level x86_topology[] = { static void __init build_sched_topology(void) { + struct sched_domain_topology_level *topology = x86_topology; + /* * When there is NUMA topology inside the package invalidate the * PKG domain since the NUMA domains will auto-magically create the @@ -529,7 +531,15 @@ static void __init build_sched_topology(void) memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); } - set_sched_topology(x86_topology); + + /* + * Drop the SMT domains if there is only one thread per-core + * since it'll get degenerated by the scheduler anyways. + */ + if (cpu_smt_num_threads <= 1) + ++topology; + + set_sched_topology(topology); } #ifdef CONFIG_NUMA -- Gitee From 86b01eefba20e096358cca4309531054bed4e1cc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Apr 2026 08:57:34 +0800 Subject: [PATCH 13/17] x86/numa: Store extra copy of numa_nodes_parsed ANBZ: #33236 commit 48084cc153a5b0fbf0aa98d47670d3be0b9f64d5 upstream. The topology setup code needs to know the total number of physical nodes enumerated in SRAT; however NUMA_EMU can cause the existing numa_nodes_parsed bitmap to be fictitious. Therefore, keep a copy of the bitmap specifically to retain the physical node count. Intel-SIG: commit 48084cc153a5 x86/numa: Store extra copy of numa_nodes_parsed. Backport SNC devination. Suggested-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: K Prateek Nayak Tested-by: Zhang Rui Tested-by: Chen Yu Tested-by: Kyle Meyer Link: https://patch.msgid.link/20260303110059.889884023@infradead.org [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/asm/numa.h | 6 ++++++ arch/x86/mm/numa.c | 8 ++++++++ arch/x86/mm/srat.c | 2 ++ 3 files changed, 16 insertions(+) diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index ef2844d69173..beaf69ad2f42 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -24,6 +24,7 @@ extern int numa_off; */ extern s16 __apicid_to_node[MAX_LOCAL_APIC]; extern nodemask_t numa_nodes_parsed __initdata; +extern nodemask_t numa_phys_nodes_parsed __initdata; extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); extern void __init numa_set_distance(int from, int to, int distance); @@ -57,6 +58,7 @@ extern void __init init_cpu_to_node(void); extern void numa_add_cpu(int cpu); extern void numa_remove_cpu(int cpu); extern void init_gi_nodes(void); +extern int num_phys_nodes(void); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } @@ -64,6 +66,10 @@ static inline void init_cpu_to_node(void) { } static inline void numa_add_cpu(int cpu) { } static inline void numa_remove_cpu(int cpu) { } static inline void init_gi_nodes(void) { } +static inline int num_phys_nodes(void) +{ + return 1; +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index df959deef5b2..0958df9bccd9 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -55,6 +55,8 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; +nodemask_t numa_phys_nodes_parsed __initdata; + int numa_cpu_node(int cpu) { u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); @@ -64,6 +66,11 @@ int numa_cpu_node(int cpu) return NUMA_NO_NODE; } +int __init num_phys_nodes(void) +{ + return bitmap_weight(numa_phys_nodes_parsed.bits, MAX_NUMNODES); +} + cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); @@ -674,6 +681,7 @@ static int __init dummy_numa_init(void) 0LLU, PFN_PHYS(max_pfn) - 1); node_set(0, numa_nodes_parsed); + node_set(0, numa_phys_nodes_parsed); numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); return 0; diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 6f8e0f21c710..44ca66651756 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -57,6 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); + node_set(node, numa_phys_nodes_parsed); pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); } @@ -97,6 +98,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); + node_set(node, numa_phys_nodes_parsed); pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); } -- Gitee From 3ede0602cb821a03bc73699600b4688f261bf381 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Apr 2026 08:58:49 +0800 Subject: [PATCH 14/17] x86/topo: Add topology_num_nodes_per_package() ANBZ: #33236 commit ae6730ff42b3a13d94b405edeb5e40108b6d21b6 upstream. Use the MADT and SRAT table data to compute __num_nodes_per_package. Specifically, SRAT has already been parsed in x86_numa_init(), which is called before acpi_boot_init() which parses MADT. So both are available in topology_init_possible_cpus(). This number is useful to divinate the various Intel CoD/SNC and AMD NPS modes, since the platforms are failing to provide this otherwise. Doing it this way is independent of the number of online CPUs and other such shenanigans. Intel-SIG: commit ae6730ff42b3 x86/topo: Add topology_num_nodes_per_package(). Backport SNC devination. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: Tony Luck Tested-by: K Prateek Nayak Tested-by: Zhang Rui Tested-by: Chen Yu Tested-by: Kyle Meyer Link: https://patch.msgid.link/20260303110100.004091624@infradead.org [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/asm/topology.h | 6 ++++++ arch/x86/kernel/cpu/common.c | 3 +++ arch/x86/kernel/cpu/topology.c | 13 +++++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 01ae10c049ca..c8601213df22 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -148,6 +148,7 @@ extern unsigned int __max_logical_packages; extern unsigned int __max_threads_per_core; extern unsigned int __num_threads_per_package; extern unsigned int __num_cores_per_package; +extern unsigned int __num_nodes_per_package; static inline unsigned int topology_max_packages(void) { @@ -172,6 +173,11 @@ static inline unsigned int topology_num_threads_per_package(void) return __num_threads_per_package; } +static inline unsigned int topology_num_nodes_per_package(void) +{ + return __num_nodes_per_package; +} + #ifdef CONFIG_X86_LOCAL_APIC int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level); #else diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b27077af3bac..20df8203ce2c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -84,6 +84,9 @@ EXPORT_SYMBOL(__max_dies_per_package); unsigned int __max_logical_packages __ro_after_init = 1; EXPORT_SYMBOL(__max_logical_packages); +unsigned int __num_nodes_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__num_nodes_per_package); + unsigned int __num_cores_per_package __ro_after_init = 1; EXPORT_SYMBOL(__num_cores_per_package); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 0f612a31181c..4c9c7dca0ee0 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "cpu.h" @@ -509,11 +510,19 @@ void __init topology_init_possible_cpus(void) set_nr_cpu_ids(allowed); cnta = domain_weight(TOPO_PKG_DOMAIN); - cntb = domain_weight(TOPO_DIE_DOMAIN); __max_logical_packages = cnta; + + pr_info("Max. logical packages: %3u\n", __max_logical_packages); + + cntb = num_phys_nodes(); + __num_nodes_per_package = DIV_ROUND_UP(cntb, cnta); + + pr_info("Max. logical nodes: %3u\n", cntb); + pr_info("Num. nodes per package:%3u\n", __num_nodes_per_package); + + cntb = domain_weight(TOPO_DIE_DOMAIN); __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta)); - pr_info("Max. logical packages: %3u\n", cnta); pr_info("Max. logical dies: %3u\n", cntb); pr_info("Max. dies per package: %3u\n", __max_dies_per_package); -- Gitee From 3febe3e0df5d2853b81af075e149d0623abdccbd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Apr 2026 09:21:30 +0800 Subject: [PATCH 15/17] x86/topo: Replace x86_has_numa_in_package ANBZ: #33236 commit 717b64d58cff6fb97f97be07e382ed7641167a56 upstream. .. with the brand spanking new topology_num_nodes_per_package(). Having the topology setup determine this value during MADT/SRAT parsing before SMP bringup avoids having to detect this situation when building the SMP topology masks. Intel-SIG: commit 717b64d58cff x86/topo: Replace x86_has_numa_in_package. Backport SNC devination. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: Tony Luck Tested-by: K Prateek Nayak Tested-by: Zhang Rui Tested-by: Chen Yu Tested-by: Kyle Meyer Link: https://patch.msgid.link/20260303110100.123701837@infradead.org [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/smpboot.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 861849fd9ee1..8aa9dc59c474 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -498,13 +498,6 @@ static int x86_cluster_flags(void) } #endif -/* - * Set if a package/die has multiple NUMA nodes inside. - * AMD Magny-Cours, Intel Cluster-on-Die, and Intel - * Sub-NUMA Clustering have this. - */ -static bool x86_has_numa_in_package; - static struct sched_domain_topology_level x86_topology[] = { SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER @@ -526,7 +519,7 @@ static void __init build_sched_topology(void) * PKG domain since the NUMA domains will auto-magically create the * right spanning domains based on the SLIT. */ - if (x86_has_numa_in_package) { + if (topology_num_nodes_per_package() > 1) { unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2; memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); @@ -580,7 +573,7 @@ int arch_sched_node_distance(int from, int to) case INTEL_GRANITERAPIDS_X: case INTEL_ATOM_DARKMONT_X: - if (!x86_has_numa_in_package || topology_max_packages() == 1 || + if (topology_max_packages() == 1 || topology_num_nodes_per_package() == 1 || d < REMOTE_DISTANCE) return d; @@ -636,7 +629,7 @@ void set_cpu_sibling_map(int cpu) o = &cpu_data(i); if (match_pkg(c, o) && !topology_same_node(c, o)) - x86_has_numa_in_package = true; + WARN_ON_ONCE(topology_num_nodes_per_package() == 1); if ((i == cpu) || (has_smt && match_smt(c, o))) link_mask(topology_sibling_cpumask, cpu, i); -- Gitee From f8a1124b05fbf0bd22bdf7a494e8310631cda765 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Mar 2026 11:55:43 +0100 Subject: [PATCH 16/17] x86/topo: Fix SNC topology mess ANBZ: #33236 commit 528d89a4707e5bfd86e30823c45dbb66877df900 upstream. Per 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in SNC-3 mode"), the original crazy SNC-3 SLIT table was: node distances: node 0 1 2 3 4 5 0: 10 15 17 21 28 26 1: 15 10 15 23 26 23 2: 17 15 10 26 23 21 3: 21 28 26 10 15 17 4: 23 26 23 15 10 15 5: 26 23 21 17 15 10 And per: https://lore.kernel.org/lkml/20250825075642.GQ3245006@noisy.programming.kicks-ass.net/ The suggestion was to average the off-trace clusters to restore sanity. However, 4d6dd05d07d0 implements this under various assumptions: - anything GNR/CWF with numa_in_package; - there will never be more than 2 packages; - the off-trace cluster will have distance >20 And then HPE shows up with a machine that matches the Vendor-Family-Model checks but looks like this: Here's an 8 socket (2 chassis) HPE system with SNC enabled: node 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0: 10 12 16 16 16 16 18 18 40 40 40 40 40 40 40 40 1: 12 10 16 16 16 16 18 18 40 40 40 40 40 40 40 40 2: 16 16 10 12 18 18 16 16 40 40 40 40 40 40 40 40 3: 16 16 12 10 18 18 16 16 40 40 40 40 40 40 40 40 4: 16 16 18 18 10 12 16 16 40 40 40 40 40 40 40 40 5: 16 16 18 18 12 10 16 16 40 40 40 40 40 40 40 40 6: 18 18 16 16 16 16 10 12 40 40 40 40 40 40 40 40 7: 18 18 16 16 16 16 12 10 40 40 40 40 40 40 40 40 8: 40 40 40 40 40 40 40 40 10 12 16 16 16 16 18 18 9: 40 40 40 40 40 40 40 40 12 10 16 16 16 16 18 18 10: 40 40 40 40 40 40 40 40 16 16 10 12 18 18 16 16 11: 40 40 40 40 40 40 40 40 16 16 12 10 18 18 16 16 12: 40 40 40 40 40 40 40 40 16 16 18 18 10 12 16 16 13: 40 40 40 40 40 40 40 40 16 16 18 18 12 10 16 16 14: 40 40 40 40 40 40 40 40 18 18 16 16 16 16 10 12 15: 40 40 40 40 40 40 40 40 18 18 16 16 16 16 12 10 10 = Same chassis and socket 12 = Same chassis and socket (SNC) 16 = Same chassis and adjacent socket 18 = Same chassis and non-adjacent socket 40 = Different chassis Turns out, the 'max 2 packages' thing is only relevant to the SNC-3 parts, the smaller parts do 8 sockets (like usual). The above SLIT table is sane, but violates the previous assumptions and trips a WARN. Now that the topology code has a sensible measure of nodes-per-package, we can use that to divinate the SNC mode at hand, and only fix up SNC-3 topologies. There is a 'healthy' amount of paranoia code validating the assumptions on the SLIT table, a simple pr_err(FW_BUG) print on failure and a fallback to using the regular table. Lets see how long this lasts :-) Intel-SIG: commit 528d89a4707e x86/topo: Fix SNC topology mess. Backport SNC devination Fixes: 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in SNC-3 mode") Reported-by: Kyle Meyer Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: K Prateek Nayak Tested-by: Zhang Rui Tested-by: Chen Yu Tested-by: Kyle Meyer Link: https://patch.msgid.link/20260303110100.238361290@infradead.org [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/smpboot.c | 190 ++++++++++++++++++++++++++++---------- 1 file changed, 143 insertions(+), 47 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8aa9dc59c474..00576d95b465 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -536,33 +536,149 @@ static void __init build_sched_topology(void) } #ifdef CONFIG_NUMA -static int sched_avg_remote_distance; -static int avg_remote_numa_distance(void) +/* + * Test if the on-trace cluster at (N,N) is symmetric. + * Uses upper triangle iteration to avoid obvious duplicates. + */ +static bool slit_cluster_symmetric(int N) { - int i, j; - int distance, nr_remote, total_distance; - - if (sched_avg_remote_distance > 0) - return sched_avg_remote_distance; - - nr_remote = 0; - total_distance = 0; - for_each_node_state(i, N_CPU) { - for_each_node_state(j, N_CPU) { - distance = node_distance(i, j); - - if (distance >= REMOTE_DISTANCE) { - nr_remote++; - total_distance += distance; - } + int u = topology_num_nodes_per_package(); + + for (int k = 0; k < u; k++) { + for (int l = k; l < u; l++) { + if (node_distance(N + k, N + l) != + node_distance(N + l, N + k)) + return false; } } - if (nr_remote) - sched_avg_remote_distance = total_distance / nr_remote; - else - sched_avg_remote_distance = REMOTE_DISTANCE; - return sched_avg_remote_distance; + return true; +} + +/* + * Return the package-id of the cluster, or ~0 if indeterminate. + * Each node in the on-trace cluster should have the same package-id. + */ +static u32 slit_cluster_package(int N) +{ + int u = topology_num_nodes_per_package(); + u32 pkg_id = ~0; + + for (int n = 0; n < u; n++) { + const struct cpumask *cpus = cpumask_of_node(N + n); + int cpu; + + for_each_cpu(cpu, cpus) { + u32 id = topology_logical_package_id(cpu); + + if (pkg_id == ~0) + pkg_id = id; + if (pkg_id != id) + return ~0; + } + } + + return pkg_id; +} + +/* + * Validate the SLIT table is of the form expected for SNC, specifically: + * + * - each on-trace cluster should be symmetric, + * - each on-trace cluster should have a unique package-id. + * + * If you NUMA_EMU on top of SNC, you get to keep the pieces. + */ +static bool slit_validate(void) +{ + int u = topology_num_nodes_per_package(); + u32 pkg_id, prev_pkg_id = ~0; + + for (int pkg = 0; pkg < topology_max_packages(); pkg++) { + int n = pkg * u; + + /* + * Ensure the on-trace cluster is symmetric and each cluster + * has a different package id. + */ + if (!slit_cluster_symmetric(n)) + return false; + pkg_id = slit_cluster_package(n); + if (pkg_id == ~0) + return false; + if (pkg && pkg_id == prev_pkg_id) + return false; + + prev_pkg_id = pkg_id; + } + + return true; +} + +/* + * Compute a sanitized SLIT table for SNC; notably SNC-3 can end up with + * asymmetric off-trace clusters, reflecting physical assymmetries. However + * this leads to 'unfortunate' sched_domain configurations. + * + * For example dual socket GNR with SNC-3: + * + * node distances: + * node 0 1 2 3 4 5 + * 0: 10 15 17 21 28 26 + * 1: 15 10 15 23 26 23 + * 2: 17 15 10 26 23 21 + * 3: 21 28 26 10 15 17 + * 4: 23 26 23 15 10 15 + * 5: 26 23 21 17 15 10 + * + * Fix things up by averaging out the off-trace clusters; resulting in: + * + * node 0 1 2 3 4 5 + * 0: 10 15 17 24 24 24 + * 1: 15 10 15 24 24 24 + * 2: 17 15 10 24 24 24 + * 3: 24 24 24 10 15 17 + * 4: 24 24 24 15 10 15 + * 5: 24 24 24 17 15 10 + */ +static int slit_cluster_distance(int i, int j) +{ + static int slit_valid = -1; + int u = topology_num_nodes_per_package(); + long d = 0; + int x, y; + + if (slit_valid < 0) { + slit_valid = slit_validate(); + if (!slit_valid) + pr_err(FW_BUG "SLIT table doesn't have the expected form for SNC -- fixup disabled!\n"); + else + pr_info("Fixing up SNC SLIT table.\n"); + } + + /* + * Is this a unit cluster on the trace? + */ + if ((i / u) == (j / u) || !slit_valid) + return node_distance(i, j); + + /* + * Off-trace cluster. + * + * Notably average out the symmetric pair of off-trace clusters to + * ensure the resulting SLIT table is symmetric. + */ + x = i - (i % u); + y = j - (j % u); + + for (i = x; i < x + u; i++) { + for (j = y; j < y + u; j++) { + d += node_distance(i, j); + d += node_distance(j, i); + } + } + + return d / (2*u*u); } int arch_sched_node_distance(int from, int to) @@ -572,34 +688,14 @@ int arch_sched_node_distance(int from, int to) switch (boot_cpu_data.x86_vfm) { case INTEL_GRANITERAPIDS_X: case INTEL_ATOM_DARKMONT_X: - - if (topology_max_packages() == 1 || topology_num_nodes_per_package() == 1 || - d < REMOTE_DISTANCE) + if (topology_max_packages() == 1 || + topology_num_nodes_per_package() < 3) return d; /* - * With SNC enabled, there could be too many levels of remote - * NUMA node distances, creating NUMA domain levels - * including local nodes and partial remote nodes. - * - * Trim finer distance tuning for NUMA nodes in remote package - * for the purpose of building sched domains. Group NUMA nodes - * in the remote package in the same sched group. - * Simplify NUMA domains and avoid extra NUMA levels including - * different remote NUMA nodes and local nodes. - * - * GNR and CWF don't expect systems with more than 2 packages - * and more than 2 hops between packages. Single average remote - * distance won't be appropriate if there are more than 2 - * packages as average distance to different remote packages - * could be different. + * Handle SNC-3 asymmetries. */ - WARN_ONCE(topology_max_packages() > 2, - "sched: Expect only up to 2 packages for GNR or CWF, " - "but saw %d packages when building sched domains.", - topology_max_packages()); - - d = avg_remote_numa_distance(); + return slit_cluster_distance(from, to); } return d; } -- Gitee From f3a481182c883fd516021f51dd7b38e2f97da220 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 3 Mar 2026 11:55:44 +0100 Subject: [PATCH 17/17] x86/resctrl: Fix SNC detection ANBZ: #33236 commit 59674fc9d0bfd96ce8a776680ee1cf22c28c9ac7 upstream. Now that the x86 topology code has a sensible nodes-per-package measure, that does not depend on the online status of CPUs, use this to divinate the SNC mode. Note that when Cluster on Die (CoD) is configured on older systems this will also show multiple NUMA nodes per package. Intel Resource Director Technology is incomaptible with CoD. Print a warning and do not use the fixup MSR_RMID_SNC_CONFIG. Intel-SIG: commit 59674fc9d0bf x86/resctrl: Fix SNC detection. Backport SNC devination Signed-off-by: Tony Luck Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: Zhang Rui Tested-by: Chen Yu Link: https://patch.msgid.link/aaCxbbgjL6OZ6VMd@agluck-desk3 Link: https://patch.msgid.link/20260303110100.367976706@infradead.org [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/cpu/resctrl/monitor.c | 36 ++++----------------------- 1 file changed, 5 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 52c14a7d36da..3ce7df1d52a5 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -353,7 +353,7 @@ void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); } -/* CPU models that support MSR_RMID_SNC_CONFIG */ +/* CPU models that support SNC and MSR_RMID_SNC_CONFIG */ static const struct x86_cpu_id snc_cpu_ids[] __initconst = { X86_MATCH_VFM(INTEL_ICELAKE_X, 0), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), @@ -364,40 +364,14 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = { {} }; -/* - * There isn't a simple hardware bit that indicates whether a CPU is running - * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the - * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in - * the same NUMA node as CPU0. - * It is not possible to accurately determine SNC state if the system is - * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes - * to L3 caches. It will be OK if system is booted with hyperthreading - * disabled (since this doesn't affect the ratio). - */ static __init int snc_get_config(void) { - struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); - const cpumask_t *node0_cpumask; - int cpus_per_node, cpus_per_l3; - int ret; - - if (!x86_match_cpu(snc_cpu_ids) || !ci) - return 1; + int ret = topology_num_nodes_per_package(); - cpus_read_lock(); - if (num_online_cpus() != num_present_cpus()) - pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); - cpus_read_unlock(); - - node0_cpumask = cpumask_of_node(cpu_to_node(0)); - - cpus_per_node = cpumask_weight(node0_cpumask); - cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); - - if (!cpus_per_node || !cpus_per_l3) + if (ret > 1 && !x86_match_cpu(snc_cpu_ids)) { + pr_warn("CoD enabled system? Resctrl not supported\n"); return 1; - - ret = cpus_per_l3 / cpus_per_node; + } /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ switch (ret) { -- Gitee