From 809a9ee44dad88067625d9236a8cd59f744517a1 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Thu, 14 Dec 2023 23:37:11 +0530
Subject: [PATCH 01/17] powerpc/smp: Enable Asym packing for cores on shared
 processor

ANBZ: #33236

commit aa80c6343fcf53cbc29f84ba9f89ca87d4e41350 upstream.

If there are shared processor LPARs, underlying Hypervisor can have more
virtual cores to handle than actual physical cores.

Starting with Power 9, a big core (aka SMT8 core) has 2 nearly
independent thread groups. On a shared processors LPARs, it helps to
pack threads to lesser number of cores so that the overall system
performance and utilization improves. PowerVM schedules at a big core
level. Hence packing to fewer cores helps.

Since each thread-group is independent, running threads on both the
thread-groups of a SMT8 core, should have a minimal adverse impact in
non over provisioned scenarios. These changes in this patchset will not
affect in the over provisioned scenario. If there are more threads than
SMT domains, then asym_packing will not kick-in

For example: Lets says there are two 8-core Shared LPARs that are
actually sharing a 8 Core shared physical pool, each running 8 threads
each. Then Consolidating 8 threads to 4 cores on each LPAR would help
them to perform better. This is because each of the LPAR will get
100% time to run applications and there will no switching required by
the Hypervisor.

To achieve this, enable SD_ASYM_PACKING flag at CACHE, MC and DIE level
when the system is running in shared processor mode and has big cores.

Intel-SIG: commit aa80c6343fcf powerpc/smp: Enable Asym packing for cores on shared processor.
Backport SNC devination dependency.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20231214180720.310852-2-srikar@linux.vnet.ibm.com
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/powerpc/kernel/smp.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 4e4870031265..f39d3bce0fc4 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1003,6 +1003,13 @@ static int powerpc_smt_flags(void)
 }
 #endif
 
+/*
+ * On shared processor LPARs scheduled on a big core (which has two or more
+ * independent thread groups per core), prefer lower numbered CPUs, so
+ * that workload consolidates to lesser number of cores.
+ */
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack);
+
 /*
  * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
  * This topology makes it *much* cheaper to migrate tasks between adjacent cores
@@ -1011,9 +1018,20 @@ static int powerpc_smt_flags(void)
  */
 static int powerpc_shared_cache_flags(void)
 {
+	if (static_branch_unlikely(&splpar_asym_pack))
+		return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING;
+
 	return SD_SHARE_PKG_RESOURCES;
 }
 
+static int powerpc_shared_proc_flags(void)
+{
+	if (static_branch_unlikely(&splpar_asym_pack))
+		return SD_ASYM_PACKING;
+
+	return 0;
+}
+
 /*
  * We can't just pass cpu_l2_cache_mask() directly because
  * returns a non-const pointer and the compiler barfs on that.
@@ -1050,8 +1068,8 @@ static struct sched_domain_topology_level powerpc_topology[] = {
 	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
 	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
-	{ cpu_mc_mask, SD_INIT_NAME(MC) },
-	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
+	{ cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) },
+	{ cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) },
 	{ NULL, },
 };
 
@@ -1686,6 +1704,9 @@ static void __init fixup_topology(void)
 {
 	int i;
 
+	if (is_shared_processor() && has_big_cores)
+		static_branch_enable(&splpar_asym_pack);
+
 #ifdef CONFIG_SCHED_SMT
 	if (has_big_cores) {
 		pr_info("Big cores detected but using small core scheduling\n");
-- 
Gitee


From 1c2df8a338c7594030fa0cf8bdc0272e547e9cfd Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Thu, 14 Dec 2023 23:37:12 +0530
Subject: [PATCH 02/17] powerpc/smp: Disable MC domain for shared processor

ANBZ: #33236

commit 0e1c1986e0e65746daa05405d7747ce882f83cf1 upstream.

Like L2-cache info, coregroup information which is used to determine MC
sched domains is only present on dedicated LPARs. i.e PowerVM doesn't
export coregroup information for shared processor LPARs. Hence disable
creating MC domains on shared LPAR Systems.

Intel-SIG: commit 0e1c1986e0e6 powerpc/smp: Disable MC domain for shared processor.
Backport SNC devination dependency.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20231214180720.310852-3-srikar@linux.vnet.ibm.com
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/powerpc/kernel/smp.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index f39d3bce0fc4..41344a4e0bf1 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1055,6 +1055,10 @@ static struct cpumask *cpu_coregroup_mask(int cpu)
 
 static bool has_coregroup_support(void)
 {
+	/* Coregroup identification not available on shared systems */
+	if (is_shared_processor())
+		return 0;
+
 	return coregroup_enabled;
 }
 
-- 
Gitee


From 2279a71b553143cdaeb0a114d249f220b2f91bed Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Thu, 14 Dec 2023 23:37:13 +0530
Subject: [PATCH 03/17] powerpc/smp: Add __ro_after_init attribute

ANBZ: #33236

commit fd535a858ebeb1f478b1d065b6c057f52aad483a upstream.

There are some variables that are only updated at boot time.
So add __ro_after_init attribute to such variables

Intel-SIG: commit fd535a858ebe powerpc/smp: Add __ro_after_init attribute.
Backport SNC devination dependency.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20231214180720.310852-4-srikar@linux.vnet.ibm.com
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/powerpc/kernel/smp.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 41344a4e0bf1..c609059478bb 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -77,10 +77,10 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
 #endif
 
 struct task_struct *secondary_current;
-bool has_big_cores;
-bool coregroup_enabled;
-bool thread_group_shares_l2;
-bool thread_group_shares_l3;
+bool has_big_cores __ro_after_init;
+bool coregroup_enabled __ro_after_init;
+bool thread_group_shares_l2 __ro_after_init;
+bool thread_group_shares_l3 __ro_after_init;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
@@ -987,7 +987,7 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property)
 	return 0;
 }
 
-static bool shared_caches;
+static bool shared_caches __ro_after_init;
 
 #ifdef CONFIG_SCHED_SMT
 /* cpumask of CPUs with asymmetric SMT dependency */
-- 
Gitee


From 5b2b851e572ce811bab1007f578e0c6fbf865eea Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Thu, 14 Dec 2023 23:37:14 +0530
Subject: [PATCH 04/17] powerpc/smp: Avoid asym packing within thread_group of
 a core

ANBZ: #33236

commit 0e93f1c780e8fd315f1262467b7d35eb6f766d2f upstream.

PowerVM Hypervisor will schedule at a core granularity. However each
core can have more than one thread_groups. For better utilization in
case of a shared processor, its preferable for the scheduler to pack to
the lowest core. However there is no benefit of moving a thread between
two thread groups of the same core.

Intel-SIG: commit 0e93f1c780e8 powerpc/smp: Avoid asym packing within thread_group of a core.
Backport SNC devination dependency.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20231214180720.310852-5-srikar@linux.vnet.ibm.com
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/powerpc/kernel/smp.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index c609059478bb..181bf8755279 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1763,6 +1763,19 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	set_sched_topology(powerpc_topology);
 }
 
+/*
+ * For asym packing, by default lower numbered CPU has higher priority.
+ * On shared processors, pack to lower numbered core. However avoid moving
+ * between thread_groups within the same core.
+ */
+int arch_asym_cpu_priority(int cpu)
+{
+	if (static_branch_unlikely(&splpar_asym_pack))
+		return -cpu / threads_per_core;
+
+	return -cpu;
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 int __cpu_disable(void)
 {
-- 
Gitee


From 4c218ee9b4259d4df2cfea99c79dbc9c9c3834b3 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Thu, 14 Dec 2023 23:37:15 +0530
Subject: [PATCH 05/17] powerpc/smp: Dynamically build Powerpc topology

ANBZ: #33236

commit c46975715f5a7b941aa09bc0539a8dbe297f308f upstream.

Currently there are four Powerpc specific sched topologies.  These are
all statically defined.  However not all these topologies are used by
all Powerpc systems.

To avoid unnecessary degenerations by the scheduler, masks and flags
are compared. However if the sched topologies are build dynamically then
the code is simpler and there are greater chances of avoiding
degenerations.

Note:
Even X86 builds its sched topologies dynamically and proposed changes
are very similar to the way X86 is building its topologies.

Intel-SIG: commit c46975715f5a powerpc/smp: Dynamically build Powerpc topology.
Backport SNC devination dependency.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20231214180720.310852-6-srikar@linux.vnet.ibm.com
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/powerpc/kernel/smp.c | 78 ++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 181bf8755279..39679a5c14b5 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -93,15 +93,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 EXPORT_SYMBOL_GPL(has_big_cores);
 
-enum {
-#ifdef CONFIG_SCHED_SMT
-	smt_idx,
-#endif
-	cache_idx,
-	mc_idx,
-	die_idx,
-};
-
 #define MAX_THREAD_LIST_SIZE	8
 #define THREAD_GROUP_SHARE_L1   1
 #define THREAD_GROUP_SHARE_L2_L3 2
@@ -1067,16 +1058,6 @@ static const struct cpumask *cpu_mc_mask(int cpu)
 	return cpu_coregroup_mask(cpu);
 }
 
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
-	{ cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) },
-	{ cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) },
-	{ NULL, },
-};
-
 static int __init init_big_cores(void)
 {
 	int cpu;
@@ -1704,9 +1685,11 @@ void start_secondary(void *unused)
 	BUG();
 }
 
-static void __init fixup_topology(void)
+static struct sched_domain_topology_level powerpc_topology[6];
+
+static void __init build_sched_topology(void)
 {
-	int i;
+	int i = 0;
 
 	if (is_shared_processor() && has_big_cores)
 		static_branch_enable(&splpar_asym_pack);
@@ -1714,36 +1697,33 @@ static void __init fixup_topology(void)
 #ifdef CONFIG_SCHED_SMT
 	if (has_big_cores) {
 		pr_info("Big cores detected but using small core scheduling\n");
-		powerpc_topology[smt_idx].mask = smallcore_smt_mask;
+		powerpc_topology[i++] = (struct sched_domain_topology_level){
+			smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
+		};
+	} else {
+		powerpc_topology[i++] = (struct sched_domain_topology_level){
+			cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
+		};
 	}
 #endif
+	if (shared_caches) {
+		powerpc_topology[i++] = (struct sched_domain_topology_level){
+			shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE)
+		};
+	}
+	if (has_coregroup_support()) {
+		powerpc_topology[i++] = (struct sched_domain_topology_level){
+			cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC)
+		};
+	}
+	powerpc_topology[i++] = (struct sched_domain_topology_level){
+		cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG)
+	};
 
-	if (!has_coregroup_support())
-		powerpc_topology[mc_idx].mask = powerpc_topology[cache_idx].mask;
-
-	/*
-	 * Try to consolidate topology levels here instead of
-	 * allowing scheduler to degenerate.
-	 * - Dont consolidate if masks are different.
-	 * - Dont consolidate if sd_flags exists and are different.
-	 */
-	for (i = 1; i <= die_idx; i++) {
-		if (powerpc_topology[i].mask != powerpc_topology[i - 1].mask)
-			continue;
-
-		if (powerpc_topology[i].sd_flags && powerpc_topology[i - 1].sd_flags &&
-				powerpc_topology[i].sd_flags != powerpc_topology[i - 1].sd_flags)
-			continue;
-
-		if (!powerpc_topology[i - 1].sd_flags)
-			powerpc_topology[i - 1].sd_flags = powerpc_topology[i].sd_flags;
+	/* There must be one trailing NULL entry left.  */
+	BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
 
-		powerpc_topology[i].mask = powerpc_topology[i + 1].mask;
-		powerpc_topology[i].sd_flags = powerpc_topology[i + 1].sd_flags;
-#ifdef CONFIG_SCHED_DEBUG
-		powerpc_topology[i].name = powerpc_topology[i + 1].name;
-#endif
-	}
+	set_sched_topology(powerpc_topology);
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
@@ -1758,9 +1738,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
 		smp_ops->bringup_done();
 
 	dump_numa_cpu_topology();
-
-	fixup_topology();
-	set_sched_topology(powerpc_topology);
+	build_sched_topology();
 }
 
 /*
-- 
Gitee


From df17baab2b053f617259ba9d537af97b3ee10609 Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Tue, 6 Aug 2024 20:08:23 +0800
Subject: [PATCH 06/17] x86/mm: Don't print out SRAT table information

ANBZ: #33236

commit 830a0d12943f53077b235f2a3caa8ab2b36475a3 upstream.

This per CPU log is becoming longer with more and more CPUs in system,
which slows down the boot process due to the serializing nature of
printk().

The value of this information is dubious and it can be retrieved by lscpu
from user space if required..

Downgrade the printk() to pr_debug() so it is still accessible for debug
purposes.

[ tglx: Massaged changelog ]

Intel-SIG: commit 830a0d12943f Don't print out SRAT table information.
Backport SNC devination dependency.

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240806120823.17111-1-lirongqing@baidu.com
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/x86/mm/srat.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 9c52a95937ad..6f8e0f21c710 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -57,8 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 	}
 	set_apicid_to_node(apic_id, node);
 	node_set(node, numa_nodes_parsed);
-	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
-	       pxm, apic_id, node);
+	pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node);
 }
 
 /* Callback for Proximity Domain -> LAPIC mapping */
@@ -98,8 +97,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 
 	set_apicid_to_node(apic_id, node);
 	node_set(node, numa_nodes_parsed);
-	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
-	       pxm, apic_id, node);
+	pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node);
 }
 
 int __init x86_acpi_numa_init(void)
-- 
Gitee


From 8787d5ffddcfd4590db30cd462377c7aef1dd436 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Mon, 23 Dec 2024 04:34:03 +0000
Subject: [PATCH 07/17] x86/topology: Remove x86_smt_flags and use
 cpu_smt_flags directly

ANBZ: #33236

commit 537e247879589f6bace747e3479e4abf42dbbbdc upstream.

x86_*_flags() wrappers were introduced with commit d3d37d850d1d
("x86/sched: Add SD_ASYM_PACKING flags to x86 ITMT CPU") to add
x86_sched_itmt_flags() in addition to the default domain flags for SMT
and MC domain.

commit 995998ebdebd ("x86/sched: Remove SD_ASYM_PACKING from the
SMT domain flags") removed the ITMT flags for SMT domain but not the
x86_smt_flags() wrappers which directly returns cpu_smt_flags().

Remove x86_smt_flags() and directly use cpu_smt_flags() to derive the
flags for SMT domain. No functional changes intended.

Intel-SIG: commit 537e24787958 Remove x86_smt_flags and use cpu_smt_flags directly.
Backport SNC devination dependency.

Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lore.kernel.org/r/20241223043407.1611-5-kprateek.nayak@amd.com
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/x86/kernel/smpboot.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3426d6aea42b..bf4a831a03de 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -491,12 +491,6 @@ static int x86_core_flags(void)
 	return cpu_core_flags() | x86_sched_itmt_flags();
 }
 #endif
-#ifdef CONFIG_SCHED_SMT
-static int x86_smt_flags(void)
-{
-	return cpu_smt_flags();
-}
-#endif
 #ifdef CONFIG_SCHED_CLUSTER
 static int x86_cluster_flags(void)
 {
@@ -519,7 +513,7 @@ static void __init build_sched_topology(void)
 
 #ifdef CONFIG_SCHED_SMT
 	x86_topology[i++] = (struct sched_domain_topology_level){
-		cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
+		cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
 	};
 #endif
 #ifdef CONFIG_SCHED_CLUSTER
-- 
Gitee


From bb094066c39b0ad5f27a6c9c89b303921b87450f Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Wed, 15 Apr 2026 09:09:51 +0800
Subject: [PATCH 08/17] sched: Move sched domain name out of CONFIG_SCHED_DEBUG

ANBZ: #33236

commit 1c055a0f5d3bafaca5d218bbb3e4e63d6307be45 upstream.

/proc/schedstat file shows cpu and sched domain level scheduler
statistics. It does not show domain name instead shows domain level.
It will be very useful for tools like `perf sched stats`[1] to
aggragate domain level stats if domain names are shown in /proc/schedstat.
But sched domain name is guarded by CONFIG_SCHED_DEBUG. As per the
discussion[2], move sched domain name out of CONFIG_SCHED_DEBUG.

[1] https://lore.kernel.org/lkml/20241122084452.1064968-1-swapnil.sapkal@amd.com/
[2] https://lore.kernel.org/lkml/fcefeb4d-3acb-462d-9c9b-3df8d927e522@amd.com/

Intel-SIG: commit 1c055a0f5d3b Move sched domain name out of CONFIG_SCHED_DEBUG.
Backport SNC devination dependency.

Suggested-by: "Gautham R. Shenoy" <gautham.shenoy@amd.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20241220063224.17767-5-swapnil.sapkal@amd.com
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 include/linux/sched/topology.h | 9 ---------
 kernel/sched/topology.c        | 4 ----
 2 files changed, 13 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7ce562c7fd64..2a608b9f2ab2 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -147,9 +147,7 @@ struct sched_domain {
 	unsigned int ttwu_move_affine;
 	unsigned int ttwu_move_balance;
 #endif
-#ifdef CONFIG_SCHED_DEBUG
 	char *name;
-#endif
 	union {
 		void *private;		/* used during construction */
 		struct rcu_head rcu;	/* used during destruction */
@@ -210,20 +208,13 @@ struct sched_domain_topology_level {
 	int		    flags;
 	int		    numa_level;
 	struct sd_data      data;
-#ifdef CONFIG_SCHED_DEBUG
 	char                *name;
-#endif
 };
 
 extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
 extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio);
 
-
-#ifdef CONFIG_SCHED_DEBUG
 # define SD_INIT_NAME(type)		.name = #type
-#else
-# define SD_INIT_NAME(type)
-#endif
 
 #else /* CONFIG_SMP */
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e8ae21c49a9d..4f360c971608 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1683,9 +1683,7 @@ sd_init(struct sched_domain_topology_level *tl,
 		.max_newidle_lb_cost	= 0,
 		.last_decay_max_lb_cost	= jiffies,
 		.child			= child,
-#ifdef CONFIG_SCHED_DEBUG
 		.name			= tl->name,
-#endif
 	};
 
 	sd_span = sched_domain_span(sd);
@@ -2441,10 +2439,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
 		if (!cpumask_subset(sched_domain_span(child),
 				    sched_domain_span(sd))) {
 			pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
 			pr_err("     the %s domain not a subset of the %s domain\n",
 					child->name, sd->name);
-#endif
 			/* Fixup, ensure @sd has at least @child CPUs. */
 			cpumask_or(sched_domain_span(sd),
 				   sched_domain_span(sd),
-- 
Gitee


From cf6b41a6b3a41255647936d07f58bda676fd5708 Mon Sep 17 00:00:00 2001
From: Li Chen <chenl311@chinatelecom.cn>
Date: Wed, 15 Apr 2026 09:17:46 +0800
Subject: [PATCH 09/17] smpboot: introduce SDTL_INIT() helper to tidy sched
 topology setup

ANBZ: #33236

commit e075f4360931263f5ec006ea5dadc065e5e98eb8 upstream.

Define a small SDTL_INIT(maskfn, flagsfn, name) macro and use it to build the
sched_domain_topology_level array. Purely a cleanup; behaviour is unchanged.

Intel-SIG: commit e075f4360931 introduce SDTL_INIT() helper to tidy sched topology setup.
Backport SNC devination dependency.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20250710105715.66594-2-me@linux.beauty
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/powerpc/kernel/smp.c      | 25 ++++++++++---------------
 arch/s390/kernel/topology.c    | 10 +++++-----
 arch/x86/kernel/smpboot.c      | 21 ++++++---------------
 include/linux/sched/topology.h |  3 ++-
 kernel/sched/topology.c        | 24 ++++++++----------------
 5 files changed, 31 insertions(+), 52 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 39679a5c14b5..f354e8d1e487 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1697,28 +1697,23 @@ static void __init build_sched_topology(void)
 #ifdef CONFIG_SCHED_SMT
 	if (has_big_cores) {
 		pr_info("Big cores detected but using small core scheduling\n");
-		powerpc_topology[i++] = (struct sched_domain_topology_level){
-			smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
-		};
+		powerpc_topology[i++] =
+			SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT);
 	} else {
-		powerpc_topology[i++] = (struct sched_domain_topology_level){
-			cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
-		};
+		powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT);
 	}
 #endif
 	if (shared_caches) {
-		powerpc_topology[i++] = (struct sched_domain_topology_level){
-			shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE)
-		};
+		powerpc_topology[i++] =
+			SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE);
 	}
+
 	if (has_coregroup_support()) {
-		powerpc_topology[i++] = (struct sched_domain_topology_level){
-			cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC)
-		};
+		powerpc_topology[i++] =
+			SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC);
 	}
-	powerpc_topology[i++] = (struct sched_domain_topology_level){
-		cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG)
-	};
+
+	powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG);
 
 	/* There must be one trailing NULL entry left.  */
 	BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 66bda6a8f918..4d94c45022eb 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -518,11 +518,11 @@ static const struct cpumask *cpu_drawer_mask(int cpu)
 }
 
 static struct sched_domain_topology_level s390_topology[] = {
-	{ cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
-	{ cpu_book_mask, SD_INIT_NAME(BOOK) },
-	{ cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
-	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
+	SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT),
+	SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
+	SDTL_INIT(cpu_book_mask, NULL, BOOK),
+	SDTL_INIT(cpu_drawer_mask, NULL, DRAWER),
+	SDTL_INIT(cpu_cpu_mask, NULL, PKG),
 	{ NULL, },
 };
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bf4a831a03de..e3e508435982 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -512,35 +512,26 @@ static void __init build_sched_topology(void)
 	int i = 0;
 
 #ifdef CONFIG_SCHED_SMT
-	x86_topology[i++] = (struct sched_domain_topology_level){
-		cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
-	};
+	x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT);
 #endif
 #ifdef CONFIG_SCHED_CLUSTER
-	x86_topology[i++] = (struct sched_domain_topology_level){
-		cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
-	};
+	x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS);
 #endif
 #ifdef CONFIG_SCHED_MC
-	x86_topology[i++] = (struct sched_domain_topology_level){
-		cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
-	};
+	x86_topology[i++] = SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC);
 #endif
 	/*
 	 * When there is NUMA topology inside the package skip the PKG domain
 	 * since the NUMA domains will auto-magically create the right spanning
 	 * domains based on the SLIT.
 	 */
-	if (!x86_has_numa_in_package) {
-		x86_topology[i++] = (struct sched_domain_topology_level){
-			cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG)
-		};
-	}
+	if (!x86_has_numa_in_package)
+		x86_topology[i++] = SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG);
 
 	/*
 	 * There must be one trailing NULL entry left.
 	 */
-	BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+	BUG_ON(i >= ARRAY_SIZE(x86_topology) - 1);
 
 	set_sched_topology(x86_topology);
 }
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 2a608b9f2ab2..f489cf2636f1 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -214,7 +214,8 @@ struct sched_domain_topology_level {
 extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
 extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio);
 
-# define SD_INIT_NAME(type)		.name = #type
+#define SDTL_INIT(maskfn, flagsfn, dname) ((struct sched_domain_topology_level) \
+	    { .mask = maskfn, .sd_flags = flagsfn, .name = #dname })
 
 #else /* CONFIG_SMP */
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 4f360c971608..24978f702f2b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1747,17 +1747,17 @@ sd_init(struct sched_domain_topology_level *tl,
  */
 static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+	SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
 #endif
 
 #ifdef CONFIG_SCHED_CLUSTER
-	{ cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+	SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
 #endif
 
 #ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+	SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
 #endif
-	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
+	SDTL_INIT(cpu_cpu_mask, NULL, PKG),
 	{ NULL, },
 };
 
@@ -2070,23 +2070,15 @@ void sched_init_numa(int offline_node)
 	/*
 	 * Add the NUMA identity distance, aka single NODE.
 	 */
-	tl[i++] = (struct sched_domain_topology_level){
-		.mask = sd_numa_mask,
-		.numa_level = 0,
-		SD_INIT_NAME(NODE)
-	};
+	tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE);
 
 	/*
 	 * .. and append 'j' levels of NUMA goodness.
 	 */
 	for (j = 1; j < nr_levels; i++, j++) {
-		tl[i] = (struct sched_domain_topology_level){
-			.mask = sd_numa_mask,
-			.sd_flags = cpu_numa_flags,
-			.flags = SDTL_OVERLAP,
-			.numa_level = j,
-			SD_INIT_NAME(NUMA)
-		};
+		tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
+		tl[i].numa_level = j;
+		tl[i].flags = SDTL_OVERLAP;
 	}
 
 	sched_domain_topology_saved = sched_domain_topology;
-- 
Gitee


From efae0af60467aef7407fc06df4fb1c13a968b5b4 Mon Sep 17 00:00:00 2001
From: Li Chen <chenl311@chinatelecom.cn>
Date: Thu, 10 Jul 2025 18:57:08 +0800
Subject: [PATCH 10/17] x86/smpboot: remove redundant CONFIG_SCHED_SMT

ANBZ: #33236

commit 992de2b02509bed68f693ea5a68b07cd586197b7 upstream.

On x86 CONFIG_SCHED_SMT is default y if SMP is enabled, so let's
simply drop CONFIG_SCHED_SMT.

Intel-SIG: commit 992de2b02509 remove redundant CONFIG_SCHED_SMT.
Backport SNC devination dependency.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20250710105715.66594-3-me@linux.beauty
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/x86/kernel/smpboot.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e3e508435982..4856ffffa0b5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -511,9 +511,7 @@ static void __init build_sched_topology(void)
 {
 	int i = 0;
 
-#ifdef CONFIG_SCHED_SMT
 	x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT);
-#endif
 #ifdef CONFIG_SCHED_CLUSTER
 	x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS);
 #endif
-- 
Gitee


From c5e85837b4e108c7a194e298ecad146275de2469 Mon Sep 17 00:00:00 2001
From: Li Chen <chenl311@chinatelecom.cn>
Date: Thu, 10 Jul 2025 18:57:09 +0800
Subject: [PATCH 11/17] x86/smpboot: moves x86_topology to static initialize
 and truncate

ANBZ: #33236

commit fbc2010d92e595dc13d8048db2419f963c8cb25e upstream.

The #ifdeffery and the initializers in build_sched_topology() are just
disgusting.

Statically initialize the domain levels in the topology array and let
build_sched_topology() invalidate the package domain level when NUMA in
package is available.

Intel-SIG: commit fbc2010d92e5 moves x86_topology to static initialize and truncate.
Backport SNC devination dependency.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20250710105715.66594-4-me@linux.beauty
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/x86/kernel/smpboot.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4856ffffa0b5..11358a85f306 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -505,32 +505,30 @@ static int x86_cluster_flags(void)
  */
 static bool x86_has_numa_in_package;
 
-static struct sched_domain_topology_level x86_topology[6];
-
-static void __init build_sched_topology(void)
-{
-	int i = 0;
-
-	x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT);
+static struct sched_domain_topology_level x86_topology[] = {
+	SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
 #ifdef CONFIG_SCHED_CLUSTER
-	x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS);
+	SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS),
 #endif
 #ifdef CONFIG_SCHED_MC
-	x86_topology[i++] = SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC);
+	SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC),
 #endif
-	/*
-	 * When there is NUMA topology inside the package skip the PKG domain
-	 * since the NUMA domains will auto-magically create the right spanning
-	 * domains based on the SLIT.
-	 */
-	if (!x86_has_numa_in_package)
-		x86_topology[i++] = SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG);
+	SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG),
+	{ NULL },
+};
 
+static void __init build_sched_topology(void)
+{
 	/*
-	 * There must be one trailing NULL entry left.
+	 * When there is NUMA topology inside the package invalidate the
+	 * PKG domain since the NUMA domains will auto-magically create the
+	 * right spanning domains based on the SLIT.
 	 */
-	BUG_ON(i >= ARRAY_SIZE(x86_topology) - 1);
+	if (x86_has_numa_in_package) {
+		unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
 
+		memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
+	}
 	set_sched_topology(x86_topology);
 }
 
-- 
Gitee


From c2f05f466a12aa6a4d792dffcaa7f17a60562016 Mon Sep 17 00:00:00 2001
From: Li Chen <chenl311@chinatelecom.cn>
Date: Wed, 15 Apr 2026 09:19:16 +0800
Subject: [PATCH 12/17] x86/smpboot: avoid SMT domain attach/destroy if SMT is
 not enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ANBZ: #33236

commit f79c9aa446d638190578515afcd06d6c9d72da55 upstream.

Currently, the SMT domain is added into sched_domain_topology by default.

If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight
is just 1, it will destroy it.

On a large machine, such as one with 512 cores, this results in
512 redundant domain attach/destroy operations.

Avoid these unnecessary operations by simply checking
cpu_smt_num_threads and skip SMT domain if the SMT domain is not
enabled.

Intel-SIG: commit f79c9aa446d6 avoid SMT domain attach/destroy if SMT is not enabled.
Backport SNC devination dependency.

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20250710105715.66594-5-me@linux.beauty
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/x86/kernel/smpboot.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 11358a85f306..861849fd9ee1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -519,6 +519,8 @@ static struct sched_domain_topology_level x86_topology[] = {
 
 static void __init build_sched_topology(void)
 {
+	struct sched_domain_topology_level *topology = x86_topology;
+
 	/*
 	 * When there is NUMA topology inside the package invalidate the
 	 * PKG domain since the NUMA domains will auto-magically create the
@@ -529,7 +531,15 @@ static void __init build_sched_topology(void)
 
 		memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
 	}
-	set_sched_topology(x86_topology);
+
+	/*
+	 * Drop the SMT domains if there is only one thread per-core
+	 * since it'll get degenerated by the scheduler anyways.
+	 */
+	if (cpu_smt_num_threads <= 1)
+		++topology;
+
+	set_sched_topology(topology);
 }
 
 #ifdef CONFIG_NUMA
-- 
Gitee


From 86b01eefba20e096358cca4309531054bed4e1cc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 15 Apr 2026 08:57:34 +0800
Subject: [PATCH 13/17] x86/numa: Store extra copy of numa_nodes_parsed

ANBZ: #33236

commit 48084cc153a5b0fbf0aa98d47670d3be0b9f64d5 upstream.

The topology setup code needs to know the total number of physical
nodes enumerated in SRAT; however NUMA_EMU can cause the existing
numa_nodes_parsed bitmap to be fictitious. Therefore, keep a copy of
the bitmap specifically to retain the physical node count.

Intel-SIG: commit 48084cc153a5 x86/numa: Store extra copy of numa_nodes_parsed.
Backport SNC devination.

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Link: https://patch.msgid.link/20260303110059.889884023@infradead.org
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/x86/include/asm/numa.h | 6 ++++++
 arch/x86/mm/numa.c          | 8 ++++++++
 arch/x86/mm/srat.c          | 2 ++
 3 files changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index ef2844d69173..beaf69ad2f42 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -24,6 +24,7 @@ extern int numa_off;
  */
 extern s16 __apicid_to_node[MAX_LOCAL_APIC];
 extern nodemask_t numa_nodes_parsed __initdata;
+extern nodemask_t numa_phys_nodes_parsed __initdata;
 
 extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
 extern void __init numa_set_distance(int from, int to, int distance);
@@ -57,6 +58,7 @@ extern void __init init_cpu_to_node(void);
 extern void numa_add_cpu(int cpu);
 extern void numa_remove_cpu(int cpu);
 extern void init_gi_nodes(void);
+extern int num_phys_nodes(void);
 #else	/* CONFIG_NUMA */
 static inline void numa_set_node(int cpu, int node)	{ }
 static inline void numa_clear_node(int cpu)		{ }
@@ -64,6 +66,10 @@ static inline void init_cpu_to_node(void)		{ }
 static inline void numa_add_cpu(int cpu)		{ }
 static inline void numa_remove_cpu(int cpu)		{ }
 static inline void init_gi_nodes(void)			{ }
+static inline int num_phys_nodes(void)
+{
+	return 1;
+}
 #endif	/* CONFIG_NUMA */
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index df959deef5b2..0958df9bccd9 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -55,6 +55,8 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
+nodemask_t numa_phys_nodes_parsed __initdata;
+
 int numa_cpu_node(int cpu)
 {
 	u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
@@ -64,6 +66,11 @@ int numa_cpu_node(int cpu)
 	return NUMA_NO_NODE;
 }
 
+int __init num_phys_nodes(void)
+{
+	return bitmap_weight(numa_phys_nodes_parsed.bits, MAX_NUMNODES);
+}
+
 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 EXPORT_SYMBOL(node_to_cpumask_map);
 
@@ -674,6 +681,7 @@ static int __init dummy_numa_init(void)
 	       0LLU, PFN_PHYS(max_pfn) - 1);
 
 	node_set(0, numa_nodes_parsed);
+	node_set(0, numa_phys_nodes_parsed);
 	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
 
 	return 0;
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 6f8e0f21c710..44ca66651756 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -57,6 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 	}
 	set_apicid_to_node(apic_id, node);
 	node_set(node, numa_nodes_parsed);
+	node_set(node, numa_phys_nodes_parsed);
 	pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node);
 }
 
@@ -97,6 +98,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 
 	set_apicid_to_node(apic_id, node);
 	node_set(node, numa_nodes_parsed);
+	node_set(node, numa_phys_nodes_parsed);
 	pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node);
 }
 
-- 
Gitee


From 3ede0602cb821a03bc73699600b4688f261bf381 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 15 Apr 2026 08:58:49 +0800
Subject: [PATCH 14/17] x86/topo: Add topology_num_nodes_per_package()

ANBZ: #33236

commit ae6730ff42b3a13d94b405edeb5e40108b6d21b6 upstream.

Use the MADT and SRAT table data to compute __num_nodes_per_package.

Specifically, SRAT has already been parsed in x86_numa_init(), which is called
before acpi_boot_init() which parses MADT. So both are available in
topology_init_possible_cpus().

This number is useful to divinate the various Intel CoD/SNC and AMD NPS modes,
since the platforms are failing to provide this otherwise.

Doing it this way is independent of the number of online CPUs and
other such shenanigans.

Intel-SIG: commit ae6730ff42b3 x86/topo: Add topology_num_nodes_per_package().
Backport SNC devination.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Tony Luck <tony.luck@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Link: https://patch.msgid.link/20260303110100.004091624@infradead.org
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/x86/include/asm/topology.h |  6 ++++++
 arch/x86/kernel/cpu/common.c    |  3 +++
 arch/x86/kernel/cpu/topology.c  | 13 +++++++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 01ae10c049ca..c8601213df22 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -148,6 +148,7 @@ extern unsigned int __max_logical_packages;
 extern unsigned int __max_threads_per_core;
 extern unsigned int __num_threads_per_package;
 extern unsigned int __num_cores_per_package;
+extern unsigned int __num_nodes_per_package;
 
 static inline unsigned int topology_max_packages(void)
 {
@@ -172,6 +173,11 @@ static inline unsigned int topology_num_threads_per_package(void)
 	return __num_threads_per_package;
 }
 
+static inline unsigned int topology_num_nodes_per_package(void)
+{
+	return __num_nodes_per_package;
+}
+
 #ifdef CONFIG_X86_LOCAL_APIC
 int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level);
 #else
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b27077af3bac..20df8203ce2c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -84,6 +84,9 @@ EXPORT_SYMBOL(__max_dies_per_package);
 unsigned int __max_logical_packages __ro_after_init = 1;
 EXPORT_SYMBOL(__max_logical_packages);
 
+unsigned int __num_nodes_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_nodes_per_package);
+
 unsigned int __num_cores_per_package __ro_after_init = 1;
 EXPORT_SYMBOL(__num_cores_per_package);
 
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 0f612a31181c..4c9c7dca0ee0 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -31,6 +31,7 @@
 #include <asm/io_apic.h>
 #include <asm/mpspec.h>
 #include <asm/smp.h>
+#include <asm/numa.h>
 
 #include "cpu.h"
 
@@ -509,11 +510,19 @@ void __init topology_init_possible_cpus(void)
 	set_nr_cpu_ids(allowed);
 
 	cnta = domain_weight(TOPO_PKG_DOMAIN);
-	cntb = domain_weight(TOPO_DIE_DOMAIN);
 	__max_logical_packages = cnta;
+
+	pr_info("Max. logical packages: %3u\n", __max_logical_packages);
+
+	cntb = num_phys_nodes();
+	__num_nodes_per_package = DIV_ROUND_UP(cntb, cnta);
+
+	pr_info("Max. logical nodes:    %3u\n", cntb);
+	pr_info("Num. nodes per package:%3u\n", __num_nodes_per_package);
+
+	cntb = domain_weight(TOPO_DIE_DOMAIN);
 	__max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
 
-	pr_info("Max. logical packages: %3u\n", cnta);
 	pr_info("Max. logical dies:     %3u\n", cntb);
 	pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
 
-- 
Gitee


From 3febe3e0df5d2853b81af075e149d0623abdccbd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 15 Apr 2026 09:21:30 +0800
Subject: [PATCH 15/17] x86/topo: Replace x86_has_numa_in_package

ANBZ: #33236

commit 717b64d58cff6fb97f97be07e382ed7641167a56 upstream.

.. with the brand spanking new topology_num_nodes_per_package().

Having the topology setup determine this value during MADT/SRAT parsing before
SMP bringup avoids having to detect this situation when building the SMP
topology masks.

Intel-SIG: commit 717b64d58cff x86/topo: Replace x86_has_numa_in_package.
Backport SNC devination.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Tony Luck <tony.luck@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Link: https://patch.msgid.link/20260303110100.123701837@infradead.org
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/x86/kernel/smpboot.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 861849fd9ee1..8aa9dc59c474 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -498,13 +498,6 @@ static int x86_cluster_flags(void)
 }
 #endif
 
-/*
- * Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
- * Sub-NUMA Clustering have this.
- */
-static bool x86_has_numa_in_package;
-
 static struct sched_domain_topology_level x86_topology[] = {
 	SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
 #ifdef CONFIG_SCHED_CLUSTER
@@ -526,7 +519,7 @@ static void __init build_sched_topology(void)
 	 * PKG domain since the NUMA domains will auto-magically create the
 	 * right spanning domains based on the SLIT.
 	 */
-	if (x86_has_numa_in_package) {
+	if (topology_num_nodes_per_package() > 1) {
 		unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
 
 		memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
@@ -580,7 +573,7 @@ int arch_sched_node_distance(int from, int to)
 	case INTEL_GRANITERAPIDS_X:
 	case INTEL_ATOM_DARKMONT_X:
 
-		if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
+		if (topology_max_packages() == 1 || topology_num_nodes_per_package() == 1 ||
 		    d < REMOTE_DISTANCE)
 			return d;
 
@@ -636,7 +629,7 @@ void set_cpu_sibling_map(int cpu)
 		o = &cpu_data(i);
 
 		if (match_pkg(c, o) && !topology_same_node(c, o))
-			x86_has_numa_in_package = true;
+			WARN_ON_ONCE(topology_num_nodes_per_package() == 1);
 
 		if ((i == cpu) || (has_smt && match_smt(c, o)))
 			link_mask(topology_sibling_cpumask, cpu, i);
-- 
Gitee


From f8a1124b05fbf0bd22bdf7a494e8310631cda765 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 3 Mar 2026 11:55:43 +0100
Subject: [PATCH 16/17] x86/topo: Fix SNC topology mess

ANBZ: #33236

commit 528d89a4707e5bfd86e30823c45dbb66877df900 upstream.

Per 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in
SNC-3 mode"), the original crazy SNC-3 SLIT table was:

node distances:
node     0    1    2    3    4    5
    0:   10   15   17   21   28   26
    1:   15   10   15   23   26   23
    2:   17   15   10   26   23   21
    3:   21   28   26   10   15   17
    4:   23   26   23   15   10   15
    5:   26   23   21   17   15   10

And per:

  https://lore.kernel.org/lkml/20250825075642.GQ3245006@noisy.programming.kicks-ass.net/

The suggestion was to average the off-trace clusters to restore sanity.

However, 4d6dd05d07d0 implements this under various assumptions:

 - anything GNR/CWF with numa_in_package;
 - there will never be more than 2 packages;
 - the off-trace cluster will have distance >20

And then HPE shows up with a machine that matches the
Vendor-Family-Model checks but looks like this:

Here's an 8 socket (2 chassis) HPE system with SNC enabled:

node   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
  0:  10  12  16  16  16  16  18  18  40  40  40  40  40  40  40  40
  1:  12  10  16  16  16  16  18  18  40  40  40  40  40  40  40  40
  2:  16  16  10  12  18  18  16  16  40  40  40  40  40  40  40  40
  3:  16  16  12  10  18  18  16  16  40  40  40  40  40  40  40  40
  4:  16  16  18  18  10  12  16  16  40  40  40  40  40  40  40  40
  5:  16  16  18  18  12  10  16  16  40  40  40  40  40  40  40  40
  6:  18  18  16  16  16  16  10  12  40  40  40  40  40  40  40  40
  7:  18  18  16  16  16  16  12  10  40  40  40  40  40  40  40  40
  8:  40  40  40  40  40  40  40  40  10  12  16  16  16  16  18  18
  9:  40  40  40  40  40  40  40  40  12  10  16  16  16  16  18  18
 10:  40  40  40  40  40  40  40  40  16  16  10  12  18  18  16  16
 11:  40  40  40  40  40  40  40  40  16  16  12  10  18  18  16  16
 12:  40  40  40  40  40  40  40  40  16  16  18  18  10  12  16  16
 13:  40  40  40  40  40  40  40  40  16  16  18  18  12  10  16  16
 14:  40  40  40  40  40  40  40  40  18  18  16  16  16  16  10  12
 15:  40  40  40  40  40  40  40  40  18  18  16  16  16  16  12  10

 10 = Same chassis and socket
 12 = Same chassis and socket (SNC)
 16 = Same chassis and adjacent socket
 18 = Same chassis and non-adjacent socket
 40 = Different chassis

Turns out, the 'max 2 packages' thing is only relevant to the SNC-3 parts, the
smaller parts do 8 sockets (like usual). The above SLIT table is sane, but
violates the previous assumptions and trips a WARN.

Now that the topology code has a sensible measure of nodes-per-package, we can
use that to divinate the SNC mode at hand, and only fix up SNC-3 topologies.

There is a 'healthy' amount of paranoia code validating the assumptions on the
SLIT table, a simple pr_err(FW_BUG) print on failure and a fallback to using
the regular table. Lets see how long this lasts :-)

Intel-SIG: commit 528d89a4707e x86/topo: Fix SNC topology mess.
Backport SNC devination

Fixes: 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in SNC-3 mode")
Reported-by: Kyle Meyer <kyle.meyer@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Link: https://patch.msgid.link/20260303110100.238361290@infradead.org
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/x86/kernel/smpboot.c | 190 ++++++++++++++++++++++++++++----------
 1 file changed, 143 insertions(+), 47 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8aa9dc59c474..00576d95b465 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -536,33 +536,149 @@ static void __init build_sched_topology(void)
 }
 
 #ifdef CONFIG_NUMA
-static int sched_avg_remote_distance;
-static int avg_remote_numa_distance(void)
+/*
+ * Test if the on-trace cluster at (N,N) is symmetric.
+ * Uses upper triangle iteration to avoid obvious duplicates.
+ */
+static bool slit_cluster_symmetric(int N)
 {
-	int i, j;
-	int distance, nr_remote, total_distance;
-
-	if (sched_avg_remote_distance > 0)
-		return sched_avg_remote_distance;
-
-	nr_remote = 0;
-	total_distance = 0;
-	for_each_node_state(i, N_CPU) {
-		for_each_node_state(j, N_CPU) {
-			distance = node_distance(i, j);
-
-			if (distance >= REMOTE_DISTANCE) {
-				nr_remote++;
-				total_distance += distance;
-			}
+	int u = topology_num_nodes_per_package();
+
+	for (int k = 0; k < u; k++) {
+		for (int l = k; l < u; l++) {
+			if (node_distance(N + k, N + l) !=
+			    node_distance(N + l, N + k))
+				return false;
 		}
 	}
-	if (nr_remote)
-		sched_avg_remote_distance = total_distance / nr_remote;
-	else
-		sched_avg_remote_distance = REMOTE_DISTANCE;
 
-	return sched_avg_remote_distance;
+	return true;
+}
+
+/*
+ * Return the package-id of the cluster, or ~0 if indeterminate.
+ * Each node in the on-trace cluster should have the same package-id.
+ */
+static u32 slit_cluster_package(int N)
+{
+	int u = topology_num_nodes_per_package();
+	u32 pkg_id = ~0;
+
+	for (int n = 0; n < u; n++) {
+		const struct cpumask *cpus = cpumask_of_node(N + n);
+		int cpu;
+
+		for_each_cpu(cpu, cpus) {
+			u32 id = topology_logical_package_id(cpu);
+
+			if (pkg_id == ~0)
+				pkg_id = id;
+			if (pkg_id != id)
+				return ~0;
+		}
+	}
+
+	return pkg_id;
+}
+
+/*
+ * Validate the SLIT table is of the form expected for SNC, specifically:
+ *
+ *  - each on-trace cluster should be symmetric,
+ *  - each on-trace cluster should have a unique package-id.
+ *
+ * If you NUMA_EMU on top of SNC, you get to keep the pieces.
+ */
+static bool slit_validate(void)
+{
+	int u = topology_num_nodes_per_package();
+	u32 pkg_id, prev_pkg_id = ~0;
+
+	for (int pkg = 0; pkg < topology_max_packages(); pkg++) {
+		int n = pkg * u;
+
+		/*
+		 * Ensure the on-trace cluster is symmetric and each cluster
+		 * has a different package id.
+		 */
+		if (!slit_cluster_symmetric(n))
+			return false;
+		pkg_id = slit_cluster_package(n);
+		if (pkg_id == ~0)
+			return false;
+		if (pkg && pkg_id == prev_pkg_id)
+			return false;
+
+		prev_pkg_id = pkg_id;
+	}
+
+	return true;
+}
+
+/*
+ * Compute a sanitized SLIT table for SNC; notably SNC-3 can end up with
+ * asymmetric off-trace clusters, reflecting physical assymmetries. However
+ * this leads to 'unfortunate' sched_domain configurations.
+ *
+ * For example dual socket GNR with SNC-3:
+ *
+ * node distances:
+ * node     0    1    2    3    4    5
+ *     0:   10   15   17   21   28   26
+ *     1:   15   10   15   23   26   23
+ *     2:   17   15   10   26   23   21
+ *     3:   21   28   26   10   15   17
+ *     4:   23   26   23   15   10   15
+ *     5:   26   23   21   17   15   10
+ *
+ * Fix things up by averaging out the off-trace clusters; resulting in:
+ *
+ * node     0    1    2    3    4    5
+ *     0:   10   15   17   24   24   24
+ *     1:   15   10   15   24   24   24
+ *     2:   17   15   10   24   24   24
+ *     3:   24   24   24   10   15   17
+ *     4:   24   24   24   15   10   15
+ *     5:   24   24   24   17   15   10
+ */
+static int slit_cluster_distance(int i, int j)
+{
+	static int slit_valid = -1;
+	int u = topology_num_nodes_per_package();
+	long d = 0;
+	int x, y;
+
+	if (slit_valid < 0) {
+		slit_valid = slit_validate();
+		if (!slit_valid)
+			pr_err(FW_BUG "SLIT table doesn't have the expected form for SNC -- fixup disabled!\n");
+		else
+			pr_info("Fixing up SNC SLIT table.\n");
+	}
+
+	/*
+	 * Is this a unit cluster on the trace?
+	 */
+	if ((i / u) == (j / u) || !slit_valid)
+		return node_distance(i, j);
+
+	/*
+	 * Off-trace cluster.
+	 *
+	 * Notably average out the symmetric pair of off-trace clusters to
+	 * ensure the resulting SLIT table is symmetric.
+	 */
+	x = i - (i % u);
+	y = j - (j % u);
+
+	for (i = x; i < x + u; i++) {
+		for (j = y; j < y + u; j++) {
+			d += node_distance(i, j);
+			d += node_distance(j, i);
+		}
+	}
+
+	return d / (2*u*u);
 }
 
 int arch_sched_node_distance(int from, int to)
@@ -572,34 +688,14 @@ int arch_sched_node_distance(int from, int to)
 	switch (boot_cpu_data.x86_vfm) {
 	case INTEL_GRANITERAPIDS_X:
 	case INTEL_ATOM_DARKMONT_X:
-
-		if (topology_max_packages() == 1 || topology_num_nodes_per_package() == 1 ||
-		    d < REMOTE_DISTANCE)
+		if (topology_max_packages() == 1 ||
+		    topology_num_nodes_per_package() < 3)
 			return d;
 
 		/*
-		 * With SNC enabled, there could be too many levels of remote
-		 * NUMA node distances, creating NUMA domain levels
-		 * including local nodes and partial remote nodes.
-		 *
-		 * Trim finer distance tuning for NUMA nodes in remote package
-		 * for the purpose of building sched domains. Group NUMA nodes
-		 * in the remote package in the same sched group.
-		 * Simplify NUMA domains and avoid extra NUMA levels including
-		 * different remote NUMA nodes and local nodes.
-		 *
-		 * GNR and CWF don't expect systems with more than 2 packages
-		 * and more than 2 hops between packages. Single average remote
-		 * distance won't be appropriate if there are more than 2
-		 * packages as average distance to different remote packages
-		 * could be different.
+		 * Handle SNC-3 asymmetries.
 		 */
-		WARN_ONCE(topology_max_packages() > 2,
-			  "sched: Expect only up to 2 packages for GNR or CWF, "
-			  "but saw %d packages when building sched domains.",
-			  topology_max_packages());
-
-		d = avg_remote_numa_distance();
+		return slit_cluster_distance(from, to);
 	}
 	return d;
 }
-- 
Gitee


From f3a481182c883fd516021f51dd7b38e2f97da220 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 3 Mar 2026 11:55:44 +0100
Subject: [PATCH 17/17] x86/resctrl: Fix SNC detection

ANBZ: #33236

commit 59674fc9d0bfd96ce8a776680ee1cf22c28c9ac7 upstream.

Now that the x86 topology code has a sensible nodes-per-package
measure, that does not depend on the online status of CPUs, use this
to divinate the SNC mode.

Note that when Cluster on Die (CoD) is configured on older systems this
will also show multiple NUMA nodes per package. Intel Resource Director
Technology is incomaptible with CoD. Print a warning and do not use the
fixup MSR_RMID_SNC_CONFIG.

Intel-SIG: commit 59674fc9d0bf x86/resctrl: Fix SNC detection.
Backport SNC devination

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Link: https://patch.msgid.link/aaCxbbgjL6OZ6VMd@agluck-desk3
Link: https://patch.msgid.link/20260303110100.367976706@infradead.org
[ Aubrey Li: amend commit log ]
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
---
 arch/x86/kernel/cpu/resctrl/monitor.c | 36 ++++-----------------------
 1 file changed, 5 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 52c14a7d36da..3ce7df1d52a5 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -353,7 +353,7 @@ void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
 		msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
 }
 
-/* CPU models that support MSR_RMID_SNC_CONFIG */
+/* CPU models that support SNC and MSR_RMID_SNC_CONFIG */
 static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
 	X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
 	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
@@ -364,40 +364,14 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
 	{}
 };
 
-/*
- * There isn't a simple hardware bit that indicates whether a CPU is running
- * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
- * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
- * the same NUMA node as CPU0.
- * It is not possible to accurately determine SNC state if the system is
- * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
- * to L3 caches. It will be OK if system is booted with hyperthreading
- * disabled (since this doesn't affect the ratio).
- */
 static __init int snc_get_config(void)
 {
-	struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
-	const cpumask_t *node0_cpumask;
-	int cpus_per_node, cpus_per_l3;
-	int ret;
-
-	if (!x86_match_cpu(snc_cpu_ids) || !ci)
-		return 1;
+	int ret = topology_num_nodes_per_package();
 
-	cpus_read_lock();
-	if (num_online_cpus() != num_present_cpus())
-		pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
-	cpus_read_unlock();
-
-	node0_cpumask = cpumask_of_node(cpu_to_node(0));
-
-	cpus_per_node = cpumask_weight(node0_cpumask);
-	cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
-
-	if (!cpus_per_node || !cpus_per_l3)
+	if (ret > 1 && !x86_match_cpu(snc_cpu_ids)) {
+		pr_warn("CoD enabled system? Resctrl not supported\n");
 		return 1;
-
-	ret = cpus_per_l3 / cpus_per_node;
+	}
 
 	/* sanity check: Only valid results are 1, 2, 3, 4, 6 */
 	switch (ret) {
-- 
Gitee