diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 4e4870031265c74977e2e6da4f81387a9bda236c..f354e8d1e4870da06c43176330ba8f5f4a64ef4e 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -77,10 +77,10 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif struct task_struct *secondary_current; -bool has_big_cores; -bool coregroup_enabled; -bool thread_group_shares_l2; -bool thread_group_shares_l3; +bool has_big_cores __ro_after_init; +bool coregroup_enabled __ro_after_init; +bool thread_group_shares_l2 __ro_after_init; +bool thread_group_shares_l3 __ro_after_init; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); @@ -93,15 +93,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); EXPORT_SYMBOL_GPL(has_big_cores); -enum { -#ifdef CONFIG_SCHED_SMT - smt_idx, -#endif - cache_idx, - mc_idx, - die_idx, -}; - #define MAX_THREAD_LIST_SIZE 8 #define THREAD_GROUP_SHARE_L1 1 #define THREAD_GROUP_SHARE_L2_L3 2 @@ -987,7 +978,7 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property) return 0; } -static bool shared_caches; +static bool shared_caches __ro_after_init; #ifdef CONFIG_SCHED_SMT /* cpumask of CPUs with asymmetric SMT dependency */ @@ -1003,6 +994,13 @@ static int powerpc_smt_flags(void) } #endif +/* + * On shared processor LPARs scheduled on a big core (which has two or more + * independent thread groups per core), prefer lower numbered CPUs, so + * that workload consolidates to lesser number of cores. + */ +static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack); + /* * P9 has a slightly odd architecture where pairs of cores share an L2 cache. * This topology makes it *much* cheaper to migrate tasks between adjacent cores @@ -1011,9 +1009,20 @@ static int powerpc_smt_flags(void) */ static int powerpc_shared_cache_flags(void) { + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING; + return SD_SHARE_PKG_RESOURCES; } +static int powerpc_shared_proc_flags(void) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_ASYM_PACKING; + + return 0; +} + /* * We can't just pass cpu_l2_cache_mask() directly because * returns a non-const pointer and the compiler barfs on that. @@ -1037,6 +1046,10 @@ static struct cpumask *cpu_coregroup_mask(int cpu) static bool has_coregroup_support(void) { + /* Coregroup identification not available on shared systems */ + if (is_shared_processor()) + return 0; + return coregroup_enabled; } @@ -1045,16 +1058,6 @@ static const struct cpumask *cpu_mc_mask(int cpu) return cpu_coregroup_mask(cpu); } -static struct sched_domain_topology_level powerpc_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, -#endif - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, - { cpu_mc_mask, SD_INIT_NAME(MC) }, - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, - { NULL, }, -}; - static int __init init_big_cores(void) { int cpu; @@ -1682,43 +1685,40 @@ void start_secondary(void *unused) BUG(); } -static void __init fixup_topology(void) +static struct sched_domain_topology_level powerpc_topology[6]; + +static void __init build_sched_topology(void) { - int i; + int i = 0; + + if (is_shared_processor() && has_big_cores) + static_branch_enable(&splpar_asym_pack); #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); - powerpc_topology[smt_idx].mask = smallcore_smt_mask; + powerpc_topology[i++] = + SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); + } else { + powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); } #endif + if (shared_caches) { + powerpc_topology[i++] = + SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); + } - if (!has_coregroup_support()) - powerpc_topology[mc_idx].mask = powerpc_topology[cache_idx].mask; - - /* - * Try to consolidate topology levels here instead of - * allowing scheduler to degenerate. - * - Dont consolidate if masks are different. - * - Dont consolidate if sd_flags exists and are different. - */ - for (i = 1; i <= die_idx; i++) { - if (powerpc_topology[i].mask != powerpc_topology[i - 1].mask) - continue; + if (has_coregroup_support()) { + powerpc_topology[i++] = + SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); + } - if (powerpc_topology[i].sd_flags && powerpc_topology[i - 1].sd_flags && - powerpc_topology[i].sd_flags != powerpc_topology[i - 1].sd_flags) - continue; + powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); - if (!powerpc_topology[i - 1].sd_flags) - powerpc_topology[i - 1].sd_flags = powerpc_topology[i].sd_flags; + /* There must be one trailing NULL entry left. */ + BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); - powerpc_topology[i].mask = powerpc_topology[i + 1].mask; - powerpc_topology[i].sd_flags = powerpc_topology[i + 1].sd_flags; -#ifdef CONFIG_SCHED_DEBUG - powerpc_topology[i].name = powerpc_topology[i + 1].name; -#endif - } + set_sched_topology(powerpc_topology); } void __init smp_cpus_done(unsigned int max_cpus) @@ -1733,9 +1733,20 @@ void __init smp_cpus_done(unsigned int max_cpus) smp_ops->bringup_done(); dump_numa_cpu_topology(); + build_sched_topology(); +} - fixup_topology(); - set_sched_topology(powerpc_topology); +/* + * For asym packing, by default lower numbered CPU has higher priority. + * On shared processors, pack to lower numbered core. However avoid moving + * between thread_groups within the same core. + */ +int arch_asym_cpu_priority(int cpu) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return -cpu / threads_per_core; + + return -cpu; } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 66bda6a8f918c27badbe1ba6077d65a542928460..4d94c45022eb39c112280cfcc597cdfcdf9fab3d 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -518,11 +518,11 @@ static const struct cpumask *cpu_drawer_mask(int cpu) } static struct sched_domain_topology_level s390_topology[] = { - { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, - { cpu_book_mask, SD_INIT_NAME(BOOK) }, - { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), + SDTL_INIT(cpu_book_mask, NULL, BOOK), + SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), + SDTL_INIT(cpu_cpu_mask, NULL, PKG), { NULL, }, }; diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index ef2844d691735d843c5f9d907295268208282fc3..beaf69ad2f42cf296a2ec6a56d598b4a1902a4ce 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -24,6 +24,7 @@ extern int numa_off; */ extern s16 __apicid_to_node[MAX_LOCAL_APIC]; extern nodemask_t numa_nodes_parsed __initdata; +extern nodemask_t numa_phys_nodes_parsed __initdata; extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); extern void __init numa_set_distance(int from, int to, int distance); @@ -57,6 +58,7 @@ extern void __init init_cpu_to_node(void); extern void numa_add_cpu(int cpu); extern void numa_remove_cpu(int cpu); extern void init_gi_nodes(void); +extern int num_phys_nodes(void); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } @@ -64,6 +66,10 @@ static inline void init_cpu_to_node(void) { } static inline void numa_add_cpu(int cpu) { } static inline void numa_remove_cpu(int cpu) { } static inline void init_gi_nodes(void) { } +static inline int num_phys_nodes(void) +{ + return 1; +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 01ae10c049caadbd0e5713b7ded3b0f974388cab..c8601213df220e00187bd2e8c910abc72d2a5de1 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -148,6 +148,7 @@ extern unsigned int __max_logical_packages; extern unsigned int __max_threads_per_core; extern unsigned int __num_threads_per_package; extern unsigned int __num_cores_per_package; +extern unsigned int __num_nodes_per_package; static inline unsigned int topology_max_packages(void) { @@ -172,6 +173,11 @@ static inline unsigned int topology_num_threads_per_package(void) return __num_threads_per_package; } +static inline unsigned int topology_num_nodes_per_package(void) +{ + return __num_nodes_per_package; +} + #ifdef CONFIG_X86_LOCAL_APIC int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level); #else diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b27077af3bac68aa0d6f21314e3d105161c2891e..20df8203ce2cdd41a440e38698749e4aa194d692 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -84,6 +84,9 @@ EXPORT_SYMBOL(__max_dies_per_package); unsigned int __max_logical_packages __ro_after_init = 1; EXPORT_SYMBOL(__max_logical_packages); +unsigned int __num_nodes_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__num_nodes_per_package); + unsigned int __num_cores_per_package __ro_after_init = 1; EXPORT_SYMBOL(__num_cores_per_package); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 52c14a7d36daf9a04fa1db10809df577c84ba08c..3ce7df1d52a598ab6b614d350568df04f685f568 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -353,7 +353,7 @@ void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); } -/* CPU models that support MSR_RMID_SNC_CONFIG */ +/* CPU models that support SNC and MSR_RMID_SNC_CONFIG */ static const struct x86_cpu_id snc_cpu_ids[] __initconst = { X86_MATCH_VFM(INTEL_ICELAKE_X, 0), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), @@ -364,40 +364,14 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = { {} }; -/* - * There isn't a simple hardware bit that indicates whether a CPU is running - * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the - * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in - * the same NUMA node as CPU0. - * It is not possible to accurately determine SNC state if the system is - * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes - * to L3 caches. It will be OK if system is booted with hyperthreading - * disabled (since this doesn't affect the ratio). - */ static __init int snc_get_config(void) { - struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); - const cpumask_t *node0_cpumask; - int cpus_per_node, cpus_per_l3; - int ret; - - if (!x86_match_cpu(snc_cpu_ids) || !ci) - return 1; + int ret = topology_num_nodes_per_package(); - cpus_read_lock(); - if (num_online_cpus() != num_present_cpus()) - pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); - cpus_read_unlock(); - - node0_cpumask = cpumask_of_node(cpu_to_node(0)); - - cpus_per_node = cpumask_weight(node0_cpumask); - cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); - - if (!cpus_per_node || !cpus_per_l3) + if (ret > 1 && !x86_match_cpu(snc_cpu_ids)) { + pr_warn("CoD enabled system? Resctrl not supported\n"); return 1; - - ret = cpus_per_l3 / cpus_per_node; + } /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ switch (ret) { diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 0f612a31181cb0e092a716ea5a1d17c0b5473ecd..4c9c7dca0ee0fecd898d769d4d58f766489106ab 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "cpu.h" @@ -509,11 +510,19 @@ void __init topology_init_possible_cpus(void) set_nr_cpu_ids(allowed); cnta = domain_weight(TOPO_PKG_DOMAIN); - cntb = domain_weight(TOPO_DIE_DOMAIN); __max_logical_packages = cnta; + + pr_info("Max. logical packages: %3u\n", __max_logical_packages); + + cntb = num_phys_nodes(); + __num_nodes_per_package = DIV_ROUND_UP(cntb, cnta); + + pr_info("Max. logical nodes: %3u\n", cntb); + pr_info("Num. nodes per package:%3u\n", __num_nodes_per_package); + + cntb = domain_weight(TOPO_DIE_DOMAIN); __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta)); - pr_info("Max. logical packages: %3u\n", cnta); pr_info("Max. logical dies: %3u\n", cntb); pr_info("Max. dies per package: %3u\n", __max_dies_per_package); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3426d6aea42b43a132b5c416a252476762ce9420..00576d95b465d54b9f757156ad45405f6eed7240 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -491,12 +491,6 @@ static int x86_core_flags(void) return cpu_core_flags() | x86_sched_itmt_flags(); } #endif -#ifdef CONFIG_SCHED_SMT -static int x86_smt_flags(void) -{ - return cpu_smt_flags(); -} -#endif #ifdef CONFIG_SCHED_CLUSTER static int x86_cluster_flags(void) { @@ -504,81 +498,187 @@ static int x86_cluster_flags(void) } #endif -/* - * Set if a package/die has multiple NUMA nodes inside. - * AMD Magny-Cours, Intel Cluster-on-Die, and Intel - * Sub-NUMA Clustering have this. - */ -static bool x86_has_numa_in_package; - -static struct sched_domain_topology_level x86_topology[6]; - -static void __init build_sched_topology(void) -{ - int i = 0; - -#ifdef CONFIG_SCHED_SMT - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) - }; -#endif +static struct sched_domain_topology_level x86_topology[] = { + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) - }; + SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) - }; + SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), #endif + SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), + { NULL }, +}; + +static void __init build_sched_topology(void) +{ + struct sched_domain_topology_level *topology = x86_topology; + /* - * When there is NUMA topology inside the package skip the PKG domain - * since the NUMA domains will auto-magically create the right spanning - * domains based on the SLIT. + * When there is NUMA topology inside the package invalidate the + * PKG domain since the NUMA domains will auto-magically create the + * right spanning domains based on the SLIT. */ - if (!x86_has_numa_in_package) { - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG) - }; + if (topology_num_nodes_per_package() > 1) { + unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2; + + memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); } /* - * There must be one trailing NULL entry left. + * Drop the SMT domains if there is only one thread per-core + * since it'll get degenerated by the scheduler anyways. */ - BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); + if (cpu_smt_num_threads <= 1) + ++topology; - set_sched_topology(x86_topology); + set_sched_topology(topology); } #ifdef CONFIG_NUMA -static int sched_avg_remote_distance; -static int avg_remote_numa_distance(void) +/* + * Test if the on-trace cluster at (N,N) is symmetric. + * Uses upper triangle iteration to avoid obvious duplicates. + */ +static bool slit_cluster_symmetric(int N) { - int i, j; - int distance, nr_remote, total_distance; - - if (sched_avg_remote_distance > 0) - return sched_avg_remote_distance; - - nr_remote = 0; - total_distance = 0; - for_each_node_state(i, N_CPU) { - for_each_node_state(j, N_CPU) { - distance = node_distance(i, j); - - if (distance >= REMOTE_DISTANCE) { - nr_remote++; - total_distance += distance; - } + int u = topology_num_nodes_per_package(); + + for (int k = 0; k < u; k++) { + for (int l = k; l < u; l++) { + if (node_distance(N + k, N + l) != + node_distance(N + l, N + k)) + return false; } } - if (nr_remote) - sched_avg_remote_distance = total_distance / nr_remote; - else - sched_avg_remote_distance = REMOTE_DISTANCE; - return sched_avg_remote_distance; + return true; +} + +/* + * Return the package-id of the cluster, or ~0 if indeterminate. + * Each node in the on-trace cluster should have the same package-id. + */ +static u32 slit_cluster_package(int N) +{ + int u = topology_num_nodes_per_package(); + u32 pkg_id = ~0; + + for (int n = 0; n < u; n++) { + const struct cpumask *cpus = cpumask_of_node(N + n); + int cpu; + + for_each_cpu(cpu, cpus) { + u32 id = topology_logical_package_id(cpu); + + if (pkg_id == ~0) + pkg_id = id; + if (pkg_id != id) + return ~0; + } + } + + return pkg_id; +} + +/* + * Validate the SLIT table is of the form expected for SNC, specifically: + * + * - each on-trace cluster should be symmetric, + * - each on-trace cluster should have a unique package-id. + * + * If you NUMA_EMU on top of SNC, you get to keep the pieces. + */ +static bool slit_validate(void) +{ + int u = topology_num_nodes_per_package(); + u32 pkg_id, prev_pkg_id = ~0; + + for (int pkg = 0; pkg < topology_max_packages(); pkg++) { + int n = pkg * u; + + /* + * Ensure the on-trace cluster is symmetric and each cluster + * has a different package id. + */ + if (!slit_cluster_symmetric(n)) + return false; + pkg_id = slit_cluster_package(n); + if (pkg_id == ~0) + return false; + if (pkg && pkg_id == prev_pkg_id) + return false; + + prev_pkg_id = pkg_id; + } + + return true; +} + +/* + * Compute a sanitized SLIT table for SNC; notably SNC-3 can end up with + * asymmetric off-trace clusters, reflecting physical assymmetries. However + * this leads to 'unfortunate' sched_domain configurations. + * + * For example dual socket GNR with SNC-3: + * + * node distances: + * node 0 1 2 3 4 5 + * 0: 10 15 17 21 28 26 + * 1: 15 10 15 23 26 23 + * 2: 17 15 10 26 23 21 + * 3: 21 28 26 10 15 17 + * 4: 23 26 23 15 10 15 + * 5: 26 23 21 17 15 10 + * + * Fix things up by averaging out the off-trace clusters; resulting in: + * + * node 0 1 2 3 4 5 + * 0: 10 15 17 24 24 24 + * 1: 15 10 15 24 24 24 + * 2: 17 15 10 24 24 24 + * 3: 24 24 24 10 15 17 + * 4: 24 24 24 15 10 15 + * 5: 24 24 24 17 15 10 + */ +static int slit_cluster_distance(int i, int j) +{ + static int slit_valid = -1; + int u = topology_num_nodes_per_package(); + long d = 0; + int x, y; + + if (slit_valid < 0) { + slit_valid = slit_validate(); + if (!slit_valid) + pr_err(FW_BUG "SLIT table doesn't have the expected form for SNC -- fixup disabled!\n"); + else + pr_info("Fixing up SNC SLIT table.\n"); + } + + /* + * Is this a unit cluster on the trace? + */ + if ((i / u) == (j / u) || !slit_valid) + return node_distance(i, j); + + /* + * Off-trace cluster. + * + * Notably average out the symmetric pair of off-trace clusters to + * ensure the resulting SLIT table is symmetric. + */ + x = i - (i % u); + y = j - (j % u); + + for (i = x; i < x + u; i++) { + for (j = y; j < y + u; j++) { + d += node_distance(i, j); + d += node_distance(j, i); + } + } + + return d / (2*u*u); } int arch_sched_node_distance(int from, int to) @@ -588,34 +688,14 @@ int arch_sched_node_distance(int from, int to) switch (boot_cpu_data.x86_vfm) { case INTEL_GRANITERAPIDS_X: case INTEL_ATOM_DARKMONT_X: - - if (!x86_has_numa_in_package || topology_max_packages() == 1 || - d < REMOTE_DISTANCE) + if (topology_max_packages() == 1 || + topology_num_nodes_per_package() < 3) return d; /* - * With SNC enabled, there could be too many levels of remote - * NUMA node distances, creating NUMA domain levels - * including local nodes and partial remote nodes. - * - * Trim finer distance tuning for NUMA nodes in remote package - * for the purpose of building sched domains. Group NUMA nodes - * in the remote package in the same sched group. - * Simplify NUMA domains and avoid extra NUMA levels including - * different remote NUMA nodes and local nodes. - * - * GNR and CWF don't expect systems with more than 2 packages - * and more than 2 hops between packages. Single average remote - * distance won't be appropriate if there are more than 2 - * packages as average distance to different remote packages - * could be different. + * Handle SNC-3 asymmetries. */ - WARN_ONCE(topology_max_packages() > 2, - "sched: Expect only up to 2 packages for GNR or CWF, " - "but saw %d packages when building sched domains.", - topology_max_packages()); - - d = avg_remote_numa_distance(); + return slit_cluster_distance(from, to); } return d; } @@ -645,7 +725,7 @@ void set_cpu_sibling_map(int cpu) o = &cpu_data(i); if (match_pkg(c, o) && !topology_same_node(c, o)) - x86_has_numa_in_package = true; + WARN_ON_ONCE(topology_num_nodes_per_package() == 1); if ((i == cpu) || (has_smt && match_smt(c, o))) link_mask(topology_sibling_cpumask, cpu, i); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index df959deef5b242e2aead2f32adf4cda9654f5b1a..0958df9bccd926df92feb764afa4c11bb9df9de5 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -55,6 +55,8 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; +nodemask_t numa_phys_nodes_parsed __initdata; + int numa_cpu_node(int cpu) { u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); @@ -64,6 +66,11 @@ int numa_cpu_node(int cpu) return NUMA_NO_NODE; } +int __init num_phys_nodes(void) +{ + return bitmap_weight(numa_phys_nodes_parsed.bits, MAX_NUMNODES); +} + cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); @@ -674,6 +681,7 @@ static int __init dummy_numa_init(void) 0LLU, PFN_PHYS(max_pfn) - 1); node_set(0, numa_nodes_parsed); + node_set(0, numa_phys_nodes_parsed); numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); return 0; diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 9c52a95937adecc29f8d85f2469ff72600414ed7..44ca66651756170091dd1bf279aedea84e1e50a6 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -57,8 +57,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", - pxm, apic_id, node); + node_set(node, numa_phys_nodes_parsed); + pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); } /* Callback for Proximity Domain -> LAPIC mapping */ @@ -98,8 +98,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", - pxm, apic_id, node); + node_set(node, numa_phys_nodes_parsed); + pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); } int __init x86_acpi_numa_init(void) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7ce562c7fd64990316699ea76b9855e3145da2e5..f489cf2636f15fcdb6aa7ff148d5e345d7e3f8c8 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -147,9 +147,7 @@ struct sched_domain { unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; #endif -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif union { void *private; /* used during construction */ struct rcu_head rcu; /* used during destruction */ @@ -210,20 +208,14 @@ struct sched_domain_topology_level { int flags; int numa_level; struct sd_data data; -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif }; extern void __init set_sched_topology(struct sched_domain_topology_level *tl); extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); - -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(type) .name = #type -#else -# define SD_INIT_NAME(type) -#endif +#define SDTL_INIT(maskfn, flagsfn, dname) ((struct sched_domain_topology_level) \ + { .mask = maskfn, .sd_flags = flagsfn, .name = #dname }) #else /* CONFIG_SMP */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e8ae21c49a9d946147c770d6c0d2d16f6f9f1010..24978f702f2b2e68731fddbdb93974bd1967318b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1683,9 +1683,7 @@ sd_init(struct sched_domain_topology_level *tl, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -#ifdef CONFIG_SCHED_DEBUG .name = tl->name, -#endif }; sd_span = sched_domain_span(sd); @@ -1749,17 +1747,17 @@ sd_init(struct sched_domain_topology_level *tl, */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, + SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), #endif - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(cpu_cpu_mask, NULL, PKG), { NULL, }, }; @@ -2072,23 +2070,15 @@ void sched_init_numa(int offline_node) /* * Add the NUMA identity distance, aka single NODE. */ - tl[i++] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .numa_level = 0, - SD_INIT_NAME(NODE) - }; + tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE); /* * .. and append 'j' levels of NUMA goodness. */ for (j = 1; j < nr_levels; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, - .flags = SDTL_OVERLAP, - .numa_level = j, - SD_INIT_NAME(NUMA) - }; + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); + tl[i].numa_level = j; + tl[i].flags = SDTL_OVERLAP; } sched_domain_topology_saved = sched_domain_topology; @@ -2441,10 +2431,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); -#endif /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd),