Skip to content

Commit 8a5cfa2

Browse files
committed
sched/fair: Get rid of sched_domains_curr_level hack for tl->cpumask()
JIRA: https://issues.redhat.com/browse/RHEL-118964 Conflicts: Minor context differences in topology.h commit 661f951 Author: Peter Zijlstra <peterz@infradead.org> Date: Mon Aug 25 12:02:44 2025 +0000 sched/fair: Get rid of sched_domains_curr_level hack for tl->cpumask() Leon [1] and Vinicius [2] noted a topology_span_sane() warning during their testing starting from v6.16-rc1. Debug that followed pointed to the tl->mask() for the NODE domain being incorrectly resolved to that of the highest NUMA domain. tl->mask() for NODE is set to the sd_numa_mask() which depends on the global "sched_domains_curr_level" hack. "sched_domains_curr_level" is set to the "tl->numa_level" during tl traversal in build_sched_domains() calling sd_init() but was not reset before topology_span_sane(). Since "tl->numa_level" still reflected the old value from build_sched_domains(), topology_span_sane() for the NODE domain trips when the span of the last NUMA domain overlaps. Instead of replicating the "sched_domains_curr_level" hack, get rid of it entirely and instead, pass the entire "sched_domain_topology_level" object to tl->cpumask() function to prevent such mishap in the future. sd_numa_mask() now directly references "tl->numa_level" instead of relying on the global "sched_domains_curr_level" hack to index into sched_domains_numa_masks[]. The original warning was reproducible on the following NUMA topology reported by Leon: $ sudo numactl -H available: 5 nodes (0-4) node 0 cpus: 0 1 node 0 size: 2927 MB node 0 free: 1603 MB node 1 cpus: 2 3 node 1 size: 3023 MB node 1 free: 3008 MB node 2 cpus: 4 5 node 2 size: 3023 MB node 2 free: 3007 MB node 3 cpus: 6 7 node 3 size: 3023 MB node 3 free: 3002 MB node 4 cpus: 8 9 node 4 size: 3022 MB node 4 free: 2718 MB node distances: node 0 1 2 3 4 0: 10 39 38 37 36 1: 39 10 38 37 36 2: 38 38 10 37 36 3: 37 37 37 10 36 4: 36 36 36 36 10 The above topology can be mimicked using the following QEMU cmd that was used to reproduce the warning and test the fix: sudo qemu-system-x86_64 -enable-kvm -cpu host \ -m 20G -smp cpus=10,sockets=10 -machine q35 \ -object memory-backend-ram,size=4G,id=m0 \ -object memory-backend-ram,size=4G,id=m1 \ -object memory-backend-ram,size=4G,id=m2 \ -object memory-backend-ram,size=4G,id=m3 \ -object memory-backend-ram,size=4G,id=m4 \ -numa node,cpus=0-1,memdev=m0,nodeid=0 \ -numa node,cpus=2-3,memdev=m1,nodeid=1 \ -numa node,cpus=4-5,memdev=m2,nodeid=2 \ -numa node,cpus=6-7,memdev=m3,nodeid=3 \ -numa node,cpus=8-9,memdev=m4,nodeid=4 \ -numa dist,src=0,dst=1,val=39 \ -numa dist,src=0,dst=2,val=38 \ -numa dist,src=0,dst=3,val=37 \ -numa dist,src=0,dst=4,val=36 \ -numa dist,src=1,dst=0,val=39 \ -numa dist,src=1,dst=2,val=38 \ -numa dist,src=1,dst=3,val=37 \ -numa dist,src=1,dst=4,val=36 \ -numa dist,src=2,dst=0,val=38 \ -numa dist,src=2,dst=1,val=38 \ -numa dist,src=2,dst=3,val=37 \ -numa dist,src=2,dst=4,val=36 \ -numa dist,src=3,dst=0,val=37 \ -numa dist,src=3,dst=1,val=37 \ -numa dist,src=3,dst=2,val=37 \ -numa dist,src=3,dst=4,val=36 \ -numa dist,src=4,dst=0,val=36 \ -numa dist,src=4,dst=1,val=36 \ -numa dist,src=4,dst=2,val=36 \ -numa dist,src=4,dst=3,val=36 \ ... [ prateek: Moved common functions to include/linux/sched/topology.h, reuse the common bits for s390 and ppc, commit message ] Closes: https://lore.kernel.org/lkml/20250610110701.GA256154@unreal/ [1] Fixes: ccf7412 ("sched/topology: Assert non-NUMA topology masks don't (partially) overlap") # ce29a7d, f55dac1 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reported-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Valentin Schneider <vschneid@redhat.com> Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com> Tested-by: Valentin Schneider <vschneid@redhat.com> # x86 Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com> # powerpc Link: https://lore.kernel.org/lkml/a3de98387abad28592e6ab591f3ff6107fe01dc1.1755893468.git.tim.c.chen@linux.intel.com/ [2] Signed-off-by: Phil Auld <pauld@redhat.com>
1 parent 3a55f9d commit 8a5cfa2

File tree

8 files changed

+66
-53
lines changed

8 files changed

+66
-53
lines changed

arch/powerpc/Kconfig

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -853,6 +853,10 @@ config SCHED_SMT
853853
when dealing with POWER5 cpus at a cost of slightly increased
854854
overhead in some places. If unsure say N here.
855855

856+
config SCHED_MC
857+
def_bool y
858+
depends on SMP
859+
856860
config PPC_DENORMALISATION
857861
bool "PowerPC denormalisation exception handling"
858862
depends on PPC_BOOK3S_64

arch/powerpc/include/asm/topology.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ static inline int cpu_to_coregroup_id(int cpu)
131131
#ifdef CONFIG_SMP
132132
#include <asm/cputable.h>
133133

134+
struct cpumask *cpu_coregroup_mask(int cpu);
135+
134136
#ifdef CONFIG_PPC64
135137
#include <asm/smp.h>
136138

arch/powerpc/kernel/smp.c

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,19 +1040,19 @@ static int powerpc_shared_proc_flags(void)
10401040
* We can't just pass cpu_l2_cache_mask() directly because
10411041
* returns a non-const pointer and the compiler barfs on that.
10421042
*/
1043-
static const struct cpumask *shared_cache_mask(int cpu)
1043+
static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu)
10441044
{
10451045
return per_cpu(cpu_l2_cache_map, cpu);
10461046
}
10471047

10481048
#ifdef CONFIG_SCHED_SMT
1049-
static const struct cpumask *smallcore_smt_mask(int cpu)
1049+
static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu)
10501050
{
10511051
return cpu_smallcore_mask(cpu);
10521052
}
10531053
#endif
10541054

1055-
static struct cpumask *cpu_coregroup_mask(int cpu)
1055+
struct cpumask *cpu_coregroup_mask(int cpu)
10561056
{
10571057
return per_cpu(cpu_coregroup_map, cpu);
10581058
}
@@ -1066,11 +1066,6 @@ static bool has_coregroup_support(void)
10661066
return coregroup_enabled;
10671067
}
10681068

1069-
static const struct cpumask *cpu_mc_mask(int cpu)
1070-
{
1071-
return cpu_coregroup_mask(cpu);
1072-
}
1073-
10741069
static int __init init_big_cores(void)
10751070
{
10761071
int cpu;
@@ -1459,7 +1454,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask)
14591454
return false;
14601455
}
14611456

1462-
cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu));
1457+
cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu));
14631458

14641459
/* Update l2-cache mask with all the CPUs that are part of submask */
14651460
or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask);
@@ -1549,7 +1544,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask)
15491544
return;
15501545
}
15511546

1552-
cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu));
1547+
cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu));
15531548

15541549
/* Update coregroup mask with all the CPUs that are part of submask */
15551550
or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask);
@@ -1612,7 +1607,7 @@ static void add_cpu_to_masks(int cpu)
16121607

16131608
/* If chip_id is -1; limit the cpu_core_mask to within PKG */
16141609
if (chip_id == -1)
1615-
cpumask_and(mask, mask, cpu_cpu_mask(cpu));
1610+
cpumask_and(mask, mask, cpu_node_mask(cpu));
16161611

16171612
for_each_cpu(i, mask) {
16181613
if (chip_id == cpu_to_chip_id(i)) {
@@ -1714,22 +1709,22 @@ static void __init build_sched_topology(void)
17141709
if (has_big_cores) {
17151710
pr_info("Big cores detected but using small core scheduling\n");
17161711
powerpc_topology[i++] =
1717-
SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT);
1712+
SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT);
17181713
} else {
1719-
powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT);
1714+
powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT);
17201715
}
17211716
#endif
17221717
if (shared_caches) {
17231718
powerpc_topology[i++] =
1724-
SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE);
1719+
SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE);
17251720
}
17261721

17271722
if (has_coregroup_support()) {
17281723
powerpc_topology[i++] =
1729-
SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC);
1724+
SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC);
17301725
}
17311726

1732-
powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG);
1727+
powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG);
17331728

17341729
/* There must be one trailing NULL entry left. */
17351730
BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);

arch/s390/kernel/topology.c

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -513,33 +513,27 @@ int topology_cpu_init(struct cpu *cpu)
513513
return rc;
514514
}
515515

516-
static const struct cpumask *cpu_thread_mask(int cpu)
517-
{
518-
return &cpu_topology[cpu].thread_mask;
519-
}
520-
521-
522516
const struct cpumask *cpu_coregroup_mask(int cpu)
523517
{
524518
return &cpu_topology[cpu].core_mask;
525519
}
526520

527-
static const struct cpumask *cpu_book_mask(int cpu)
521+
static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu)
528522
{
529523
return &cpu_topology[cpu].book_mask;
530524
}
531525

532-
static const struct cpumask *cpu_drawer_mask(int cpu)
526+
static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu)
533527
{
534528
return &cpu_topology[cpu].drawer_mask;
535529
}
536530

537531
static struct sched_domain_topology_level s390_topology[] = {
538-
SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT),
539-
SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
540-
SDTL_INIT(cpu_book_mask, NULL, BOOK),
541-
SDTL_INIT(cpu_drawer_mask, NULL, DRAWER),
542-
SDTL_INIT(cpu_cpu_mask, NULL, PKG),
532+
SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
533+
SDTL_INIT(tl_mc_mask, cpu_core_flags, MC),
534+
SDTL_INIT(tl_book_mask, NULL, BOOK),
535+
SDTL_INIT(tl_drawer_mask, NULL, DRAWER),
536+
SDTL_INIT(tl_pkg_mask, NULL, PKG),
543537
{ NULL, },
544538
};
545539

arch/x86/kernel/smpboot.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -505,14 +505,14 @@ static int x86_cluster_flags(void)
505505
static bool x86_has_numa_in_package;
506506

507507
static struct sched_domain_topology_level x86_topology[] = {
508-
SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
508+
SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
509509
#ifdef CONFIG_SCHED_CLUSTER
510-
SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS),
510+
SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS),
511511
#endif
512512
#ifdef CONFIG_SCHED_MC
513-
SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC),
513+
SDTL_INIT(tl_mc_mask, x86_core_flags, MC),
514514
#endif
515-
SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG),
515+
SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG),
516516
{ NULL },
517517
};
518518

include/linux/sched/topology.h

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,27 +35,53 @@ extern const struct sd_flag_debug sd_flag_debug[];
3535

3636
#endif
3737

38+
struct sched_domain_topology_level;
39+
3840
#ifdef CONFIG_SCHED_SMT
3941
static inline int cpu_smt_flags(void)
4042
{
4143
return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
4244
}
45+
46+
static inline const
47+
struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu)
48+
{
49+
return cpu_smt_mask(cpu);
50+
}
4351
#endif
4452

4553
#ifdef CONFIG_SCHED_CLUSTER
4654
static inline int cpu_cluster_flags(void)
4755
{
4856
return SD_CLUSTER | SD_SHARE_LLC;
4957
}
58+
59+
static inline const
60+
struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu)
61+
{
62+
return cpu_clustergroup_mask(cpu);
63+
}
5064
#endif
5165

5266
#ifdef CONFIG_SCHED_MC
5367
static inline int cpu_core_flags(void)
5468
{
5569
return SD_SHARE_LLC;
5670
}
71+
72+
static inline const
73+
struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu)
74+
{
75+
return cpu_coregroup_mask(cpu);
76+
}
5777
#endif
5878

79+
static inline const
80+
struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu)
81+
{
82+
return cpu_node_mask(cpu);
83+
}
84+
5985
#ifdef CONFIG_NUMA
6086
static inline int cpu_numa_flags(void)
6187
{
@@ -185,7 +211,7 @@ bool cpus_equal_capacity(int this_cpu, int that_cpu);
185211
bool cpus_share_cache(int this_cpu, int that_cpu);
186212
bool cpus_share_resources(int this_cpu, int that_cpu);
187213

188-
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
214+
typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu);
189215
typedef int (*sched_domain_flags_f)(void);
190216

191217
#define SDTL_OVERLAP 0x01

include/linux/topology.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ static inline const struct cpumask *cpu_smt_mask(int cpu)
240240
}
241241
#endif
242242

243-
static inline const struct cpumask *cpu_cpu_mask(int cpu)
243+
static inline const struct cpumask *cpu_node_mask(int cpu)
244244
{
245245
return cpumask_of_node(cpu_to_node(cpu));
246246
}

kernel/sched/topology.c

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1604,7 +1604,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
16041604
enum numa_topology_type sched_numa_topology_type;
16051605

16061606
static int sched_domains_numa_levels;
1607-
static int sched_domains_curr_level;
16081607

16091608
int sched_max_numa_distance;
16101609
static int *sched_domains_numa_distance;
@@ -1645,14 +1644,7 @@ sd_init(struct sched_domain_topology_level *tl,
16451644
int sd_id, sd_weight, sd_flags = 0;
16461645
struct cpumask *sd_span;
16471646

1648-
#ifdef CONFIG_NUMA
1649-
/*
1650-
* Ugly hack to pass state to sd_numa_mask()...
1651-
*/
1652-
sched_domains_curr_level = tl->numa_level;
1653-
#endif
1654-
1655-
sd_weight = cpumask_weight(tl->mask(cpu));
1647+
sd_weight = cpumask_weight(tl->mask(tl, cpu));
16561648

16571649
if (tl->sd_flags)
16581650
sd_flags = (*tl->sd_flags)();
@@ -1690,7 +1682,7 @@ sd_init(struct sched_domain_topology_level *tl,
16901682
};
16911683

16921684
sd_span = sched_domain_span(sd);
1693-
cpumask_and(sd_span, cpu_map, tl->mask(cpu));
1685+
cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
16941686
sd_id = cpumask_first(sd_span);
16951687

16961688
sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
@@ -1750,17 +1742,17 @@ sd_init(struct sched_domain_topology_level *tl,
17501742
*/
17511743
static struct sched_domain_topology_level default_topology[] = {
17521744
#ifdef CONFIG_SCHED_SMT
1753-
SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
1745+
SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
17541746
#endif
17551747

17561748
#ifdef CONFIG_SCHED_CLUSTER
1757-
SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
1749+
SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS),
17581750
#endif
17591751

17601752
#ifdef CONFIG_SCHED_MC
1761-
SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
1753+
SDTL_INIT(tl_mc_mask, cpu_core_flags, MC),
17621754
#endif
1763-
SDTL_INIT(cpu_cpu_mask, NULL, PKG),
1755+
SDTL_INIT(tl_pkg_mask, NULL, PKG),
17641756
{ NULL, },
17651757
};
17661758

@@ -1782,9 +1774,9 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl)
17821774

17831775
#ifdef CONFIG_NUMA
17841776

1785-
static const struct cpumask *sd_numa_mask(int cpu)
1777+
static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu)
17861778
{
1787-
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1779+
return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)];
17881780
}
17891781

17901782
static void sched_numa_warn(const char *str)
@@ -2423,15 +2415,15 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
24232415
* breaks the linking done for an earlier span.
24242416
*/
24252417
for_each_cpu(cpu, cpu_map) {
2426-
const struct cpumask *tl_cpu_mask = tl->mask(cpu);
2418+
const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu);
24272419
int id;
24282420

24292421
/* lowest bit set in this mask is used as a unique id */
24302422
id = cpumask_first(tl_cpu_mask);
24312423

24322424
if (cpumask_test_cpu(id, id_seen)) {
24332425
/* First CPU has already been seen, ensure identical spans */
2434-
if (!cpumask_equal(tl->mask(id), tl_cpu_mask))
2426+
if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask))
24352427
return false;
24362428
} else {
24372429
/* First CPU hasn't been seen before, ensure it's a completely new span */

0 commit comments

Comments
 (0)