sched-ext/scx

scx_layered: Add topology related bpf_cpumasks to layer_cpumask_wrapper

hodgesds opened this issue · 0 comments

Add struct bpf_cpumasks for topologies on struct layer_cpumask_wrapper. The change should be rather straightforward:

diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
index 72a7005..43de306 100644
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -275,22 +275,29 @@ static void adj_load(u32 layer_idx, s64 adj, u64 now)
 	bpf_spin_lock(&lockw->lock);
 	layer->load += adj;
 	ravg_accumulate(&layer->load_rd, layer->load, now, USAGE_HALF_LIFE);
 	bpf_spin_unlock(&lockw->lock);
 
 	if (debug && adj < 0 && (s64)layer->load < 0)
 		scx_bpf_error("cpu%d layer%d load underflow (load=%lld adj=%lld)",
 			      bpf_get_smp_processor_id(), layer_idx, layer->load, adj);
 }
 
+struct layered_topo_cpumasks {
+         struct bpf_cpumask __kptr *cpumask;
+};
+
+
 struct layer_cpumask_wrapper {
 	struct bpf_cpumask __kptr *cpumask;
+	struct layered_topo_cpumasks llc_cpumasks[MAX_DOMS];
+	struct layered_topo_cpumasks node_cpumasks[MAX_NUMA_NODES];
 };
 
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__type(key, u32);
 	__type(value, struct layer_cpumask_wrapper);
 	__uint(max_entries, MAX_LAYERS);
 	__uint(map_flags, 0);
 } layer_cpumasks SEC(".maps");
 
@@ -302,45 +309,58 @@ static struct cpumask *lookup_layer_cpumask(int idx)
 		return (struct cpumask *)cpumaskw->cpumask;
 	} else {
 		scx_bpf_error("no layer_cpumask");
 		return NULL;
 	}
 }
 
 static void refresh_cpumasks(int idx)
 {
 	struct layer_cpumask_wrapper *cpumaskw;
+	struct cpu_ctx *cctx;
 	struct layer *layer;
-	int cpu, total = 0;
+	int cpu, node_idx, total = 0;
 
 	if (!__sync_val_compare_and_swap(&layers[idx].refresh_cpus, 1, 0))
 		return;
 
 	cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx);
 
 	bpf_for(cpu, 0, nr_possible_cpus) {
 		u8 *u8_ptr;
 
+		if (!(cctx = lookup_cpu_ctx(cpu))) {
+			scx_bpf_error("can't happen");
+			return;
+		}
+
+		node_idx = cctx->node_idx;
+		if (node_idx > nr_nodes) {
+			scx_bpf_error("can't happen");
+			return;
+		}
+
 		if ((u8_ptr = MEMBER_VPTR(layers, [idx].cpus[cpu / 8]))) {
 			/*
 			 * XXX - The following test should be outside the loop
 			 * but that makes the verifier think that
 			 * cpumaskw->cpumask might be NULL in the loop.
 			 */
 			barrier_var(cpumaskw);
 			if (!cpumaskw || !cpumaskw->cpumask) {
 				scx_bpf_error("can't happen");
 				return;
 			}
 
 			if (*u8_ptr & (1 << (cpu % 8))) {
 				bpf_cpumask_set_cpu(cpu, cpumaskw->cpumask);
+				bpf_cpumask_set_cpu(cpu, cpumaskw->node_cpumasks[node_idx].cpumask);
 				total++;
 			} else {
 				bpf_cpumask_clear_cpu(cpu, cpumaskw->cpumask);
 			}
 		} else {
 			scx_bpf_error("can't happen");
 		}
 	}
 
 	// XXX - shouldn't be necessary

However, this fails the verifier when trying to use it:

108: (79) r2 = *(u64 *)(r6 +0)        ; R2_w=rcu_ptr_or_null_bpf_cpumask(id=21) R6=map_value(map=layer_cpumasks,ks=4,vs=1032) refs=1,8
109: (55) if r2 != 0x0 goto pc+7 117: R0=map_value(map=cpu_ctxs,ks=4,vs=3632) R1_w=map_value(map=bpf_bpf.bss,ks=4,vs=23411976,smin=smin32=0,smax=umax=smax32=umax32=0x1653cff,var_off=(0x0; 0x1ffffff)) R2_w=rcu_ptr_bpf_cpumask() R6=map_value(map=layer_cpumasks,ks=4,vs=1032) R7=scalar(id=19,smin=smin32=0,smax=umax=smax32=umax32=79,var_off=(0x0; 0x7f)) R8=0 R9=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3)) R10=fp0 fp-8=mmmm???? fp-16=mmmmmmmm fp-24=mmmm???? fp-32=iter_num(ref_id=8,state=active,depth=2) fp-40=iter_num(ref_id=1,state=active,depth=2) refs=1,8
; if (*u8_ptr & (1 << (cpu % 8))) { @ main.bpf.c:354
117: (71) r1 = *(u8 *)(r1 +0)         ; R1_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) refs=1,8
118: (bc) w3 = w7                     ; R3_w=scalar(id=19,smin=smin32=0,smax=umax=smax32=umax32=79,var_off=(0x0; 0x7f)) R7=scalar(id=19,smin=smin32=0,smax=umax=smax32=umax32=79,var_off=(0x0; 0x7f)) refs=1,8
119: (54) w3 &= 7                     ; R3_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=7,var_off=(0x0; 0x7)) refs=1,8
120: (7c) w1 >>= w3                   ; R1_w=scalar() R3_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=7,var_off=(0x0; 0x7)) refs=1,8
121: (54) w1 &= 1                     ; R1_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=1,var_off=(0x0; 0x1)) refs=1,8
122: (16) if w1 == 0x0 goto pc+21     ; R1_w=1 refs=1,8
; bpf_cpumask_set_cpu(cpu, cpumaskw->cpumask); @ main.bpf.c:355
123: (bc) w1 = w7                     ; R1_w=scalar(id=19,smin=smin32=0,smax=umax=smax32=umax32=79,var_off=(0x0; 0x7f)) R7=scalar(id=19,smin=smin32=0,smax=umax=smax32=umax32=79,var_off=(0x0; 0x7f)) refs=1,8
124: (85) call bpf_cpumask_set_cpu#99616      ; refs=1,8
; bpf_cpumask_set_cpu(cpu, cpumaskw->node_cpumasks[node_idx].cpumask); @ main.bpf.c:356
125: (67) r9 <<= 32                   ; R9_w=scalar(smin=smin32=0,smax=umax=0x200000000,smax32=umax32=0,var_off=(0x0; 0x300000000)) refs=1,8
126: (c7) r9 s>>= 32                  ; R9_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3)) refs=1,8
127: (67) r9 <<= 3                    ; R9_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=16,var_off=(0x0; 0x18)) refs=1,8
128: (bf) r1 = r6                     ; R1_w=map_value(map=layer_cpumasks,ks=4,vs=1032) R6=map_value(map=layer_cpumasks,ks=4,vs=1032) refs=1,8
129: (0f) r1 += r9                    ; R1_w=map_value(map=layer_cpumasks,ks=4,vs=1032,smin=smin32=0,smax=umax=smax32=umax32=16,var_off=(0x0; 0x18)) R9_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=16,var_off=(0x0; 0x18)) refs=1,8
130: (79) r2 = *(u64 *)(r1 +520)      ; R1_w=map_value(map=layer_cpumasks,ks=4,vs=1032,smin=smin32=0,smax=umax=smax32=umax32=16,var_off=(0x0; 0x18)) R2_w=scalar() refs=1,8
131: (bc) w1 = w7                     ; R1_w=scalar(id=19,smin=smin32=0,smax=umax=smax32=umax32=79,var_off=(0x0; 0x7f)) R7=scalar(id=19,smin=smin32=0,smax=umax=smax32=umax32=79,var_off=(0x0; 0x7f)) refs=1,8
132: (85) call bpf_cpumask_set_cpu#99616
R2 type=scalar expected=fp
processed 455 insns (limit 1000000) max_states_per_insn 2 total_states 40 peak_states 40 mark_read 14
-- END PROG LOAD LOG --

23:00:07 [WARN] libbpf: prog 'sched_tick_fentry': failed to load: -13

23:00:07 [WARN] libbpf: failed to load object 'bpf_bpf'

23:00:07 [WARN] libbpf: failed to load BPF skeleton 'bpf_bpf': -13

Error: Failed to load BPF program

Caused by:
    Permission denied (os error 13)

In the bpf selftests there is a test for bpf_cpumask kptrs in maps and for nested bpf cpumasks. However, there are no tests that cover nested kptrs in a map. This may not be possible to verify without verifier changes.

If we can get these changes working the in the idle CPU selection path we can possibly remove some of the temporary bpf_cpumasks that create extra overhead in the idle CPU selection path.