Skip to content

Commit 2c11471

Browse files
committed
sched_ext, rcu: Eject BPF scheduler on RCU CPU stall panic
JIRA: https://issues.redhat.com/browse/RHEL-111810 commit cb44400 Author: David Dai <david.dai@linux.dev> Date: Tue Jun 24 15:49:06 2025 -0700 sched_ext, rcu: Eject BPF scheduler on RCU CPU stall panic For systems using a sched_ext scheduler and has panic_on_rcu_stall enabled, try kicking out the current scheduler before issuing a panic. While there are numerous reasons for RCU CPU stalls that are not directly attributed to the scheduler, deferring the panic gives sched_ext an opportunity to provide additional debug info when ejecting the current scheduler. Also, handling the event more gracefully allows us to potentially recover the system instead of incurring additional down time. Suggested-by: Tejun Heo <tj@kernel.org> Reviewed-by: Paul E. McKenney <paulmck@kernel.org> Signed-off-by: David Dai <david.dai@linux.dev> Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Phil Auld <pauld@redhat.com>
1 parent 253f738 commit 2c11471

File tree

3 files changed

+44
-0
lines changed

3 files changed

+44
-0
lines changed

include/linux/sched/ext.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,12 +206,14 @@ struct sched_ext_entity {
206206
void sched_ext_free(struct task_struct *p);
207207
void print_scx_info(const char *log_lvl, struct task_struct *p);
208208
void scx_softlockup(u32 dur_s);
209+
bool scx_rcu_cpu_stall(void);
209210

210211
#else /* !CONFIG_SCHED_CLASS_EXT */
211212

212213
static inline void sched_ext_free(struct task_struct *p) {}
213214
static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
214215
static inline void scx_softlockup(u32 dur_s) {}
216+
static inline bool scx_rcu_cpu_stall(void) { return false; }
215217

216218
#endif /* CONFIG_SCHED_CLASS_EXT */
217219

kernel/rcu/tree_stall.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,13 @@ static void panic_on_rcu_stall(void)
142142
{
143143
static int cpu_stall;
144144

145+
/*
146+
* Attempt to kick out the BPF scheduler if it's installed and defer
147+
* the panic to give the system a chance to recover.
148+
*/
149+
if (scx_rcu_cpu_stall())
150+
return;
151+
145152
if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
146153
return;
147154

kernel/sched/ext.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4614,6 +4614,41 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
46144614
p->sched_class != &ext_sched_class;
46154615
}
46164616

4617+
/**
4618+
* scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
4619+
*
4620+
* While there are various reasons why RCU CPU stalls can occur on a system
4621+
* that may not be caused by the current BPF scheduler, try kicking out the
4622+
* current scheduler in an attempt to recover the system to a good state before
4623+
* issuing panics.
4624+
*/
4625+
bool scx_rcu_cpu_stall(void)
4626+
{
4627+
struct scx_sched *sch;
4628+
4629+
rcu_read_lock();
4630+
4631+
sch = rcu_dereference(scx_root);
4632+
if (unlikely(!sch)) {
4633+
rcu_read_unlock();
4634+
return false;
4635+
}
4636+
4637+
switch (scx_enable_state()) {
4638+
case SCX_ENABLING:
4639+
case SCX_ENABLED:
4640+
break;
4641+
default:
4642+
rcu_read_unlock();
4643+
return false;
4644+
}
4645+
4646+
scx_error(sch, "RCU CPU stall detected!");
4647+
rcu_read_unlock();
4648+
4649+
return true;
4650+
}
4651+
46174652
/**
46184653
* scx_softlockup - sched_ext softlockup handler
46194654
* @dur_s: number of seconds of CPU stuck due to soft lockup

0 commit comments

Comments
 (0)