summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-10-20 11:30:56 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-10-20 11:30:56 -0700
commit2b4d25010d0f2e359ff34e06c120e0cee3848fc7 (patch)
tree08cecb09cce8f8e3f03c2044a65fffeec02c341b /kernel
parenta5ee44c8297803efd36cd0c773d72687afdd7500 (diff)
parent5ec36fe24bd2d529ba415b9eaed44a689ab543ed (diff)
Merge tag 'sched_urgent_for_v6.12_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduling fixes from Borislav Petkov: - Add PREEMPT_RT maintainers - Fix another aspect of delayed dequeued tasks wrt determining their state, i.e., whether they're runnable or blocked - Handle delayed dequeued tasks and their migration wrt PSI properly - Fix the situation where a delayed dequeue task gets enqueued into a new class, which should not happen - Fix a case where memory allocation would happen while the runqueue lock is held, which is a no-no - Do not over-schedule when tasks with shorter slices preempt the currently running task - Make sure delayed to deque entities are properly handled before unthrottling - Other smaller cleanups and improvements * tag 'sched_urgent_for_v6.12_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: MAINTAINERS: Add an entry for PREEMPT_RT. sched/fair: Fix external p->on_rq users sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug sched/core: Dequeue PSI signals for blocked tasks that are delayed sched: Fix delayed_dequeue vs switched_from_fair() sched/core: Disable page allocation in task_tick_mm_cid() sched/deadline: Use hrtick_enabled_dl() before start_hrtick_dl() sched/eevdf: Fix wakeup-preempt by checking cfs_rq->nr_running sched: Fix sched_delayed vs cfs_bandwidth
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/rcu/tasks.h9
-rw-r--r--kernel/sched/core.c61
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/ext.c4
-rw-r--r--kernel/sched/fair.c27
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sched/stats.h48
-rw-r--r--kernel/sched/syscalls.c13
-rw-r--r--kernel/task_work.c15
-rw-r--r--kernel/time/tick-sched.c6
-rw-r--r--kernel/trace/trace_selftest.c2
13 files changed, 128 insertions, 70 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e3589c4287cb..cdd09769e6c5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9251,7 +9251,7 @@ static void perf_event_switch(struct task_struct *task,
},
};
- if (!sched_in && task->on_rq) {
+ if (!sched_in && task_is_runnable(task)) {
switch_event.event_id.header.misc |=
PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 44bbd7dbd2c8..8d530d0949ff 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -109,7 +109,12 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
{
unsigned int state = READ_ONCE(p->__state);
- if (p->on_rq)
+ /*
+ * Allow freezing the sched_delayed tasks; they will not execute until
+ * ttwu() fixes them up, so it is safe to swap their state now, instead
+ * of waiting for them to get fully dequeued.
+ */
+ if (task_is_runnable(p))
return 0;
if (p != current && task_curr(p))
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 6333f4ccf024..4d7ee95df06e 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -986,6 +986,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t)
return false;
/*
+ * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but
+ * since it is a spurious state (it will transition into the
+ * traditional blocked state or get woken up without outside
+ * dependencies), not considering it such should only affect timing.
+ *
+ * Be conservative for now and not include it.
+ */
+
+ /*
* Idle tasks (or idle injection) within the idle loop are RCU-tasks
* quiescent states. But CPU boot code performed by the idle task
* isn't a quiescent state.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aeb595514461..dbfb5717d6af 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
+ * Additionally it is possible to be ->on_rq but still be considered not
+ * runnable when p->se.sched_delayed is true. These tasks are on the runqueue
+ * but will be dequeued as soon as they get picked again. See the
+ * task_is_runnable() helper.
+ *
* p->on_cpu <- { 0, 1 }:
*
* is set by prepare_task() and cleared by finish_task() such that it will be
@@ -2012,11 +2017,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & ENQUEUE_RESTORE)) {
- sched_info_enqueue(rq, p);
- psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
- }
-
p->sched_class->enqueue_task(rq, p, flags);
/*
* Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
@@ -2024,6 +2024,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p);
+ if (!(flags & ENQUEUE_RESTORE)) {
+ sched_info_enqueue(rq, p);
+ psi_enqueue(p, flags & ENQUEUE_MIGRATED);
+ }
+
if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
}
@@ -2041,7 +2046,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & DEQUEUE_SAVE)) {
sched_info_dequeue(rq, p);
- psi_dequeue(p, flags & DEQUEUE_SLEEP);
+ psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
}
/*
@@ -4323,9 +4328,10 @@ static bool __task_needs_rq_lock(struct task_struct *p)
* @arg: Argument to function.
*
* Fix the task in it's current state by avoiding wakeups and or rq operations
- * and call @func(@arg) on it. This function can use ->on_rq and task_curr()
- * to work out what the state is, if required. Given that @func can be invoked
- * with a runqueue lock held, it had better be quite lightweight.
+ * and call @func(@arg) on it. This function can use task_is_runnable() and
+ * task_curr() to work out what the state is, if required. Given that @func
+ * can be invoked with a runqueue lock held, it had better be quite
+ * lightweight.
*
* Returns:
* Whatever @func returns
@@ -6544,6 +6550,7 @@ static void __sched notrace __schedule(int sched_mode)
* as a preemption by schedule_debug() and RCU.
*/
bool preempt = sched_mode > SM_NONE;
+ bool block = false;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
@@ -6629,6 +6636,7 @@ static void __sched notrace __schedule(int sched_mode)
* After this, schedule() must not care about p->state any more.
*/
block_task(rq, prev, flags);
+ block = true;
}
switch_count = &prev->nvcsw;
}
@@ -6674,7 +6682,7 @@ picked:
migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
- psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+ psi_sched_switch(prev, next, block);
trace_sched_switch(preempt, prev, next, prev_state);
@@ -7017,20 +7025,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);
-void __setscheduler_prio(struct task_struct *p, int prio)
+const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
{
if (dl_prio(prio))
- p->sched_class = &dl_sched_class;
- else if (rt_prio(prio))
- p->sched_class = &rt_sched_class;
+ return &dl_sched_class;
+
+ if (rt_prio(prio))
+ return &rt_sched_class;
+
#ifdef CONFIG_SCHED_CLASS_EXT
- else if (task_should_scx(p))
- p->sched_class = &ext_sched_class;
+ if (task_should_scx(p))
+ return &ext_sched_class;
#endif
- else
- p->sched_class = &fair_sched_class;
- p->prio = prio;
+ return &fair_sched_class;
}
#ifdef CONFIG_RT_MUTEXES
@@ -7076,7 +7084,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
int prio, oldprio, queued, running, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- const struct sched_class *prev_class;
+ const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
struct rq *rq;
@@ -7134,6 +7142,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
+ next_class = __setscheduler_class(p, prio);
+
+ if (prev_class != next_class && p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
@@ -7171,7 +7184,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
p->rt.timeout = 0;
}
- __setscheduler_prio(p, prio);
+ p->sched_class = next_class;
+ p->prio = prio;
+
check_class_changing(rq, p, prev_class);
if (queued)
@@ -10465,7 +10480,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
return;
if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
return;
- task_work_add(curr, work, TWA_RESUME);
+
+ /* No page allocation under rq lock */
+ task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC);
}
void sched_mm_cid_exit_signals(struct task_struct *t)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9ce93d0bf452..be1b917dc8ce 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2385,7 +2385,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
deadline_queue_push_tasks(rq);
- if (hrtick_enabled(rq))
+ if (hrtick_enabled_dl(rq))
start_hrtick_dl(rq, &p->dl);
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6eae3b69bf6e..5900b06fd036 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4493,7 +4493,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- __setscheduler_prio(p, p->prio);
+ p->sched_class = __setscheduler_class(p, p->prio);
check_class_changing(task_rq(p), p, old_class);
sched_enq_and_set_task(&ctx);
@@ -5204,7 +5204,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
p->scx.slice = SCX_SLICE_DFL;
- __setscheduler_prio(p, p->prio);
+ p->sched_class = __setscheduler_class(p, p->prio);
check_class_changing(task_rq(p), p, old_class);
sched_enq_and_set_task(&ctx);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 225b31aaee55..c157d4860a3b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1247,7 +1247,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
account_cfs_rq_runtime(cfs_rq, delta_exec);
- if (rq->nr_running == 1)
+ if (cfs_rq->nr_running == 1)
return;
if (resched || did_preempt_short(cfs_rq, curr)) {
@@ -6058,10 +6058,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- if (se->on_rq) {
- SCHED_WARN_ON(se->sched_delayed);
+ /* Handle any unfinished DELAY_DEQUEUE business first. */
+ if (se->sched_delayed) {
+ int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
+
+ dequeue_entity(qcfs_rq, se, flags);
+ } else if (se->on_rq)
break;
- }
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
@@ -13174,22 +13177,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
- /*
- * Since this is called after changing class, this is a little weird
- * and we cannot use DEQUEUE_DELAYED.
- */
- if (p->se.sched_delayed) {
- /* First, dequeue it from its new class' structures */
- dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
- /*
- * Now, clean up the fair_sched_class side of things
- * related to sched_delayed being true and that wasn't done
- * due to the generic dequeue not using DEQUEUE_DELAYED.
- */
- finish_delayed_dequeue_entity(&p->se);
- p->se.rel_deadline = 0;
- __block_task(rq, p);
- }
}
static void switched_to_fair(struct rq *rq, struct task_struct *p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6085ef50febf..081519ffab46 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3800,7 +3800,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
-extern void __setscheduler_prio(struct task_struct *p, int prio);
+extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio);
extern void set_load_weight(struct task_struct *p, bool update_load);
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 237780aa3c53..767e098a3bd1 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -119,45 +119,63 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
- * where a task's runnable state changes, and requeues, where a task
- * and its state are being moved between CPUs and runqueues.
+ * where a task's runnable state changes, and migrations, where a task
+ * and its runnable state are being moved between CPUs and runqueues.
+ *
+ * A notable case is a task whose dequeue is delayed. PSI considers
+ * those sleeping, but because they are still on the runqueue they can
+ * go through migration requeues. In this case, *sleeping* states need
+ * to be transferred.
*/
-static inline void psi_enqueue(struct task_struct *p, bool wakeup)
+static inline void psi_enqueue(struct task_struct *p, bool migrate)
{
- int clear = 0, set = TSK_RUNNING;
+ int clear = 0, set = 0;
if (static_branch_likely(&psi_disabled))
return;
- if (p->in_memstall)
- set |= TSK_MEMSTALL_RUNNING;
-
- if (!wakeup) {
+ if (p->se.sched_delayed) {
+ /* CPU migration of "sleeping" task */
+ SCHED_WARN_ON(!migrate);
if (p->in_memstall)
set |= TSK_MEMSTALL;
+ if (p->in_iowait)
+ set |= TSK_IOWAIT;
+ } else if (migrate) {
+ /* CPU migration of runnable task */
+ set = TSK_RUNNING;
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING;
} else {
+ /* Wakeup of new or sleeping task */
if (p->in_iowait)
clear |= TSK_IOWAIT;
+ set = TSK_RUNNING;
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL_RUNNING;
}
psi_task_change(p, clear, set);
}
-static inline void psi_dequeue(struct task_struct *p, bool sleep)
+static inline void psi_dequeue(struct task_struct *p, bool migrate)
{
if (static_branch_likely(&psi_disabled))
return;
/*
+ * When migrating a task to another CPU, clear all psi
+ * state. The enqueue callback above will work it out.
+ */
+ if (migrate)
+ psi_task_change(p, p->psi_flags, 0);
+
+ /*
* A voluntary sleep is a dequeue followed by a task switch. To
* avoid walking all ancestors twice, psi_task_switch() handles
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
* Do nothing here.
*/
- if (sleep)
- return;
-
- psi_task_change(p, p->psi_flags, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -190,8 +208,8 @@ static inline void psi_sched_switch(struct task_struct *prev,
}
#else /* CONFIG_PSI */
-static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
-static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
+static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
+static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index aa70beee9895..0470bcc3d204 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -529,7 +529,7 @@ int __sched_setscheduler(struct task_struct *p,
{
int oldpolicy = -1, policy = attr->sched_policy;
int retval, oldprio, newprio, queued, running;
- const struct sched_class *prev_class;
+ const struct sched_class *prev_class, *next_class;
struct balance_callback *head;
struct rq_flags rf;
int reset_on_fork;
@@ -706,6 +706,12 @@ change:
queue_flags &= ~DEQUEUE_MOVE;
}
+ prev_class = p->sched_class;
+ next_class = __setscheduler_class(p, newprio);
+
+ if (prev_class != next_class && p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
@@ -713,11 +719,10 @@ change:
if (running)
put_prev_task(rq, p);
- prev_class = p->sched_class;
-
if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
__setscheduler_params(p, attr);
- __setscheduler_prio(p, newprio);
+ p->sched_class = next_class;
+ p->prio = newprio;
}
__setscheduler_uclamp(p, attr);
check_class_changing(rq, p, prev_class);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 5d14d639ac71..c969f1f26be5 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -55,15 +55,26 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
enum task_work_notify_mode notify)
{
struct callback_head *head;
+ int flags = notify & TWA_FLAGS;
+ notify &= ~TWA_FLAGS;
if (notify == TWA_NMI_CURRENT) {
if (WARN_ON_ONCE(task != current))
return -EINVAL;
if (!IS_ENABLED(CONFIG_IRQ_WORK))
return -EINVAL;
} else {
- /* record the work call stack in order to print it in KASAN reports */
- kasan_record_aux_stack(work);
+ /*
+ * Record the work call stack in order to print it in KASAN
+ * reports.
+ *
+ * Note that stack allocation can fail if TWAF_NO_ALLOC flag
+ * is set and new page is needed to expand the stack buffer.
+ */
+ if (flags & TWAF_NO_ALLOC)
+ kasan_record_aux_stack_noalloc(work);
+ else
+ kasan_record_aux_stack(work);
}
head = READ_ONCE(task->task_works);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 753a184c7090..f203f000da1a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -434,6 +434,12 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
* smp_mb__after_spin_lock()
* tick_nohz_task_switch()
* LOAD p->tick_dep_mask
+ *
+ * XXX given a task picks up the dependency on schedule(), should we
+ * only care about tasks that are currently on the CPU instead of all
+ * that are on the runqueue?
+ *
+ * That is, does this want to be: task_on_cpu() / task_curr()?
*/
if (!sched_task_on_rq(tsk))
return;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c4ad7cd7e778..1469dd8075fa 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1485,7 +1485,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
/* reset the max latency */
tr->max_latency = 0;
- while (p->on_rq) {
+ while (task_is_runnable(p)) {
/*
* Sleep to make sure the -deadline thread is asleep too.
* On virtual machines we can't rely on timings,