diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-10-20 11:30:56 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-10-20 11:30:56 -0700 |
commit | 2b4d25010d0f2e359ff34e06c120e0cee3848fc7 (patch) | |
tree | 08cecb09cce8f8e3f03c2044a65fffeec02c341b /kernel | |
parent | a5ee44c8297803efd36cd0c773d72687afdd7500 (diff) | |
parent | 5ec36fe24bd2d529ba415b9eaed44a689ab543ed (diff) |
Merge tag 'sched_urgent_for_v6.12_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduling fixes from Borislav Petkov:
- Add PREEMPT_RT maintainers
- Fix another aspect of delayed dequeued tasks wrt determining their
state, i.e., whether they're runnable or blocked
- Handle delayed dequeued tasks and their migration wrt PSI properly
- Fix the situation where a delayed dequeue task gets enqueued into a
new class, which should not happen
- Fix a case where memory allocation would happen while the runqueue
lock is held, which is a no-no
- Do not over-schedule when tasks with shorter slices preempt the
currently running task
- Make sure delayed to deque entities are properly handled before
unthrottling
- Other smaller cleanups and improvements
* tag 'sched_urgent_for_v6.12_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
MAINTAINERS: Add an entry for PREEMPT_RT.
sched/fair: Fix external p->on_rq users
sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug
sched/core: Dequeue PSI signals for blocked tasks that are delayed
sched: Fix delayed_dequeue vs switched_from_fair()
sched/core: Disable page allocation in task_tick_mm_cid()
sched/deadline: Use hrtick_enabled_dl() before start_hrtick_dl()
sched/eevdf: Fix wakeup-preempt by checking cfs_rq->nr_running
sched: Fix sched_delayed vs cfs_bandwidth
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/events/core.c | 2 | ||||
-rw-r--r-- | kernel/freezer.c | 7 | ||||
-rw-r--r-- | kernel/rcu/tasks.h | 9 | ||||
-rw-r--r-- | kernel/sched/core.c | 61 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/ext.c | 4 | ||||
-rw-r--r-- | kernel/sched/fair.c | 27 | ||||
-rw-r--r-- | kernel/sched/sched.h | 2 | ||||
-rw-r--r-- | kernel/sched/stats.h | 48 | ||||
-rw-r--r-- | kernel/sched/syscalls.c | 13 | ||||
-rw-r--r-- | kernel/task_work.c | 15 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace_selftest.c | 2 |
13 files changed, 128 insertions, 70 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index e3589c4287cb..cdd09769e6c5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9251,7 +9251,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - if (!sched_in && task->on_rq) { + if (!sched_in && task_is_runnable(task)) { switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; } diff --git a/kernel/freezer.c b/kernel/freezer.c index 44bbd7dbd2c8..8d530d0949ff 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -109,7 +109,12 @@ static int __set_task_frozen(struct task_struct *p, void *arg) { unsigned int state = READ_ONCE(p->__state); - if (p->on_rq) + /* + * Allow freezing the sched_delayed tasks; they will not execute until + * ttwu() fixes them up, so it is safe to swap their state now, instead + * of waiting for them to get fully dequeued. + */ + if (task_is_runnable(p)) return 0; if (p != current && task_curr(p)) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6333f4ccf024..4d7ee95df06e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -986,6 +986,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t) return false; /* + * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but + * since it is a spurious state (it will transition into the + * traditional blocked state or get woken up without outside + * dependencies), not considering it such should only affect timing. + * + * Be conservative for now and not include it. + */ + + /* * Idle tasks (or idle injection) within the idle loop are RCU-tasks * quiescent states. But CPU boot code performed by the idle task * isn't a quiescent state. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aeb595514461..dbfb5717d6af 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * + * Additionally it is possible to be ->on_rq but still be considered not + * runnable when p->se.sched_delayed is true. These tasks are on the runqueue + * but will be dequeued as soon as they get picked again. See the + * task_is_runnable() helper. + * * p->on_cpu <- { 0, 1 }: * * is set by prepare_task() and cleared by finish_task() such that it will be @@ -2012,11 +2017,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) { - sched_info_enqueue(rq, p); - psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); - } - p->sched_class->enqueue_task(rq, p, flags); /* * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear @@ -2024,6 +2024,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p); + if (!(flags & ENQUEUE_RESTORE)) { + sched_info_enqueue(rq, p); + psi_enqueue(p, flags & ENQUEUE_MIGRATED); + } + if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); } @@ -2041,7 +2046,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeue(rq, p); - psi_dequeue(p, flags & DEQUEUE_SLEEP); + psi_dequeue(p, !(flags & DEQUEUE_SLEEP)); } /* @@ -4323,9 +4328,10 @@ static bool __task_needs_rq_lock(struct task_struct *p) * @arg: Argument to function. * * Fix the task in it's current state by avoiding wakeups and or rq operations - * and call @func(@arg) on it. This function can use ->on_rq and task_curr() - * to work out what the state is, if required. Given that @func can be invoked - * with a runqueue lock held, it had better be quite lightweight. + * and call @func(@arg) on it. This function can use task_is_runnable() and + * task_curr() to work out what the state is, if required. Given that @func + * can be invoked with a runqueue lock held, it had better be quite + * lightweight. * * Returns: * Whatever @func returns @@ -6544,6 +6550,7 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; + bool block = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6629,6 +6636,7 @@ static void __sched notrace __schedule(int sched_mode) * After this, schedule() must not care about p->state any more. */ block_task(rq, prev, flags); + block = true; } switch_count = &prev->nvcsw; } @@ -6674,7 +6682,7 @@ picked: migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); - psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + psi_sched_switch(prev, next, block); trace_sched_switch(preempt, prev, next, prev_state); @@ -7017,20 +7025,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -void __setscheduler_prio(struct task_struct *p, int prio) +const struct sched_class *__setscheduler_class(struct task_struct *p, int prio) { if (dl_prio(prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; + return &dl_sched_class; + + if (rt_prio(prio)) + return &rt_sched_class; + #ifdef CONFIG_SCHED_CLASS_EXT - else if (task_should_scx(p)) - p->sched_class = &ext_sched_class; + if (task_should_scx(p)) + return &ext_sched_class; #endif - else - p->sched_class = &fair_sched_class; - p->prio = prio; + return &fair_sched_class; } #ifdef CONFIG_RT_MUTEXES @@ -7076,7 +7084,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct rq_flags rf; struct rq *rq; @@ -7134,6 +7142,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; + next_class = __setscheduler_class(p, prio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -7171,7 +7184,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->rt.timeout = 0; } - __setscheduler_prio(p, prio); + p->sched_class = next_class; + p->prio = prio; + check_class_changing(rq, p, prev_class); if (queued) @@ -10465,7 +10480,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) return; - task_work_add(curr, work, TWA_RESUME); + + /* No page allocation under rq lock */ + task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); } void sched_mm_cid_exit_signals(struct task_struct *t) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ce93d0bf452..be1b917dc8ce 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2385,7 +2385,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) deadline_queue_push_tasks(rq); - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, &p->dl); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6eae3b69bf6e..5900b06fd036 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4493,7 +4493,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - __setscheduler_prio(p, p->prio); + p->sched_class = __setscheduler_class(p, p->prio); check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); @@ -5204,7 +5204,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); p->scx.slice = SCX_SLICE_DFL; - __setscheduler_prio(p, p->prio); + p->sched_class = __setscheduler_class(p, p->prio); check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 225b31aaee55..c157d4860a3b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1247,7 +1247,7 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); - if (rq->nr_running == 1) + if (cfs_rq->nr_running == 1) return; if (resched || did_preempt_short(cfs_rq, curr)) { @@ -6058,10 +6058,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (se->on_rq) { - SCHED_WARN_ON(se->sched_delayed); + /* Handle any unfinished DELAY_DEQUEUE business first. */ + if (se->sched_delayed) { + int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; + + dequeue_entity(qcfs_rq, se, flags); + } else if (se->on_rq) break; - } enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) @@ -13174,22 +13177,6 @@ static void attach_task_cfs_rq(struct task_struct *p) static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); - /* - * Since this is called after changing class, this is a little weird - * and we cannot use DEQUEUE_DELAYED. - */ - if (p->se.sched_delayed) { - /* First, dequeue it from its new class' structures */ - dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); - /* - * Now, clean up the fair_sched_class side of things - * related to sched_delayed being true and that wasn't done - * due to the generic dequeue not using DEQUEUE_DELAYED. - */ - finish_delayed_dequeue_entity(&p->se); - p->se.rel_deadline = 0; - __block_task(rq, p); - } } static void switched_to_fair(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6085ef50febf..081519ffab46 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3800,7 +3800,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); -extern void __setscheduler_prio(struct task_struct *p, int prio); +extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio); extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 237780aa3c53..767e098a3bd1 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -119,45 +119,63 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, - * where a task's runnable state changes, and requeues, where a task - * and its state are being moved between CPUs and runqueues. + * where a task's runnable state changes, and migrations, where a task + * and its runnable state are being moved between CPUs and runqueues. + * + * A notable case is a task whose dequeue is delayed. PSI considers + * those sleeping, but because they are still on the runqueue they can + * go through migration requeues. In this case, *sleeping* states need + * to be transferred. */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) +static inline void psi_enqueue(struct task_struct *p, bool migrate) { - int clear = 0, set = TSK_RUNNING; + int clear = 0, set = 0; if (static_branch_likely(&psi_disabled)) return; - if (p->in_memstall) - set |= TSK_MEMSTALL_RUNNING; - - if (!wakeup) { + if (p->se.sched_delayed) { + /* CPU migration of "sleeping" task */ + SCHED_WARN_ON(!migrate); if (p->in_memstall) set |= TSK_MEMSTALL; + if (p->in_iowait) + set |= TSK_IOWAIT; + } else if (migrate) { + /* CPU migration of runnable task */ + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; } else { + /* Wakeup of new or sleeping task */ if (p->in_iowait) clear |= TSK_IOWAIT; + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; } psi_task_change(p, clear, set); } -static inline void psi_dequeue(struct task_struct *p, bool sleep) +static inline void psi_dequeue(struct task_struct *p, bool migrate) { if (static_branch_likely(&psi_disabled)) return; /* + * When migrating a task to another CPU, clear all psi + * state. The enqueue callback above will work it out. + */ + if (migrate) + psi_task_change(p, p->psi_flags, 0); + + /* * A voluntary sleep is a dequeue followed by a task switch. To * avoid walking all ancestors twice, psi_task_switch() handles * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. */ - if (sleep) - return; - - psi_task_change(p, p->psi_flags, 0); } static inline void psi_ttwu_dequeue(struct task_struct *p) @@ -190,8 +208,8 @@ static inline void psi_sched_switch(struct task_struct *prev, } #else /* CONFIG_PSI */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} -static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_enqueue(struct task_struct *p, bool migrate) {} +static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index aa70beee9895..0470bcc3d204 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -529,7 +529,7 @@ int __sched_setscheduler(struct task_struct *p, { int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; int reset_on_fork; @@ -706,6 +706,12 @@ change: queue_flags &= ~DEQUEUE_MOVE; } + prev_class = p->sched_class; + next_class = __setscheduler_class(p, newprio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -713,11 +719,10 @@ change: if (running) put_prev_task(rq, p); - prev_class = p->sched_class; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); + p->sched_class = next_class; + p->prio = newprio; } __setscheduler_uclamp(p, attr); check_class_changing(rq, p, prev_class); diff --git a/kernel/task_work.c b/kernel/task_work.c index 5d14d639ac71..c969f1f26be5 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -55,15 +55,26 @@ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; + int flags = notify & TWA_FLAGS; + notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { - /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + /* + * Record the work call stack in order to print it in KASAN + * reports. + * + * Note that stack allocation can fail if TWAF_NO_ALLOC flag + * is set and new page is needed to expand the stack buffer. + */ + if (flags & TWAF_NO_ALLOC) + kasan_record_aux_stack_noalloc(work); + else + kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 753a184c7090..f203f000da1a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -434,6 +434,12 @@ static void tick_nohz_kick_task(struct task_struct *tsk) * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask + * + * XXX given a task picks up the dependency on schedule(), should we + * only care about tasks that are currently on the CPU instead of all + * that are on the runqueue? + * + * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index c4ad7cd7e778..1469dd8075fa 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1485,7 +1485,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* reset the max latency */ tr->max_latency = 0; - while (p->on_rq) { + while (task_is_runnable(p)) { /* * Sleep to make sure the -deadline thread is asleep too. * On virtual machines we can't rely on timings, |