From 94b548a15e8ec47dfbf6925bdfb64bb5657dce0c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jun 2023 20:52:55 +0200 Subject: sched: Simplify set_user_nice() Use guards to reduce gotos and simplify control flow. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 ++++++------- kernel/sched/sched.h | 5 +++++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2299a5cfbfb9..fa57a560c52a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7187,9 +7187,8 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { bool queued, running; - int old_prio; - struct rq_flags rf; struct rq *rq; + int old_prio; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) return; @@ -7197,7 +7196,9 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = task_rq_lock(p, &rf); + CLASS(task_rq_lock, rq_guard)(p); + rq = rq_guard.rq; + update_rq_clock(rq); /* @@ -7208,8 +7209,9 @@ void set_user_nice(struct task_struct *p, long nice) */ if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; + return; } + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -7232,9 +7234,6 @@ void set_user_nice(struct task_struct *p, long nice) * lowered its priority, then reschedule its CPU: */ p->sched_class->prio_changed(rq, p, old_prio); - -out_unlock: - task_rq_unlock(rq, p, &rf); } EXPORT_SYMBOL(set_user_nice); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 04846272409c..68768f47ccb7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1658,6 +1658,11 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } +DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, + _T->rq = task_rq_lock(_T->lock, &_T->rf), + task_rq_unlock(_T->rq, _T->lock, &_T->rf), + struct rq *rq; struct rq_flags rf) + static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) -- cgit v1.2.3 From febe162d4d9158cf2b5d48fdd440db7bb55dd622 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jun 2023 16:54:54 +0200 Subject: sched: Simplify syscalls Use guards to reduce gotos and simplify control flow. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 154 +++++++++++++++++++++++----------------------------- 1 file changed, 68 insertions(+), 86 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa57a560c52a..67c32c43a94b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7506,6 +7506,21 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } +static struct task_struct *find_get_task(pid_t pid) +{ + struct task_struct *p; + guard(rcu)(); + + p = find_process_by_pid(pid); + if (likely(p)) + get_task_struct(p); + + return p; +} + +DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T), + find_get_task(pid), pid_t pid) + /* * sched_setparam() passes in -1 for its policy, to let the functions * it calls know not to change it. @@ -7543,14 +7558,11 @@ static void __setscheduler_params(struct task_struct *p, static bool check_same_owner(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred; - bool match; + guard(rcu)(); - rcu_read_lock(); pcred = __task_cred(p); - match = (uid_eq(cred->euid, pcred->euid) || - uid_eq(cred->euid, pcred->uid)); - rcu_read_unlock(); - return match; + return (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); } /* @@ -7962,27 +7974,17 @@ static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lparam; - struct task_struct *p; - int retval; if (!param || pid < 0) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); - - if (likely(p)) { - retval = sched_setscheduler(p, policy, &lparam); - put_task_struct(p); - } + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; - return retval; + return sched_setscheduler(p, policy, &lparam); } /* @@ -8078,7 +8080,6 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags) { struct sched_attr attr; - struct task_struct *p; int retval; if (!uattr || pid < 0 || flags) @@ -8093,21 +8094,14 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) attr.sched_policy = SETPARAM_POLICY; - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; - if (likely(p)) { - if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); - retval = sched_setattr(p, &attr); - put_task_struct(p); - } + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); - return retval; + return sched_setattr(p, &attr); } /** @@ -8125,16 +8119,17 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (pid < 0) return -EINVAL; - retval = -ESRCH; - rcu_read_lock(); + guard(rcu)(); p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy - | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (!retval) { + retval = p->policy; + if (p->sched_reset_on_fork) + retval |= SCHED_RESET_ON_FORK; } - rcu_read_unlock(); return retval; } @@ -8155,30 +8150,23 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (!param || pid < 0) return -EINVAL; - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - if (task_has_rt_policy(p)) - lp.sched_priority = p->rt_priority; - rcu_read_unlock(); + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; + } /* * This one might sleep, we cannot do it with a spinlock held ... */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; + return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; } /* @@ -8238,39 +8226,33 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, usize < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - kattr.sched_policy = p->policy; - if (p->sched_reset_on_fork) - kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); - kattr.sched_flags &= SCHED_FLAG_ALL; + kattr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK - /* - * This could race with another potential updater, but this is fine - * because it'll correctly read the old or the new value. We don't need - * to guarantee who wins the race as long as it doesn't return garbage. - */ - kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; - kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ + kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif - - rcu_read_unlock(); + } return sched_attr_copy_to_user(uattr, &kattr, usize); - -out_unlock: - rcu_read_unlock(); - return retval; } #ifdef CONFIG_SMP -- cgit v1.2.3 From 92c2ec5bc1081e6bbbe172bcfb1a566ad7b4f809 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jun 2023 16:57:35 +0200 Subject: sched: Simplify sched_{set,get}affinity() Use guards to reduce gotos and simplify control flow. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 53 ++++++++++++++--------------------------------------- 1 file changed, 14 insertions(+), 39 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 67c32c43a94b..1d5cbb305057 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8347,39 +8347,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { struct affinity_context ac; struct cpumask *user_mask; - struct task_struct *p; int retval; - rcu_read_lock(); - - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); + CLASS(find_get_task, p)(pid); + if (!p) return -ESRCH; - } - - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); - if (p->flags & PF_NO_SETAFFINITY) { - retval = -EINVAL; - goto out_put_task; - } + if (p->flags & PF_NO_SETAFFINITY) + return -EINVAL; if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - retval = -EPERM; - goto out_put_task; - } - rcu_read_unlock(); + guard(rcu)(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) + return -EPERM; } retval = security_task_setscheduler(p); if (retval) - goto out_put_task; + return retval; /* * With non-SMP configs, user_cpus_ptr/user_mask isn't used and @@ -8389,8 +8374,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (user_mask) { cpumask_copy(user_mask, in_mask); } else if (IS_ENABLED(CONFIG_SMP)) { - retval = -ENOMEM; - goto out_put_task; + return -ENOMEM; } ac = (struct affinity_context){ @@ -8402,8 +8386,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) retval = __sched_setaffinity(p, &ac); kfree(ac.user_mask); -out_put_task: - put_task_struct(p); return retval; } @@ -8445,28 +8427,21 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; - unsigned long flags; int retval; - rcu_read_lock(); - - retval = -ESRCH; + guard(rcu)(); p = find_process_by_pid(pid); if (!p) - goto out_unlock; + return -ESRCH; retval = security_task_getscheduler(p); if (retval) - goto out_unlock; + return retval; - raw_spin_lock_irqsave(&p->pi_lock, flags); + guard(raw_spinlock_irqsave)(&p->pi_lock); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); -out_unlock: - rcu_read_unlock(); - - return retval; + return 0; } /** -- cgit v1.2.3 From 7a50f76674f8b6f4f30a1cec954179f10e20110c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jun 2023 16:58:23 +0200 Subject: sched: Simplify yield_to() Use guards to reduce gotos and simplify control flow. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 67 +++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1d5cbb305057..6c8c40a54560 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8888,55 +8888,46 @@ int __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; - unsigned long flags; int yielded = 0; - local_irq_save(flags); - rq = this_rq(); + scoped_guard (irqsave) { + rq = this_rq(); again: - p_rq = task_rq(p); - /* - * If we're the only runnable task on the rq and target rq also - * has only one task, there's absolutely no point in yielding. - */ - if (rq->nr_running == 1 && p_rq->nr_running == 1) { - yielded = -ESRCH; - goto out_irq; - } + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) + return -ESRCH; - double_rq_lock(rq, p_rq); - if (task_rq(p) != p_rq) { - double_rq_unlock(rq, p_rq); - goto again; - } + guard(double_rq_lock)(rq, p_rq); + if (task_rq(p) != p_rq) + goto again; - if (!curr->sched_class->yield_to_task) - goto out_unlock; + if (!curr->sched_class->yield_to_task) + return 0; - if (curr->sched_class != p->sched_class) - goto out_unlock; + if (curr->sched_class != p->sched_class) + return 0; - if (task_on_cpu(p_rq, p) || !task_is_running(p)) - goto out_unlock; + if (task_on_cpu(p_rq, p) || !task_is_running(p)) + return 0; - yielded = curr->sched_class->yield_to_task(rq, p); - if (yielded) { - schedstat_inc(rq->yld_count); - /* - * Make p's CPU reschedule; pick_next_entity takes care of - * fairness. - */ - if (preempt && rq != p_rq) - resched_curr(p_rq); + yielded = curr->sched_class->yield_to_task(rq, p); + if (yielded) { + schedstat_inc(rq->yld_count); + /* + * Make p's CPU reschedule; pick_next_entity + * takes care of fairness. + */ + if (preempt && rq != p_rq) + resched_curr(p_rq); + } } -out_unlock: - double_rq_unlock(rq, p_rq); -out_irq: - local_irq_restore(flags); - - if (yielded > 0) + if (yielded) schedule(); return yielded; -- cgit v1.2.3 From af7c5763f5e8bc1b3f827354a283ccaf6a8c8098 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jun 2023 16:59:05 +0200 Subject: sched: Simplify sched_rr_get_interval() Use guards to reduce gotos and simplify control flow. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6c8c40a54560..d298176367f7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9030,38 +9030,30 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { - struct task_struct *p; - unsigned int time_slice; - struct rq_flags rf; - struct rq *rq; + unsigned int time_slice = 0; int retval; if (pid < 0) return -EINVAL; - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; + scoped_guard (rcu) { + struct task_struct *p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - rq = task_rq_lock(p, &rf); - time_slice = 0; - if (p->sched_class->get_rr_interval) - time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &rf); + scoped_guard (task_rq_lock, p) { + struct rq *rq = scope.rq; + if (p->sched_class->get_rr_interval) + time_slice = p->sched_class->get_rr_interval(rq, p); + } + } - rcu_read_unlock(); jiffies_to_timespec64(time_slice, t); return 0; - -out_unlock: - rcu_read_unlock(); - return retval; } /** -- cgit v1.2.3 From fa614b4feb5a246474ac71b45e520a8ddefc809c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jun 2023 20:41:09 +0200 Subject: sched: Simplify sched_move_task() Use guards to reduce gotos and simplify control flow. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d298176367f7..a3f4fb8a6841 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10437,17 +10437,18 @@ void sched_move_task(struct task_struct *tsk) int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct task_group *group; - struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(tsk, &rf); + CLASS(task_rq_lock, rq_guard)(tsk); + rq = rq_guard.rq; + /* * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous * group changes. */ group = sched_get_task_group(tsk); if (group == tsk->sched_task_group) - goto unlock; + return; update_rq_clock(rq); @@ -10472,9 +10473,6 @@ void sched_move_task(struct task_struct *tsk) */ resched_curr(rq); } - -unlock: - task_rq_unlock(rq, tsk, &rf); } static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -- cgit v1.2.3 From 6fb45460615358157a6d3c990e74f9c1395247e2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jun 2023 20:45:16 +0200 Subject: sched: Simplify tg_set_cfs_bandwidth() Use guards to reduce gotos and simplify control flow. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- include/linux/cpu.h | 2 ++ kernel/sched/core.c | 38 +++++++++++++++++++------------------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 0abd60a7987b..f19f56501809 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -153,6 +153,8 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; } static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { } #endif /* !CONFIG_HOTPLUG_CPU */ +DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock()) + #ifdef CONFIG_PM_SLEEP_SMP extern int freeze_secondary_cpus(int primary); extern void thaw_secondary_cpus(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a3f4fb8a6841..5d9f36359461 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10802,11 +10802,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - cpus_read_lock(); - mutex_lock(&cfs_constraints_mutex); + guard(cpus_read_lock)(); + guard(mutex)(&cfs_constraints_mutex); + ret = __cfs_schedulable(tg, period, quota); if (ret) - goto out_unlock; + return ret; runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; @@ -10816,39 +10817,38 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, */ if (runtime_enabled && !runtime_was_enabled) cfs_bandwidth_usage_inc(); - raw_spin_lock_irq(&cfs_b->lock); - cfs_b->period = ns_to_ktime(period); - cfs_b->quota = quota; - cfs_b->burst = burst; - __refill_cfs_bandwidth_runtime(cfs_b); + scoped_guard (raw_spinlock_irq, &cfs_b->lock) { + cfs_b->period = ns_to_ktime(period); + cfs_b->quota = quota; + cfs_b->burst = burst; - /* Restart the period timer (if active) to handle new period expiry: */ - if (runtime_enabled) - start_cfs_bandwidth(cfs_b); + __refill_cfs_bandwidth_runtime(cfs_b); - raw_spin_unlock_irq(&cfs_b->lock); + /* + * Restart the period timer (if active) to handle new + * period expiry: + */ + if (runtime_enabled) + start_cfs_bandwidth(cfs_b); + } for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; - struct rq_flags rf; - rq_lock_irq(rq, &rf); + guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); - rq_unlock_irq(rq, &rf); } + if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); -out_unlock: - mutex_unlock(&cfs_constraints_mutex); - cpus_read_unlock(); - return ret; + return 0; } static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) -- cgit v1.2.3 From 0e34600ac9317dbe5f0a7bfaa3d7187d757572ed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jun 2023 20:52:49 +0200 Subject: sched: Misc cleanups Random remaining guard use... Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 167 ++++++++++++++++++++-------------------------------- 1 file changed, 63 insertions(+), 104 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5d9f36359461..76662d809183 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1480,16 +1480,12 @@ static void __uclamp_update_util_min_rt_default(struct task_struct *p) static void uclamp_update_util_min_rt_default(struct task_struct *p) { - struct rq_flags rf; - struct rq *rq; - if (!rt_task(p)) return; /* Protect updates to p->uclamp_* */ - rq = task_rq_lock(p, &rf); + guard(task_rq_lock)(p); __uclamp_update_util_min_rt_default(p); - task_rq_unlock(rq, p, &rf); } static inline struct uclamp_se @@ -1785,9 +1781,8 @@ static void uclamp_update_root_tg(void) uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], sysctl_sched_uclamp_util_max, false); - rcu_read_lock(); + guard(rcu)(); cpu_util_update_eff(&root_task_group.css); - rcu_read_unlock(); } #else static void uclamp_update_root_tg(void) { } @@ -1814,10 +1809,9 @@ static void uclamp_sync_util_min_rt_default(void) smp_mb__after_spinlock(); read_unlock(&tasklist_lock); - rcu_read_lock(); + guard(rcu)(); for_each_process_thread(g, p) uclamp_update_util_min_rt_default(p); - rcu_read_unlock(); } static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, @@ -2250,20 +2244,13 @@ static __always_inline int task_state_match(struct task_struct *p, unsigned int state) { #ifdef CONFIG_PREEMPT_RT - int match; - /* * Serialize against current_save_and_set_rtlock_wait_state() and * current_restore_rtlock_saved_state(). */ - raw_spin_lock_irq(&p->pi_lock); - match = __task_state_match(p, state); - raw_spin_unlock_irq(&p->pi_lock); - - return match; -#else - return __task_state_match(p, state); + guard(raw_spinlock_irq)(&p->pi_lock); #endif + return __task_state_match(p, state); } /* @@ -2417,10 +2404,9 @@ void migrate_disable(void) return; } - preempt_disable(); + guard(preempt)(); this_rq()->nr_pinned++; p->migration_disabled = 1; - preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_disable); @@ -2444,7 +2430,7 @@ void migrate_enable(void) * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). */ - preempt_disable(); + guard(preempt)(); if (p->cpus_ptr != &p->cpus_mask) __set_cpus_allowed_ptr(p, &ac); /* @@ -2455,7 +2441,6 @@ void migrate_enable(void) barrier(); p->migration_disabled = 0; this_rq()->nr_pinned--; - preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -3516,13 +3501,11 @@ out: */ void kick_process(struct task_struct *p) { - int cpu; + guard(preempt)(); + int cpu = task_cpu(p); - preempt_disable(); - cpu = task_cpu(p); if ((cpu != smp_processor_id()) && task_curr(p)) smp_send_reschedule(cpu); - preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); @@ -6368,8 +6351,9 @@ static void sched_core_balance(struct rq *rq) struct sched_domain *sd; int cpu = cpu_of(rq); - preempt_disable(); - rcu_read_lock(); + guard(preempt)(); + guard(rcu)(); + raw_spin_rq_unlock_irq(rq); for_each_domain(cpu, sd) { if (need_resched()) @@ -6379,8 +6363,6 @@ static void sched_core_balance(struct rq *rq) break; } raw_spin_rq_lock_irq(rq); - rcu_read_unlock(); - preempt_enable(); } static DEFINE_PER_CPU(struct balance_callback, core_balance_head); @@ -8258,8 +8240,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, #ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { - int ret = 0; - /* * If the task isn't a deadline task or admission control is * disabled then we don't care about affinity changes. @@ -8273,11 +8253,11 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) * tasks allowed to run on all the CPUs in the task's * root_domain. */ - rcu_read_lock(); + guard(rcu)(); if (!cpumask_subset(task_rq(p)->rd->span, mask)) - ret = -EBUSY; - rcu_read_unlock(); - return ret; + return -EBUSY; + + return 0; } #endif @@ -10509,11 +10489,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new group */ - mutex_lock(&uclamp_mutex); - rcu_read_lock(); + guard(mutex)(&uclamp_mutex); + guard(rcu)(); cpu_util_update_eff(css); - rcu_read_unlock(); - mutex_unlock(&uclamp_mutex); #endif return 0; @@ -10664,8 +10642,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, static_branch_enable(&sched_uclamp_used); - mutex_lock(&uclamp_mutex); - rcu_read_lock(); + guard(mutex)(&uclamp_mutex); + guard(rcu)(); tg = css_tg(of_css(of)); if (tg->uclamp_req[clamp_id].value != req.util) @@ -10680,9 +10658,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, /* Update effective clamps to track the most restrictive value */ cpu_util_update_eff(of_css(of)); - rcu_read_unlock(); - mutex_unlock(&uclamp_mutex); - return nbytes; } @@ -10708,10 +10683,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf, u64 percent; u32 rem; - rcu_read_lock(); - tg = css_tg(seq_css(sf)); - util_clamp = tg->uclamp_req[clamp_id].value; - rcu_read_unlock(); + scoped_guard (rcu) { + tg = css_tg(seq_css(sf)); + util_clamp = tg->uclamp_req[clamp_id].value; + } if (util_clamp == SCHED_CAPACITY_SCALE) { seq_puts(sf, "max\n"); @@ -11033,7 +11008,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) { - int ret; struct cfs_schedulable_data data = { .tg = tg, .period = period, @@ -11045,11 +11019,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) do_div(data.quota, NSEC_PER_USEC); } - rcu_read_lock(); - ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); } static int cpu_cfs_stat_show(struct seq_file *sf, void *v) @@ -11654,14 +11625,12 @@ int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, * are not the last task to be migrated from this cpu for this mm, so * there is no need to move src_cid to the destination cpu. */ - rcu_read_lock(); + guard(rcu)(); src_task = rcu_dereference(src_rq->curr); if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - rcu_read_unlock(); t->last_mm_cid = -1; return -1; } - rcu_read_unlock(); return src_cid; } @@ -11705,18 +11674,17 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, * the lazy-put flag, this task will be responsible for transitioning * from lazy-put flag set to MM_CID_UNSET. */ - rcu_read_lock(); - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - rcu_read_unlock(); - /* - * We observed an active task for this mm, there is therefore - * no point in moving this cid to the destination cpu. - */ - t->last_mm_cid = -1; - return -1; + scoped_guard (rcu) { + src_task = rcu_dereference(src_rq->curr); + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { + /* + * We observed an active task for this mm, there is therefore + * no point in moving this cid to the destination cpu. + */ + t->last_mm_cid = -1; + return -1; + } } - rcu_read_unlock(); /* * The src_cid is unused, so it can be unset. @@ -11789,7 +11757,6 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_ { struct rq *rq = cpu_rq(cpu); struct task_struct *t; - unsigned long flags; int cid, lazy_cid; cid = READ_ONCE(pcpu_cid->cid); @@ -11824,23 +11791,21 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_ * the lazy-put flag, that task will be responsible for transitioning * from lazy-put flag set to MM_CID_UNSET. */ - rcu_read_lock(); - t = rcu_dereference(rq->curr); - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) { - rcu_read_unlock(); - return; + scoped_guard (rcu) { + t = rcu_dereference(rq->curr); + if (READ_ONCE(t->mm_cid_active) && t->mm == mm) + return; } - rcu_read_unlock(); /* * The cid is unused, so it can be unset. * Disable interrupts to keep the window of cid ownership without rq * lock small. */ - local_irq_save(flags); - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - __mm_cid_put(mm, cid); - local_irq_restore(flags); + scoped_guard (irqsave) { + if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) + __mm_cid_put(mm, cid); + } } static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) @@ -11862,14 +11827,13 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) * snapshot associated with this cid if an active task using the mm is * observed on this rq. */ - rcu_read_lock(); - curr = rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - rcu_read_unlock(); - return; + scoped_guard (rcu) { + curr = rcu_dereference(rq->curr); + if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { + WRITE_ONCE(pcpu_cid->time, rq_clock); + return; + } } - rcu_read_unlock(); if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) return; @@ -11963,7 +11927,6 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) void sched_mm_cid_exit_signals(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -11971,7 +11934,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); preempt_enable_no_resched(); /* holding spinlock */ WRITE_ONCE(t->mm_cid_active, 0); /* @@ -11981,13 +11944,11 @@ void sched_mm_cid_exit_signals(struct task_struct *t) smp_mb(); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; - rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_before_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -11995,7 +11956,7 @@ void sched_mm_cid_before_execve(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); preempt_enable_no_resched(); /* holding spinlock */ WRITE_ONCE(t->mm_cid_active, 0); /* @@ -12005,13 +11966,11 @@ void sched_mm_cid_before_execve(struct task_struct *t) smp_mb(); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; - rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_after_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12019,16 +11978,16 @@ void sched_mm_cid_after_execve(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); - rq_unlock_irqrestore(rq, &rf); + scoped_guard (rq_lock_irqsave, rq) { + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 1); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + } rseq_set_notify_resume(t); } -- cgit v1.2.3 From 4de7b17fd05d03fa919e8c47fc66122bd24d7b6c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 21 Aug 2023 14:44:28 +0100 Subject: sched: Assert for_each_thread() is properly locked list_for_each_entry_rcu() takes an optional fourth argument which allows RCU to assert that the correct lock is held. Several callers of for_each_thread() rely on their caller to be holding the appropriate lock, so this is a useful assertion to include. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ingo Molnar Reviewed-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20230821134428.2504912-1-willy@infradead.org --- include/linux/sched/signal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0014d3adaf84..9610bad018a3 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -656,7 +656,8 @@ extern bool current_is_single_threaded(void); while ((t = next_thread(t)) != g) #define __for_each_thread(signal, t) \ - list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ + lockdep_is_held(&tasklist_lock)) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) -- cgit v1.2.3 From b1f099b1cf51d553c510c6c8141c27d9ba7ea1fe Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 19 Aug 2023 07:12:33 -0700 Subject: numa: Generalize numa_map_to_online_node() The function in fact searches the nearest node for a given one, based on a N_ONLINE state. This is a common pattern to search for a nearest node. This patch converts numa_map_to_online_node() to numa_nearest_node() so that others won't need to opencode the logic. Signed-off-by: Yury Norov Signed-off-by: Ingo Molnar Cc: Mel Gorman Link: https://lore.kernel.org/r/20230819141239.287290-2-yury.norov@gmail.com --- include/linux/numa.h | 7 +++++-- mm/mempolicy.c | 18 +++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/include/linux/numa.h b/include/linux/numa.h index 59df211d051f..fb30a42f0700 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -25,7 +25,7 @@ #include /* Generic implementation available */ -int numa_map_to_online_node(int node); +int numa_nearest_node(int node, unsigned int state); #ifndef memory_add_physaddr_to_nid static inline int memory_add_physaddr_to_nid(u64 start) @@ -44,10 +44,11 @@ static inline int phys_to_target_node(u64 start) } #endif #else /* !CONFIG_NUMA */ -static inline int numa_map_to_online_node(int node) +static inline int numa_nearest_node(int node, unsigned int state) { return NUMA_NO_NODE; } + static inline int memory_add_physaddr_to_nid(u64 start) { return 0; @@ -58,6 +59,8 @@ static inline int phys_to_target_node(u64 start) } #endif +#define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE) + #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP extern const struct attribute_group arch_node_dev_group; #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 42b5567e3773..d4c0fff79758 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -131,22 +131,26 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /** - * numa_map_to_online_node - Find closest online node + * numa_nearest_node - Find nearest node by state * @node: Node id to start the search + * @state: State to filter the search * - * Lookup the next closest node by distance if @nid is not online. + * Lookup the closest node by distance if @nid is not in state. * - * Return: this @node if it is online, otherwise the closest node by distance + * Return: this @node if it is in state, otherwise the closest node by distance */ -int numa_map_to_online_node(int node) +int numa_nearest_node(int node, unsigned int state) { int min_dist = INT_MAX, dist, n, min_node; - if (node == NUMA_NO_NODE || node_online(node)) + if (state >= NR_NODE_STATES) + return -EINVAL; + + if (node == NUMA_NO_NODE || node_state(node, state)) return node; min_node = node; - for_each_online_node(n) { + for_each_node_state(n, state) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; @@ -156,7 +160,7 @@ int numa_map_to_online_node(int node) return min_node; } -EXPORT_SYMBOL_GPL(numa_map_to_online_node); +EXPORT_SYMBOL_GPL(numa_nearest_node); struct mempolicy *get_task_policy(struct task_struct *p) { -- cgit v1.2.3 From d1db9fb432d50b0eecdfdd85d17cc15a59cc093b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 19 Aug 2023 07:12:34 -0700 Subject: sched/fair: Fix open-coded numa_nearest_node() task_numa_placement() searches for a nearest node to migrate by calling for_each_node_state(). Now that we have numa_nearest_node(), switch to using it. Signed-off-by: Yury Norov Signed-off-by: Ingo Molnar Cc: Mel Gorman Link: https://lore.kernel.org/r/20230819141239.287290-3-yury.norov@gmail.com --- kernel/sched/fair.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8dbff6e7ad4f..41cfd61b4d6b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2847,19 +2847,7 @@ static void task_numa_placement(struct task_struct *p) } /* Cannot migrate task to CPU-less node */ - if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) { - int near_nid = max_nid; - int distance, near_distance = INT_MAX; - - for_each_node_state(nid, N_CPU) { - distance = node_distance(max_nid, nid); - if (distance < near_distance) { - near_nid = nid; - near_distance = distance; - } - } - max_nid = near_nid; - } + max_nid = numa_nearest_node(max_nid, N_CPU); if (ng) { numa_group_count_active_nodes(ng); -- cgit v1.2.3 From 617f2c38cb5ce60226042081c09e2ee3a90d03f8 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 19 Aug 2023 07:12:35 -0700 Subject: sched/topology: Fix sched_numa_find_nth_cpu() in CPU-less case When the node provided by user is CPU-less, corresponding record in sched_domains_numa_masks is not set. Trying to dereference it in the following code leads to kernel crash. To avoid it, start searching from the nearest node with CPUs. Fixes: cd7f55359c90 ("sched: add sched_numa_find_nth_cpu()") Reported-by: Yicong Yang Reported-by: Guenter Roeck Signed-off-by: Yury Norov Signed-off-by: Ingo Molnar Reviewed-by: Yicong Yang Cc: Mel Gorman Link: https://lore.kernel.org/r/20230819141239.287290-4-yury.norov@gmail.com Closes: https://lore.kernel.org/lkml/CAAH8bW8C5humYnfpW3y5ypwx0E-09A3QxFE1JFzR66v+mO4XfA@mail.gmail.com/T/ Closes: https://lore.kernel.org/lkml/ZMHSNQfv39HN068m@yury-ThinkPad/T/#mf6431cb0b7f6f05193c41adeee444bc95bf2b1c4 --- kernel/sched/topology.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 05a5bc678c08..423d08947962 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2122,12 +2122,16 @@ static int hop_cmp(const void *a, const void *b) */ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { - struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu }; + struct __cmp_key k = { .cpus = cpus, .cpu = cpu }; struct cpumask ***hop_masks; int hop, ret = nr_cpu_ids; rcu_read_lock(); + /* CPU-less node entries are uninitialized in sched_domains_numa_masks */ + node = numa_nearest_node(node, N_CPU); + k.node = node; + k.masks = rcu_dereference(sched_domains_numa_masks); if (!k.masks) goto unlock; -- cgit v1.2.3 From 8ab63d418d4339d996f80d02a00dbce0aa3ff972 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 19 Aug 2023 07:12:36 -0700 Subject: sched/topology: Fix sched_numa_find_nth_cpu() in non-NUMA case When CONFIG_NUMA is enabled, sched_numa_find_nth_cpu() searches for a CPU in sched_domains_numa_masks. The masks includes only online CPUs, so effectively offline CPUs are skipped. When CONFIG_NUMA is disabled, the fallback function should be consistent. Fixes: cd7f55359c90 ("sched: add sched_numa_find_nth_cpu()") Signed-off-by: Yury Norov Signed-off-by: Ingo Molnar Cc: Mel Gorman Link: https://lore.kernel.org/r/20230819141239.287290-5-yury.norov@gmail.com --- include/linux/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/topology.h b/include/linux/topology.h index fea32377f7c7..52f5850730b3 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -251,7 +251,7 @@ extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int #else static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { - return cpumask_nth(cpu, cpus); + return cpumask_nth_and(cpu, cpus, cpu_online_mask); } static inline const struct cpumask * -- cgit v1.2.3 From 9ecea9ae4d3127a09fb5dfcea87f248937a39ff5 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 19 Aug 2023 07:12:37 -0700 Subject: sched/topology: Handle NUMA_NO_NODE in sched_numa_find_nth_cpu() sched_numa_find_nth_cpu() doesn't handle NUMA_NO_NODE properly, and may crash kernel if passed with it. On the other hand, the only user of sched_numa_find_nth_cpu() has to check NUMA_NO_NODE case explicitly. It would be easier for users if this logic will get moved into sched_numa_find_nth_cpu(). Signed-off-by: Yury Norov Signed-off-by: Ingo Molnar Cc: Mel Gorman Link: https://lore.kernel.org/r/20230819141239.287290-6-yury.norov@gmail.com --- kernel/sched/topology.c | 3 +++ lib/cpumask.c | 4 +--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 423d08947962..a60ecf45360c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2126,6 +2126,9 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) struct cpumask ***hop_masks; int hop, ret = nr_cpu_ids; + if (node == NUMA_NO_NODE) + return cpumask_nth_and(cpu, cpus, cpu_online_mask); + rcu_read_lock(); /* CPU-less node entries are uninitialized in sched_domains_numa_masks */ diff --git a/lib/cpumask.c b/lib/cpumask.c index a7fd02b5ae26..34335c1e7265 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -146,9 +146,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node) /* Wrap: we always want a cpu. */ i %= num_online_cpus(); - cpu = (node == NUMA_NO_NODE) ? - cpumask_nth(i, cpu_online_mask) : - sched_numa_find_nth_cpu(cpu_online_mask, i, node); + cpu = sched_numa_find_nth_cpu(cpu_online_mask, i, node); WARN_ON(cpu >= nr_cpu_ids); return cpu; -- cgit v1.2.3 From 6d08ad2166f7770341ea56afad45fa41cd16ae62 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 19 Aug 2023 07:12:38 -0700 Subject: sched/topology: Fix sched_numa_find_nth_cpu() comment Reword sched_numa_find_nth_cpu() comment and make it kernel-doc compatible. Signed-off-by: Yury Norov Signed-off-by: Ingo Molnar Cc: Mel Gorman Link: https://lore.kernel.org/r/20230819141239.287290-7-yury.norov@gmail.com --- kernel/sched/topology.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a60ecf45360c..a7b50bba7829 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2112,13 +2112,15 @@ static int hop_cmp(const void *a, const void *b) return -1; } -/* - * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu - * closest to @cpu from @cpumask. - * cpumask: cpumask to find a cpu from - * cpu: Nth cpu to find - * - * returns: cpu, or nr_cpu_ids when nothing found. +/** + * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU + * from @cpus to @cpu, taking into account distance + * from a given @node. + * @cpus: cpumask to find a cpu from + * @cpu: CPU to start searching + * @node: NUMA node to order CPUs by distance + * + * Return: cpu, or nr_cpu_ids when nothing found. */ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { -- cgit v1.2.3 From c0490bc9bb62d9376f3dd4ec28e03ca0fef97152 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 13 Sep 2023 13:20:31 +0000 Subject: sched/fair: Fix cfs_rq_is_decayed() on !SMP We don't need to maintain per-queue leaf_cfs_rq_list on !SMP, since it's used for cfs_rq load tracking & balancing on SMP. But sched debug interface uses it to print per-cfs_rq stats. This patch fixes the !SMP version of cfs_rq_is_decayed(), so the per-queue leaf_cfs_rq_list is also maintained correctly on !SMP, to fix the warning in assert_list_leaf_cfs_rq(). Fixes: 0a00a354644e ("sched/fair: Delete useless condition in tg_unthrottle_up()") Reported-by: Leo Yu-Chi Liang Signed-off-by: Chengming Zhou Signed-off-by: Ingo Molnar Tested-by: Leo Yu-Chi Liang Reviewed-by: Vincent Guittot Closes: https://lore.kernel.org/all/ZN87UsqkWcFLDxea@swlinux02/ Link: https://lore.kernel.org/r/20230913132031.2242151-1-chengming.zhou@linux.dev --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 41cfd61b4d6b..c893721ff5b1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4866,7 +4866,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { - return true; + return !cfs_rq->nr_running; } #define UPDATE_TG 0x0 -- cgit v1.2.3 From 4ff34ad3d39377d9f6953f3606ccf611ce636767 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 28 Feb 2023 17:14:26 +0100 Subject: sched/core: Use do-while instead of for loop in set_nr_if_polling() Use equivalent do-while loop instead of infinite for loop. There are no asm code changes. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230228161426.4508-1-ubizjak@gmail.com --- kernel/sched/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 76662d809183..f39482d6a6e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -919,14 +919,13 @@ static bool set_nr_if_polling(struct task_struct *p) struct thread_info *ti = task_thread_info(p); typeof(ti->flags) val = READ_ONCE(ti->flags); - for (;;) { + do { if (!(val & _TIF_POLLING_NRFLAG)) return false; if (val & _TIF_NEED_RESCHED) return true; - if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)) - break; - } + } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)); + return true; } -- cgit v1.2.3 From fbaa6a181a4b1886cbf4214abdf9a2df68471510 Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Fri, 8 Sep 2023 15:49:15 -0700 Subject: sched/core: Remove ifdeffery for saved_state In preparation for freezer to also use saved_state, remove the CONFIG_PREEMPT_RT compilation guard around saved_state. On the arm64 platform I tested which did not have CONFIG_PREEMPT_RT, there was no statistically significant deviation by applying this patch. Test methodology: perf bench sched message -g 40 -l 40 Signed-off-by: Elliot Berman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- kernel/sched/core.c | 8 ++------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 77f01ac385f7..dc37ae787e33 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -750,10 +750,8 @@ struct task_struct { #endif unsigned int __state; -#ifdef CONFIG_PREEMPT_RT /* saved state for "spinlock sleepers" */ unsigned int saved_state; -#endif /* * This begins the randomizable portion of task_struct. Only diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f39482d6a6e6..49541e3c1295 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2232,23 +2232,20 @@ int __task_state_match(struct task_struct *p, unsigned int state) if (READ_ONCE(p->__state) & state) return 1; -#ifdef CONFIG_PREEMPT_RT if (READ_ONCE(p->saved_state) & state) return -1; -#endif + return 0; } static __always_inline int task_state_match(struct task_struct *p, unsigned int state) { -#ifdef CONFIG_PREEMPT_RT /* * Serialize against current_save_and_set_rtlock_wait_state() and * current_restore_rtlock_saved_state(). */ guard(raw_spinlock_irq)(&p->pi_lock); -#endif return __task_state_match(p, state); } @@ -4038,7 +4035,6 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) *success = !!(match = __task_state_match(p, state)); -#ifdef CONFIG_PREEMPT_RT /* * Saved state preserves the task state across blocking on * an RT lock. If the state matches, set p::saved_state to @@ -4054,7 +4050,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) */ if (match < 0) p->saved_state = TASK_RUNNING; -#endif + return match > 0; } -- cgit v1.2.3 From 8f0eed4a78a81668bc78923ea09f51a7a663c2b0 Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Fri, 8 Sep 2023 15:49:16 -0700 Subject: freezer,sched: Use saved_state to reduce some spurious wakeups After commit f5d39b020809 ("freezer,sched: Rewrite core freezer logic"), tasks that transition directly from TASK_FREEZABLE to TASK_FROZEN are always woken up on the thaw path. Prior to that commit, tasks could ask freezer to consider them "frozen enough" via freezer_do_not_count(). The commit replaced freezer_do_not_count() with a TASK_FREEZABLE state which allows freezer to immediately mark the task as TASK_FROZEN without waking up the task. This is efficient for the suspend path, but on the thaw path, the task is always woken up even if the task didn't need to wake up and goes back to its TASK_(UN)INTERRUPTIBLE state. Although these tasks are capable of handling of the wakeup, we can observe a power/perf impact from the extra wakeup. We observed on Android many tasks wait in the TASK_FREEZABLE state (particularly due to many of them being binder clients). We observed nearly 4x the number of tasks and a corresponding linear increase in latency and power consumption when thawing the system. The latency increased from ~15ms to ~50ms. Avoid the spurious wakeups by saving the state of TASK_FREEZABLE tasks. If the task was running before entering TASK_FROZEN state (__refrigerator()) or if the task received a wake up for the saved state, then the task is woken on thaw. saved_state from PREEMPT_RT locks can be re-used because freezer would not stomp on the rtlock wait flow: TASK_RTLOCK_WAIT isn't considered freezable. Reported-by: Prakash Viswalingam Signed-off-by: Elliot Berman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/freezer.c | 41 +++++++++++++++++++---------------------- kernel/sched/core.c | 23 ++++++++++++++--------- 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/kernel/freezer.c b/kernel/freezer.c index 4fad0e6fca64..c450fa8b8b5e 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -71,7 +71,11 @@ bool __refrigerator(bool check_kthr_stop) for (;;) { bool freeze; + raw_spin_lock_irq(¤t->pi_lock); set_current_state(TASK_FROZEN); + /* unstale saved_state so that __thaw_task() will wake us up */ + current->saved_state = TASK_RUNNING; + raw_spin_unlock_irq(¤t->pi_lock); spin_lock_irq(&freezer_lock); freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop()); @@ -129,6 +133,7 @@ static int __set_task_frozen(struct task_struct *p, void *arg) WARN_ON_ONCE(debug_locks && p->lockdep_depth); #endif + p->saved_state = p->__state; WRITE_ONCE(p->__state, TASK_FROZEN); return TASK_FROZEN; } @@ -170,42 +175,34 @@ bool freeze_task(struct task_struct *p) } /* - * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical - * state in p->jobctl. If either of them got a wakeup that was missed because - * TASK_FROZEN, then their canonical state reflects that and the below will - * refuse to restore the special state and instead issue the wakeup. + * Restore the saved_state before the task entered freezer. For typical task + * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens + * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state + * is restored unless they got an expected wakeup (see ttwu_state_match()). + * Returns 1 if the task state was restored. */ -static int __set_task_special(struct task_struct *p, void *arg) +static int __restore_freezer_state(struct task_struct *p, void *arg) { - unsigned int state = 0; + unsigned int state = p->saved_state; - if (p->jobctl & JOBCTL_TRACED) - state = TASK_TRACED; - - else if (p->jobctl & JOBCTL_STOPPED) - state = TASK_STOPPED; - - if (state) + if (state != TASK_RUNNING) { WRITE_ONCE(p->__state, state); + return 1; + } - return state; + return 0; } void __thaw_task(struct task_struct *p) { - unsigned long flags, flags2; + unsigned long flags; spin_lock_irqsave(&freezer_lock, flags); if (WARN_ON_ONCE(freezing(p))) goto unlock; - if (lock_task_sighand(p, &flags2)) { - /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */ - bool ret = task_call_func(p, __set_task_special, NULL); - unlock_task_sighand(p, &flags2); - if (ret) - goto unlock; - } + if (task_call_func(p, __restore_freezer_state, NULL)) + goto unlock; wake_up_state(p, TASK_FROZEN); unlock: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 49541e3c1295..5a50c4e41be9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2242,8 +2242,8 @@ static __always_inline int task_state_match(struct task_struct *p, unsigned int state) { /* - * Serialize against current_save_and_set_rtlock_wait_state() and - * current_restore_rtlock_saved_state(). + * Serialize against current_save_and_set_rtlock_wait_state(), + * current_restore_rtlock_saved_state(), and __refrigerator(). */ guard(raw_spinlock_irq)(&p->pi_lock); return __task_state_match(p, state); @@ -4015,13 +4015,17 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * The caller holds p::pi_lock if p != current or has preemption * disabled when p == current. * - * The rules of PREEMPT_RT saved_state: + * The rules of saved_state: * * The related locking code always holds p::pi_lock when updating * p::saved_state, which means the code is fully serialized in both cases. * - * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other - * bits set. This allows to distinguish all wakeup scenarios. + * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. + * No other bits set. This allows to distinguish all wakeup scenarios. + * + * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This + * allows us to prevent early wakeup of tasks before they can be run on + * asymmetric ISA architectures (eg ARMv9). */ static __always_inline bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) @@ -4037,10 +4041,11 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) /* * Saved state preserves the task state across blocking on - * an RT lock. If the state matches, set p::saved_state to - * TASK_RUNNING, but do not wake the task because it waits - * for a lock wakeup. Also indicate success because from - * the regular waker's point of view this has succeeded. + * an RT lock or TASK_FREEZABLE tasks. If the state matches, + * set p::saved_state to TASK_RUNNING, but do not wake the task + * because it waits for a lock wakeup or __thaw_task(). Also + * indicate success because from the regular waker's point of + * view this has succeeded. * * After acquiring the lock the task will restore p::__state * from p::saved_state which ensures that the regular -- cgit v1.2.3 From 1528c661c24b407e92194426b0adbb43de859ce0 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Tue, 12 Sep 2023 14:58:08 +0800 Subject: sched/fair: Ratelimit update to tg->load_avg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using sysbench to benchmark Postgres in a single docker instance with sysbench's nr_threads set to nr_cpu, it is observed there are times update_cfs_group() and update_load_avg() shows noticeable overhead on a 2sockets/112core/224cpu Intel Sapphire Rapids(SPR): 13.75% 13.74% [kernel.vmlinux] [k] update_cfs_group 10.63% 10.04% [kernel.vmlinux] [k] update_load_avg Annotate shows the cycles are mostly spent on accessing tg->load_avg with update_load_avg() being the write side and update_cfs_group() being the read side. tg->load_avg is per task group and when different tasks of the same taskgroup running on different CPUs frequently access tg->load_avg, it can be heavily contended. E.g. when running postgres_sysbench on a 2sockets/112cores/224cpus Intel Sappire Rapids, during a 5s window, the wakeup number is 14millions and migration number is 11millions and with each migration, the task's load will transfer from src cfs_rq to target cfs_rq and each change involves an update to tg->load_avg. Since the workload can trigger as many wakeups and migrations, the access(both read and write) to tg->load_avg can be unbound. As a result, the two mentioned functions showed noticeable overhead. With netperf/nr_client=nr_cpu/UDP_RR, the problem is worse: during a 5s window, wakeup number is 21millions and migration number is 14millions; update_cfs_group() costs ~25% and update_load_avg() costs ~16%. Reduce the overhead by limiting updates to tg->load_avg to at most once per ms. The update frequency is a tradeoff between tracking accuracy and overhead. 1ms is chosen because PELT window is roughly 1ms and it delivered good results for the tests that I've done. After this change, the cost of accessing tg->load_avg is greatly reduced and performance improved. Detailed test results below. ============================== postgres_sysbench on SPR: 25% base: 42382±19.8% patch: 50174±9.5% (noise) 50% base: 67626±1.3% patch: 67365±3.1% (noise) 75% base: 100216±1.2% patch: 112470±0.1% +12.2% 100% base: 93671±0.4% patch: 113563±0.2% +21.2% ============================== hackbench on ICL: group=1 base: 114912±5.2% patch: 117857±2.5% (noise) group=4 base: 359902±1.6% patch: 361685±2.7% (noise) group=8 base: 461070±0.8% patch: 491713±0.3% +6.6% group=16 base: 309032±5.0% patch: 378337±1.3% +22.4% ============================= hackbench on SPR: group=1 base: 100768±2.9% patch: 103134±2.9% (noise) group=4 base: 413830±12.5% patch: 378660±16.6% (noise) group=8 base: 436124±0.6% patch: 490787±3.2% +12.5% group=16 base: 457730±3.2% patch: 680452±1.3% +48.8% ============================ netperf/udp_rr on ICL 25% base: 114413±0.1% patch: 115111±0.0% +0.6% 50% base: 86803±0.5% patch: 86611±0.0% (noise) 75% base: 35959±5.3% patch: 49801±0.6% +38.5% 100% base: 61951±6.4% patch: 70224±0.8% +13.4% =========================== netperf/udp_rr on SPR 25% base: 104954±1.3% patch: 107312±2.8% (noise) 50% base: 55394±4.6% patch: 54940±7.4% (noise) 75% base: 13779±3.1% patch: 36105±1.1% +162% 100% base: 9703±3.7% patch: 28011±0.2% +189% ============================================== netperf/tcp_stream on ICL (all in noise range) 25% base: 43092±0.1% patch: 42891±0.5% 50% base: 19278±14.9% patch: 22369±7.2% 75% base: 16822±3.0% patch: 17086±2.3% 100% base: 18216±0.6% patch: 18078±2.9% =============================================== netperf/tcp_stream on SPR (all in noise range) 25% base: 34491±0.3% patch: 34886±0.5% 50% base: 19278±14.9% patch: 22369±7.2% 75% base: 16822±3.0% patch: 17086±2.3% 100% base: 18216±0.6% patch: 18078±2.9% Reported-by: Nitin Tekchandani Suggested-by: Vincent Guittot Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Mathieu Desnoyers Reviewed-by: David Vernet Tested-by: Mathieu Desnoyers Tested-by: Swapnil Sapkal Link: https://lkml.kernel.org/r/20230912065808.2530-2-aaron.lu@intel.com --- kernel/sched/fair.c | 13 ++++++++++++- kernel/sched/sched.h | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c893721ff5b1..d0877878bcdb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3876,7 +3876,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { - long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + long delta; + u64 now; /* * No need to update load_avg for root_task_group as it is not used. @@ -3884,9 +3885,19 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) if (cfs_rq->tg == &root_task_group) return; + /* + * For migration heavy workloads, access to tg->load_avg can be + * unbound. Limit the update rate to at most once per ms. + */ + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC) + return; + + delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; + cfs_rq->last_update_tg_load_avg = now; } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 68768f47ccb7..887468c48ff6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -594,6 +594,7 @@ struct cfs_rq { } removed; #ifdef CONFIG_FAIR_GROUP_SCHED + u64 last_update_tg_load_avg; unsigned long tg_load_avg_contrib; long propagate; long prop_runnable_sum; -- cgit v1.2.3 From 7ad0354d18ae05e9c8885251e234cbcf141f8972 Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Fri, 18 Aug 2023 09:56:33 +0800 Subject: sched/headers: Remove duplicated includes in kernel/sched/sched.h Remove duplicated includes of linux/cgroup.h and linux/psi.h. Both of these includes are included regardless of the config and they are all protected by ifndef, so no point including them again. Signed-off-by: GUO Zihua Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230818015633.18370-1-guozihua@huawei.com --- kernel/sched/sched.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 887468c48ff6..5f217b1e8f1c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -74,15 +74,6 @@ #include "../workqueue_internal.h" -#ifdef CONFIG_CGROUP_SCHED -#include -#include -#endif - -#ifdef CONFIG_SCHED_DEBUG -# include -#endif - #ifdef CONFIG_PARAVIRT # include # include -- cgit v1.2.3 From 82845683ca6a15fe8c7912c6264bb0e84ec6f5fb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Sep 2023 10:31:15 +0200 Subject: sched/fair: Rename check_preempt_wakeup() to check_preempt_wakeup_fair() Other scheduling classes already postfix their similar methods with the class name. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d0877878bcdb..aeaf31e32c67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7994,7 +7994,7 @@ static void set_next_buddy(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; @@ -12830,7 +12830,7 @@ DEFINE_SCHED_CLASS(fair) = { .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .check_preempt_curr = check_preempt_wakeup, + .check_preempt_curr = check_preempt_wakeup_fair, .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, -- cgit v1.2.3 From e23edc86b09df655bf8963bbcb16647adc787395 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Sep 2023 10:38:21 +0200 Subject: sched/fair: Rename check_preempt_curr() to wakeup_preempt() The name is a bit opaque - make it clear that this is about wakeup preemption. Also rename the ->check_preempt_curr() methods similarly. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) --- kernel/sched/core.c | 14 +++++++------- kernel/sched/deadline.c | 10 +++++----- kernel/sched/fair.c | 10 +++++----- kernel/sched/idle.c | 4 ++-- kernel/sched/rt.c | 6 +++--- kernel/sched/sched.h | 4 ++-- kernel/sched/stop_task.c | 4 ++-- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5a50c4e41be9..52ceb85b6421 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2211,10 +2211,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio); } -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { if (p->sched_class == rq->curr->sched_class) - rq->curr->sched_class->check_preempt_curr(rq, p, flags); + rq->curr->sched_class->wakeup_preempt(rq, p, flags); else if (sched_class_above(p->sched_class, rq->curr->sched_class)) resched_curr(rq); @@ -2508,7 +2508,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq_lock(rq, rf); WARN_ON_ONCE(task_cpu(p) != new_cpu); activate_task(rq, p, 0); - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); return rq; } @@ -3390,7 +3390,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); - check_preempt_curr(dst_rq, p, 0); + wakeup_preempt(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); rq_unpin_lock(src_rq, &srf); @@ -3764,7 +3764,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, } activate_task(rq, p, en_flags); - check_preempt_curr(rq, p, wake_flags); + wakeup_preempt(rq, p, wake_flags); ttwu_do_wakeup(p); @@ -3835,7 +3835,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) * it should preempt the task that is current now. */ update_rq_clock(rq); - check_preempt_curr(rq, p, wake_flags); + wakeup_preempt(rq, p, wake_flags); } ttwu_do_wakeup(p); ret = 1; @@ -4854,7 +4854,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); - check_preempt_curr(rq, p, WF_FORK); + wakeup_preempt(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 58b542bf2893..fb1996a674db 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -763,7 +763,7 @@ static inline void deadline_queue_pull_task(struct rq *rq) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags); static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, struct rq *rq) @@ -1175,7 +1175,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -1939,7 +1939,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * Only called when both the current and waking task are -deadline * tasks. */ -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { @@ -2652,7 +2652,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) deadline_queue_push_tasks(rq); #endif if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); } else { @@ -2721,7 +2721,7 @@ DEFINE_SCHED_CLASS(dl) = { .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, - .check_preempt_curr = check_preempt_curr_dl, + .wakeup_preempt = wakeup_preempt_dl, .pick_next_task = pick_next_task_dl, .put_prev_task = put_prev_task_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aeaf31e32c67..fcf0c5bc8b47 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8007,7 +8007,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int /* * This is possible from callers such as attach_tasks(), in which we - * unconditionally check_preempt_curr() after an enqueue (which may have + * unconditionally wakeup_preempt() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ @@ -8914,7 +8914,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) WARN_ON_ONCE(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } /* @@ -12369,7 +12369,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (p->prio > oldprio) resched_curr(rq); } else - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -12471,7 +12471,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) if (task_current(rq, p)) resched_curr(rq); else - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } } @@ -12830,7 +12830,7 @@ DEFINE_SCHED_CLASS(fair) = { .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .check_preempt_curr = check_preempt_wakeup_fair, + .wakeup_preempt = check_preempt_wakeup_fair, .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 342f58a329f5..26f714003c1f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -400,7 +400,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) +static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) { resched_curr(rq); } @@ -481,7 +481,7 @@ DEFINE_SCHED_CLASS(idle) = { /* dequeue is not valid, we print a debug message there: */ .dequeue_task = dequeue_task_idle, - .check_preempt_curr = check_preempt_curr_idle, + .wakeup_preempt = wakeup_preempt_idle, .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0597ba0f85ff..3e442fa3f6bc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -953,7 +953,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) /* * When we're idle and a woken (rt) task is - * throttled check_preempt_curr() will set + * throttled wakeup_preempt() will set * skip_update and the time between the wakeup * and this unthrottle will get accounted as * 'runtime'. @@ -1715,7 +1715,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) +static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { resched_curr(rq); @@ -2702,7 +2702,7 @@ DEFINE_SCHED_CLASS(rt) = { .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, - .check_preempt_curr = check_preempt_curr_rt, + .wakeup_preempt = wakeup_preempt_rt, .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5f217b1e8f1c..7e070dcf7074 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2236,7 +2236,7 @@ struct sched_class { void (*yield_task) (struct rq *rq); bool (*yield_to_task)(struct rq *rq, struct task_struct *p); - void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); struct task_struct *(*pick_next_task)(struct rq *rq); @@ -2510,7 +2510,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); -extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #ifdef CONFIG_PREEMPT_RT #define SCHED_NR_MIGRATE_BREAK 8 diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 85590599b4d6..6cf7304e6449 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #endif /* CONFIG_SMP */ static void -check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) { /* we're never preempted */ } @@ -120,7 +120,7 @@ DEFINE_SCHED_CLASS(stop) = { .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, - .check_preempt_curr = check_preempt_curr_stop, + .wakeup_preempt = wakeup_preempt_stop, .pick_next_task = pick_next_task_stop, .put_prev_task = put_prev_task_stop, -- cgit v1.2.3 From 17e7170645e34c519443ba63895264bbdee7beee Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 20 Sep 2023 15:00:24 +0200 Subject: sched/debug: Remove the /proc/sys/kernel/sched_child_runs_first sysctl The /proc/sys/kernel/sched_child_runs_first knob is no longer connected since: 5e963f2bd4654 ("sched/fair: Commit to EEVDF") Remove it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230920130025.412071-2-bigeasy@linutronix.de --- kernel/sched/debug.c | 1 - kernel/sched/fair.c | 13 ------------- kernel/sched/sched.h | 2 -- 3 files changed, 16 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4c3d0d9f3db6..132dfd1e6f47 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -864,7 +864,6 @@ static void sched_debug_header(struct seq_file *m) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) PN(sysctl_sched_base_slice); - P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN #undef P diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fcf0c5bc8b47..75720008fdd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -78,12 +78,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; unsigned int sysctl_sched_base_slice = 750000ULL; static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; -/* - * After fork, child runs first. If set to 0 (default) then - * parent will (try to) run first. - */ -unsigned int sysctl_sched_child_runs_first __read_mostly; - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; int sched_thermal_decay_shift; @@ -145,13 +139,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7e070dcf7074..9260120ed2a5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -100,8 +100,6 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; -extern unsigned int sysctl_sched_child_runs_first; - extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); -- cgit v1.2.3 From 622f0a1d544fa88dda10d27727835e825c84ae0f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 20 Sep 2023 15:00:25 +0200 Subject: sched/debug: Update stale reference to sched_debug.c Since commit: 8a99b6833c884 ("sched: Move SCHED_DEBUG sysctl to debugfs") The sched_debug interface moved from /proc to debugfs. The comment mentions still the outdated proc interfaces. Update the comment, point to the current location of the interface. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230920130025.412071-3-bigeasy@linutronix.de --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 132dfd1e6f47..5e34a8cb2c76 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -8,7 +8,7 @@ */ /* - * This allows printing both to /proc/sched_debug and + * This allows printing both to /sys/kernel/debug/sched/debug and * to the console */ #define SEQ_printf(m, x...) \ -- cgit v1.2.3 From 87c3a5893e865739ce78aa7192d36011022e0af7 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Fri, 15 Sep 2023 15:47:11 +1000 Subject: sched/core: Optimize in_task() and in_interrupt() a bit Except on x86, preempt_count is always accessed with READ_ONCE(). Repeated invocations in macros like irq_count() produce repeated loads. These redundant instructions appear in various fast paths. In the one shown below, for example, irq_count() is evaluated during kernel entry if !tick_nohz_full_cpu(smp_processor_id()). 0001ed0a : 1ed0a: 4e56 0000 linkw %fp,#0 1ed0e: 200f movel %sp,%d0 1ed10: 0280 ffff e000 andil #-8192,%d0 1ed16: 2040 moveal %d0,%a0 1ed18: 2028 0008 movel %a0@(8),%d0 1ed1c: 0680 0001 0000 addil #65536,%d0 1ed22: 2140 0008 movel %d0,%a0@(8) 1ed26: 082a 0001 000f btst #1,%a2@(15) 1ed2c: 670c beqs 1ed3a 1ed2e: 2028 0008 movel %a0@(8),%d0 1ed32: 2028 0008 movel %a0@(8),%d0 1ed36: 2028 0008 movel %a0@(8),%d0 1ed3a: 4e5e unlk %fp 1ed3c: 4e75 rts This patch doesn't prevent the pointless btst and beqs instructions above, but it does eliminate 2 of the 3 pointless move instructions here and elsewhere. On x86, preempt_count is per-cpu data and the problem does not arise presumably because the compiler is free to optimize more effectively. This patch was tested on m68k and x86. I was expecting no changes to object code for x86 and mostly that's what I saw. However, there were a few places where code generation was perturbed for some reason. The performance issue addressed here is minor on uniprocessor m68k. I got a 0.01% improvement from this patch for a simple "find /sys -false" benchmark. For architectures and workloads susceptible to cache line bounce the improvement is expected to be larger. The only SMP architecture I have is x86, and as x86 unaffected I have not done any further measurements. Fixes: 15115830c887 ("preempt: Cleanup the macro maze a bit") Signed-off-by: Finn Thain Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/0a403120a682a525e6db2d81d1a3ffcc137c3742.1694756831.git.fthain@linux-m68k.org --- include/linux/preempt.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 1424670df161..9aa6358a1a16 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -99,14 +99,21 @@ static __always_inline unsigned char interrupt_context_level(void) return level; } +/* + * These macro definitions avoid redundant invocations of preempt_count() + * because such invocations would result in redundant loads given that + * preempt_count() is commonly implemented with READ_ONCE(). + */ + #define nmi_count() (preempt_count() & NMI_MASK) #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #ifdef CONFIG_PREEMPT_RT # define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK) +# define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count()) #else # define softirq_count() (preempt_count() & SOFTIRQ_MASK) +# define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK)) #endif -#define irq_count() (nmi_count() | hardirq_count() | softirq_count()) /* * Macros to retrieve the current execution context: @@ -119,7 +126,11 @@ static __always_inline unsigned char interrupt_context_level(void) #define in_nmi() (nmi_count()) #define in_hardirq() (hardirq_count()) #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) -#define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq())) +#ifdef CONFIG_PREEMPT_RT +# define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq())) +#else +# define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) +#endif /* * The following macros are deprecated and should not be used in new code: -- cgit v1.2.3 From 3ba78da711940ce07c39c4cdd1f4ad284067a42d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 6 Jun 2021 13:27:15 +0200 Subject: sched/headers: Add header guard to It's the only non-trivial header in include/linux/sched/ missing a header guard. Signed-off-by: Ingo Molnar --- include/linux/sched/deadline.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 7c83d4d5a971..df3aca89d4f5 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_DEADLINE_H +#define _LINUX_SCHED_DEADLINE_H /* * SCHED_DEADLINE tasks has negative priorities, reflecting @@ -34,3 +36,5 @@ extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); #endif /* CONFIG_SMP */ + +#endif /* _LINUX_SCHED_DEADLINE_H */ -- cgit v1.2.3 From 6eddb116dd830436afbd922568292867de6c8b9e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Sep 2023 11:24:17 +0200 Subject: sched/headers: Standardize the header guard name Use the same _LINUX_SCHED_ prefix nomenclature as the other 29 header guards in include/linux/sched/ do. Signed-off-by: Ingo Molnar --- include/linux/sched/vhost_task.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h index 837a23624a66..bc60243d43b3 100644 --- a/include/linux/sched/vhost_task.h +++ b/include/linux/sched/vhost_task.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_VHOST_TASK_H -#define _LINUX_VHOST_TASK_H - +#ifndef _LINUX_SCHED_VHOST_TASK_H +#define _LINUX_SCHED_VHOST_TASK_H struct vhost_task; @@ -11,4 +10,4 @@ void vhost_task_start(struct vhost_task *vtsk); void vhost_task_stop(struct vhost_task *vtsk); void vhost_task_wake(struct vhost_task *vtsk); -#endif +#endif /* _LINUX_SCHED_VHOST_TASK_H */ -- cgit v1.2.3 From 0f9a1a4d234c064d8dff69cf3f3755554dd479ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Sep 2023 11:27:37 +0200 Subject: sched/headers: Standardize the header guard #endif Signed-off-by: Ingo Molnar --- include/linux/sched/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h index 3c3e049224ae..969aaf5ef9d6 100644 --- a/include/linux/sched/types.h +++ b/include/linux/sched/types.h @@ -20,4 +20,4 @@ struct task_cputime { unsigned long long sum_exec_runtime; }; -#endif +#endif /* _LINUX_SCHED_TYPES_H */ -- cgit v1.2.3 From 1632d47fae2f2d229dd432854c4443ebb0bb27a4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Sep 2023 11:28:48 +0200 Subject: sched/headers: Standardize the header guard #endif Signed-off-by: Ingo Molnar --- include/linux/sched/smt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index 59d3736c454c..fb1e295e7e63 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -17,4 +17,4 @@ static inline bool sched_smt_active(void) { return false; } void arch_smt_update(void); -#endif +#endif /* _LINUX_SCHED_SMT_H */ -- cgit v1.2.3 From 8bf0cdfac7f8aa3fa6151b5c5f5eebdb44a64e89 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Sep 2023 11:32:58 +0200 Subject: : Introduce the list_for_each_reverse() method The list_head counterpart of list_for_each_entry_reverse() was missing, add it to complete the list handling APIs in . [ This new API is also relied on by a WIP scheduler patch, so this variant is not a theoretical possibility only. ] Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org --- include/linux/list.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/list.h b/include/linux/list.h index 164b4d0e9d2a..1837caedf723 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -686,6 +686,14 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) +/** + * list_for_each_reverse - iterate backwards over a list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each_reverse(pos, head) \ + for (pos = (head)->prev; pos != (head); pos = pos->prev) + /** * list_for_each_rcu - Iterate over a list in an RCU-safe fashion * @pos: the &struct list_head to use as a loop cursor. -- cgit v1.2.3 From dc461c48deda8a2d243fbaf49e276d555eb833d8 Mon Sep 17 00:00:00 2001 From: Liming Wu Date: Fri, 25 Aug 2023 10:35:00 +0800 Subject: sched/debug: Avoid checking in_atomic_preempt_off() twice in schedule_debug() in_atomic_preempt_off() already gets called in schedule_debug() once, which is the only caller of __schedule_bug(). Skip the second call within __schedule_bug(), it should always be true at this point. [ mingo: Clarified the changelog. ] Signed-off-by: Liming Wu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230825023501.1848-1-liming.wu@jaguarmicro.com --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 52ceb85b6421..107493469b4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5899,8 +5899,7 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); - if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) - && in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); } -- cgit v1.2.3 From 30797bce8ef0c73f0c388148ffac92458533b10e Mon Sep 17 00:00:00 2001 From: Josh Don Date: Fri, 22 Sep 2023 16:05:34 -0700 Subject: sched/fair: Make cfs_rq->throttled_csd_list available on !SMP This makes the following patch cleaner by avoiding extra CONFIG_SMP conditionals on the availability of rq->throttled_csd_list. Signed-off-by: Josh Don Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230922230535.296350-1-joshdon@google.com --- kernel/sched/fair.c | 4 ---- kernel/sched/sched.h | 2 -- 2 files changed, 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 75720008fdd2..41c960eca792 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5763,11 +5763,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) if (!cfs_rq_throttled(cfs_rq)) goto next; -#ifdef CONFIG_SMP /* Already queued for async unthrottle */ if (!list_empty(&cfs_rq->throttled_csd_list)) goto next; -#endif /* By the above checks, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); @@ -6134,9 +6132,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); -#ifdef CONFIG_SMP INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); -#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9260120ed2a5..96f8ab7a0702 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -634,9 +634,7 @@ struct cfs_rq { int throttled; int throttle_count; struct list_head throttled_list; -#ifdef CONFIG_SMP struct list_head throttled_csd_list; -#endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; -- cgit v1.2.3 From 2f8c62296b6f656bbfd17e9f1fadd7478003a9d9 Mon Sep 17 00:00:00 2001 From: Josh Don Date: Fri, 22 Sep 2023 16:05:35 -0700 Subject: sched/fair: Fix warning in bandwidth distribution We've observed the following warning being hit in distribute_cfs_runtime(): SCHED_WARN_ON(cfs_rq->runtime_remaining > 0) We have the following race: - CPU 0: running bandwidth distribution (distribute_cfs_runtime). Inspects the local cfs_rq and makes its runtime_remaining positive. However, we defer unthrottling the local cfs_rq until after considering all remote cfs_rq's. - CPU 1: starts running bandwidth distribution from the slack timer. When it finds the cfs_rq for CPU 0 on the throttled list, it observers the that the cfs_rq is throttled, yet is not on the CSD list, and has a positive runtime_remaining, thus triggering the warning in distribute_cfs_runtime. To fix this, we can rework the local unthrottling logic to put the local cfs_rq on a local list, so that any future bandwidth distributions will realize that the cfs_rq is about to be unthrottled. Signed-off-by: Josh Don Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230922230535.296350-2-joshdon@google.com --- kernel/sched/fair.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 41c960eca792..2973173ad850 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5741,13 +5741,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { - struct cfs_rq *local_unthrottle = NULL; int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; bool throttled = false; - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *tmp; struct rq_flags rf; struct rq *rq; + LIST_HEAD(local_unthrottle); rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -5782,11 +5782,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) { - if (cpu_of(rq) != this_cpu || - SCHED_WARN_ON(local_unthrottle)) + if (cpu_of(rq) != this_cpu) { unthrottle_cfs_rq_async(cfs_rq); - else - local_unthrottle = cfs_rq; + } else { + /* + * We currently only expect to be unthrottling + * a single cfs_rq locally. + */ + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + list_add_tail(&cfs_rq->throttled_csd_list, + &local_unthrottle); + } } else { throttled = true; } @@ -5794,15 +5800,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) next: rq_unlock_irqrestore(rq, &rf); } - rcu_read_unlock(); - if (local_unthrottle) { - rq = cpu_rq(this_cpu); + list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle, + throttled_csd_list) { + struct rq *rq = rq_of(cfs_rq); + rq_lock_irqsave(rq, &rf); - if (cfs_rq_throttled(local_unthrottle)) - unthrottle_cfs_rq(local_unthrottle); + + list_del_init(&cfs_rq->throttled_csd_list); + + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + rq_unlock_irqrestore(rq, &rf); } + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + + rcu_read_unlock(); return throttled; } -- cgit v1.2.3 From 3eafe225995c67f8c179011ec2d6e4c12b32a53d Mon Sep 17 00:00:00 2001 From: Wang Jinchao Date: Sun, 20 Aug 2023 20:53:17 +0800 Subject: sched/core: Refactor the task_flags check for worker sleeping in sched_submit_work() Simplify the conditional logic for checking worker flags by splitting the original compound `if` statement into separate `if` and `else if` clauses. This modification not only retains the previous functionality, but also reduces a single `if` check, improving code clarity and potentially enhancing performance. Signed-off-by: Wang Jinchao Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/ZOIMvURE99ZRAYEj@fedora --- kernel/sched/core.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 107493469b4e..84881a582847 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6711,12 +6711,10 @@ static inline void sched_submit_work(struct task_struct *tsk) * If a worker goes to sleep, notify and ask workqueue whether it * wants to wake up a task to maintain concurrency. */ - if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { - if (task_flags & PF_WQ_WORKER) - wq_worker_sleeping(tsk); - else - io_wq_worker_sleeping(tsk); - } + if (task_flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else if (task_flags & PF_IO_WORKER) + io_wq_worker_sleeping(tsk); /* * spinlock and rwlock must not flush block requests. This will -- cgit v1.2.3 From 612f769edd06a6e42f7cd72425488e68ddaeef0a Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 11 Aug 2023 12:20:44 +0100 Subject: sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask Sebastian noted that the rto_push_work IRQ work can be queued for a CPU that has an empty pushable_tasks list, which means nothing useful will be done in the IPI other than queue the work for the next CPU on the rto_mask. rto_push_irq_work_func() only operates on tasks in the pushable_tasks list, but the conditions for that irq_work to be queued (and for a CPU to be added to the rto_mask) rely on rq_rt->nr_migratory instead. nr_migratory is increased whenever an RT task entity is enqueued and it has nr_cpus_allowed > 1. Unlike the pushable_tasks list, nr_migratory includes a rt_rq's current task. This means a rt_rq can have a migratible current, N non-migratible queued tasks, and be flagged as overloaded / have its CPU set in the rto_mask, despite having an empty pushable_tasks list. Make an rt_rq's overload logic be driven by {enqueue,dequeue}_pushable_task(). Since rt_rq->{rt_nr_migratory,rt_nr_total} become unused, remove them. Note that the case where the current task is pushed away to make way for a migration-disabled task remains unchanged: the migration-disabled task has to be in the pushable_tasks list in the first place, which means it has nr_cpus_allowed > 1. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Tested-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20230811112044.3302588-1-vschneid@redhat.com --- kernel/sched/debug.c | 3 --- kernel/sched/rt.c | 70 ++++++++-------------------------------------------- kernel/sched/sched.h | 2 -- 3 files changed, 10 insertions(+), 65 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5e34a8cb2c76..c4253bd2dfb0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -724,9 +724,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) PU(rt_nr_running); -#ifdef CONFIG_SMP - PU(rt_nr_migratory); -#endif P(rt_throttled); PN(rt_time); PN(rt_runtime); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3e442fa3f6bc..3b627ab586fb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -143,7 +143,6 @@ void init_rt_rq(struct rt_rq *rt_rq) #if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->highest_prio.next = MAX_RT_PRIO-1; - rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); #endif /* CONFIG_SMP */ @@ -358,53 +357,6 @@ static inline void rt_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } -static void update_rt_migration(struct rt_rq *rt_rq) -{ - if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { - if (!rt_rq->overloaded) { - rt_set_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 1; - } - } else if (rt_rq->overloaded) { - rt_clear_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 0; - } -} - -static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - return; - - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total++; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory++; - - update_rt_migration(rt_rq); -} - -static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - return; - - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total--; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory--; - - update_rt_migration(rt_rq); -} - static inline int has_pushable_tasks(struct rq *rq) { return !plist_head_empty(&rq->rt.pushable_tasks); @@ -438,6 +390,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) /* Update the highest prio pushable task */ if (p->prio < rq->rt.highest_prio.next) rq->rt.highest_prio.next = p->prio; + + if (!rq->rt.overloaded) { + rt_set_overload(rq); + rq->rt.overloaded = 1; + } } static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) @@ -451,6 +408,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) rq->rt.highest_prio.next = p->prio; } else { rq->rt.highest_prio.next = MAX_RT_PRIO-1; + + if (rq->rt.overloaded) { + rt_clear_overload(rq); + rq->rt.overloaded = 0; + } } } @@ -464,16 +426,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) { } -static inline -void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - -static inline -void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - static inline void rt_queue_push_tasks(struct rq *rq) { } @@ -1281,7 +1233,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); inc_rt_prio(rt_rq, prio); - inc_rt_migration(rt_se, rt_rq); inc_rt_group(rt_se, rt_rq); } @@ -1294,7 +1245,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); - dec_rt_migration(rt_se, rt_rq); dec_rt_group(rt_se, rt_rq); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 96f8ab7a0702..41d760df458f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -663,8 +663,6 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - unsigned int rt_nr_migratory; - unsigned int rt_nr_total; int overloaded; struct plist_head pushable_tasks; -- cgit v1.2.3 From 5fe7765997b139e2d922b58359dea181efe618f9 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 28 Sep 2023 17:02:51 +0200 Subject: sched/deadline: Make dl_rq->pushable_dl_tasks update drive dl_rq->overloaded dl_rq->dl_nr_migratory is increased whenever a DL entity is enqueued and it has nr_cpus_allowed > 1. Unlike the pushable_dl_tasks tree, dl_rq->dl_nr_migratory includes a dl_rq's current task. This means a dl_rq can have a migratable current, N non-migratable queued tasks, and be flagged as overloaded and have its CPU set in the dlo_mask, despite having an empty pushable_tasks tree. Make an dl_rq's overload logic be driven by {enqueue,dequeue}_pushable_dl_task(), in other words make DL RQs only be flagged as overloaded if they have at least one runnable-but-not-current migratable task. o push_dl_task() is unaffected, as it is a no-op if there are no pushable tasks. o pull_dl_task() now no longer scans runqueues whose sole migratable task is their current one, which it can't do anything about anyway. It may also now pull tasks to a DL RQ with dl_nr_running > 1 if only its current task is migratable. Since dl_rq->dl_nr_migratory becomes unused, remove it. RT had the exact same mechanism (rt_rq->rt_nr_migratory) which was dropped in favour of relying on rt_rq->pushable_tasks, see: 612f769edd06 ("sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask") Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Juri Lelli Link: https://lore.kernel.org/r/20230928150251.463109-1-vschneid@redhat.com --- kernel/sched/deadline.c | 57 ++++++++++++------------------------------------- kernel/sched/debug.c | 1 - kernel/sched/sched.h | 1 - 3 files changed, 14 insertions(+), 45 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb1996a674db..d98408a274e5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -509,7 +509,6 @@ void init_dl_rq(struct dl_rq *dl_rq) /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; - dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else @@ -553,39 +552,6 @@ static inline void dl_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); } -static void update_dl_migration(struct dl_rq *dl_rq) -{ - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { - if (!dl_rq->overloaded) { - dl_set_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 1; - } - } else if (dl_rq->overloaded) { - dl_clear_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 0; - } -} - -static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory++; - - update_dl_migration(dl_rq); -} - -static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory--; - - update_dl_migration(dl_rq); -} - #define __node_2_pdl(node) \ rb_entry((node), struct task_struct, pushable_dl_tasks) @@ -594,6 +560,11 @@ static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); } +static inline int has_pushable_dl_tasks(struct rq *rq) +{ + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); +} + /* * The list of pushable -deadline task is not a plist, like in * sched_rt.c, it is an rb-tree with tasks ordered by deadline. @@ -609,6 +580,11 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) __pushable_less); if (leftmost) rq->dl.earliest_dl.next = p->dl.deadline; + + if (!rq->dl.overloaded) { + dl_set_overload(rq); + rq->dl.overloaded = 1; + } } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -625,11 +601,11 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; RB_CLEAR_NODE(&p->pushable_dl_tasks); -} -static inline int has_pushable_dl_tasks(struct rq *rq) -{ - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); + if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) { + dl_clear_overload(rq); + rq->dl.overloaded = 0; + } } static int push_dl_task(struct rq *rq); @@ -1504,7 +1480,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); - inc_dl_migration(dl_se, dl_rq); } static inline @@ -1518,7 +1493,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); - dec_dl_migration(dl_se, dl_rq); } static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) @@ -2291,9 +2265,6 @@ static int push_dl_task(struct rq *rq) struct rq *later_rq; int ret = 0; - if (!rq->dl.overloaded) - return 0; - next_task = pick_next_pushable_dl_task(rq); if (!next_task) return 0; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c4253bd2dfb0..4580a450700e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -745,7 +745,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) PU(dl_nr_running); #ifdef CONFIG_SMP - PU(dl_nr_migratory); dl_bw = &cpu_rq(cpu)->rd->dl_bw; #else dl_bw = &dl_rq->dl_bw; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 41d760df458f..649eb9ec0657 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -707,7 +707,6 @@ struct dl_rq { u64 next; } earliest_dl; - unsigned int dl_nr_migratory; int overloaded; /* -- cgit v1.2.3 From 6b00a40147653c8ea748e8f4396510f252763364 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 17 Sep 2023 00:29:53 +0100 Subject: sched/uclamp: Set max_spare_cap_cpu even if max_spare_cap is 0 When uclamp_max is being used, the util of the task could be higher than the spare capacity of the CPU, but due to uclamp_max value we force-fit it there. The way the condition for checking for max_spare_cap in find_energy_efficient_cpu() was constructed; it ignored any CPU that has its spare_cap less than or _equal_ to max_spare_cap. Since we initialize max_spare_cap to 0; this lead to never setting max_spare_cap_cpu and hence ending up never performing compute_energy() for this cluster and missing an opportunity for a better energy efficient placement to honour uclamp_max setting. max_spare_cap = 0; cpu_cap = capacity_of(cpu) - cpu_util(p); // 0 if cpu_util(p) is high ... util_fits_cpu(...); // will return true if uclamp_max forces it to fit ... // this logic will fail to update max_spare_cap_cpu if cpu_cap is 0 if (cpu_cap > max_spare_cap) { max_spare_cap = cpu_cap; max_spare_cap_cpu = cpu; } prev_spare_cap suffers from a similar problem. Fix the logic by converting the variables into long and treating -1 value as 'not populated' instead of 0 which is a viable and correct spare capacity value. We need to be careful signed comparison is used when comparing with cpu_cap in one of the conditions. Fixes: 1d42509e475c ("sched/fair: Make EAS wakeup placement consider uclamp restrictions") Signed-off-by: Qais Yousef (Google) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230916232955.2099394-2-qyousef@layalina.io --- kernel/sched/fair.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2973173ad850..4ce949bb0213 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7703,11 +7703,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long util_min = p_util_min, util_max = p_util_max; unsigned long cpu_cap, cpu_thermal_cap, util; - unsigned long cur_delta, max_spare_cap = 0; + long prev_spare_cap = -1, max_spare_cap = -1; unsigned long rq_util_min, rq_util_max; - unsigned long prev_spare_cap = 0; + unsigned long cur_delta, base_energy; int max_spare_cap_cpu = -1; - unsigned long base_energy; int fits, max_fits = -1; cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); @@ -7770,7 +7769,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) prev_spare_cap = cpu_cap; prev_fits = fits; } else if ((fits > max_fits) || - ((fits == max_fits) && (cpu_cap > max_spare_cap))) { + ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) { /* * Find the CPU with the maximum spare capacity * among the remaining CPUs in the performance @@ -7782,7 +7781,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } } - if (max_spare_cap_cpu < 0 && prev_spare_cap == 0) + if (max_spare_cap_cpu < 0 && prev_spare_cap < 0) continue; eenv_pd_busy_time(&eenv, cpus, p); @@ -7790,7 +7789,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) base_energy = compute_energy(&eenv, pd, cpus, p, -1); /* Evaluate the energy impact of using prev_cpu. */ - if (prev_spare_cap > 0) { + if (prev_spare_cap > -1) { prev_delta = compute_energy(&eenv, pd, cpus, p, prev_cpu); /* CPU utilization has changed */ -- cgit v1.2.3 From 23c9519def98ee0fa97ea5871535e9b136f522fc Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 17 Sep 2023 00:29:54 +0100 Subject: sched/uclamp: Ignore (util == 0) optimization in feec() when p_util_max = 0 find_energy_efficient_cpu() bails out early if effective util of the task is 0 as the delta at this point will be zero and there's nothing for EAS to do. When uclamp is being used, this could lead to wrong decisions when uclamp_max is set to 0. In this case the task is capped to performance point 0, but it is actually running and consuming energy and we can benefit from EAS energy calculations. Rework the condition so that it bails out when both util and uclamp_min are 0. We can do that without needing to use uclamp_task_util(); remove it. Fixes: d81304bc6193 ("sched/uclamp: Cater for uclamp in find_energy_efficient_cpu()'s early exit condition") Signed-off-by: Qais Yousef (Google) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230916232955.2099394-3-qyousef@layalina.io --- kernel/sched/fair.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4ce949bb0213..284b0abe7f9b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4558,22 +4558,6 @@ static inline unsigned long task_util_est(struct task_struct *p) return max(task_util(p), _task_util_est(p)); } -#ifdef CONFIG_UCLAMP_TASK -static inline unsigned long uclamp_task_util(struct task_struct *p, - unsigned long uclamp_min, - unsigned long uclamp_max) -{ - return clamp(task_util_est(p), uclamp_min, uclamp_max); -} -#else -static inline unsigned long uclamp_task_util(struct task_struct *p, - unsigned long uclamp_min, - unsigned long uclamp_max) -{ - return task_util_est(p); -} -#endif - static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { @@ -7695,7 +7679,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) target = prev_cpu; sync_entity_load_avg(&p->se); - if (!uclamp_task_util(p, p_util_min, p_util_max)) + if (!task_util_est(p) && p_util_min == 0) goto unlock; eenv_task_busy_time(&eenv, p, prev_cpu); -- cgit v1.2.3 From 15874a3d27e6405e9d17595f83bd3ca1b6cab16d Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 17 Sep 2023 00:29:55 +0100 Subject: sched/debug: Add new tracepoint to track compute energy computation It was useful to track feec() placement decision and debug the spare capacity and optimization issues vs uclamp_max. Signed-off-by: Qais Yousef (Google) Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230916232955.2099394-4-qyousef@layalina.io --- include/trace/events/sched.h | 5 +++++ kernel/sched/core.c | 1 + kernel/sched/fair.c | 7 ++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index fbb99a61f714..a13d5d06be9d 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -735,6 +735,11 @@ DECLARE_TRACE(sched_update_nr_running_tp, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change)); +DECLARE_TRACE(sched_compute_energy_tp, + TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy, + unsigned long max_util, unsigned long busy_time), + TP_ARGS(p, dst_cpu, energy, max_util, busy_time)); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 84881a582847..324980e3d2e5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -114,6 +114,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 284b0abe7f9b..e2a69af8be36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7600,11 +7600,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd, { unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); unsigned long busy_time = eenv->pd_busy_time; + unsigned long energy; if (dst_cpu >= 0) busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time); - return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); + energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); + + trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time); + + return energy; } /* -- cgit v1.2.3 From 079be8fc630943d9fc70a97807feb73d169ee3fc Mon Sep 17 00:00:00 2001 From: Cyril Hrubis Date: Mon, 2 Oct 2023 13:55:51 +0200 Subject: sched/rt: Disallow writing invalid values to sched_rt_period_us The validation of the value written to sched_rt_period_us was broken because: - the sysclt_sched_rt_period is declared as unsigned int - parsed by proc_do_intvec() - the range is asserted after the value parsed by proc_do_intvec() Because of this negative values written to the file were written into a unsigned integer that were later on interpreted as large positive integers which did passed the check: if (sysclt_sched_rt_period <= 0) return EINVAL; This commit fixes the parsing by setting explicit range for both perid_us and runtime_us into the sched_rt_sysctls table and processes the values with proc_dointvec_minmax() instead. Alternatively if we wanted to use full range of unsigned int for the period value we would have to split the proc_handler and use proc_douintvec() for it however even the Documentation/scheduller/sched-rt-group.rst describes the range as 1 to INT_MAX. As far as I can tell the only problem this causes is that the sysctl file allows writing negative values which when read back may confuse userspace. There is also a LTP test being submitted for these sysctl files at: http://patchwork.ozlabs.org/project/ltp/patch/20230901144433.2526-1-chrubis@suse.cz/ Signed-off-by: Cyril Hrubis Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231002115553.3007-2-chrubis@suse.cz --- kernel/sched/rt.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3b627ab586fb..88fc98601413 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -37,6 +37,8 @@ static struct ctl_table sched_rt_sysctls[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "sched_rt_runtime_us", @@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_NEG_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "sched_rr_timeslice_ms", @@ -2935,9 +2939,6 @@ static int sched_rt_global_constraints(void) #ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) { - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - if ((sysctl_sched_rt_runtime != RUNTIME_INF) && ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || ((u64)sysctl_sched_rt_runtime * @@ -2968,7 +2969,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_validate(); -- cgit v1.2.3 From e6dbdd8fb75526b01787050087b65d12c76b3666 Mon Sep 17 00:00:00 2001 From: Cyril Hrubis Date: Mon, 2 Oct 2023 13:55:52 +0200 Subject: sched/rt/docs: Clarify & fix sched_rt_* sysctl docs - Describe explicitly that sched_rt_runtime_us is allocated from sched_rt_period_us and hence always less or equal to that value. - The limit for sched_rt_runtime_us is not INT_MAX-1, but rather it's limited by the value of sched_rt_period_us. If sched_rt_period_us is INT_MAX then sched_rt_runtime_us can be set to INT_MAX as well. Signed-off-by: Cyril Hrubis Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231002115553.3007-3-chrubis@suse.cz --- Documentation/scheduler/sched-rt-group.rst | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst index 655a096ec8fb..a16bee8f74c2 100644 --- a/Documentation/scheduler/sched-rt-group.rst +++ b/Documentation/scheduler/sched-rt-group.rst @@ -87,18 +87,20 @@ lack an EDF scheduler to make non-uniform periods usable. The system wide settings are configured under the /proc virtual file system: /proc/sys/kernel/sched_rt_period_us: - The scheduling period that is equivalent to 100% CPU bandwidth + The scheduling period that is equivalent to 100% CPU bandwidth. /proc/sys/kernel/sched_rt_runtime_us: - A global limit on how much time realtime scheduling may use. Even without - CONFIG_RT_GROUP_SCHED enabled, this will limit time reserved to realtime - processes. With CONFIG_RT_GROUP_SCHED it signifies the total bandwidth - available to all realtime groups. + A global limit on how much time realtime scheduling may use. This is always + less or equal to the period_us, as it denotes the time allocated from the + period_us for the realtime tasks. Even without CONFIG_RT_GROUP_SCHED enabled, + this will limit time reserved to realtime processes. With + CONFIG_RT_GROUP_SCHED=y it signifies the total bandwidth available to all + realtime groups. * Time is specified in us because the interface is s32. This gives an operating range from 1us to about 35 minutes. * sched_rt_period_us takes values from 1 to INT_MAX. - * sched_rt_runtime_us takes values from -1 to (INT_MAX - 1). + * sched_rt_runtime_us takes values from -1 to sched_rt_period_us. * A run time of -1 specifies runtime == period, ie. no limit. -- cgit v1.2.3 From 83494dc51033506eb60c5e11a335461b2dc42111 Mon Sep 17 00:00:00 2001 From: Cyril Hrubis Date: Mon, 2 Oct 2023 13:55:53 +0200 Subject: sched/rt/docs: Use 'real-time' instead of 'realtime' Standardize on a single variant. Signed-off-by: Cyril Hrubis Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231002115553.3007-4-chrubis@suse.cz --- Documentation/scheduler/sched-rt-group.rst | 34 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst index a16bee8f74c2..d685609ed3d7 100644 --- a/Documentation/scheduler/sched-rt-group.rst +++ b/Documentation/scheduler/sched-rt-group.rst @@ -39,10 +39,10 @@ Most notable: 1.1 The problem --------------- -Realtime scheduling is all about determinism, a group has to be able to rely on +Real-time scheduling is all about determinism, a group has to be able to rely on the amount of bandwidth (eg. CPU time) being constant. In order to schedule -multiple groups of realtime tasks, each group must be assigned a fixed portion -of the CPU time available. Without a minimum guarantee a realtime group can +multiple groups of real-time tasks, each group must be assigned a fixed portion +of the CPU time available. Without a minimum guarantee a real-time group can obviously fall short. A fuzzy upper limit is of no use since it cannot be relied upon. Which leaves us with just the single fixed portion. @@ -50,14 +50,14 @@ relied upon. Which leaves us with just the single fixed portion. ---------------- CPU time is divided by means of specifying how much time can be spent running -in a given period. We allocate this "run time" for each realtime group which -the other realtime groups will not be permitted to use. +in a given period. We allocate this "run time" for each real-time group which +the other real-time groups will not be permitted to use. -Any time not allocated to a realtime group will be used to run normal priority +Any time not allocated to a real-time group will be used to run normal priority tasks (SCHED_OTHER). Any allocated run time not used will also be picked up by SCHED_OTHER. -Let's consider an example: a frame fixed realtime renderer must deliver 25 +Let's consider an example: a frame fixed real-time renderer must deliver 25 frames a second, which yields a period of 0.04s per frame. Now say it will also have to play some music and respond to input, leaving it with around 80% CPU time dedicated for the graphics. We can then give this group a run time of 0.8 @@ -70,7 +70,7 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s = of 0.00015s. The remaining CPU time will be used for user input and other tasks. Because -realtime tasks have explicitly allocated the CPU time they need to perform +real-time tasks have explicitly allocated the CPU time they need to perform their tasks, buffer underruns in the graphics or audio can be eliminated. NOTE: the above example is not fully implemented yet. We still @@ -90,12 +90,12 @@ The system wide settings are configured under the /proc virtual file system: The scheduling period that is equivalent to 100% CPU bandwidth. /proc/sys/kernel/sched_rt_runtime_us: - A global limit on how much time realtime scheduling may use. This is always + A global limit on how much time real-time scheduling may use. This is always less or equal to the period_us, as it denotes the time allocated from the - period_us for the realtime tasks. Even without CONFIG_RT_GROUP_SCHED enabled, - this will limit time reserved to realtime processes. With + period_us for the real-time tasks. Even without CONFIG_RT_GROUP_SCHED enabled, + this will limit time reserved to real-time processes. With CONFIG_RT_GROUP_SCHED=y it signifies the total bandwidth available to all - realtime groups. + real-time groups. * Time is specified in us because the interface is s32. This gives an operating range from 1us to about 35 minutes. @@ -110,7 +110,7 @@ The system wide settings are configured under the /proc virtual file system: The default values for sched_rt_period_us (1000000 or 1s) and sched_rt_runtime_us (950000 or 0.95s). This gives 0.05s to be used by SCHED_OTHER (non-RT tasks). These defaults were chosen so that a run-away -realtime tasks will not lock up the machine but leave a little time to recover +real-time tasks will not lock up the machine but leave a little time to recover it. By setting runtime to -1 you'd get the old behaviour back. By default all bandwidth is assigned to the root group and new groups get the @@ -118,10 +118,10 @@ period from /proc/sys/kernel/sched_rt_period_us and a run time of 0. If you want to assign bandwidth to another group, reduce the root group's bandwidth and assign some or all of the difference to another group. -Realtime group scheduling means you have to assign a portion of total CPU -bandwidth to the group before it will accept realtime tasks. Therefore you will -not be able to run realtime tasks as any user other than root until you have -done that, even if the user has the rights to run processes with realtime +Real-time group scheduling means you have to assign a portion of total CPU +bandwidth to the group before it will accept real-time tasks. Therefore you will +not be able to run real-time tasks as any user other than root until you have +done that, even if the user has the rights to run processes with real-time priority! -- cgit v1.2.3 From d844fe65f0957024c3e1b0bf2a0615246184d9bc Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 7 Aug 2023 20:03:57 -0700 Subject: sched/headers: Move 'struct sched_param' out of uapi, to work around glibc/musl breakage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both glibc and musl define 'struct sched_param' in sched.h, while kernel has it in uapi/linux/sched/types.h, making it cumbersome to use sched_getattr(2) or sched_setattr(2) from userspace. For example, something like this: #include #include struct sched_attr sa; will result in "error: redefinition of ‘struct sched_param’" (note the code doesn't need sched_param at all -- it needs struct sched_attr plus some stuff from sched.h). The situation is, glibc is not going to provide a wrapper for sched_{get,set}attr, thus the need to include linux/sched_types.h directly, which leads to the above problem. Thus, the userspace is left with a few sub-par choices when it wants to use e.g. sched_setattr(2), such as maintaining a copy of struct sched_attr definition, or using some other ugly tricks. OTOH, 'struct sched_param' is well known, defined in POSIX, and it won't be ever changed (as that would break backward compatibility). So, while 'struct sched_param' is indeed part of the kernel uapi, exposing it the way it's done now creates an issue, and hiding it (like this patch does) fixes that issue, hopefully without creating another one: common userspace software rely on libc headers, and as for "special" software (like libc), it looks like glibc and musl do not rely on kernel headers for 'struct sched_param' definition (but let's Cc their mailing lists in case it's otherwise). The alternative to this patch would be to move struct sched_attr to, say, linux/sched.h, or linux/sched/attr.h (the new file). Oh, and here is the previous attempt to fix the issue: https://lore.kernel.org/all/20200528135552.GA87103@google.com/ While I support Linus arguments, the issue is still here and needs to be fixed. [ mingo: Linus is right, this shouldn't be needed - but on the other hand I agree that this header is not really helpful to user-space as-is. So let's pretend that is only about sched_attr, and call this commit a workaround for user-space breakage that it in reality is ... Also, remove the Fixes tag. ] Signed-off-by: Kir Kolyshkin Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230808030357.1213829-1-kolyshkin@gmail.com --- include/linux/sched.h | 5 ++++- include/uapi/linux/sched/types.h | 4 ---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index dc37ae787e33..e4235bbfad77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -63,7 +63,6 @@ struct robust_list_head; struct root_domain; struct rq; struct sched_attr; -struct sched_param; struct seq_file; struct sighand_struct; struct signal_struct; @@ -370,6 +369,10 @@ extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; #endif +struct sched_param { + int sched_priority; +}; + struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h index f2c4589d4dbf..90662385689b 100644 --- a/include/uapi/linux/sched/types.h +++ b/include/uapi/linux/sched/types.h @@ -4,10 +4,6 @@ #include -struct sched_param { - int sched_priority; -}; - #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ -- cgit v1.2.3 From d4d6596b43868a1e05fe5b047e73c3aff96444c6 Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Wed, 2 Aug 2023 10:15:01 +0800 Subject: sched/headers: Remove duplicate header inclusions and "autogroup.h" are included twice, remove the duplicate header inclusion. Signed-off-by: Yu Liao Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230802021501.2511569-1-liaoyu15@huawei.com --- kernel/sched/build_utility.c | 1 - kernel/sched/core.c | 1 - 2 files changed, 2 deletions(-) diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 99bdd96f454f..80a3df49ab47 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 324980e3d2e5..27aff98645e8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -85,7 +85,6 @@ #include "sched.h" #include "stats.h" -#include "autogroup.h" #include "autogroup.h" #include "pelt.h" -- cgit v1.2.3 From ea41bb514fe286bf50498b3c6d7f7a5dc2b6c5e0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 4 Oct 2023 11:33:36 +0200 Subject: sched/core: Update stale comment in try_to_wake_up() The following commit: 9b3c4ab3045e ("sched,rcu: Rework try_invoke_on_locked_down_task()") ... renamed try_invoke_on_locked_down_task() to task_call_func(), but forgot to update the comment in try_to_wake_up(). But it turns out that the smp_rmb() doesn't live in task_call_func() either, it was moved to __task_needs_rq_lock() in: 91dabf33ae5d ("sched: Fix race in task_call_func()") Fix that now. Also fix the s/smb/smp typo while at it. Reported-by: Zhang Qiao Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230731085759.11443-1-zhangqiao22@huawei.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 65e10ac34660..f5783cb16791 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4237,7 +4237,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). * - * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). + * A similar smp_rmb() lives in __task_needs_rq_lock(). */ smp_rmb(); if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) -- cgit v1.2.3 From e3e3bab1844d448a239cd57ebf618839e26b4157 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 27 Jul 2023 14:45:57 -0400 Subject: x86/speculation: Add __update_spec_ctrl() helper Add a new __update_spec_ctrl() helper which is a variant of update_spec_ctrl() that can be used in a noinstr function. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Cc: Linus Torvalds Link: https://lore.kernel.org/r/20230727184600.26768-2-longman@redhat.com --- arch/x86/include/asm/spec-ctrl.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index cb0386fc4dc3..c648502e4535 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -4,6 +4,7 @@ #include #include +#include /* * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR @@ -76,6 +77,16 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; } +/* + * This can be used in noinstr functions & should only be called in bare + * metal context. + */ +static __always_inline void __update_spec_ctrl(u64 val) +{ + __this_cpu_write(x86_spec_ctrl_current, val); + native_wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + #ifdef CONFIG_SMP extern void speculative_store_bypass_ht_init(void); #else -- cgit v1.2.3 From 2743fe89d4d41616ffbe1e7e96e443ae7a4b1cc6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 27 Jul 2023 14:45:58 -0400 Subject: x86/idle: Disable IBRS when CPU is offline to improve single-threaded performance Commit bf5835bcdb96 ("intel_idle: Disable IBRS during long idle") disables IBRS when the CPU enters long idle. However, when a CPU becomes offline, the IBRS bit is still set when X86_FEATURE_KERNEL_IBRS is enabled. That will impact the performance of a sibling CPU. Mitigate this performance impact by clearing all the mitigation bits in SPEC_CTRL MSR when offline. When the CPU is online again, it will be re-initialized and so restoring the SPEC_CTRL value isn't needed. Add a comment to say that native_play_dead() is a __noreturn function, but it can't be marked as such to avoid confusion about the missing MSR restoration code. When DPDK is running on an isolated CPU thread processing network packets in user space while its sibling thread is idle. The performance of the busy DPDK thread with IBRS on and off in the sibling idle thread are: IBRS on IBRS off ------- -------- packets/second: 7.8M 10.4M avg tsc cycles/packet: 282.26 209.86 This is a 25% performance degradation. The test system is a Intel Xeon 4114 CPU @ 2.20GHz. [ mingo: Extended the changelog with performance data from the 0/4 mail. ] Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Cc: Linus Torvalds Link: https://lore.kernel.org/r/20230727184600.26768-3-longman@redhat.com --- arch/x86/kernel/smpboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 48e040618731..02765d9da682 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -87,6 +87,7 @@ #include #include #include +#include /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -1623,8 +1624,15 @@ void __noreturn hlt_play_dead(void) native_halt(); } +/* + * native_play_dead() is essentially a __noreturn function, but it can't + * be marked as such as the compiler may complain about it. + */ void native_play_dead(void) { + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + __update_spec_ctrl(0); + play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); -- cgit v1.2.3 From 7506203089dceb1d9e1f35d37ad2e46d44798a6d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 27 Jul 2023 14:45:59 -0400 Subject: intel_idle: Use __update_spec_ctrl() in intel_idle_ibrs() When intel_idle_ibrs() is called, it modifies the SPEC_CTRL MSR to 0 in order disable IBRS. However, the new MSR value isn't reflected in x86_spec_ctrl_current which is at odd with the other code that keep track of its state in that percpu variable. Use the new __update_spec_ctrl() to have the x86_spec_ctrl_current percpu value properly updated. Since spec-ctrl.h includes both msr.h and nospec-branch.h, we can remove those from the include file list. Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Cc: Linus Torvalds Link: https://lore.kernel.org/r/20230727184600.26768-4-longman@redhat.com --- drivers/idle/intel_idle.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index ea5a6a14c553..86ac9a441f85 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -53,9 +53,8 @@ #include #include #include -#include #include -#include +#include #include #define INTEL_IDLE_VERSION "0.5.1" @@ -182,12 +181,12 @@ static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev, int ret; if (smt_active) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + __update_spec_ctrl(0); ret = __intel_idle(dev, drv, index); if (smt_active) - native_wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl); + __update_spec_ctrl(spec_ctrl); return ret; } -- cgit v1.2.3 From aa1567a7e6440b8c3af4b0d8a8219d8fc5028c5f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 27 Jul 2023 14:46:00 -0400 Subject: intel_idle: Add ibrs_off module parameter to force-disable IBRS Commit bf5835bcdb96 ("intel_idle: Disable IBRS during long idle") disables IBRS when the cstate is 6 or lower. However, there are some use cases where a customer may want to use max_cstate=1 to lower latency. Such use cases will suffer from the performance degradation caused by the enabling of IBRS in the sibling idle thread. Add a "ibrs_off" module parameter to force disable IBRS and the CPUIDLE_FLAG_IRQ_ENABLE flag if set. In the case of a Skylake server with max_cstate=1, this new ibrs_off option will likely increase the IRQ response latency as IRQ will now be disabled. When running SPECjbb2015 with cstates set to C1 on a Skylake system. First test when the kernel is booted with: "intel_idle.ibrs_off": max-jOPS = 117828, critical-jOPS = 66047 Then retest when the kernel is booted without the "intel_idle.ibrs_off" added: max-jOPS = 116408, critical-jOPS = 58958 That means booting with "intel_idle.ibrs_off" improves performance by: max-jOPS: +1.2%, which could be considered noise range. critical-jOPS: +12%, which is definitely a solid improvement. The admin-guide/pm/intel_idle.rst file is updated to add a description about the new "ibrs_off" module parameter. Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Cc: Linus Torvalds Link: https://lore.kernel.org/r/20230727184600.26768-5-longman@redhat.com --- Documentation/admin-guide/pm/intel_idle.rst | 17 ++++++++++++++++- drivers/idle/intel_idle.c | 11 ++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst index b799a43da62e..39bd6ecce7de 100644 --- a/Documentation/admin-guide/pm/intel_idle.rst +++ b/Documentation/admin-guide/pm/intel_idle.rst @@ -170,7 +170,7 @@ and ``idle=nomwait``. If any of them is present in the kernel command line, the ``MWAIT`` instruction is not allowed to be used, so the initialization of ``intel_idle`` will fail. -Apart from that there are four module parameters recognized by ``intel_idle`` +Apart from that there are five module parameters recognized by ``intel_idle`` itself that can be set via the kernel command line (they cannot be updated via sysfs, so that is the only way to change their values). @@ -216,6 +216,21 @@ are ignored). The idle states disabled this way can be enabled (on a per-CPU basis) from user space via ``sysfs``. +The ``ibrs_off`` module parameter is a boolean flag (defaults to +false). If set, it is used to control if IBRS (Indirect Branch Restricted +Speculation) should be turned off when the CPU enters an idle state. +This flag does not affect CPUs that use Enhanced IBRS which can remain +on with little performance impact. + +For some CPUs, IBRS will be selected as mitigation for Spectre v2 and Retbleed +security vulnerabilities by default. Leaving the IBRS mode on while idling may +have a performance impact on its sibling CPU. The IBRS mode will be turned off +by default when the CPU enters into a deep idle state, but not in some +shallower ones. Setting the ``ibrs_off`` module parameter will force the IBRS +mode to off when the CPU is in any one of the available idle states. This may +help performance of a sibling CPU at the expense of a slightly higher wakeup +latency for the idle CPU. + .. _intel-idle-core-and-package-idle-states: diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 86ac9a441f85..dcda0afecfc5 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -68,6 +68,7 @@ static int max_cstate = CPUIDLE_STATE_MAX - 1; static unsigned int disabled_states_mask __read_mostly; static unsigned int preferred_states_mask __read_mostly; static bool force_irq_on __read_mostly; +static bool ibrs_off __read_mostly; static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; @@ -1852,11 +1853,13 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate) } if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && - state->flags & CPUIDLE_FLAG_IBRS) { + ((state->flags & CPUIDLE_FLAG_IBRS) || ibrs_off)) { /* * IBRS mitigation requires that C-states are entered * with interrupts disabled. */ + if (ibrs_off && (state->flags & CPUIDLE_FLAG_IRQ_ENABLE)) + state->flags &= ~CPUIDLE_FLAG_IRQ_ENABLE; WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); state->enter = intel_idle_ibrs; return; @@ -2175,3 +2178,9 @@ MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states"); * 'CPUIDLE_FLAG_INIT_XSTATE' and 'CPUIDLE_FLAG_IBRS' flags. */ module_param(force_irq_on, bool, 0444); +/* + * Force the disabling of IBRS when X86_FEATURE_KERNEL_IBRS is on and + * CPUIDLE_FLAG_IRQ_ENABLE isn't set. + */ +module_param(ibrs_off, bool, 0444); +MODULE_PARM_DESC(ibrs_off, "Disable IBRS when idle"); -- cgit v1.2.3 From bc87127a45928de5fdf0ec39d7a86e1edd0e179e Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Thu, 20 Jul 2023 16:05:16 +0800 Subject: sched/debug: Print 'tgid' in sched_show_task() Multiple blocked tasks are printed when the system hangs. They may have the same parent pid, but belong to different task groups. Printing tgid lets users better know whether these tasks are from the same task group or not. Signed-off-by: Yajun Deng Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230720080516.1515297-1-yajun.deng@linux.dev --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5783cb16791..cf6d3fdd4eb5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9089,9 +9089,9 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", - free, task_pid_nr(p), ppid, - read_task_thread_flags(p)); + pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n", + free, task_pid_nr(p), task_tgid_nr(p), + ppid, read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); -- cgit v1.2.3 From 7ef7145a2b26b172ac6885c4cf3272a38bc0979a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Oct 2023 12:25:16 +0200 Subject: sched/nohz: Update idle load-balancing (ILB) comments - Fix incorrect/misleading comments, - clarify some others, - fix typos & grammar, - and use more consistent style throughout. Signed-off-by: Ingo Molnar Reviewed-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20231006102518.2452758-2-mingo@kernel.org --- kernel/sched/fair.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 52c498fd6c46..2b63a14cd05e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11503,14 +11503,15 @@ static inline int on_null_domain(struct rq *rq) #ifdef CONFIG_NO_HZ_COMMON /* - * idle load balancing details - * - When one of the busy CPUs notice that there may be an idle rebalancing + * NOHZ idle load balancing (ILB) details: + * + * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set + * + * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set * anywhere yet. */ - static inline int find_new_ilb(void) { int ilb; @@ -11531,8 +11532,10 @@ static inline int find_new_ilb(void) } /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). + * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU + * SMP function call (IPI). + * + * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -11560,7 +11563,7 @@ static void kick_ilb(unsigned int flags) /* * This way we generate an IPI on the target CPU which - * is idle. And the softirq performing nohz idle load balance + * is idle, and the softirq performing NOHZ idle load balancing * will be run before returning from the IPI. */ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); @@ -11589,7 +11592,7 @@ static void nohz_balancer_kick(struct rq *rq) /* * None are in tickless mode and hence no need for NOHZ idle load - * balancing. + * balancing: */ if (likely(!atomic_read(&nohz.nr_cpus))) return; @@ -11611,9 +11614,8 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(rq->sd); if (sd) { /* - * If there's a CFS task and the current CPU has reduced - * capacity; kick the ILB to see if there's a better CPU to run - * on. + * If there's a runnable CFS task and the current CPU has reduced + * capacity, kick the ILB to see if there's a better CPU to run on: */ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; @@ -11665,11 +11667,11 @@ static void nohz_balancer_kick(struct rq *rq) if (sds) { /* * If there is an imbalance between LLC domains (IOW we could - * increase the overall cache use), we need some less-loaded LLC - * domain to pull some load. Likewise, we may need to spread + * increase the overall cache utilization), we need a less-loaded LLC + * domain to pull some load from. Likewise, we may need to spread * load within the current LLC domain (e.g. packed SMT cores but * other CPUs are idle). We can't really know from here how busy - * the others are - so just get a nohz balance going if it looks + * the others are - so just get a NOHZ balance going if it looks * like this LLC domain has tasks we could move. */ nr_busy = atomic_read(&sds->nr_busy_cpus); -- cgit v1.2.3 From b6dd6984832a2868f78879fce30d6965ae899d02 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Oct 2023 12:25:17 +0200 Subject: sched/nohz: Use consistent variable names in find_new_ilb() and kick_ilb() Use 'ilb_cpu' consistently in both functions. Signed-off-by: Ingo Molnar Reviewed-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20231006102518.2452758-3-mingo@kernel.org --- kernel/sched/fair.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2b63a14cd05e..f82b301740ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11514,18 +11514,18 @@ static inline int on_null_domain(struct rq *rq) */ static inline int find_new_ilb(void) { - int ilb; const struct cpumask *hk_mask; + int ilb_cpu; hk_mask = housekeeping_cpumask(HK_TYPE_MISC); - for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { + for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { - if (ilb == smp_processor_id()) + if (ilb_cpu == smp_processor_id()) continue; - if (idle_cpu(ilb)) - return ilb; + if (idle_cpu(ilb_cpu)) + return ilb_cpu; } return nr_cpu_ids; -- cgit v1.2.3 From f4bb5705114530cd775a5a649b666755b3efe7aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Oct 2023 12:25:18 +0200 Subject: sched/nohz: Remove unnecessarily complex error handling pattern from find_new_ilb() find_new_ilb() returns nr_cpu_ids on failure - which is the usual cpumask bitops return pattern, but is weird & unnecessary in this context: not only is it a global variable, it it is a +1 out of bounds CPU index and also has different signedness ... Its only user, kick_ilb(), then checks the return against nr_cpu_ids to decide to return. There's no other use. So instead of this, use a standard -1 return on failure to find an idle CPU, as the argument is signed already. Signed-off-by: Ingo Molnar Reviewed-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20231006102518.2452758-4-mingo@kernel.org --- kernel/sched/fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f82b301740ec..19bb4ac94146 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11528,7 +11528,7 @@ static inline int find_new_ilb(void) return ilb_cpu; } - return nr_cpu_ids; + return -1; } /* @@ -11549,8 +11549,7 @@ static void kick_ilb(unsigned int flags) nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); - - if (ilb_cpu >= nr_cpu_ids) + if (ilb_cpu < 0) return; /* -- cgit v1.2.3 From 089768dfeb3ab294f9ab6a1f2462001f0f879fbb Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Sun, 8 Oct 2023 10:15:38 +0800 Subject: sched/rt: Change the type of 'sysctl_sched_rt_period' from 'unsigned int' to 'int' Doing this matches the natural type of 'int' based calculus in sched_rt_handler(), and also enables the adding in of a correct upper bounds check on the sysctl interface. [ mingo: Rewrote the changelog. ] Signed-off-by: Yajun Deng Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231008021538.3063250-1-yajun.deng@linux.dev --- kernel/sched/rt.c | 6 +++--- kernel/sched/sched.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 88fc98601413..76d82a096e03 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -16,7 +16,7 @@ struct rt_bandwidth def_rt_bandwidth; * period over which we measure -rt task CPU usage in us. * default: 1s */ -unsigned int sysctl_sched_rt_period = 1000000; +int sysctl_sched_rt_period = 1000000; /* * part of the period that we allow rt tasks to run in us. @@ -34,7 +34,7 @@ static struct ctl_table sched_rt_sysctls[] = { { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(int), .mode = 0644, .proc_handler = sched_rt_handler, .extra1 = SYSCTL_ONE, @@ -47,7 +47,7 @@ static struct ctl_table sched_rt_sysctls[] = { .mode = 0644, .proc_handler = sched_rt_handler, .extra1 = SYSCTL_NEG_ONE, - .extra2 = SYSCTL_INT_MAX, + .extra2 = (void *)&sysctl_sched_rt_period, }, { .procname = "sched_rr_timeslice_ms", diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 649eb9ec0657..515eb4cffd5e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -105,7 +105,7 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust); extern void call_trace_sched_update_nr_running(struct rq *rq, int count); -extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; extern int sched_rr_timeslice; -- cgit v1.2.3 From 7bc263840bc3377186cb06b003ac287bb2f18ce2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 9 Oct 2023 12:36:16 +0200 Subject: sched/topology: Consolidate and clean up access to a CPU's max compute capacity Remove the rq::cpu_capacity_orig field and use arch_scale_cpu_capacity() instead. The scheduler uses 3 methods to get access to a CPU's max compute capacity: - arch_scale_cpu_capacity(cpu) which is the default way to get a CPU's capacity. - cpu_capacity_orig field which is periodically updated with arch_scale_cpu_capacity(). - capacity_orig_of(cpu) which encapsulates rq->cpu_capacity_orig. There is no real need to save the value returned by arch_scale_cpu_capacity() in struct rq. arch_scale_cpu_capacity() returns: - either a per_cpu variable. - or a const value for systems which have only one capacity. Remove rq::cpu_capacity_orig and use arch_scale_cpu_capacity() everywhere. No functional changes. Some performance tests on Arm64: - small SMP device (hikey): no noticeable changes - HMP device (RB5): hackbench shows minor improvement (1-2%) - large smp (thx2): hackbench and tbench shows minor improvement (1%) Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20231009103621.374412-2-vincent.guittot@linaro.org --- Documentation/scheduler/sched-capacity.rst | 13 +++++++------ kernel/sched/core.c | 2 +- kernel/sched/cpudeadline.c | 2 +- kernel/sched/deadline.c | 4 ++-- kernel/sched/fair.c | 18 ++++++++---------- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 6 ------ kernel/sched/topology.c | 7 +++++-- 8 files changed, 25 insertions(+), 29 deletions(-) diff --git a/Documentation/scheduler/sched-capacity.rst b/Documentation/scheduler/sched-capacity.rst index e2c1cf743158..de414b33dd2a 100644 --- a/Documentation/scheduler/sched-capacity.rst +++ b/Documentation/scheduler/sched-capacity.rst @@ -39,14 +39,15 @@ per Hz, leading to:: ------------------- Two different capacity values are used within the scheduler. A CPU's -``capacity_orig`` is its maximum attainable capacity, i.e. its maximum -attainable performance level. A CPU's ``capacity`` is its ``capacity_orig`` to -which some loss of available performance (e.g. time spent handling IRQs) is -subtracted. +``original capacity`` is its maximum attainable capacity, i.e. its maximum +attainable performance level. This original capacity is returned by +the function arch_scale_cpu_capacity(). A CPU's ``capacity`` is its ``original +capacity`` to which some loss of available performance (e.g. time spent +handling IRQs) is subtracted. Note that a CPU's ``capacity`` is solely intended to be used by the CFS class, -while ``capacity_orig`` is class-agnostic. The rest of this document will use -the term ``capacity`` interchangeably with ``capacity_orig`` for the sake of +while ``original capacity`` is class-agnostic. The rest of this document will use +the term ``capacity`` interchangeably with ``original capacity`` for the sake of brevity. 1.3 Platform examples diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cf6d3fdd4eb5..a3f9cd52eec5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9929,7 +9929,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; rq->balance_callback = &balance_push_callback; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 57c92d751bcd..95baa12a1029 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -131,7 +131,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (!dl_task_fits_capacity(p, cpu)) { cpumask_clear_cpu(cpu, later_mask); - cap = capacity_orig_of(cpu); + cap = arch_scale_cpu_capacity(cpu); if (cap > max_cap || (cpu == task_cpu(p) && cap == max_cap)) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d98408a274e5..7039a8d5ae9b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -132,7 +132,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) int i; for_each_cpu_and(i, mask, cpu_active_mask) - cap += capacity_orig_of(i); + cap += arch_scale_cpu_capacity(i); return cap; } @@ -144,7 +144,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) static inline unsigned long dl_bw_capacity(int i) { if (!sched_asym_cpucap_active() && - capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { + arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) { return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; } else { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 19bb4ac94146..e7c1bafc0460 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4669,7 +4669,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) + if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) return; /* @@ -4717,14 +4717,14 @@ static inline int util_fits_cpu(unsigned long util, return fits; /* - * We must use capacity_orig_of() for comparing against uclamp_min and + * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and * uclamp_max. We only care about capacity pressure (by using * capacity_of()) for comparing against the real util. * * If a task is boosted to 1024 for example, we don't want a tiny * pressure to skew the check whether it fits a CPU or not. * - * Similarly if a task is capped to capacity_orig_of(little_cpu), it + * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it * should fit a little cpu even if there's some pressure. * * Only exception is for thermal pressure since it has a direct impact @@ -4736,7 +4736,7 @@ static inline int util_fits_cpu(unsigned long util, * For uclamp_max, we can tolerate a drop in performance level as the * goal is to cap the task. So it's okay if it's getting less. */ - capacity_orig = capacity_orig_of(cpu); + capacity_orig = arch_scale_cpu_capacity(cpu); capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* @@ -7217,7 +7217,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) * Look for the CPU with best capacity. */ else if (fits < 0) - cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); + cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu)); /* * First, select CPU which fits better (-1 being better than 0). @@ -7459,7 +7459,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) util = max(util, util_est); } - return min(util, capacity_orig_of(cpu)); + return min(util, arch_scale_cpu_capacity(cpu)); } unsigned long cpu_util_cfs(int cpu) @@ -9250,8 +9250,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); - if (!capacity) capacity = 1; @@ -9327,7 +9325,7 @@ static inline int check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { return ((rq->cpu_capacity * sd->imbalance_pct) < - (rq->cpu_capacity_orig * 100)); + (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } /* @@ -9338,7 +9336,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) { return rq->misfit_task_load && - (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || + (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || check_cpu_capacity(rq, sd)); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 76d82a096e03..e93b69ef919b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -471,7 +471,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) min_cap = uclamp_eff_value(p, UCLAMP_MIN); max_cap = uclamp_eff_value(p, UCLAMP_MAX); - cpu_cap = capacity_orig_of(cpu); + cpu_cap = arch_scale_cpu_capacity(cpu); return cpu_cap >= min(min_cap, max_cap); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 515eb4cffd5e..7e7fedcfc580 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1033,7 +1033,6 @@ struct rq { struct sched_domain __rcu *sd; unsigned long cpu_capacity; - unsigned long cpu_capacity_orig; struct balance_callback *balance_callback; @@ -2967,11 +2966,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_SMP -static inline unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} - /** * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a7b50bba7829..1cc595907363 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2488,12 +2488,15 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + unsigned long capacity; + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); + capacity = arch_scale_cpu_capacity(i); /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); + if (capacity > READ_ONCE(d.rd->max_cpu_capacity)) + WRITE_ONCE(d.rd->max_cpu_capacity, capacity); cpu_attach_domain(sd, d.rd, i); } -- cgit v1.2.3 From 5b77261c5510f1e6f4d359e97dd3e39ee7259c3d Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Mon, 9 Oct 2023 11:30:36 +0530 Subject: sched/topology: Remove the EM_MAX_COMPLEXITY limit The Energy Aware Scheduler (EAS) estimates the energy consumption of placing a task on different CPUs. The goal is to minimize this energy consumption. Estimating the energy of different task placements is increasingly complex with the size of the platform. To avoid having a slow wake-up path, EAS is only enabled if this complexity is low enough. The current complexity limit was set in: b68a4c0dba3b1 ("sched/topology: Disable EAS on inappropriate platforms") ... based on the first implementation of EAS, which was re-computing the power of the whole platform for each task placement scenario, see: 390031e4c309 ("sched/fair: Introduce an energy estimation helper function") ... but the complexity of EAS was reduced in: eb92692b2544d ("sched/fair: Speed-up energy-aware wake-ups") ... and find_energy_efficient_cpu() (feec) algorithm was updated in: 3e8c6c9aac42 ("sched/fair: Remove task_util from effective utilization in feec()") find_energy_efficient_cpu() (feec) is now doing: feec() \_ for_each_pd(pd) [0] // get max_spare_cap_cpu and compute_prev_delta \_ for_each_cpu(pd) [1] \_ eenv_pd_busy_time(pd) [2] \_ for_each_cpu(pd) // compute_energy(pd) without the task \_ eenv_pd_max_util(pd, -1) [3.0] \_ for_each_cpu(pd) \_ em_cpu_energy(pd, -1) \_ for_each_ps(pd) // compute_energy(pd) with the task on prev_cpu \_ eenv_pd_max_util(pd, prev_cpu) [3.1] \_ for_each_cpu(pd) \_ em_cpu_energy(pd, prev_cpu) \_ for_each_ps(pd) // compute_energy(pd) with the task on max_spare_cap_cpu \_ eenv_pd_max_util(pd, max_spare_cap_cpu) [3.2] \_ for_each_cpu(pd) \_ em_cpu_energy(pd, max_spare_cap_cpu) \_ for_each_ps(pd) [3.1] happens only once since prev_cpu is unique. With the same definitions for nr_pd, nr_cpus and nr_ps, the complexity is of: nr_pd * (2 * [nr_cpus in pd] + 2 * ([nr_cpus in pd] + [nr_ps in pd])) + ([nr_cpus in pd] + [nr_ps in pd]) [0] * ( [1] + [2] + [3.0] + [3.2] ) + [3.1] = nr_pd * (4 * [nr_cpus in pd] + 2 * [nr_ps in pd]) + [nr_cpus in prev pd] + nr_ps The complexity limit was set to 2048 in: b68a4c0dba3b1 ("sched/topology: Disable EAS on inappropriate platforms") ... to make "EAS usable up to 16 CPUs with per-CPU DVFS and less than 8 performance states each". For the same platform, the complexity would actually be of: 16 * (4 + 2 * 7) + 1 + 7 = 296 Since the EAS complexity was greatly reduced since the limit was introduced, bigger platforms can handle EAS. For instance, a platform with 112 CPUs with 7 performance states each would not reach it: 112 * (4 + 2 * 7) + 1 + 7 = 2024 To reflect this improvement in the underlying EAS code, remove the EAS complexity check. Note that a limit on the number of CPUs still holds against EM_MAX_NUM_CPUS to avoid overflows during the energy estimation. [ mingo: Updates to the changelog. ] Signed-off-by: Pierre Gondois Signed-off-by: Ingo Molnar Reviewed-by: Lukasz Luba Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20231009060037.170765-2-sshegde@linux.vnet.ibm.com --- Documentation/scheduler/sched-energy.rst | 29 +++--------------------- kernel/sched/topology.c | 39 +++----------------------------- 2 files changed, 6 insertions(+), 62 deletions(-) diff --git a/Documentation/scheduler/sched-energy.rst b/Documentation/scheduler/sched-energy.rst index fc853c8cc346..70e2921ef725 100644 --- a/Documentation/scheduler/sched-energy.rst +++ b/Documentation/scheduler/sched-energy.rst @@ -359,32 +359,9 @@ in milli-Watts or in an 'abstract scale'. 6.3 - Energy Model complexity ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The task wake-up path is very latency-sensitive. When the EM of a platform is -too complex (too many CPUs, too many performance domains, too many performance -states, ...), the cost of using it in the wake-up path can become prohibitive. -The energy-aware wake-up algorithm has a complexity of: - - C = Nd * (Nc + Ns) - -with: Nd the number of performance domains; Nc the number of CPUs; and Ns the -total number of OPPs (ex: for two perf. domains with 4 OPPs each, Ns = 8). - -A complexity check is performed at the root domain level, when scheduling -domains are built. EAS will not start on a root domain if its C happens to be -higher than the completely arbitrary EM_MAX_COMPLEXITY threshold (2048 at the -time of writing). - -If you really want to use EAS but the complexity of your platform's Energy -Model is too high to be used with a single root domain, you're left with only -two possible options: - - 1. split your system into separate, smaller, root domains using exclusive - cpusets and enable EAS locally on each of them. This option has the - benefit to work out of the box but the drawback of preventing load - balance between root domains, which can result in an unbalanced system - overall; - 2. submit patches to reduce the complexity of the EAS wake-up algorithm, - hence enabling it to cope with larger EMs in reasonable time. +EAS does not impose any complexity limit on the number of PDs/OPPs/CPUs but +restricts the number of CPUs to EM_MAX_NUM_CPUS to prevent overflows during +the energy estimation. 6.4 - Schedutil governor diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1cc595907363..fcda3f066eec 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -348,32 +348,13 @@ static void sched_energy_set(bool has_eas) * 1. an Energy Model (EM) is available; * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. * 3. no SMT is detected. - * 4. the EM complexity is low enough to keep scheduling overheads low; - * 5. schedutil is driving the frequency of all CPUs of the rd; - * 6. frequency invariance support is present; - * - * The complexity of the Energy Model is defined as: - * - * C = nr_pd * (nr_cpus + nr_ps) - * - * with parameters defined as: - * - nr_pd: the number of performance domains - * - nr_cpus: the number of CPUs - * - nr_ps: the sum of the number of performance states of all performance - * domains (for example, on a system with 2 performance domains, - * with 10 performance states each, nr_ps = 2 * 10 = 20). - * - * It is generally not a good idea to use such a model in the wake-up path on - * very complex platforms because of the associated scheduling overheads. The - * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs - * with per-CPU DVFS and less than 8 performance states each, for example. + * 4. schedutil is driving the frequency of all CPUs of the rd; + * 5. frequency invariance support is present; */ -#define EM_MAX_COMPLEXITY 2048 - extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { - int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); + int i; struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; @@ -431,20 +412,6 @@ static bool build_perf_domains(const struct cpumask *cpu_map) goto free; tmp->next = pd; pd = tmp; - - /* - * Count performance domains and performance states for the - * complexity check. - */ - nr_pd++; - nr_ps += em_pd_nr_perf_states(pd->em_pd); - } - - /* Bail out if the Energy Model complexity is too high. */ - if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { - WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", - cpumask_pr_args(cpu_map)); - goto free; } perf_domain_debug(cpu_map, pd); -- cgit v1.2.3 From e03dc9fa0663bc303383170e961561462ff00c93 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Mon, 9 Oct 2023 20:24:28 +0800 Subject: sched/psi: Change update_triggers() to a 'void' function Update_triggers() always returns now + group->rtpoll_min_period, and the return value is only used by psi_rtpoll_work(), so change update_triggers() to a void function, let group->rtpoll_next_update = now + group->rtpoll_min_period directly. This will avoid unnecessary function return value passing & simplifies the function. [ mingo: Updated changelog ] Suggested-by: Suren Baghdasaryan Signed-off-by: Yang Yang Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/202310092024289721617@zte.com.cn --- kernel/sched/psi.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1d0f634725a6..be853f227e40 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -434,7 +434,7 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value) return growth; } -static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, +static void update_triggers(struct psi_group *group, u64 now, bool *update_total, enum psi_aggregators aggregator) { struct psi_trigger *t; @@ -503,8 +503,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, /* Reset threshold breach flag once event got generated */ t->pending_event = false; } - - return now + group->rtpoll_min_period; } static u64 update_averages(struct psi_group *group, u64 now) @@ -706,7 +704,8 @@ static void psi_rtpoll_work(struct psi_group *group) } if (now >= group->rtpoll_next_update) { - group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL); + update_triggers(group, now, &update_total, PSI_POLL); + group->rtpoll_next_update = now + group->rtpoll_min_period; if (update_total) memcpy(group->rtpoll_total, group->total[PSI_POLL], sizeof(group->rtpoll_total)); -- cgit v1.2.3 From 8f833c82cdab7b4049bcfe88311d35fa5f24e422 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Mon, 9 Oct 2023 11:30:37 +0530 Subject: sched/topology: Change behaviour of the 'sched_energy_aware' sysctl, based on the platform The 'sched_energy_aware' sysctl is available for the admin to disable/enable energy aware scheduling(EAS). EAS is enabled only if few conditions are met by the platform. They are, asymmetric CPU capacity, no SMT, schedutil CPUfreq governor, frequency invariant load tracking etc. A platform may boot without EAS capability, but could gain such capability at runtime. For example, changing/registering the cpufreq governor to schedutil. At present, though platform doesn't support EAS, this sysctl returns 1 and it ends up calling build_perf_domains on write to 1 and NOP when writing to 0. That is confusing and un-necessary. Desired behavior would be to have this sysctl to enable/disable the EAS on supported platform. On non-supported platform write to the sysctl would return not supported error and read of the sysctl would return empty. So sched_energy_aware returns empty - EAS is not possible at this moment This will include EAS capable platforms which have at least one EAS condition false during startup, e.g. not using the schedutil cpufreq governor sched_energy_aware returns 0 - EAS is supported but disabled by admin. sched_energy_aware returns 1 - EAS is supported and enabled. User can find out the reason why EAS is not possible by checking info messages. sched_is_eas_possible returns true if the platform can do EAS at this moment. Signed-off-by: Shrikanth Hegde Signed-off-by: Ingo Molnar Tested-by: Pierre Gondois Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20231009060037.170765-3-sshegde@linux.vnet.ibm.com --- Documentation/admin-guide/sysctl/kernel.rst | 3 +- kernel/sched/topology.c | 112 ++++++++++++++++++---------- 2 files changed, 76 insertions(+), 39 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index cf33de56da27..d89ac2bd8dc4 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1182,7 +1182,8 @@ automatically on platforms where it can run (that is, platforms with asymmetric CPU topologies and having an Energy Model available). If your platform happens to meet the requirements for EAS but you do not want to use it, change -this value to 0. +this value to 0. On Non-EAS platforms, write operation fails and +read doesn't return anything. task_delayacct =============== diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index fcda3f066eec..4cbbdacafe9e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -212,6 +212,70 @@ static unsigned int sysctl_sched_energy_aware = 1; static DEFINE_MUTEX(sched_energy_mutex); static bool sched_energy_update; +extern struct cpufreq_governor schedutil_gov; +static bool sched_is_eas_possible(const struct cpumask *cpu_mask) +{ + bool any_asym_capacity = false; + struct cpufreq_policy *policy; + struct cpufreq_governor *gov; + int i; + + /* EAS is enabled for asymmetric CPU capacity topologies. */ + for_each_cpu(i, cpu_mask) { + if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) { + any_asym_capacity = true; + break; + } + } + if (!any_asym_capacity) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + /* EAS definitely does *not* handle SMT */ + if (sched_smt_active()) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, SMT is not supported\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + if (!arch_scale_freq_invariant()) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + /* Do not attempt EAS if schedutil is not being used. */ + for_each_cpu(i, cpu_mask) { + policy = cpufreq_cpu_get(i); + if (!policy) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d", + cpumask_pr_args(cpu_mask), i); + } + return false; + } + gov = policy->governor; + cpufreq_cpu_put(policy); + if (gov != &schedutil_gov) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + } + + return true; +} + void rebuild_sched_domains_energy(void) { mutex_lock(&sched_energy_mutex); @@ -230,6 +294,15 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write, if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (!sched_is_eas_possible(cpu_active_mask)) { + if (write) { + return -EOPNOTSUPP; + } else { + *lenp = 0; + return 0; + } + } + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); @@ -351,61 +424,24 @@ static void sched_energy_set(bool has_eas) * 4. schedutil is driving the frequency of all CPUs of the rd; * 5. frequency invariance support is present; */ -extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { int i; struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; - struct cpufreq_policy *policy; - struct cpufreq_governor *gov; if (!sysctl_sched_energy_aware) goto free; - /* EAS is enabled for asymmetric CPU capacity topologies. */ - if (!per_cpu(sd_asym_cpucapacity, cpu)) { - if (sched_debug()) { - pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", - cpumask_pr_args(cpu_map)); - } - goto free; - } - - /* EAS definitely does *not* handle SMT */ - if (sched_smt_active()) { - pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", - cpumask_pr_args(cpu_map)); - goto free; - } - - if (!arch_scale_freq_invariant()) { - if (sched_debug()) { - pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", - cpumask_pr_args(cpu_map)); - } + if (!sched_is_eas_possible(cpu_map)) goto free; - } for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ if (find_pd(pd, i)) continue; - /* Do not attempt EAS if schedutil is not being used. */ - policy = cpufreq_cpu_get(i); - if (!policy) - goto free; - gov = policy->governor; - cpufreq_cpu_put(policy); - if (gov != &schedutil_gov) { - if (rd->pd) - pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", - cpumask_pr_args(cpu_map)); - goto free; - } - /* Create the new pd and add it to the local list. */ tmp = pd_init(i); if (!tmp) -- cgit v1.2.3 From f2273f4e19e29f7d0be6a2393f18369cd1b496c8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Oct 2023 17:31:26 +0200 Subject: sched/topology: Move the declaration of 'schedutil_gov' to kernel/sched/sched.h Move it out of the .c file into the shared scheduler-internal header file, to gain type-checking. Signed-off-by: Ingo Molnar Cc: Shrikanth Hegde Cc: Valentin Schneider Link: https://lore.kernel.org/r/20231009060037.170765-3-sshegde@linux.vnet.ibm.com --- kernel/sched/sched.h | 2 ++ kernel/sched/topology.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7e7fedcfc580..faf9031422e1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3203,6 +3203,8 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } +extern struct cpufreq_governor schedutil_gov; + #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #define perf_domain_span(pd) NULL diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4cbbdacafe9e..d9508617f7f8 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -212,7 +212,6 @@ static unsigned int sysctl_sched_energy_aware = 1; static DEFINE_MUTEX(sched_energy_mutex); static bool sched_energy_update; -extern struct cpufreq_governor schedutil_gov; static bool sched_is_eas_possible(const struct cpumask *cpu_mask) { bool any_asym_capacity = false; -- cgit v1.2.3 From 9ae5c00ea2e600a8b823f9b95606dd244f3096bf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:38 +0100 Subject: sched/numa: Document vma_numab_state fields Document the intended usage of the fields. [ mingo: Reformatted to take less vertical space & tidied it up. ] Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231010083143.19593-2-mgorman@techsingularity.net --- include/linux/mm_types.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36c5b43999e6..d7f042ec1f33 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -551,8 +551,29 @@ struct vma_lock { }; struct vma_numab_state { + /* + * Initialised as time in 'jiffies' after which VMA + * should be scanned. Delays first scan of new VMA by at + * least sysctl_numa_balancing_scan_delay: + */ unsigned long next_scan; + + /* + * Time in jiffies when access_pids[] is reset to + * detect phase change behaviour: + */ unsigned long next_pid_reset; + + /* + * Approximate tracking of PIDs that trapped a NUMA hinting + * fault. May produce false positives due to hash collisions. + * + * [0] Previous PID tracking + * [1] Current PID tracking + * + * Window moves after next_pid_reset has expired approximately + * every VMA_PID_RESET_PERIOD jiffies: + */ unsigned long access_pids[2]; }; -- cgit v1.2.3 From f3a6c97940fbd25d6c84c2d5642338fc99a9b35b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:39 +0100 Subject: sched/numa: Rename vma_numab_state::access_pids[] => ::pids_active[], ::next_pid_reset => ::pids_active_reset The access_pids[] field name is somewhat ambiguous as no PIDs are accessed. Similarly, it's not clear that next_pid_reset is related to access_pids[]. Rename the fields to more accurately reflect their purpose. [ mingo: Rename in the comments too. ] Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231010083143.19593-3-mgorman@techsingularity.net --- include/linux/mm.h | 4 ++-- include/linux/mm_types.h | 6 +++--- kernel/sched/fair.c | 12 ++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index bf5d0b1b16f4..19fc73b02c9f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1726,8 +1726,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) unsigned int pid_bit; pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); - if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) { - __set_bit(pid_bit, &vma->numab_state->access_pids[1]); + if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) { + __set_bit(pid_bit, &vma->numab_state->pids_active[1]); } } #else /* !CONFIG_NUMA_BALANCING */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d7f042ec1f33..e7571eca1131 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -559,10 +559,10 @@ struct vma_numab_state { unsigned long next_scan; /* - * Time in jiffies when access_pids[] is reset to + * Time in jiffies when pids_active[] is reset to * detect phase change behaviour: */ - unsigned long next_pid_reset; + unsigned long pids_active_reset; /* * Approximate tracking of PIDs that trapped a NUMA hinting @@ -574,7 +574,7 @@ struct vma_numab_state { * Window moves after next_pid_reset has expired approximately * every VMA_PID_RESET_PERIOD jiffies: */ - unsigned long access_pids[2]; + unsigned long pids_active[2]; }; /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e7c1bafc0460..6b47edcbe834 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3125,7 +3125,7 @@ static bool vma_is_accessed(struct vm_area_struct *vma) if (READ_ONCE(current->mm->numa_scan_seq) < 2) return true; - pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1]; + pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids); } @@ -3241,7 +3241,7 @@ static void task_numa_work(struct callback_head *work) msecs_to_jiffies(sysctl_numa_balancing_scan_delay); /* Reset happens after 4 times scan delay of scan start */ - vma->numab_state->next_pid_reset = vma->numab_state->next_scan + + vma->numab_state->pids_active_reset = vma->numab_state->next_scan + msecs_to_jiffies(VMA_PID_RESET_PERIOD); } @@ -3262,11 +3262,11 @@ static void task_numa_work(struct callback_head *work) * vma for recent access to avoid clearing PID info before access.. */ if (mm->numa_scan_seq && - time_after(jiffies, vma->numab_state->next_pid_reset)) { - vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset + + time_after(jiffies, vma->numab_state->pids_active_reset)) { + vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset + msecs_to_jiffies(VMA_PID_RESET_PERIOD); - vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]); - vma->numab_state->access_pids[1] = 0; + vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]); + vma->numab_state->pids_active[1] = 0; } do { -- cgit v1.2.3 From ed2da8b725b932b1e2b2f4835bb664d47ed03031 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:40 +0100 Subject: sched/numa: Trace decisions related to skipping VMAs NUMA balancing skips or scans VMAs for a variety of reasons. In preparation for completing scans of VMAs regardless of PID access, trace the reasons why a VMA was skipped. In a later patch, the tracing will be used to track if a VMA was forcibly scanned. Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231010083143.19593-4-mgorman@techsingularity.net --- include/linux/sched/numa_balancing.h | 8 ++++++ include/trace/events/sched.h | 50 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 17 +++++++++--- 3 files changed, 71 insertions(+), 4 deletions(-) diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index 3988762efe15..c127a1509e2f 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -15,6 +15,14 @@ #define TNF_FAULT_LOCAL 0x08 #define TNF_MIGRATE_FAIL 0x10 +enum numa_vmaskip_reason { + NUMAB_SKIP_UNSUITABLE, + NUMAB_SKIP_SHARED_RO, + NUMAB_SKIP_INACCESSIBLE, + NUMAB_SKIP_SCAN_DELAY, + NUMAB_SKIP_PID_INACTIVE, +}; + #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index a13d5d06be9d..d82a04d6a1bc 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -664,6 +664,56 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) ); +#ifdef CONFIG_NUMA_BALANCING +#define NUMAB_SKIP_REASON \ + EM( NUMAB_SKIP_UNSUITABLE, "unsuitable" ) \ + EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \ + EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ + EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ + EMe(NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) + +/* Redefine for export. */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +NUMAB_SKIP_REASON + +/* Redefine for symbolic printing. */ +#undef EM +#undef EMe +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +TRACE_EVENT(sched_skip_vma_numa, + + TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma, + enum numa_vmaskip_reason reason), + + TP_ARGS(mm, vma, reason), + + TP_STRUCT__entry( + __field(unsigned long, numa_scan_offset) + __field(unsigned long, vm_start) + __field(unsigned long, vm_end) + __field(enum numa_vmaskip_reason, reason) + ), + + TP_fast_assign( + __entry->numa_scan_offset = mm->numa_scan_offset; + __entry->vm_start = vma->vm_start; + __entry->vm_end = vma->vm_end; + __entry->reason = reason; + ), + + TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s", + __entry->numa_scan_offset, + __entry->vm_start, + __entry->vm_end, + __print_symbolic(__entry->reason, NUMAB_SKIP_REASON)) +); +#endif /* CONFIG_NUMA_BALANCING */ /* * Tracepoint for waking a polling cpu without an IPI. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b47edcbe834..31cfdb0794fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3210,6 +3210,7 @@ static void task_numa_work(struct callback_head *work) do { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); continue; } @@ -3220,15 +3221,19 @@ static void task_numa_work(struct callback_head *work) * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO); continue; + } /* * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!vma_is_accessible(vma)) + if (!vma_is_accessible(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); continue; + } /* Initialise new per-VMA NUMAB state. */ if (!vma->numab_state) { @@ -3250,12 +3255,16 @@ static void task_numa_work(struct callback_head *work) * delay the scan for new VMAs. */ if (mm->numa_scan_seq && time_before(jiffies, - vma->numab_state->next_scan)) + vma->numab_state->next_scan)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY); continue; + } /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(vma)) + if (!vma_is_accessed(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); continue; + } /* * RESET access PIDs regularly for old VMAs. Resetting after checking -- cgit v1.2.3 From 2e2675db1906ac04809f5399bf1f5e30d56a6f3e Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Tue, 10 Oct 2023 09:31:41 +0100 Subject: sched/numa: Move up the access pid reset logic Recent NUMA hinting faulting activity is reset approximately every VMA_PID_RESET_PERIOD milliseconds. However, if the current task has not accessed a VMA then the reset check is missed and the reset is potentially deferred forever. Check if the PID activity information should be reset before checking if the current task recently trapped a NUMA hinting fault. [ mgorman@techsingularity.net: Rewrite changelog ] Suggested-by: Mel Gorman Signed-off-by: Raghavendra K T Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231010083143.19593-5-mgorman@techsingularity.net --- kernel/sched/fair.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 31cfdb0794fb..ce36969625bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3260,16 +3260,7 @@ static void task_numa_work(struct callback_head *work) continue; } - /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(vma)) { - trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); - continue; - } - - /* - * RESET access PIDs regularly for old VMAs. Resetting after checking - * vma for recent access to avoid clearing PID info before access.. - */ + /* RESET access PIDs regularly for old VMAs. */ if (mm->numa_scan_seq && time_after(jiffies, vma->numab_state->pids_active_reset)) { vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset + @@ -3278,6 +3269,12 @@ static void task_numa_work(struct callback_head *work) vma->numab_state->pids_active[1] = 0; } + /* Do not scan the VMA if task has not accessed */ + if (!vma_is_accessed(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); + continue; + } + do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); -- cgit v1.2.3 From b7a5b537c55c088d891ae554103d1b281abef781 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:42 +0100 Subject: sched/numa: Complete scanning of partial VMAs regardless of PID activity NUMA Balancing skips VMAs when the current task has not trapped a NUMA fault within the VMA. If the VMA is skipped then mm->numa_scan_offset advances and a task that is trapping faults within the VMA may never fully update PTEs within the VMA. Force tasks to update PTEs for partially scanned PTEs. The VMA will be tagged for NUMA hints by some task but this removes some of the benefit of tracking PID activity within a VMA. A follow-on patch will mitigate this problem. The test cases and machines evaluated did not trigger the corner case so the performance results are neutral with only small changes within the noise from normal test-to-test variance. However, the next patch makes the corner case easier to trigger. Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Tested-by: Raghavendra K T Link: https://lore.kernel.org/r/20231010083143.19593-6-mgorman@techsingularity.net --- include/linux/sched/numa_balancing.h | 1 + include/trace/events/sched.h | 3 ++- kernel/sched/fair.c | 18 +++++++++++++++--- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index c127a1509e2f..7dcc0bdfddbb 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -21,6 +21,7 @@ enum numa_vmaskip_reason { NUMAB_SKIP_INACCESSIBLE, NUMAB_SKIP_SCAN_DELAY, NUMAB_SKIP_PID_INACTIVE, + NUMAB_SKIP_IGNORE_PID, }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index d82a04d6a1bc..bfc07c10541a 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -670,7 +670,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \ EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ - EMe(NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) + EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \ + EMe(NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) /* Redefine for export. */ #undef EM diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ce36969625bd..ab79013f6e91 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3113,7 +3113,7 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } -static bool vma_is_accessed(struct vm_area_struct *vma) +static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long pids; /* @@ -3126,7 +3126,19 @@ static bool vma_is_accessed(struct vm_area_struct *vma) return true; pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; - return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids); + if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) + return true; + + /* + * Complete a scan that has already started regardless of PID access, or + * some VMAs may never be scanned in multi-threaded applications: + */ + if (mm->numa_scan_offset > vma->vm_start) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID); + return true; + } + + return false; } #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) @@ -3270,7 +3282,7 @@ static void task_numa_work(struct callback_head *work) } /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(vma)) { + if (!vma_is_accessed(mm, vma)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); continue; } -- cgit v1.2.3 From f169c62ff7cd1acf8bac8ae17bfeafa307d9e6fa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:43 +0100 Subject: sched/numa: Complete scanning of inactive VMAs when there is no alternative VMAs are skipped if there is no recent fault activity but this represents a chicken-and-egg problem as there may be no fault activity if the PTEs are never updated to trap NUMA hints. There is an indirect reliance on scanning to be forced early in the lifetime of a task but this may fail to detect changes in phase behaviour. Force inactive VMAs to be scanned when all other eligible VMAs have been updated within the same scan sequence. Test results in general look good with some changes in performance, both negative and positive, depending on whether the additional scanning and faulting was beneficial or not to the workload. The autonuma benchmark workload NUMA01_THREADLOCAL was picked for closer examination. The workload creates two processes with numerous threads and thread-local storage that is zero-filled in a loop. It exercises the corner case where unrelated threads may skip VMAs that are thread-local to another thread and still has some VMAs that inactive while the workload executes. The VMA skipping activity frequency with and without the patch: 6.6.0-rc2-sched-numabtrace-v1 ============================= 649 reason=scan_delay 9,094 reason=unsuitable 48,915 reason=shared_ro 143,919 reason=inaccessible 193,050 reason=pid_inactive 6.6.0-rc2-sched-numabselective-v1 ============================= 146 reason=seq_completed 622 reason=ignore_pid_inactive 624 reason=scan_delay 6,570 reason=unsuitable 16,101 reason=shared_ro 27,608 reason=inaccessible 41,939 reason=pid_inactive Note that with the patch applied, the PID activity is ignored (ignore_pid_inactive) to ensure a VMA with some activity is completely scanned. In addition, a small number of VMAs are scanned when no other eligible VMA is available during a single scan window (seq_completed). The number of times a VMA is skipped due to no PID activity from the scanning task (pid_inactive) drops dramatically. It is expected that this will increase the number of PTEs updated for NUMA hinting faults as well as hinting faults but these represent PTEs that would otherwise have been missed. The tradeoff is scan+fault overhead versus improving locality due to migration. On a 2-socket Cascade Lake test machine, the time to complete the workload is as follows; 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 Min elsp-NUMA01_THREADLOCAL 174.22 ( 0.00%) 117.64 ( 32.48%) Amean elsp-NUMA01_THREADLOCAL 175.68 ( 0.00%) 123.34 * 29.79%* Stddev elsp-NUMA01_THREADLOCAL 1.20 ( 0.00%) 4.06 (-238.20%) CoeffVar elsp-NUMA01_THREADLOCAL 0.68 ( 0.00%) 3.29 (-381.70%) Max elsp-NUMA01_THREADLOCAL 177.18 ( 0.00%) 128.03 ( 27.74%) The time to complete the workload is reduced by almost 30%: 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 / Duration User 91201.80 63506.64 Duration System 2015.53 1819.78 Duration Elapsed 1234.77 868.37 In this specific case, system CPU time was not increased but it's not universally true. From vmstat, the NUMA scanning and fault activity is as follows; 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 Ops NUMA base-page range updates 64272.00 26374386.00 Ops NUMA PTE updates 36624.00 55538.00 Ops NUMA PMD updates 54.00 51404.00 Ops NUMA hint faults 15504.00 75786.00 Ops NUMA hint local faults % 14860.00 56763.00 Ops NUMA hint local percent 95.85 74.90 Ops NUMA pages migrated 1629.00 6469222.00 Both the number of PTE updates and hint faults is dramatically increased. While this is superficially unfortunate, it represents ranges that were simply skipped without the patch. As a result of the scanning and hinting faults, many more pages were also migrated but as the time to completion is reduced, the overhead is offset by the gain. Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Tested-by: Raghavendra K T Link: https://lore.kernel.org/r/20231010083143.19593-7-mgorman@techsingularity.net --- include/linux/mm_types.h | 6 ++++ include/linux/sched/numa_balancing.h | 1 + include/trace/events/sched.h | 3 +- kernel/sched/fair.c | 55 ++++++++++++++++++++++++++++++++++-- 4 files changed, 61 insertions(+), 4 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e7571eca1131..589f31ef2e84 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -575,6 +575,12 @@ struct vma_numab_state { * every VMA_PID_RESET_PERIOD jiffies: */ unsigned long pids_active[2]; + + /* + * MM scan sequence ID when the VMA was last completely scanned. + * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq + */ + int prev_scan_seq; }; /* diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index 7dcc0bdfddbb..b69afb8630db 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -22,6 +22,7 @@ enum numa_vmaskip_reason { NUMAB_SKIP_SCAN_DELAY, NUMAB_SKIP_PID_INACTIVE, NUMAB_SKIP_IGNORE_PID, + NUMAB_SKIP_SEQ_COMPLETED, }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index bfc07c10541a..6188ad0d9e0d 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -671,7 +671,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \ - EMe(NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) + EM( NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) \ + EMe(NUMAB_SKIP_SEQ_COMPLETED, "seq_completed" ) /* Redefine for export. */ #undef EM diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ab79013f6e91..922905194c0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3158,6 +3158,8 @@ static void task_numa_work(struct callback_head *work) unsigned long nr_pte_updates = 0; long pages, virtpages; struct vma_iterator vmi; + bool vma_pids_skipped; + bool vma_pids_forced = false; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -3200,7 +3202,6 @@ static void task_numa_work(struct callback_head *work) */ p->node_stamp += 2 * TICK_NSEC; - start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ virtpages = pages * 8; /* Scan up to this much virtual space */ @@ -3210,6 +3211,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; + + /* + * VMAs are skipped if the current PID has not trapped a fault within + * the VMA recently. Allow scanning to be forced if there is no + * suitable VMA remaining. + */ + vma_pids_skipped = false; + +retry_pids: + start = mm->numa_scan_offset; vma_iter_init(&vmi, mm, start); vma = vma_next(&vmi); if (!vma) { @@ -3260,6 +3271,13 @@ static void task_numa_work(struct callback_head *work) /* Reset happens after 4 times scan delay of scan start */ vma->numab_state->pids_active_reset = vma->numab_state->next_scan + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + + /* + * Ensure prev_scan_seq does not match numa_scan_seq, + * to prevent VMAs being skipped prematurely on the + * first scan: + */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1; } /* @@ -3281,8 +3299,19 @@ static void task_numa_work(struct callback_head *work) vma->numab_state->pids_active[1] = 0; } - /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(mm, vma)) { + /* Do not rescan VMAs twice within the same sequence. */ + if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) { + mm->numa_scan_offset = vma->vm_end; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED); + continue; + } + + /* + * Do not scan the VMA if task has not accessed it, unless no other + * VMA candidate exists. + */ + if (!vma_pids_forced && !vma_is_accessed(mm, vma)) { + vma_pids_skipped = true; trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); continue; } @@ -3311,8 +3340,28 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); + + /* VMA scan is complete, do not scan until next sequence. */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq; + + /* + * Only force scan within one VMA at a time, to limit the + * cost of scanning a potentially uninteresting VMA. + */ + if (vma_pids_forced) + break; } for_each_vma(vmi, vma); + /* + * If no VMAs are remaining and VMAs were skipped due to the PID + * not accessing the VMA previously, then force a scan to ensure + * forward progress: + */ + if (!vma && !vma_pids_forced && vma_pids_skipped) { + vma_pids_forced = true; + goto retry_pids; + } + out: /* * It is possible to reach the end of the VMA list but the last few -- cgit v1.2.3 From b19fdb16fb2167c6bc9ee8fbc0c1d2d4fd3e2eb8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Oct 2023 16:57:44 +0100 Subject: sched/headers: Remove comment referring to rq::cpu_load, since this has been removed There is a comment that refers to cpu_load, however, this cpu_load was removed with: 55627e3cd22c ("sched/core: Remove rq->cpu_load[]") ... back in 2019. The comment does not make sense with respect to this removed array, so remove the comment. Signed-off-by: Colin Ian King Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231010155744.1381065-1-colin.i.king@gmail.com --- kernel/sched/sched.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index faf9031422e1..65cad0e5729e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -948,10 +948,6 @@ struct rq { /* runqueue lock: */ raw_spinlock_t __lock; - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; -- cgit v1.2.3 From 80cc1d1d5ee35701daf11725ce06d8a240588973 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2023 16:41:07 +0800 Subject: sched/psi: Avoid updating PSI triggers and ->rtpoll_total when there are no state changes When psimon wakes up and there are no state changes for ->rtpoll_states, it's unnecessary to update triggers and ->rtpoll_total because the pressures being monitored by the user have not changed. This will help to slightly reduce unnecessary computations of PSI. [ mingo: Changelog updates ] Signed-off-by: Yang Yang Signed-off-by: Ingo Molnar Cc: Johannes Weiner Cc: Suren Baghdasaryan Cc: Peter Ziljstra Link: https://lore.kernel.org/r/202310101641075436843@zte.com.cn --- kernel/sched/psi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index be853f227e40..79f8db0c6150 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -704,11 +704,12 @@ static void psi_rtpoll_work(struct psi_group *group) } if (now >= group->rtpoll_next_update) { - update_triggers(group, now, &update_total, PSI_POLL); - group->rtpoll_next_update = now + group->rtpoll_min_period; - if (update_total) + if (changed_states & group->rtpoll_states) { + update_triggers(group, now, &update_total, PSI_POLL); memcpy(group->rtpoll_total, group->total[PSI_POLL], sizeof(group->rtpoll_total)); + } + group->rtpoll_next_update = now + group->rtpoll_min_period; } psi_schedule_rtpoll_work(group, -- cgit v1.2.3 From 3657680f38cd7df413d665f2b2f38e9a78130d8b Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2023 16:45:43 +0800 Subject: sched/psi: Delete the 'update_total' function parameter from update_triggers() The 'update_total' parameter of update_triggers() is always true after the previous commit: 80cc1d1d5ee3 ("sched/psi: Avoid updating PSI triggers and ->rtpoll_total when there are no state changes") If the 'changed_states & group->rtpoll_states' condition is true, 'new_stall' in update_triggers() will be true, and then 'update_total' should also be true. So update_total is redundant - remove it. [ mingo: Changelog updates ] Signed-off-by: Yang Yang Signed-off-by: Ingo Molnar Cc: Johannes Weiner Cc: Suren Baghdasaryan Cc: Peter Ziljstra Link: https://lore.kernel.org/r/202310101645437859599@zte.com.cn --- kernel/sched/psi.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 79f8db0c6150..44a78774ae87 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -434,14 +434,13 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value) return growth; } -static void update_triggers(struct psi_group *group, u64 now, bool *update_total, +static void update_triggers(struct psi_group *group, u64 now, enum psi_aggregators aggregator) { struct psi_trigger *t; u64 *total = group->total[aggregator]; struct list_head *triggers; u64 *aggregator_total; - *update_total = false; if (aggregator == PSI_AVGS) { triggers = &group->avg_triggers; @@ -471,14 +470,6 @@ static void update_triggers(struct psi_group *group, u64 now, bool *update_total * events without dropping any). */ if (new_stall) { - /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. - */ - *update_total = true; - /* Calculate growth since last update */ growth = window_update(&t->win, now, total[t->state]); if (!t->pending_event) { @@ -563,7 +554,6 @@ static void psi_avgs_work(struct work_struct *work) struct delayed_work *dwork; struct psi_group *group; u32 changed_states; - bool update_total; u64 now; dwork = to_delayed_work(work); @@ -582,7 +572,7 @@ static void psi_avgs_work(struct work_struct *work) * go - see calc_avgs() and missed_periods. */ if (now >= group->avg_next_update) { - update_triggers(group, now, &update_total, PSI_AVGS); + update_triggers(group, now, PSI_AVGS); group->avg_next_update = update_averages(group, now); } @@ -638,7 +628,6 @@ static void psi_rtpoll_work(struct psi_group *group) { bool force_reschedule = false; u32 changed_states; - bool update_total; u64 now; mutex_lock(&group->rtpoll_trigger_lock); @@ -705,7 +694,7 @@ static void psi_rtpoll_work(struct psi_group *group) if (now >= group->rtpoll_next_update) { if (changed_states & group->rtpoll_states) { - update_triggers(group, now, &update_total, PSI_POLL); + update_triggers(group, now, PSI_POLL); memcpy(group->rtpoll_total, group->total[PSI_POLL], sizeof(group->rtpoll_total)); } -- cgit v1.2.3 From f577cd57bfaa889cf0718e30e92c08c7f78c9d85 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Jul 2023 16:10:56 +0200 Subject: sched/topology: Rename 'DIE' domain to 'PKG' While reworking the x86 topology code Thomas tripped over creating a 'DIE' domain for the package mask. :-) Since these names are CONFIG_SCHED_DEBUG=y only, rename them to make the name less ambiguous. [ Shrikanth Hegde: rename on s390 as well. ] [ Valentin Schneider: also rename it in the comments. ] [ mingo: port to recent kernels & find all remaining occurances. ] Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Valentin Schneider Acked-by: Mel Gorman Acked-by: Heiko Carstens Acked-by: Gautham R. Shenoy Acked-by: Vincent Guittot Link: https://lore.kernel.org/r/20230712141056.GI3100107@hirez.programming.kicks-ass.net --- arch/powerpc/kernel/smp.c | 4 ++-- arch/s390/kernel/topology.c | 2 +- arch/x86/kernel/smpboot.c | 4 ++-- kernel/sched/fair.c | 2 +- kernel/sched/topology.c | 8 ++++---- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5826f5108a12..4e4870031265 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1051,7 +1051,7 @@ static struct sched_domain_topology_level powerpc_topology[] = { #endif { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, { cpu_mc_mask, SD_INIT_NAME(MC) }, - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, { NULL, }, }; @@ -1595,7 +1595,7 @@ static void add_cpu_to_masks(int cpu) /* Skip all CPUs already part of current CPU core mask */ cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu)); - /* If chip_id is -1; limit the cpu_core_mask to within DIE*/ + /* If chip_id is -1; limit the cpu_core_mask to within PKG */ if (chip_id == -1) cpumask_and(mask, mask, cpu_cpu_mask(cpu)); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 68adf1de8888..66bda6a8f918 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -522,7 +522,7 @@ static struct sched_domain_topology_level s390_topology[] = { { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, { cpu_book_mask, SD_INIT_NAME(BOOK) }, { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, { NULL, }, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 02765d9da682..e3b3e806bf61 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -641,13 +641,13 @@ static void __init build_sched_topology(void) }; #endif /* - * When there is NUMA topology inside the package skip the DIE domain + * When there is NUMA topology inside the package skip the PKG domain * since the NUMA domains will auto-magically create the right spanning * domains based on the SLIT. */ if (!x86_has_numa_in_package) { x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE) + cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG) }; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 922905194c0c..a751e552f253 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9555,7 +9555,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) * can only do it if @group is an SMT group and has exactly on busy CPU. Larger * imbalances in the number of CPUS are dealt with in find_busiest_group(). * - * If we are balancing load within an SMT core, or at DIE domain level, always + * If we are balancing load within an SMT core, or at PKG domain level, always * proceed. * * Return: true if @env::dst_cpu can do with asym_packing load balance. False diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d9508617f7f8..a63729f87c21 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1119,7 +1119,7 @@ fail: * * - Simultaneous multithreading (SMT) * - Multi-Core Cache (MC) - * - Package (DIE) + * - Package (PKG) * * Where the last one more or less denotes everything up to a NUMA node. * @@ -1141,13 +1141,13 @@ fail: * * CPU 0 1 2 3 4 5 6 7 * - * DIE [ ] + * PKG [ ] * MC [ ] [ ] * SMT [ ] [ ] [ ] [ ] * * - or - * - * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 + * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 * @@ -1681,7 +1681,7 @@ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, { NULL, }, }; -- cgit v1.2.3 From 0c2924079f5a83ed715630680e338b3685a0bf7d Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Tue, 26 Sep 2023 11:57:22 +0000 Subject: sched/psi: Bail out early from irq time accounting We could bail out early when psi was disabled. Signed-off-by: Haifeng Xu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Link: https://lore.kernel.org/r/20230926115722.467833-1-haifeng.xu@shopee.com --- kernel/sched/psi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 44a78774ae87..519bc922a960 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -998,6 +998,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) struct psi_group_cpu *groupc; u64 now; + if (static_branch_likely(&psi_disabled)) + return; + if (!task->pid) return; -- cgit v1.2.3 From f0498d2a54e7966ce23cd7c7ff42c64fa0059b07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Oct 2023 20:57:39 +0200 Subject: sched: Fix stop_one_cpu_nowait() vs hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kuyo reported sporadic failures on a sched_setaffinity() vs CPU hotplug stress-test -- notably affine_move_task() remains stuck in wait_for_completion(), leading to a hung-task detector warning. Specifically, it was reported that stop_one_cpu_nowait(.fn = migration_cpu_stop) returns false -- this stopper is responsible for the matching complete(). The race scenario is: CPU0 CPU1 // doing _cpu_down() __set_cpus_allowed_ptr() task_rq_lock(); takedown_cpu() stop_machine_cpuslocked(take_cpu_down..) ack_state() MULTI_STOP_RUN take_cpu_down() __cpu_disable(); stop_machine_park(); stopper->enabled = false; /> /> stop_one_cpu_nowait(.fn = migration_cpu_stop); if (stopper->enabled) // false!!! That is, by doing stop_one_cpu_nowait() after dropping rq-lock, the stopper thread gets a chance to preempt and allows the cpu-down for the target CPU to complete. OTOH, since stop_one_cpu_nowait() / cpu_stop_queue_work() needs to issue a wakeup, it must not be ran under the scheduler locks. Solve this apparent contradiction by keeping preemption disabled over the unlock + queue_stopper combination: preempt_disable(); task_rq_unlock(...); if (!stop_pending) stop_one_cpu_nowait(...) preempt_enable(); This respects the lock ordering contraints while still avoiding the above race. That is, if we find the CPU is online under rq-lock, the targeted stop_one_cpu_nowait() must succeed. Apply this pattern to all similar stop_one_cpu_nowait() invocations. Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") Reported-by: "Kuyo Chang (張建文)" Signed-off-by: Peter Zijlstra (Intel) Tested-by: "Kuyo Chang (張建文)" Link: https://lkml.kernel.org/r/20231010200442.GA16515@noisy.programming.kicks-ass.net --- kernel/sched/core.c | 10 ++++++++-- kernel/sched/deadline.c | 2 ++ kernel/sched/fair.c | 4 +++- kernel/sched/rt.c | 4 ++++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a3f9cd52eec5..264c2eb380d7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2645,9 +2645,11 @@ static int migration_cpu_stop(void *data) * it. */ WARN_ON_ONCE(!pending->stop_pending); + preempt_disable(); task_rq_unlock(rq, p, &rf); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); + preempt_enable(); return 0; } out: @@ -2967,12 +2969,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag complete = true; } + preempt_disable(); task_rq_unlock(rq, p, rf); - if (push_task) { stop_one_cpu_nowait(rq->cpu, push_cpu_stop, p, &rq->push_work); } + preempt_enable(); if (complete) complete_all(&pending->done); @@ -3038,12 +3041,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag if (flags & SCA_MIGRATE_ENABLE) p->migration_flags &= ~MDF_PUSH; + preempt_disable(); task_rq_unlock(rq, p, rf); - if (!stop_pending) { stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, &pending->arg, &pending->stop_work); } + preempt_enable(); if (flags & SCA_MIGRATE_ENABLE) return 0; @@ -9421,9 +9425,11 @@ static void balance_push(struct rq *rq) * Temporarily drop rq->lock such that we can wake-up the stop task. * Both preemption and IRQs are still disabled. */ + preempt_disable(); raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, this_cpu_ptr(&push_work)); + preempt_enable(); /* * At this point need_resched() is true and we'll take the loop in * schedule(). The next pick is obviously going to be the stop task diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7039a8d5ae9b..b28114478b82 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2420,9 +2420,11 @@ skip: double_unlock_balance(this_rq, src_rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(this_rq); stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, push_task, &src_rq->push_work); + preempt_enable(); raw_spin_rq_lock(this_rq); } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a751e552f253..38d757c35004 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11254,13 +11254,15 @@ more_balance: busiest->push_cpu = this_cpu; active_balance = 1; } - raw_spin_rq_unlock_irqrestore(busiest, flags); + preempt_disable(); + raw_spin_rq_unlock_irqrestore(busiest, flags); if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); } + preempt_enable(); } } else { sd->nr_balance_failed = 0; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e93b69ef919b..6aaf0a3d6081 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2063,9 +2063,11 @@ retry: */ push_task = get_push_task(rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, push_cpu_stop, push_task, &rq->push_work); + preempt_enable(); raw_spin_rq_lock(rq); } @@ -2402,9 +2404,11 @@ skip: double_unlock_balance(this_rq, src_rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(this_rq); stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, push_task, &src_rq->push_work); + preempt_enable(); raw_spin_rq_lock(this_rq); } } -- cgit v1.2.3 From 1b8a955dd338dfbf39831d4687c25263e885a9cb Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 12 Oct 2023 08:58:24 -0400 Subject: sched: Make PELT acronym definition searchable The PELT acronym definition can be found right at the top of kernel/sched/pelt.c (of course), but it cannot be found through use of grep -r PELT kernel/sched/ Add the acronym "(PELT)" after "Per Entity Load Tracking" at the top of the source file. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231012125824.1260774-1-mathieu.desnoyers@efficios.com --- kernel/sched/pelt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 0f310768260c..63b6cf898220 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Per Entity Load Tracking + * Per Entity Load Tracking (PELT) * * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar * -- cgit v1.2.3 From 7b3d8df549390e797f883efa16224fa0dfe35e55 Mon Sep 17 00:00:00 2001 From: Fan Yu Date: Mon, 16 Oct 2023 19:20:39 +0800 Subject: sched/psi: Update poll => rtpoll in relevant comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PSI trigger code is now making a distinction between privileged and unprivileged triggers, after the following commit: 65457b74aa94 ("sched/psi: Rename existing poll members in preparation") But some comments have not been modified along with the code, so they need to be updated. This will help readers better understand the code. Signed-off-by: Fan Yu Signed-off-by: Ingo Molnar Cc: Johannes Weiner Cc: Suren Baghdasaryan Cc: Peter Ziljstra Link: https://lore.kernel.org/r/202310161920399921184@zte.com.cn --- kernel/sched/psi.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 519bc922a960..7b4aa5809c0f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -596,7 +596,7 @@ static void init_rtpoll_triggers(struct psi_group *group, u64 now) group->rtpoll_next_update = now + group->rtpoll_min_period; } -/* Schedule polling if it's not already scheduled or forced. */ +/* Schedule rtpolling if it's not already scheduled or forced. */ static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay, bool force) { @@ -636,37 +636,37 @@ static void psi_rtpoll_work(struct psi_group *group) if (now > group->rtpoll_until) { /* - * We are either about to start or might stop polling if no - * state change was recorded. Resetting poll_scheduled leaves + * We are either about to start or might stop rtpolling if no + * state change was recorded. Resetting rtpoll_scheduled leaves * a small window for psi_group_change to sneak in and schedule - * an immediate poll_work before we get to rescheduling. One - * potential extra wakeup at the end of the polling window - * should be negligible and polling_next_update still keeps + * an immediate rtpoll_work before we get to rescheduling. One + * potential extra wakeup at the end of the rtpolling window + * should be negligible and rtpoll_next_update still keeps * updates correctly on schedule. */ atomic_set(&group->rtpoll_scheduled, 0); /* - * A task change can race with the poll worker that is supposed to + * A task change can race with the rtpoll worker that is supposed to * report on it. To avoid missing events, ensure ordering between - * poll_scheduled and the task state accesses, such that if the poll - * worker misses the state update, the task change is guaranteed to - * reschedule the poll worker: + * rtpoll_scheduled and the task state accesses, such that if the + * rtpoll worker misses the state update, the task change is + * guaranteed to reschedule the rtpoll worker: * - * poll worker: - * atomic_set(poll_scheduled, 0) + * rtpoll worker: + * atomic_set(rtpoll_scheduled, 0) * smp_mb() * LOAD states * * task change: * STORE states - * if atomic_xchg(poll_scheduled, 1) == 0: - * schedule poll worker + * if atomic_xchg(rtpoll_scheduled, 1) == 0: + * schedule rtpoll worker * * The atomic_xchg() implies a full barrier. */ smp_mb(); } else { - /* Polling window is not over, keep rescheduling */ + /* The rtpolling window is not over, keep rescheduling */ force_reschedule = true; } @@ -674,7 +674,7 @@ static void psi_rtpoll_work(struct psi_group *group) collect_percpu_times(group, PSI_POLL, &changed_states); if (changed_states & group->rtpoll_states) { - /* Initialize trigger windows when entering polling mode */ + /* Initialize trigger windows when entering rtpolling mode */ if (now > group->rtpoll_until) init_rtpoll_triggers(group, now); -- cgit v1.2.3 From 1b7ef2d94ff4cb0b1186a224a97349864820c606 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 18 Oct 2023 14:27:59 +0800 Subject: sched/fair: Remove duplicate #include ./kernel/sched/fair.c: linux/sched/cond_resched.h is included more than once. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231018062759.44375-1-jiapeng.chong@linux.alibaba.com Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=6907 --- kernel/sched/fair.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 38d757c35004..9ae2208089e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,8 +51,6 @@ #include -#include - #include "sched.h" #include "stats.h" #include "autogroup.h" -- cgit v1.2.3 From fb064e5ae1657595c090ebbc5b15787a3ef603e9 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 20 Oct 2023 01:40:27 +0000 Subject: sched/nohz: Update comments about NEWILB_KICK How ILB is triggered without IPIs is cryptic. Out of mercy for future code readers, document it in code comments. The comments are derived from a discussion with Vincent in a past review. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231020014031.919742-2-joel@joelfernandes.org --- kernel/sched/fair.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9ae2208089e4..8c486ffcb779 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12005,8 +12005,19 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) } /* - * Check if we need to run the ILB for updating blocked load before entering - * idle state. + * Check if we need to directly run the ILB for updating blocked load before + * entering idle state. Here we run ILB directly without issuing IPIs. + * + * Note that when this function is called, the tick may not yet be stopped on + * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and + * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates + * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle + * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is + * called from this function on (this) CPU that's not yet in the mask. That's + * OK because the goal of nohz_run_idle_balance() is to run ILB only for + * updating the blocked load of already idle CPUs without waking up one of + * those idle CPUs and outside the preempt disable / irq off phase of the local + * cpu about to enter idle, because it can take a long time. */ void nohz_run_idle_balance(int cpu) { -- cgit v1.2.3 From 4c456c9ad334a940e354da1002184bc19f4493ef Mon Sep 17 00:00:00 2001 From: Yiwei Lin Date: Fri, 20 Oct 2023 13:56:17 +0800 Subject: sched/fair: Remove unused 'curr' argument from pick_next_entity() The 'curr' argument of pick_next_entity() has become unused after the EEVDF changes. [ mingo: Updated the changelog. ] Signed-off-by: Yiwei Lin Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231020055617.42064-1-s921975628@gmail.com --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8c486ffcb779..4b70b0d14698 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5256,7 +5256,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * -pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) +pick_next_entity(struct cfs_rq *cfs_rq) { /* * Enabling NEXT_BUDDY will affect latency but not fairness. @@ -8160,7 +8160,7 @@ again: goto again; } - se = pick_next_entity(cfs_rq, curr); + se = pick_next_entity(cfs_rq); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -8223,7 +8223,7 @@ again: } } - se = pick_next_entity(cfs_rq, curr); + se = pick_next_entity(cfs_rq); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -8262,7 +8262,7 @@ simple: put_prev_task(rq, prev); do { - se = pick_next_entity(cfs_rq, NULL); + se = pick_next_entity(cfs_rq); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); -- cgit v1.2.3 From 5ebde09d91707a4a9bec1e3d213e3c12ffde348f Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Thu, 12 Oct 2023 17:00:03 +0800 Subject: sched/core: Fix RQCF_ACT_SKIP leak Igor Raits and Bagas Sanjaya report a RQCF_ACT_SKIP leak warning. This warning may be triggered in the following situations: CPU0 CPU1 __schedule() *rq->clock_update_flags <<= 1;* unregister_fair_sched_group() pick_next_task_fair+0x4a/0x410 destroy_cfs_bandwidth() newidle_balance+0x115/0x3e0 for_each_possible_cpu(i) *i=0* rq_unpin_lock(this_rq, rf) __cfsb_csd_unthrottle() raw_spin_rq_unlock(this_rq) rq_lock(*CPU0_rq*, &rf) rq_clock_start_loop_update() rq->clock_update_flags & RQCF_ACT_SKIP <-- raw_spin_rq_lock(this_rq) The purpose of RQCF_ACT_SKIP is to skip the update rq clock, but the update is very early in __schedule(), but we clear RQCF_*_SKIP very late, causing it to span that gap above and triggering this warning. In __schedule() we can clear the RQCF_*_SKIP flag immediately after update_rq_clock() to avoid this RQCF_ACT_SKIP leak warning. And set rq->clock_update_flags to RQCF_UPDATED to avoid rq->clock_update_flags < RQCF_ACT_SKIP warning that may be triggered later. Fixes: ebb83d84e49b ("sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()") Closes: https://lore.kernel.org/all/20230913082424.73252-1-jiahao.os@bytedance.com Reported-by: Igor Raits Reported-by: Bagas Sanjaya Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Hao Jia Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/a5dd536d-041a-2ce9-f4b7-64d8d85c86dc@gmail.com --- kernel/sched/core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 264c2eb380d7..dc724f59e495 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5361,8 +5361,6 @@ context_switch(struct rq *rq, struct task_struct *prev, /* switch_mm_cid() requires the memory barriers above. */ switch_mm_cid(rq, prev, next); - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ @@ -6600,6 +6598,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); + rq->clock_update_flags = RQCF_UPDATED; switch_count = &prev->nivcsw; @@ -6679,8 +6678,6 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); -- cgit v1.2.3 From b95303e0aeaf446b65169dd4142cacdaeb7d4c8b Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 19 Oct 2023 11:33:21 +0800 Subject: sched: Add cpus_share_resources API Add cpus_share_resources() API. This is the preparation for the optimization of select_idle_cpu() on platforms with cluster scheduler level. On a machine with clusters cpus_share_resources() will test whether two cpus are within the same cluster. On a non-cluster machine it will behaves the same as cpus_share_cache(). So we use "resources" here for cache resources. Signed-off-by: Barry Song Signed-off-by: Yicong Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Gautham R. Shenoy Reviewed-by: Tim Chen Reviewed-by: Vincent Guittot Tested-and-reviewed-by: Chen Yu Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20231019033323.54147-2-yangyicong@huawei.com --- include/linux/sched/sd_flags.h | 7 +++++++ include/linux/sched/topology.h | 8 +++++++- kernel/sched/core.c | 12 ++++++++++++ kernel/sched/sched.h | 1 + kernel/sched/topology.c | 13 +++++++++++++ 5 files changed, 40 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index fad77b5172e2..a8b28647aafc 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -109,6 +109,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) */ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +/* + * Domain members share CPU cluster (LLC tags or L2 cache) + * + * NEEDS_GROUPS: Clusters are shared between groups. + */ +SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) + /* * Domain members share CPU package resources (i.e. caches) * diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 67b573d5bf28..4c14fe127223 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void) #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; } #endif @@ -179,6 +179,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +bool cpus_share_resources(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); @@ -232,6 +233,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +static inline bool cpus_share_resources(int this_cpu, int that_cpu) +{ + return true; +} + #endif /* !CONFIG_SMP */ #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dc724f59e495..5e1fb8a63b2e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3939,6 +3939,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } +/* + * Whether CPUs are share cache resources, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_resources(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu); +} + static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 65cad0e5729e..998f03d02de0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1853,6 +1853,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_share_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a63729f87c21..dbb8c328e8ad 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -668,6 +668,7 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -693,6 +694,17 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_share_id, cpu) = id; + sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -1550,6 +1562,7 @@ static struct cpumask ***sched_domains_numa_masks; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_CLUSTER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING) -- cgit v1.2.3 From 8881e1639f1f899b64e9bccf6cc14d51c1d3c822 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 19 Oct 2023 11:33:22 +0800 Subject: sched/fair: Scan cluster before scanning LLC in wake-up path For platforms having clusters like Kunpeng920, CPUs within the same cluster have lower latency when synchronizing and accessing shared resources like cache. Thus, this patch tries to find an idle cpu within the cluster of the target CPU before scanning the whole LLC to gain lower latency. This will be implemented in 2 steps in select_idle_sibling(): 1. When the prev_cpu/recent_used_cpu are good wakeup candidates, use them if they're sharing cluster with the target CPU. Otherwise trying to scan for an idle CPU in the target's cluster. 2. Scanning the cluster prior to the LLC of the target CPU for an idle CPU to wakeup. Testing has been done on Kunpeng920 by pinning tasks to one numa and two numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs. With this patch, We noticed enhancement on tbench and netperf within one numa or cross two numa on top of tip-sched-core commit 9b46f1abc6d4 ("sched/debug: Print 'tgid' in sched_show_task()") tbench results (node 0): baseline patched 1: 327.2833 372.4623 ( 13.80%) 4: 1320.5933 1479.8833 ( 12.06%) 8: 2638.4867 2921.5267 ( 10.73%) 16: 5282.7133 5891.5633 ( 11.53%) 32: 9810.6733 9877.3400 ( 0.68%) 64: 7408.9367 7447.9900 ( 0.53%) 128: 6203.2600 6191.6500 ( -0.19%) tbench results (node 0-1): baseline patched 1: 332.0433 372.7223 ( 12.25%) 4: 1325.4667 1477.6733 ( 11.48%) 8: 2622.9433 2897.9967 ( 10.49%) 16: 5218.6100 5878.2967 ( 12.64%) 32: 10211.7000 11494.4000 ( 12.56%) 64: 13313.7333 16740.0333 ( 25.74%) 128: 13959.1000 14533.9000 ( 4.12%) netperf results TCP_RR (node 0): baseline patched 1: 76546.5033 90649.9867 ( 18.42%) 4: 77292.4450 90932.7175 ( 17.65%) 8: 77367.7254 90882.3467 ( 17.47%) 16: 78519.9048 90938.8344 ( 15.82%) 32: 72169.5035 72851.6730 ( 0.95%) 64: 25911.2457 25882.2315 ( -0.11%) 128: 10752.6572 10768.6038 ( 0.15%) netperf results TCP_RR (node 0-1): baseline patched 1: 76857.6667 90892.2767 ( 18.26%) 4: 78236.6475 90767.3017 ( 16.02%) 8: 77929.6096 90684.1633 ( 16.37%) 16: 77438.5873 90502.5787 ( 16.87%) 32: 74205.6635 88301.5612 ( 19.00%) 64: 69827.8535 71787.6706 ( 2.81%) 128: 25281.4366 25771.3023 ( 1.94%) netperf results UDP_RR (node 0): baseline patched 1: 96869.8400 110800.8467 ( 14.38%) 4: 97744.9750 109680.5425 ( 12.21%) 8: 98783.9863 110409.9637 ( 11.77%) 16: 99575.0235 110636.2435 ( 11.11%) 32: 95044.7250 97622.8887 ( 2.71%) 64: 32925.2146 32644.4991 ( -0.85%) 128: 12859.2343 12824.0051 ( -0.27%) netperf results UDP_RR (node 0-1): baseline patched 1: 97202.4733 110190.1200 ( 13.36%) 4: 95954.0558 106245.7258 ( 10.73%) 8: 96277.1958 105206.5304 ( 9.27%) 16: 97692.7810 107927.2125 ( 10.48%) 32: 79999.6702 103550.2999 ( 29.44%) 64: 80592.7413 87284.0856 ( 8.30%) 128: 27701.5770 29914.5820 ( 7.99%) Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch in the code has not been tested but it supposed to work. Chen Yu also noticed this will improve the performance of tbench and netperf on a 24 CPUs Jacobsville machine, there are 4 CPUs in one cluster sharing L2 Cache. [https://lore.kernel.org/lkml/Ytfjs+m1kUs0ScSn@worktop.programming.kicks-ass.net] Suggested-by: Peter Zijlstra Signed-off-by: Barry Song Signed-off-by: Yicong Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Tim Chen Reviewed-by: Chen Yu Reviewed-by: Gautham R. Shenoy Reviewed-by: Vincent Guittot Tested-and-reviewed-by: Chen Yu Tested-by: Yicong Yang Link: https://lkml.kernel.org/r/20231019033323.54147-3-yangyicong@huawei.com --- kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 1 + kernel/sched/topology.c | 12 ++++++++++++ 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa2091a08034..c47b38eded19 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7259,6 +7259,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } } + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + + if (sg->flags & SD_CLUSTER) { + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_group_span(sg)); + } + } + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -7266,7 +7290,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return i; } else { - if (!--nr) + if (--nr <= 0) return -1; idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) @@ -7395,8 +7419,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_cpu(task_util, util_min, util_max, prev)) - return prev; + asym_fits_cpu(task_util, util_min, util_max, prev)) { + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(prev, target)) + return prev; + } /* * Allow a per-cpu kthread to stack with the wakee if the @@ -7423,7 +7451,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { - return recent_used_cpu; + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(recent_used_cpu, target)) + return recent_used_cpu; + } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 998f03d02de0..ef4fe7bcf740 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1859,6 +1859,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; static __always_inline bool sched_asym_cpucap_active(void) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dbb8c328e8ad..10d1391e7416 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -673,7 +673,9 @@ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { @@ -2386,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct rq *rq = NULL; int i, ret = -ENOMEM; bool has_asym = false; + bool has_cluster = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2514,12 +2517,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att WRITE_ONCE(d.rd->max_cpu_capacity, capacity); cpu_attach_domain(sd, d.rd, i); + + if (lowest_flag_domain(i, SD_CLUSTER)) + has_cluster = true; } rcu_read_unlock(); if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + if (rq && sched_debug_verbose) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); @@ -2619,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + if (static_branch_unlikely(&sched_cluster_active)) + static_branch_dec_cpuslocked(&sched_cluster_active); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); -- cgit v1.2.3 From 22165f61d0c4092adf40f967c899e5d8b8a0d703 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 19 Oct 2023 11:33:23 +0800 Subject: sched/fair: Use candidate prev/recent_used CPU if scanning failed for cluster wakeup Chen Yu reports a hackbench regression of cluster wakeup when hackbench threads equal to the CPU number [1]. Analysis shows it's because we wake up more on the target CPU even if the prev_cpu is a good wakeup candidate and leads to the decrease of the CPU utilization. Generally if the task's prev_cpu is idle we'll wake up the task on it without scanning. On cluster machines we'll try to wake up the task in the same cluster of the target for better cache affinity, so if the prev_cpu is idle but not sharing the same cluster with the target we'll still try to find an idle CPU within the cluster. This will improve the performance at low loads on cluster machines. But in the issue above, if the prev_cpu is idle but not in the cluster with the target CPU, we'll try to scan an idle one in the cluster. But since the system is busy, we're likely to fail the scanning and use target instead, even if the prev_cpu is idle. Then leads to the regression. This patch solves this in 2 steps: o record the prev_cpu/recent_used_cpu if they're good wakeup candidates but not sharing the cluster with the target. o on scanning failure use the prev_cpu/recent_used_cpu if they're recorded as idle [1] https://lore.kernel.org/all/ZGzDLuVaHR1PAYDt@chenyu5-mobl1/ Closes: https://lore.kernel.org/all/ZGsLy83wPIpamy6x@chenyu5-mobl1/ Reported-by: Chen Yu Signed-off-by: Yicong Yang Tested-and-reviewed-by: Chen Yu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20231019033323.54147-4-yangyicong@huawei.com --- kernel/sched/fair.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c47b38eded19..523b5aee2d6a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7392,7 +7392,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) bool has_idle_core = false; struct sched_domain *sd; unsigned long task_util, util_min, util_max; - int i, recent_used_cpu; + int i, recent_used_cpu, prev_aff = -1; /* * On asymmetric system, update task utilization because we will check @@ -7424,6 +7424,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!static_branch_unlikely(&sched_cluster_active) || cpus_share_resources(prev, target)) return prev; + + prev_aff = prev; } /* @@ -7456,6 +7458,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) cpus_share_resources(recent_used_cpu, target)) return recent_used_cpu; + } else { + recent_used_cpu = -1; } /* @@ -7496,6 +7500,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster + * first. But prev_cpu or recent_used_cpu may also be a good candidate, + * use them if possible when no idle CPU found in select_idle_cpu(). + */ + if ((unsigned int)prev_aff < nr_cpumask_bits) + return prev_aff; + if ((unsigned int)recent_used_cpu < nr_cpumask_bits) + return recent_used_cpu; + return target; } -- cgit v1.2.3 From 984ffb6a4366752c949f7b39640aecdce222607f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Oct 2023 12:35:33 +0200 Subject: sched/fair: Remove SIS_PROP SIS_UTIL seems to work well, lets remove the old thing. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Link: https://lkml.kernel.org/r/20231020134337.GD33965@noisy.programming.kicks-ass.net --- include/linux/sched/topology.h | 2 -- kernel/sched/core.c | 5 ----- kernel/sched/fair.c | 48 ------------------------------------------ kernel/sched/features.h | 1 - kernel/sched/sched.h | 3 --- 5 files changed, 59 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 4c14fe127223..de545ba85218 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -109,8 +109,6 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; - u64 avg_scan_cost; /* select_idle_sibling */ - #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e1fb8a63b2e..7a0c16115b79 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3792,9 +3792,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (rq->avg_idle > max) rq->avg_idle = max; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle / 2; - rq->idle_stamp = 0; } #endif @@ -9953,8 +9950,6 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 523b5aee2d6a..8767988242ee 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7209,45 +7209,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; struct sched_domain_shared *sd_share; - struct rq *this_rq = this_rq(); - int this = smp_processor_id(); - struct sched_domain *this_sd = NULL; - u64 time = 0; cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP) && !has_idle_core) { - u64 avg_cost, avg_idle, span_avg; - unsigned long now = jiffies; - - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - - /* - * If we're busy, the assumption that the last idle period - * predicts the future is flawed; age away the remaining - * predicted idle time. - */ - if (unlikely(this_rq->wake_stamp < now)) { - while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) { - this_rq->wake_stamp++; - this_rq->wake_avg_idle >>= 1; - } - } - - avg_idle = this_rq->wake_avg_idle; - avg_cost = this_sd->avg_scan_cost + 1; - - span_avg = sd->span_weight * avg_idle; - if (span_avg > 4*avg_cost) - nr = div_u64(span_avg, avg_cost); - else - nr = 4; - - time = cpu_clock(this); - } - if (sched_feat(SIS_UTIL)) { sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); if (sd_share) { @@ -7301,18 +7265,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (has_idle_core) set_idle_cores(target, false); - if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { - time = cpu_clock(this) - time; - - /* - * Account for the scan cost of wakeups against the average - * idle time. - */ - this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time); - - update_avg(&this_sd->avg_scan_cost, time); - } - return idle_cpu; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index f770168230ae..a3ddf84de430 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true) /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef4fe7bcf740..2e5a95486a42 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1059,9 +1059,6 @@ struct rq { u64 idle_stamp; u64 avg_idle; - unsigned long wake_stamp; - u64 wake_avg_idle; - /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; -- cgit v1.2.3