diff options
24 files changed, 361 insertions, 188 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 30cc8f683dc6..74cf4b8964a7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14741,7 +14741,7 @@ F: include/uapi/linux/nitro_enclaves.h F: samples/nitro_enclaves/ NOHZ, DYNTICKS SUPPORT -M: Frederic Weisbecker <fweisbec@gmail.com> +M: Frederic Weisbecker <frederic@kernel.org> M: Thomas Gleixner <tglx@linutronix.de> M: Ingo Molnar <mingo@kernel.org> L: linux-kernel@vger.kernel.org diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index a7ec06ce3785..515ca33b854c 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -1,8 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32 +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile hostprogs := vdsomunge diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index beaf9586338f..fe7a53c6781f 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -6,9 +6,7 @@ # Heavily based on the vDSO Makefiles for other archs. # -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_AARCH64_JUMP_SLOT|R_AARCH64_GLOB_DAT|R_AARCH64_ABS64 +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso := vgettimeofday.o note.o sigreturn.o diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index f59bd1a4ead6..d014162c5c71 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -3,9 +3,6 @@ # Makefile for vdso32 # -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32 include $(srctree)/lib/vdso/Makefile # Same as cc-*option, but using CC_COMPAT instead of CC diff --git a/arch/csky/kernel/vdso/Makefile b/arch/csky/kernel/vdso/Makefile index 0b6909f10667..299e4e41ebc5 100644 --- a/arch/csky/kernel/vdso/Makefile +++ b/arch/csky/kernel/vdso/Makefile @@ -1,8 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_CKCORE_ADDR32|R_CKCORE_JUMP_SLOT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile # Symbols present in the vdso diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index d89e2ac75f7b..461240ab4436 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -1,9 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # Objects to go into the VDSO. -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_LARCH_32|R_LARCH_64|R_LARCH_MARK_LA|R_LARCH_JUMP_SLOT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o sigreturn.o diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index 18af9474ed0e..eb56581f6d73 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -4,9 +4,7 @@ # Sanitizer runtimes are unavailable and cannot be linked here. KCSAN_SANITIZE := n -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_MIPS_JUMP_SLOT|R_MIPS_GLOB_DAT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso-y := elf.o vgettimeofday.o sigreturn.o diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index 66f723f53be2..4c3f34485f08 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -2,7 +2,7 @@ # List of files in the vdso, has to be asm only for now -ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN|R_PPC_REL24 +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso32 = sigtramp32-32.o gettimeofday-32.o datapage-32.o cacheflush-32.o note-32.o getcpu-32.o diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 06e6b27f3bcc..a04b3bc35ca2 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -1,9 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only # Copied from arch/tile/kernel/vdso/Makefile -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_RISCV_32|R_RISCV_64|R_RISCV_JUMP_SLOT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile # Symbols present in the vdso vdso-syms = rt_sigreturn diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index 245bddfe9bc0..bafd3147eb4e 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -2,9 +2,8 @@ # List of files in the vdso KCOV_INSTRUMENT := n -ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE -ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso32 = vdso_user_wrapper-32.o note-32.o diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index 34f9542636e9..a766d286e15f 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -2,9 +2,8 @@ # List of files in the vdso KCOV_INSTRUMENT := n -ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE -ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso64 = vdso_user_wrapper.o note.o obj-cvdso64 = vdso64_generic.o getcpu.o diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 1506a22a4fb6..6a1821bd7d5e 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -3,10 +3,7 @@ # Building vDSO images for x86. # -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE| -ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile # Sanitizer runtimes are unavailable and cannot be linked here. diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 2c6e99ca48af..d607f51404fc 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -4,6 +4,7 @@ #include <linux/spinlock.h> #include <linux/list.h> +#include <linux/mutex.h> #include <linux/alarmtimer.h> #include <linux/timerqueue.h> @@ -62,16 +63,18 @@ static inline int clockid_to_fd(const clockid_t clk) * cpu_timer - Posix CPU timer representation for k_itimer * @node: timerqueue node to queue in the task/sig * @head: timerqueue head on which this timer is queued - * @task: Pointer to target task + * @pid: Pointer to target task PID * @elist: List head for the expiry list * @firing: Timer is currently firing + * @handling: Pointer to the task which handles expiry */ struct cpu_timer { - struct timerqueue_node node; - struct timerqueue_head *head; - struct pid *pid; - struct list_head elist; - int firing; + struct timerqueue_node node; + struct timerqueue_head *head; + struct pid *pid; + struct list_head elist; + int firing; + struct task_struct __rcu *handling; }; static inline bool cpu_timer_enqueue(struct timerqueue_head *head, @@ -135,10 +138,12 @@ struct posix_cputimers { /** * posix_cputimers_work - Container for task work based posix CPU timer expiry * @work: The task work to be scheduled + * @mutex: Mutex held around expiry in context of this task work * @scheduled: @work has been scheduled already, no further processing */ struct posix_cputimers_work { struct callback_head work; + struct mutex mutex; unsigned int scheduled; }; diff --git a/kernel/signal.c b/kernel/signal.c index 8cb28f1df294..8f6330f0e9ca 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1003,8 +1003,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) /* * Now find a thread we can wake up to take the signal off the queue. * - * If the main thread wants the signal, it gets first crack. - * Probably the least surprising to the average bear. + * Try the suggested task first (may or may not be the main thread). */ if (wants_signal(sig, p)) t = p; @@ -1970,8 +1969,24 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) ret = -1; rcu_read_lock(); + + /* + * This function is used by POSIX timers to deliver a timer signal. + * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID + * set), the signal must be delivered to the specific thread (queues + * into t->pending). + * + * Where type is not PIDTYPE_PID, signals must be delivered to the + * process. In this case, prefer to deliver to current if it is in + * the same thread group as the target process, which avoids + * unnecessarily waking up a potentially idle task. + */ t = pid_task(pid, type); - if (!t || !likely(lock_task_sighand(t, &flags))) + if (!t) + goto ret; + if (type != PIDTYPE_PID && same_thread_group(t, current)) + t = current; + if (!likely(lock_task_sighand(t, &flags))) goto ret; ret = 1; /* the signal is ignored */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2f5e9b34022c..e9c6f9d0e42c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -846,6 +846,8 @@ static u64 collect_timerqueue(struct timerqueue_head *head, return expires; ctmr->firing = 1; + /* See posix_cpu_timer_wait_running() */ + rcu_assign_pointer(ctmr->handling, current); cpu_timer_dequeue(ctmr); list_add_tail(&ctmr->elist, firing); } @@ -1161,7 +1163,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk); #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK static void posix_cpu_timers_work(struct callback_head *work) { + struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work); + + mutex_lock(&cw->mutex); handle_posix_cpu_timers(current); + mutex_unlock(&cw->mutex); +} + +/* + * Invoked from the posix-timer core when a cancel operation failed because + * the timer is marked firing. The caller holds rcu_read_lock(), which + * protects the timer and the task which is expiring it from being freed. + */ +static void posix_cpu_timer_wait_running(struct k_itimer *timr) +{ + struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling); + + /* Has the handling task completed expiry already? */ + if (!tsk) + return; + + /* Ensure that the task cannot go away */ + get_task_struct(tsk); + /* Now drop the RCU protection so the mutex can be locked */ + rcu_read_unlock(); + /* Wait on the expiry mutex */ + mutex_lock(&tsk->posix_cputimers_work.mutex); + /* Release it immediately again. */ + mutex_unlock(&tsk->posix_cputimers_work.mutex); + /* Drop the task reference. */ + put_task_struct(tsk); + /* Relock RCU so the callsite is balanced */ + rcu_read_lock(); +} + +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) +{ + /* Ensure that timr->it.cpu.handling task cannot go away */ + rcu_read_lock(); + spin_unlock_irq(&timr->it_lock); + posix_cpu_timer_wait_running(timr); + rcu_read_unlock(); + /* @timr is on stack and is valid */ + spin_lock_irq(&timr->it_lock); } /* @@ -1177,6 +1221,7 @@ void clear_posix_cputimers_work(struct task_struct *p) sizeof(p->posix_cputimers_work.work)); init_task_work(&p->posix_cputimers_work.work, posix_cpu_timers_work); + mutex_init(&p->posix_cputimers_work.mutex); p->posix_cputimers_work.scheduled = false; } @@ -1255,6 +1300,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk) lockdep_posixtimer_exit(); } +static void posix_cpu_timer_wait_running(struct k_itimer *timr) +{ + cpu_relax(); +} + +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) +{ + spin_unlock_irq(&timr->it_lock); + cpu_relax(); + spin_lock_irq(&timr->it_lock); +} + static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) { return false; @@ -1363,6 +1420,8 @@ static void handle_posix_cpu_timers(struct task_struct *tsk) */ if (likely(cpu_firing >= 0)) cpu_timer_fire(timer); + /* See posix_cpu_timer_wait_running() */ + rcu_assign_pointer(timer->it.cpu.handling, NULL); spin_unlock(&timer->it_lock); } } @@ -1497,23 +1556,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, expires = cpu_timer_getexpires(&timer.it.cpu); error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); if (!error) { - /* - * Timer is now unarmed, deletion can not fail. - */ + /* Timer is now unarmed, deletion can not fail. */ posix_cpu_timer_del(&timer); + } else { + while (error == TIMER_RETRY) { + posix_cpu_timer_wait_running_nsleep(&timer); + error = posix_cpu_timer_del(&timer); + } } - spin_unlock_irq(&timer.it_lock); - while (error == TIMER_RETRY) { - /* - * We need to handle case when timer was or is in the - * middle of firing. In other cases we already freed - * resources. - */ - spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_del(&timer); - spin_unlock_irq(&timer.it_lock); - } + spin_unlock_irq(&timer.it_lock); if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { /* @@ -1623,6 +1675,7 @@ const struct k_clock clock_posix_cpu = { .timer_del = posix_cpu_timer_del, .timer_get = posix_cpu_timer_get, .timer_rearm = posix_cpu_timer_rearm, + .timer_wait_running = posix_cpu_timer_wait_running, }; const struct k_clock clock_process = { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0c8a87a11b39..808a247205a9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -846,6 +846,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer, rcu_read_lock(); unlock_timer(timer, *flags); + /* + * kc->timer_wait_running() might drop RCU lock. So @timer + * cannot be touched anymore after the function returns! + */ if (!WARN_ON_ONCE(!kc->timer_wait_running)) kc->timer_wait_running(timer); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 46789356f856..65b8658da829 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -218,9 +218,19 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { + ktime_t next_p; + u32 rem; + tick_do_timer_cpu = cpu; - tick_next_period = ktime_get(); + next_p = ktime_get(); + div_u64_rem(next_p, TICK_NSEC, &rem); + if (rem) { + next_p -= rem; + next_p += TICK_NSEC; + } + + tick_next_period = next_p; #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a46506f7ec6d..52254679ec48 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -647,43 +647,67 @@ static void tick_nohz_update_jiffies(ktime_t now) touch_softlockup_watchdog_sched(); } -/* - * Updates the per-CPU time idle statistics counters - */ -static void -update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; - if (ts->idle_active) { - delta = ktime_sub(now, ts->idle_entrytime); - if (nr_iowait_cpu(cpu) > 0) - ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); - else - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - ts->idle_entrytime = now; - } + if (WARN_ON_ONCE(!ts->idle_active)) + return; - if (last_update_time) - *last_update_time = ktime_to_us(now); + delta = ktime_sub(now, ts->idle_entrytime); -} + write_seqcount_begin(&ts->idle_sleeptime_seq); + if (nr_iowait_cpu(smp_processor_id()) > 0) + ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + else + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); -static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) -{ - update_ts_time_stats(smp_processor_id(), ts, now, NULL); + ts->idle_entrytime = now; ts->idle_active = 0; + write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { + write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); ts->idle_active = 1; + write_seqcount_end(&ts->idle_sleeptime_seq); + sched_clock_idle_sleep_event(); } +static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, + bool compute_delta, u64 *last_update_time) +{ + ktime_t now, idle; + unsigned int seq; + + if (!tick_nohz_active) + return -1; + + now = ktime_get(); + if (last_update_time) + *last_update_time = ktime_to_us(now); + + do { + seq = read_seqcount_begin(&ts->idle_sleeptime_seq); + + if (ts->idle_active && compute_delta) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(*sleeptime, delta); + } else { + idle = *sleeptime; + } + } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); + + return ktime_to_us(idle); + +} + /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query @@ -691,7 +715,10 @@ static void tick_nohz_start_idle(struct tick_sched *ts) * counters if NULL. * * Return the cumulative idle time (since boot) for a given - * CPU, in microseconds. + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. @@ -701,27 +728,9 @@ static void tick_nohz_start_idle(struct tick_sched *ts) u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, idle; - - if (!tick_nohz_active) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - idle = ts->idle_sleeptime; - } else { - if (ts->idle_active && !nr_iowait_cpu(cpu)) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - idle = ktime_add(ts->idle_sleeptime, delta); - } else { - idle = ts->idle_sleeptime; - } - } - - return ktime_to_us(idle); + return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, + !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); @@ -732,7 +741,10 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); * counters if NULL. * * Return the cumulative iowait time (since boot) for a given - * CPU, in microseconds. + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. @@ -742,26 +754,9 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, iowait; - if (!tick_nohz_active) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - iowait = ts->iowait_sleeptime; - } else { - if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - iowait = ktime_add(ts->iowait_sleeptime, delta); - } else { - iowait = ts->iowait_sleeptime; - } - } - - return ktime_to_us(iowait); + return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, + nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); @@ -1094,10 +1089,16 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return true; } -static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) +/** + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + */ +void tick_nohz_idle_stop_tick(void) { - ktime_t expires; + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); + ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the @@ -1129,16 +1130,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) } } -/** - * tick_nohz_idle_stop_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - */ -void tick_nohz_idle_stop_tick(void) -{ - __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); -} - void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 504649513399..5ed5a9d41d5a 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -22,65 +22,82 @@ enum tick_nohz_mode { /** * struct tick_sched - sched tick emulation and no idle tick control/stats - * @sched_timer: hrtimer to schedule the periodic tick in high - * resolution mode - * @check_clocks: Notification mechanism about clocksource changes - * @nohz_mode: Mode - one state of tick_nohz_mode + * * @inidle: Indicator that the CPU is in the tick idle mode * @tick_stopped: Indicator that the idle tick has been stopped * @idle_active: Indicator that the CPU is actively in the tick idle mode; * it is reset during irq handling phases. - * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @do_timer_last: CPU was the last one doing do_timer before going idle * @got_idle_tick: Tick timer function has run with @inidle set + * @stalled_jiffies: Number of stalled jiffies detected across ticks + * @last_tick_jiffies: Value of jiffies seen on last tick + * @sched_timer: hrtimer to schedule the periodic tick in high + * resolution mode * @last_tick: Store the last tick expiry time when the tick * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. * @next_tick: Next tick to be fired when in dynticks mode. * @idle_jiffies: jiffies at the entry to idle for idle time accounting + * @idle_waketime: Time when the idle was interrupted + * @idle_entrytime: Time when the idle call was entered + * @nohz_mode: Mode - one state of tick_nohz_mode + * @last_jiffies: Base jiffies snapshot when next event was last computed + * @timer_expires_base: Base time clock monotonic for @timer_expires + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @next_timer: Expiry time of next expiring timer for debugging purpose only + * @idle_expires: Next tick in idle, for debugging purpose only * @idle_calls: Total number of idle calls * @idle_sleeps: Number of idle calls, where the sched tick was stopped - * @idle_entrytime: Time when the idle call was entered - * @idle_waketime: Time when the idle was interrupted * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding - * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) - * @timer_expires_base: Base time clock monotonic for @timer_expires - * @next_timer: Expiry time of next expiring timer for debugging purpose only * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick - * @last_tick_jiffies: Value of jiffies seen on last tick - * @stalled_jiffies: Number of stalled jiffies detected across ticks + * @check_clocks: Notification mechanism about clocksource changes */ struct tick_sched { - struct hrtimer sched_timer; - unsigned long check_clocks; - enum tick_nohz_mode nohz_mode; - + /* Common flags */ unsigned int inidle : 1; unsigned int tick_stopped : 1; unsigned int idle_active : 1; unsigned int do_timer_last : 1; unsigned int got_idle_tick : 1; + /* Tick handling: jiffies stall check */ + unsigned int stalled_jiffies; + unsigned long last_tick_jiffies; + + /* Tick handling */ + struct hrtimer sched_timer; ktime_t last_tick; ktime_t next_tick; unsigned long idle_jiffies; - unsigned long idle_calls; - unsigned long idle_sleeps; - ktime_t idle_entrytime; ktime_t idle_waketime; - ktime_t idle_exittime; - ktime_t idle_sleeptime; - ktime_t iowait_sleeptime; + + /* Idle entry */ + seqcount_t idle_sleeptime_seq; + ktime_t idle_entrytime; + + /* Tick stop */ + enum tick_nohz_mode nohz_mode; unsigned long last_jiffies; - u64 timer_expires; u64 timer_expires_base; + u64 timer_expires; u64 next_timer; ktime_t idle_expires; + unsigned long idle_calls; + unsigned long idle_sleeps; + + /* Idle exit */ + ktime_t idle_exittime; + ktime_t idle_sleeptime; + ktime_t iowait_sleeptime; + + /* Full dynticks handling */ atomic_t tick_dep_mask; - unsigned long last_tick_jiffies; - unsigned int stalled_jiffies; + + /* Clocksource changes */ + unsigned long check_clocks; }; extern struct tick_sched *tick_get_tick_sched(int cpu); diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile index e814061d6aa0..9f031eafc465 100644 --- a/lib/vdso/Makefile +++ b/lib/vdso/Makefile @@ -5,18 +5,13 @@ GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH)) c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c) -# This cmd checks that the vdso library does not contain absolute relocation +# This cmd checks that the vdso library does not contain dynamic relocations. # It has to be called after the linking of the vdso library and requires it # as a parameter. # -# $(ARCH_REL_TYPE_ABS) is defined in the arch specific makefile and corresponds -# to the absolute relocation types printed by "objdump -R" and accepted by the -# dynamic linker. -ifndef ARCH_REL_TYPE_ABS -$(error ARCH_REL_TYPE_ABS is not set) -endif - +# As a workaround for some GNU ld ports which produce unneeded R_*_NONE +# dynamic relocations, ignore R_*_NONE. quiet_cmd_vdso_check = VDSOCHK $@ - cmd_vdso_check = if $(OBJDUMP) -R $@ | grep -E -h "$(ARCH_REL_TYPE_ABS)"; \ + cmd_vdso_check = if $(READELF) -rW $@ | grep -v _NONE | grep -q " R_\w*_"; \ then (echo >&2 "$@: dynamic relocations are not supported"; \ rm -f $@; /bin/false); fi diff --git a/tools/testing/selftests/proc/proc-uptime-001.c b/tools/testing/selftests/proc/proc-uptime-001.c index 781f7a50fc3f..f335eec5067e 100644 --- a/tools/testing/selftests/proc/proc-uptime-001.c +++ b/tools/testing/selftests/proc/proc-uptime-001.c @@ -13,7 +13,9 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -// Test that values in /proc/uptime increment monotonically. +// Test that boottime value in /proc/uptime and CLOCK_BOOTTIME increment +// monotonically. We don't test idle time monotonicity due to broken iowait +// task counting, cf: comment above get_cpu_idle_time_us() #undef NDEBUG #include <assert.h> #include <stdint.h> @@ -25,20 +27,31 @@ int main(void) { - uint64_t start, u0, u1, i0, i1; + uint64_t start, u0, u1, c0, c1; int fd; fd = open("/proc/uptime", O_RDONLY); assert(fd >= 0); - proc_uptime(fd, &u0, &i0); + u0 = proc_uptime(fd); start = u0; + c0 = clock_boottime(); + do { - proc_uptime(fd, &u1, &i1); + u1 = proc_uptime(fd); + c1 = clock_boottime(); + + /* Is /proc/uptime monotonic ? */ assert(u1 >= u0); - assert(i1 >= i0); + + /* Is CLOCK_BOOTTIME monotonic ? */ + assert(c1 >= c0); + + /* Is CLOCK_BOOTTIME VS /proc/uptime monotonic ? */ + assert(c0 >= u0); + u0 = u1; - i0 = i1; + c0 = c1; } while (u1 - start < 100); return 0; diff --git a/tools/testing/selftests/proc/proc-uptime-002.c b/tools/testing/selftests/proc/proc-uptime-002.c index 7d0aa22bdc12..ae453daa96c1 100644 --- a/tools/testing/selftests/proc/proc-uptime-002.c +++ b/tools/testing/selftests/proc/proc-uptime-002.c @@ -13,8 +13,10 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -// Test that values in /proc/uptime increment monotonically -// while shifting across CPUs. +// Test that boottime value in /proc/uptime and CLOCK_BOOTTIME increment +// monotonically while shifting across CPUs. We don't test idle time +// monotonicity due to broken iowait task counting, cf: comment above +// get_cpu_idle_time_us() #undef NDEBUG #include <assert.h> #include <errno.h> @@ -42,10 +44,10 @@ static inline int sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned lo int main(void) { + uint64_t u0, u1, c0, c1; unsigned int len; unsigned long *m; unsigned int cpu; - uint64_t u0, u1, i0, i1; int fd; /* find out "nr_cpu_ids" */ @@ -60,7 +62,9 @@ int main(void) fd = open("/proc/uptime", O_RDONLY); assert(fd >= 0); - proc_uptime(fd, &u0, &i0); + u0 = proc_uptime(fd); + c0 = clock_boottime(); + for (cpu = 0; cpu < len * 8; cpu++) { memset(m, 0, len); m[cpu / (8 * sizeof(unsigned long))] |= 1UL << (cpu % (8 * sizeof(unsigned long))); @@ -68,11 +72,20 @@ int main(void) /* CPU might not exist, ignore error */ sys_sched_setaffinity(0, len, m); - proc_uptime(fd, &u1, &i1); + u1 = proc_uptime(fd); + c1 = clock_boottime(); + + /* Is /proc/uptime monotonic ? */ assert(u1 >= u0); - assert(i1 >= i0); + + /* Is CLOCK_BOOTTIME monotonic ? */ + assert(c1 >= c0); + + /* Is CLOCK_BOOTTIME VS /proc/uptime monotonic ? */ + assert(c0 >= u0); + u0 = u1; - i0 = i1; + c0 = c1; } return 0; diff --git a/tools/testing/selftests/proc/proc-uptime.h b/tools/testing/selftests/proc/proc-uptime.h index dc6a42b1d6b0..730cce4a3d73 100644 --- a/tools/testing/selftests/proc/proc-uptime.h +++ b/tools/testing/selftests/proc/proc-uptime.h @@ -19,10 +19,22 @@ #include <string.h> #include <stdlib.h> #include <unistd.h> +#include <time.h> #include "proc.h" -static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle) +static uint64_t clock_boottime(void) +{ + struct timespec ts; + int err; + + err = clock_gettime(CLOCK_BOOTTIME, &ts); + assert(err >= 0); + + return (ts.tv_sec * 100) + (ts.tv_nsec / 10000000); +} + +static uint64_t proc_uptime(int fd) { uint64_t val1, val2; char buf[64], *p; @@ -43,18 +55,6 @@ static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle) assert(p[3] == ' '); val2 = (p[1] - '0') * 10 + p[2] - '0'; - *uptime = val1 * 100 + val2; - - p += 4; - - val1 = xstrtoull(p, &p); - assert(p[0] == '.'); - assert('0' <= p[1] && p[1] <= '9'); - assert('0' <= p[2] && p[2] <= '9'); - assert(p[3] == '\n'); - - val2 = (p[1] - '0') * 10 + p[2] - '0'; - *idle = val1 * 100 + val2; - assert(p + 4 == buf + rv); + return val1 * 100 + val2; } diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 0ba500056e63..8a17c0e8d82b 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -188,6 +188,80 @@ static int check_timer_create(int which) return 0; } +int remain; +__thread int got_signal; + +static void *distribution_thread(void *arg) +{ + while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); + return NULL; +} + +static void distribution_handler(int nr) +{ + if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED)) + __atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED); +} + +/* + * Test that all running threads _eventually_ receive CLOCK_PROCESS_CPUTIME_ID + * timer signals. This primarily tests that the kernel does not favour any one. + */ +static int check_timer_distribution(void) +{ + int err, i; + timer_t id; + const int nthreads = 10; + pthread_t threads[nthreads]; + struct itimerspec val = { + .it_value.tv_sec = 0, + .it_value.tv_nsec = 1000 * 1000, + .it_interval.tv_sec = 0, + .it_interval.tv_nsec = 1000 * 1000, + }; + + printf("Check timer_create() per process signal distribution... "); + fflush(stdout); + + remain = nthreads + 1; /* worker threads + this thread */ + signal(SIGALRM, distribution_handler); + err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); + if (err < 0) { + perror("Can't create timer\n"); + return -1; + } + err = timer_settime(id, 0, &val, NULL); + if (err < 0) { + perror("Can't set timer\n"); + return -1; + } + + for (i = 0; i < nthreads; i++) { + if (pthread_create(&threads[i], NULL, distribution_thread, NULL)) { + perror("Can't create thread\n"); + return -1; + } + } + + /* Wait for all threads to receive the signal. */ + while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); + + for (i = 0; i < nthreads; i++) { + if (pthread_join(threads[i], NULL)) { + perror("Can't join thread\n"); + return -1; + } + } + + if (timer_delete(id)) { + perror("Can't delete timer\n"); + return -1; + } + + printf("[OK]\n"); + return 0; +} + int main(int argc, char **argv) { printf("Testing posix timers. False negative may happen on CPU execution \n"); @@ -217,5 +291,8 @@ int main(int argc, char **argv) if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0) return ksft_exit_fail(); + if (check_timer_distribution() < 0) + return ksft_exit_fail(); + return ksft_exit_pass(); } |