diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 17:41:08 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 17:41:08 -0800 |
commit | 1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f (patch) | |
tree | a5dabaa924d50867cbe347e20a7643b2850f11c0 /include | |
parent | a2f0e7eee1344eb9f91b22bc72d9eb0a52b849c9 (diff) | |
parent | 7c4a5b89a0b5a57a64b601775b296abf77a9fe97 (diff) |
Merge tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Improve the scalability of the CFS bandwidth unthrottling logic with
large number of CPUs.
- Fix & rework various cpuidle routines, simplify interaction with the
generic scheduler code. Add __cpuidle methods as noinstr to objtool's
noinstr detection and fix boatloads of cpuidle bugs & quirks.
- Add new ABI: introduce MEMBARRIER_CMD_GET_REGISTRATIONS, to query
previously issued registrations.
- Limit scheduler slice duration to the sysctl_sched_latency period, to
improve scheduling granularity with a large number of SCHED_IDLE
tasks.
- Debuggability enhancement on sys_exit(): warn about disabled IRQs,
but also enable them to prevent a cascade of followup problems and
repeat warnings.
- Fix the rescheduling logic in prio_changed_dl().
- Micro-optimize cpufreq and sched-util methods.
- Micro-optimize ttwu_runnable()
- Micro-optimize the idle-scanning in update_numa_stats(),
select_idle_capacity() and steal_cookie_task().
- Update the RSEQ code & self-tests
- Constify various scheduler methods
- Remove unused methods
- Refine __init tags
- Documentation updates
- Misc other cleanups, fixes
* tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (110 commits)
sched/rt: pick_next_rt_entity(): check list_entry
sched/deadline: Add more reschedule cases to prio_changed_dl()
sched/fair: sanitize vruntime of entity being placed
sched/fair: Remove capacity inversion detection
sched/fair: unlink misfit task from cpu overutilized
objtool: mem*() are not uaccess safe
cpuidle: Fix poll_idle() noinstr annotation
sched/clock: Make local_clock() noinstr
sched/clock/x86: Mark sched_clock() noinstr
x86/pvclock: Improve atomic update of last_value in pvclock_clocksource_read()
x86/atomics: Always inline arch_atomic64*()
cpuidle: tracing, preempt: Squash _rcuidle tracing
cpuidle: tracing: Warn about !rcu_is_watching()
cpuidle: lib/bug: Disable rcu_is_watching() during WARN/BUG
cpuidle: drivers: firmware: psci: Dont instrument suspend code
KVM: selftests: Fix build of rseq test
exit: Detect and fix irq disabled state in oops
cpuidle, arm64: Fix the ARM64 cpuidle logic
cpuidle: mvebu: Fix duplicate flags assignment
sched/fair: Limit sched slice duration
...
Diffstat (limited to 'include')
-rw-r--r-- | include/asm-generic/vmlinux.lds.h | 9 | ||||
-rw-r--r-- | include/linux/auxvec.h | 2 | ||||
-rw-r--r-- | include/linux/clockchips.h | 4 | ||||
-rw-r--r-- | include/linux/compiler_types.h | 18 | ||||
-rw-r--r-- | include/linux/context_tracking.h | 27 | ||||
-rw-r--r-- | include/linux/cpu.h | 3 | ||||
-rw-r--r-- | include/linux/cpuidle.h | 50 | ||||
-rw-r--r-- | include/linux/cpumask.h | 4 | ||||
-rw-r--r-- | include/linux/math64.h | 4 | ||||
-rw-r--r-- | include/linux/mm.h | 25 | ||||
-rw-r--r-- | include/linux/mm_types.h | 43 | ||||
-rw-r--r-- | include/linux/percpu-defs.h | 2 | ||||
-rw-r--r-- | include/linux/sched.h | 9 | ||||
-rw-r--r-- | include/linux/sched/clock.h | 8 | ||||
-rw-r--r-- | include/linux/sched/cputime.h | 9 | ||||
-rw-r--r-- | include/linux/sched/idle.h | 40 | ||||
-rw-r--r-- | include/linux/thread_info.h | 18 | ||||
-rw-r--r-- | include/linux/trace_recursion.h | 18 | ||||
-rw-r--r-- | include/linux/tracepoint.h | 15 | ||||
-rw-r--r-- | include/trace/events/rseq.h | 7 | ||||
-rw-r--r-- | include/uapi/linux/auxvec.h | 2 | ||||
-rw-r--r-- | include/uapi/linux/membarrier.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/rseq.h | 22 |
23 files changed, 290 insertions, 53 deletions
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 659bf3b31c91..d1f57e4868ed 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -558,6 +558,9 @@ ALIGN_FUNCTION(); \ __noinstr_text_start = .; \ *(.noinstr.text) \ + __cpuidle_text_start = .; \ + *(.cpuidle.text) \ + __cpuidle_text_end = .; \ __noinstr_text_end = .; /* @@ -598,12 +601,6 @@ *(.spinlock.text) \ __lock_text_end = .; -#define CPUIDLE_TEXT \ - ALIGN_FUNCTION(); \ - __cpuidle_text_start = .; \ - *(.cpuidle.text) \ - __cpuidle_text_end = .; - #define KPROBES_TEXT \ ALIGN_FUNCTION(); \ __kprobes_text_start = .; \ diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h index f68d0ec2d740..407f7005e6d6 100644 --- a/include/linux/auxvec.h +++ b/include/linux/auxvec.h @@ -4,6 +4,6 @@ #include <uapi/linux/auxvec.h> -#define AT_VECTOR_SIZE_BASE 20 /* NEW_AUX_ENT entries in auxiliary table */ +#define AT_VECTOR_SIZE_BASE 22 /* NEW_AUX_ENT entries in auxiliary table */ /* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */ #endif /* _LINUX_AUXVEC_H */ diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 8ae9a95ebf5b..9aac31d856f3 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -211,7 +211,7 @@ extern int tick_receive_broadcast(void); extern void tick_setup_hrtimer_broadcast(void); extern int tick_check_broadcast_expired(void); # else -static inline int tick_check_broadcast_expired(void) { return 0; } +static __always_inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) { } # endif @@ -219,7 +219,7 @@ static inline void tick_setup_hrtimer_broadcast(void) { } static inline void clockevents_suspend(void) { } static inline void clockevents_resume(void) { } -static inline int tick_check_broadcast_expired(void) { return 0; } +static __always_inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 7c1afe0f4129..dea5bf5bd09c 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -232,11 +232,25 @@ struct ftrace_likely_data { #endif /* Section for code which can't be instrumented at all */ -#define noinstr \ - noinline notrace __attribute((__section__(".noinstr.text"))) \ +#define __noinstr_section(section) \ + noinline notrace __attribute((__section__(section))) \ __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \ __no_sanitize_memory +#define noinstr __noinstr_section(".noinstr.text") + +/* + * The __cpuidle section is used twofold: + * + * 1) the original use -- identifying if a CPU is 'stuck' in idle state based + * on it's instruction pointer. See cpu_in_idle(). + * + * 2) supressing instrumentation around where cpuidle disables RCU; where the + * function isn't strictly required for #1, this is interchangeable with + * noinstr. + */ +#define __cpuidle __noinstr_section(".cpuidle.text") + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index dcef4a9e4d63..d4afa8508a80 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -130,9 +130,36 @@ static __always_inline unsigned long ct_state_inc(int incby) return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state)); } +static __always_inline bool warn_rcu_enter(void) +{ + bool ret = false; + + /* + * Horrible hack to shut up recursive RCU isn't watching fail since + * lots of the actual reporting also relies on RCU. + */ + preempt_disable_notrace(); + if (rcu_dynticks_curr_cpu_in_eqs()) { + ret = true; + ct_state_inc(RCU_DYNTICKS_IDX); + } + + return ret; +} + +static __always_inline void warn_rcu_exit(bool rcu) +{ + if (rcu) + ct_state_inc(RCU_DYNTICKS_IDX); + preempt_enable_notrace(); +} + #else static inline void ct_idle_enter(void) { } static inline void ct_idle_exit(void) { } + +static __always_inline bool warn_rcu_enter(void) { return false; } +static __always_inline void warn_rcu_exit(bool rcu) { } #endif /* !CONFIG_CONTEXT_TRACKING_IDLE */ #endif diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 314802f98b9d..f83e4519c5f0 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -176,9 +176,6 @@ void __noreturn cpu_startup_entry(enum cpuhp_state state); void cpu_idle_poll_ctrl(bool enable); -/* Attach to any functions which should be considered cpuidle. */ -#define __cpuidle __section(".cpuidle.text") - bool cpu_in_idle(unsigned long pc); void arch_cpu_idle(void); diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index fce476275e16..3183aeb7f5b4 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -14,6 +14,7 @@ #include <linux/percpu.h> #include <linux/list.h> #include <linux/hrtimer.h> +#include <linux/context_tracking.h> #define CPUIDLE_STATE_MAX 10 #define CPUIDLE_NAME_LEN 16 @@ -115,6 +116,35 @@ struct cpuidle_device { DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices); DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev); +static __always_inline void ct_cpuidle_enter(void) +{ + lockdep_assert_irqs_disabled(); + /* + * Idle is allowed to (temporary) enable IRQs. It + * will return with IRQs disabled. + * + * Trace IRQs enable here, then switch off RCU, and have + * arch_cpu_idle() use raw_local_irq_enable(). Note that + * ct_idle_enter() relies on lockdep IRQ state, so switch that + * last -- this is very similar to the entry code. + */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + instrumentation_end(); + ct_idle_enter(); + lockdep_hardirqs_on(_RET_IP_); +} + +static __always_inline void ct_cpuidle_exit(void) +{ + /* + * Carefully undo the above. + */ + lockdep_hardirqs_off(_RET_IP_); + ct_idle_exit(); + instrumentation_begin(); +} + /**************************** * CPUIDLE DRIVER INTERFACE * ****************************/ @@ -277,7 +307,7 @@ extern s64 cpuidle_governor_latency_req(unsigned int cpu); #define __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, \ idx, \ state, \ - is_retention) \ + is_retention, is_rcu) \ ({ \ int __ret = 0; \ \ @@ -289,7 +319,11 @@ extern s64 cpuidle_governor_latency_req(unsigned int cpu); if (!is_retention) \ __ret = cpu_pm_enter(); \ if (!__ret) { \ + if (!is_rcu) \ + ct_cpuidle_enter(); \ __ret = low_level_idle_enter(state); \ + if (!is_rcu) \ + ct_cpuidle_exit(); \ if (!is_retention) \ cpu_pm_exit(); \ } \ @@ -298,15 +332,21 @@ extern s64 cpuidle_governor_latency_req(unsigned int cpu); }) #define CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx) \ - __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 0) + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 0, 0) #define CPU_PM_CPU_IDLE_ENTER_RETENTION(low_level_idle_enter, idx) \ - __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 1) + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, idx, 1, 0) #define CPU_PM_CPU_IDLE_ENTER_PARAM(low_level_idle_enter, idx, state) \ - __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 0) + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 0, 0) + +#define CPU_PM_CPU_IDLE_ENTER_PARAM_RCU(low_level_idle_enter, idx, state) \ + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 0, 1) #define CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(low_level_idle_enter, idx, state) \ - __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 1) + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 1, 0) + +#define CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM_RCU(low_level_idle_enter, idx, state) \ + __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx, state, 1, 1) #endif /* _LINUX_CPUIDLE_H */ diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c2aa0aa26b45..d45e5de13721 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1017,9 +1017,9 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu) * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. */ -static inline unsigned int num_online_cpus(void) +static __always_inline unsigned int num_online_cpus(void) { - return atomic_read(&__num_online_cpus); + return arch_atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) diff --git a/include/linux/math64.h b/include/linux/math64.h index 8958f4c005c1..8b9191a2849e 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -161,7 +161,7 @@ static inline u64 mul_u32_u32(u32 a, u32 b) #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_shr -static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) +static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } @@ -177,7 +177,7 @@ static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) #else #ifndef mul_u64_u32_shr -static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) +static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { u32 ah, al; u64 ret; diff --git a/include/linux/mm.h b/include/linux/mm.h index bd3197748562..716d30d93616 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1982,6 +1982,31 @@ struct zap_details { /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit_signals(struct task_struct *t); +static inline int task_mm_cid(struct task_struct *t) +{ + return t->mm_cid; +} +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } +static inline int task_mm_cid(struct task_struct *t) +{ + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return raw_smp_processor_id(); +} +#endif + #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9757067c3053..af8119776ab1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -645,7 +645,18 @@ struct mm_struct { * &struct mm_struct is freed. */ atomic_t mm_count; - +#ifdef CONFIG_SCHED_MM_CID + /** + * @cid_lock: Protect cid bitmap updates vs lookups. + * + * Prevent situations where updates to the cid bitmap happen + * concurrently with lookups. Those can lead to situations + * where a lookup cannot find a free bit simply because it was + * unlucky enough to load, non-atomically, bitmap words as they + * were being concurrently updated by the updaters. + */ + raw_spinlock_t cid_lock; +#endif #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif @@ -909,6 +920,36 @@ static inline void vma_iter_init(struct vma_iterator *vmi, vmi->mas.node = MAS_START; } +#ifdef CONFIG_SCHED_MM_CID +/* Accessor for struct mm_struct's cidmask. */ +static inline cpumask_t *mm_cidmask(struct mm_struct *mm) +{ + unsigned long cid_bitmap = (unsigned long)mm; + + cid_bitmap += offsetof(struct mm_struct, cpu_bitmap); + /* Skip cpu_bitmap */ + cid_bitmap += cpumask_size(); + return (struct cpumask *)cid_bitmap; +} + +static inline void mm_init_cid(struct mm_struct *mm) +{ + raw_spin_lock_init(&mm->cid_lock); + cpumask_clear(mm_cidmask(mm)); +} + +static inline unsigned int mm_cid_size(void) +{ + return cpumask_size(); +} +#else /* CONFIG_SCHED_MM_CID */ +static inline void mm_init_cid(struct mm_struct *mm) { } +static inline unsigned int mm_cid_size(void) +{ + return 0; +} +#endif /* CONFIG_SCHED_MM_CID */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index af1071535de8..e60727be79c4 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -310,7 +310,7 @@ extern void __bad_size_call_parameter(void); #ifdef CONFIG_DEBUG_PREEMPT extern void __this_cpu_preempt_check(const char *op); #else -static inline void __this_cpu_preempt_check(const char *op) { } +static __always_inline void __this_cpu_preempt_check(const char *op) { } #endif #define __pcpu_size_call_return(stem, variable) \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f6ce9ca7097..63d242164b1a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1302,6 +1302,7 @@ struct task_struct { #ifdef CONFIG_RSEQ struct rseq __user *rseq; + u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically @@ -1310,6 +1311,11 @@ struct task_struct { unsigned long rseq_event_mask; #endif +#ifdef CONFIG_SCHED_MM_CID + int mm_cid; /* Current cid in mm */ + int mm_cid_active; /* Whether cid bitmap is active */ +#endif + struct tlbflush_unmap_batch tlb_ubc; union { @@ -2352,10 +2358,12 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; + t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } else { t->rseq = current->rseq; + t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; } @@ -2364,6 +2372,7 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) static inline void rseq_execve(struct task_struct *t) { t->rseq = NULL; + t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 867d588314e0..ca008f7d3615 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -45,7 +45,7 @@ static inline u64 cpu_clock(int cpu) return sched_clock(); } -static inline u64 local_clock(void) +static __always_inline u64 local_clock(void) { return sched_clock(); } @@ -79,10 +79,8 @@ static inline u64 cpu_clock(int cpu) return sched_clock_cpu(cpu); } -static inline u64 local_clock(void) -{ - return sched_clock_cpu(raw_smp_processor_id()); -} +extern u64 local_clock(void); + #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h index ce3c58286062..5f8fd5b24a2e 100644 --- a/include/linux/sched/cputime.h +++ b/include/linux/sched/cputime.h @@ -8,15 +8,6 @@ * cputime accounting APIs: */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -#include <asm/cputime.h> - -#ifndef cputime_to_nsecs -# define cputime_to_nsecs(__ct) \ - (cputime_to_usecs(__ct) * NSEC_PER_USEC) -#endif -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ - #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime); diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index d73d314d59c6..478084f9105e 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -23,12 +23,37 @@ static inline void wake_up_if_idle(int cpu) { } */ #ifdef TIF_POLLING_NRFLAG -static inline void __current_set_polling(void) +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H + +static __always_inline void __current_set_polling(void) { - set_thread_flag(TIF_POLLING_NRFLAG); + arch_set_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); } -static inline bool __must_check current_set_polling_and_test(void) +static __always_inline void __current_clr_polling(void) +{ + arch_clear_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#else + +static __always_inline void __current_set_polling(void) +{ + set_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +static __always_inline void __current_clr_polling(void) +{ + clear_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H */ + +static __always_inline bool __must_check current_set_polling_and_test(void) { __current_set_polling(); @@ -41,12 +66,7 @@ static inline bool __must_check current_set_polling_and_test(void) return unlikely(tif_need_resched()); } -static inline void __current_clr_polling(void) -{ - clear_thread_flag(TIF_POLLING_NRFLAG); -} - -static inline bool __must_check current_clr_polling_and_test(void) +static __always_inline bool __must_check current_clr_polling_and_test(void) { __current_clr_polling(); @@ -73,7 +93,7 @@ static inline bool __must_check current_clr_polling_and_test(void) } #endif -static inline void current_clr_polling(void) +static __always_inline void current_clr_polling(void) { __current_clr_polling(); diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 9f392ec76f2b..c02646884fa8 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -177,7 +177,23 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H + +static __always_inline bool tif_need_resched(void) +{ + return arch_test_bit(TIF_NEED_RESCHED, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#else + +static __always_inline bool tif_need_resched(void) +{ + return test_bit(TIF_NEED_RESCHED, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index c303f7a114e9..d48cd92d2364 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -135,6 +135,21 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); # define do_ftrace_record_recursion(ip, pip) do { } while (0) #endif +#ifdef CONFIG_ARCH_WANTS_NO_INSTR +# define trace_warn_on_no_rcu(ip) \ + ({ \ + bool __ret = !rcu_is_watching(); \ + if (__ret && !trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \ + trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \ + WARN_ONCE(true, "RCU not on for: %pS\n", (void *)ip); \ + trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \ + } \ + __ret; \ + }) +#else +# define trace_warn_on_no_rcu(ip) false +#endif + /* * Preemption is promised to be disabled when return bit >= 0. */ @@ -144,6 +159,9 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign unsigned int val = READ_ONCE(current->trace_recursion); int bit; + if (trace_warn_on_no_rcu(ip)) + return -1; + bit = trace_get_context_bit() + start; if (unlikely(val & (1 << bit))) { /* diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 4b33b95eb8be..552f80b8362f 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -178,6 +178,17 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #endif /* CONFIG_HAVE_STATIC_CALL */ /* + * ARCH_WANTS_NO_INSTR archs are expected to have sanitized entry and idle + * code that disallow any/all tracing/instrumentation when RCU isn't watching. + */ +#ifdef CONFIG_ARCH_WANTS_NO_INSTR +#define RCUIDLE_COND(rcuidle) (rcuidle) +#else +/* srcu can't be used from NMI */ +#define RCUIDLE_COND(rcuidle) (rcuidle && in_nmi()) +#endif + +/* * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. */ @@ -188,8 +199,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (!(cond)) \ return; \ \ - /* srcu can't be used from NMI */ \ - WARN_ON_ONCE(rcuidle && in_nmi()); \ + if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \ + return; \ \ /* keep srcu and sched-rcu usage consistent */ \ preempt_disable_notrace(); \ diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h index a04a64bc1a00..823b47d1ba1e 100644 --- a/include/trace/events/rseq.h +++ b/include/trace/events/rseq.h @@ -16,13 +16,18 @@ TRACE_EVENT(rseq_update, TP_STRUCT__entry( __field(s32, cpu_id) + __field(s32, node_id) + __field(s32, mm_cid) ), TP_fast_assign( __entry->cpu_id = raw_smp_processor_id(); + __entry->node_id = cpu_to_node(__entry->cpu_id); + __entry->mm_cid = task_mm_cid(t); ), - TP_printk("cpu_id=%d", __entry->cpu_id) + TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id, + __entry->node_id, __entry->mm_cid) ); TRACE_EVENT(rseq_ip_fixup, diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h index c7e502bf5a6f..6991c4b8ab18 100644 --- a/include/uapi/linux/auxvec.h +++ b/include/uapi/linux/auxvec.h @@ -30,6 +30,8 @@ * differ from AT_PLATFORM. */ #define AT_RANDOM 25 /* address of 16 random bytes */ #define AT_HWCAP2 26 /* extension of AT_HWCAP */ +#define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size */ +#define AT_RSEQ_ALIGN 28 /* rseq allocation alignment */ #define AT_EXECFN 31 /* filename of program */ diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h index 737605897f36..5f3ad6d5be6f 100644 --- a/include/uapi/linux/membarrier.h +++ b/include/uapi/linux/membarrier.h @@ -137,6 +137,9 @@ * @MEMBARRIER_CMD_SHARED: * Alias to MEMBARRIER_CMD_GLOBAL. Provided for * header backward compatibility. + * @MEMBARRIER_CMD_GET_REGISTRATIONS: + * Returns a bitmask of previously issued + * registration commands. * * Command to be passed to the membarrier system call. The commands need to * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to @@ -153,6 +156,7 @@ enum membarrier_cmd { MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6), MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = (1 << 7), MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8), + MEMBARRIER_CMD_GET_REGISTRATIONS = (1 << 9), /* Alias for header backward compatibility. */ MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL, diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 77ee207623a9..c233aae5eac9 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -130,6 +130,28 @@ struct rseq { * this thread. */ __u32 flags; + + /* + * Restartable sequences node_id field. Updated by the kernel. Read by + * user-space with single-copy atomicity semantics. This field should + * only be read by the thread which registered this data structure. + * Aligned on 32-bit. Contains the current NUMA node ID. + */ + __u32 node_id; + + /* + * Restartable sequences mm_cid field. Updated by the kernel. Read by + * user-space with single-copy atomicity semantics. This field should + * only be read by the thread which registered this data structure. + * Aligned on 32-bit. Contains the current thread's concurrency ID + * (allocated uniquely within a memory map). + */ + __u32 mm_cid; + + /* + * Flexible array member at end of structure, after last feature field. + */ + char end[]; } __attribute__((aligned(4 * sizeof(__u64)))); #endif /* _UAPI_LINUX_RSEQ_H */ |