diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-09 21:21:34 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-09 21:21:34 -0800 |
commit | 86c6a2fddf0b89b494c7616f2c06cf915c4bff01 (patch) | |
tree | 0e6930c93e5d49ead71b17fcadf0cc9ba28c3d2d /include/linux | |
parent | bee2782f30f66898be3f74ad02e4d1f87a969694 (diff) | |
parent | fd7de1e8d5b2b2b35e71332fafb899f584597150 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main changes in this cycle are:
- 'Nested Sleep Debugging', activated when CONFIG_DEBUG_ATOMIC_SLEEP=y.
This instruments might_sleep() checks to catch places that nest
blocking primitives - such as mutex usage in a wait loop. Such
bugs can result in hard to debug races/hangs.
Another category of invalid nesting that this facility will detect
is the calling of blocking functions from within schedule() ->
sched_submit_work() -> blk_schedule_flush_plug().
There's some potential for false positives (if secondary blocking
primitives themselves are not ready yet for this facility), but the
kernel will warn once about such bugs per bootup, so the warning
isn't much of a nuisance.
This feature comes with a number of fixes, for problems uncovered
with it, so no messages are expected normally.
- Another round of sched/numa optimizations and refinements, for
CONFIG_NUMA_BALANCING=y.
- Another round of sched/dl fixes and refinements.
Plus various smaller fixes and cleanups"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits)
sched: Add missing rcu protection to wake_up_all_idle_cpus
sched/deadline: Introduce start_hrtick_dl() for !CONFIG_SCHED_HRTICK
sched/numa: Init numa balancing fields of init_task
sched/deadline: Remove unnecessary definitions in cpudeadline.h
sched/cpupri: Remove unnecessary definitions in cpupri.h
sched/deadline: Fix rq->dl.pushable_tasks bug in push_dl_task()
sched/fair: Fix stale overloaded status in the busiest group finding logic
sched: Move p->nr_cpus_allowed check to select_task_rq()
sched/completion: Document when to use wait_for_completion_io_*()
sched: Update comments about CLONE_NEWUTS and CLONE_NEWIPC
sched/fair: Kill task_struct::numa_entry and numa_group::task_list
sched: Refactor task_struct to use numa_faults instead of numa_* pointers
sched/deadline: Don't check CONFIG_SMP in switched_from_dl()
sched/deadline: Reschedule from switched_from_dl() after a successful pull
sched/deadline: Push task away if the deadline is equal to curr during wakeup
sched/deadline: Add deadline rq status print
sched/deadline: Fix artificial overrun introduced by yield_task_dl()
sched/rt: Clean up check_preempt_equal_prio()
sched/core: Use dl_bw_of() under rcu_read_lock_sched()
sched: Check if we got a shallowest_idle_cpu before searching for least_loaded_cpu
...
Diffstat (limited to 'include/linux')
-rw-r--r-- | include/linux/freezer.h | 50 | ||||
-rw-r--r-- | include/linux/init_task.h | 10 | ||||
-rw-r--r-- | include/linux/kernel.h | 5 | ||||
-rw-r--r-- | include/linux/sched.h | 87 | ||||
-rw-r--r-- | include/linux/wait.h | 80 |
5 files changed, 158 insertions, 74 deletions
diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 7fd81b8c4897..6b7fd9cf5ea2 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -246,15 +246,6 @@ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, * defined in <linux/wait.h> */ -#define wait_event_freezekillable(wq, condition) \ -({ \ - int __retval; \ - freezer_do_not_count(); \ - __retval = wait_event_killable(wq, (condition)); \ - freezer_count(); \ - __retval; \ -}) - /* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ #define wait_event_freezekillable_unsafe(wq, condition) \ ({ \ @@ -265,35 +256,6 @@ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, __retval; \ }) -#define wait_event_freezable(wq, condition) \ -({ \ - int __retval; \ - freezer_do_not_count(); \ - __retval = wait_event_interruptible(wq, (condition)); \ - freezer_count(); \ - __retval; \ -}) - -#define wait_event_freezable_timeout(wq, condition, timeout) \ -({ \ - long __retval = timeout; \ - freezer_do_not_count(); \ - __retval = wait_event_interruptible_timeout(wq, (condition), \ - __retval); \ - freezer_count(); \ - __retval; \ -}) - -#define wait_event_freezable_exclusive(wq, condition) \ -({ \ - int __retval; \ - freezer_do_not_count(); \ - __retval = wait_event_interruptible_exclusive(wq, condition); \ - freezer_count(); \ - __retval; \ -}) - - #else /* !CONFIG_FREEZER */ static inline bool frozen(struct task_struct *p) { return false; } static inline bool freezing(struct task_struct *p) { return false; } @@ -331,18 +293,6 @@ static inline void set_freezable(void) {} #define freezable_schedule_hrtimeout_range(expires, delta, mode) \ schedule_hrtimeout_range(expires, delta, mode) -#define wait_event_freezable(wq, condition) \ - wait_event_interruptible(wq, condition) - -#define wait_event_freezable_timeout(wq, condition, timeout) \ - wait_event_interruptible_timeout(wq, condition, timeout) - -#define wait_event_freezable_exclusive(wq, condition) \ - wait_event_interruptible_exclusive(wq, condition) - -#define wait_event_freezekillable(wq, condition) \ - wait_event_killable(wq, condition) - #define wait_event_freezekillable_unsafe(wq, condition) \ wait_event_killable(wq, condition) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index d996aef8044f..3037fc085e8e 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -166,6 +166,15 @@ extern struct task_group root_task_group; # define INIT_RT_MUTEXES(tsk) #endif +#ifdef CONFIG_NUMA_BALANCING +# define INIT_NUMA_BALANCING(tsk) \ + .numa_preferred_nid = -1, \ + .numa_group = NULL, \ + .numa_faults = NULL, +#else +# define INIT_NUMA_BALANCING(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -237,6 +246,7 @@ extern struct task_group root_task_group; INIT_CPUSET_SEQ(tsk) \ INIT_RT_MUTEXES(tsk) \ INIT_VTIME(tsk) \ + INIT_NUMA_BALANCING(tsk) \ } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3d770f5564b8..446d76a87ba1 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -162,6 +162,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + void ___might_sleep(const char *file, int line, int preempt_offset); void __might_sleep(const char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep @@ -175,10 +176,14 @@ extern int _cond_resched(void); */ # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) +# define sched_annotate_sleep() __set_current_state(TASK_RUNNING) #else + static inline void ___might_sleep(const char *file, int line, + int preempt_offset) { } static inline void __might_sleep(const char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) +# define sched_annotate_sleep() do { } while (0) #endif #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 706a9f744909..55f5ee7cc3d3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -243,6 +243,43 @@ extern char ___assert_task_state[1 - 2*!!( ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ (task->flags & PF_FROZEN) == 0) +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + +#define __set_task_state(tsk, state_value) \ + do { \ + (tsk)->task_state_change = _THIS_IP_; \ + (tsk)->state = (state_value); \ + } while (0) +#define set_task_state(tsk, state_value) \ + do { \ + (tsk)->task_state_change = _THIS_IP_; \ + set_mb((tsk)->state, (state_value)); \ + } while (0) + +/* + * set_current_state() includes a barrier so that the write of current->state + * is correctly serialised wrt the caller's subsequent test of whether to + * actually sleep: + * + * set_current_state(TASK_UNINTERRUPTIBLE); + * if (do_i_need_to_sleep()) + * schedule(); + * + * If the caller does not need such serialisation then use __set_current_state() + */ +#define __set_current_state(state_value) \ + do { \ + current->task_state_change = _THIS_IP_; \ + current->state = (state_value); \ + } while (0) +#define set_current_state(state_value) \ + do { \ + current->task_state_change = _THIS_IP_; \ + set_mb(current->state, (state_value)); \ + } while (0) + +#else + #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) #define set_task_state(tsk, state_value) \ @@ -259,11 +296,13 @@ extern char ___assert_task_state[1 - 2*!!( * * If the caller does not need such serialisation then use __set_current_state() */ -#define __set_current_state(state_value) \ +#define __set_current_state(state_value) \ do { current->state = (state_value); } while (0) -#define set_current_state(state_value) \ +#define set_current_state(state_value) \ set_mb(current->state, (state_value)) +#endif + /* Task command name length */ #define TASK_COMM_LEN 16 @@ -1558,28 +1597,23 @@ struct task_struct { struct numa_group *numa_group; /* - * Exponential decaying average of faults on a per-node basis. - * Scheduling placement decisions are made based on the these counts. - * The values remain static for the duration of a PTE scan + * numa_faults is an array split into four regions: + * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer + * in this precise order. + * + * faults_memory: Exponential decaying average of faults on a per-node + * basis. Scheduling placement decisions are made based on these + * counts. The values remain static for the duration of a PTE scan. + * faults_cpu: Track the nodes the process was running on when a NUMA + * hinting fault was incurred. + * faults_memory_buffer and faults_cpu_buffer: Record faults per node + * during the current scan window. When the scan completes, the counts + * in faults_memory and faults_cpu decay and these values are copied. */ - unsigned long *numa_faults_memory; + unsigned long *numa_faults; unsigned long total_numa_faults; /* - * numa_faults_buffer records faults per node during the current - * scan window. When the scan completes, the counts in - * numa_faults_memory decay and these values are copied. - */ - unsigned long *numa_faults_buffer_memory; - - /* - * Track the nodes the process was running on when a NUMA hinting - * fault was incurred. - */ - unsigned long *numa_faults_cpu; - unsigned long *numa_faults_buffer_cpu; - - /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local. The task scan period is adapted * based on the locality of the faults with different weights @@ -1661,6 +1695,9 @@ struct task_struct { unsigned int sequential_io; unsigned int sequential_io_avg; #endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + unsigned long task_state_change; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ @@ -2052,6 +2089,10 @@ static inline void tsk_restore_flags(struct task_struct *task, task->flags |= orig_flags & flags; } +extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial); +extern int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed); #ifdef CONFIG_SMP extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); @@ -2760,7 +2801,7 @@ static inline int signal_pending_state(long state, struct task_struct *p) extern int _cond_resched(void); #define cond_resched() ({ \ - __might_sleep(__FILE__, __LINE__, 0); \ + ___might_sleep(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) @@ -2773,14 +2814,14 @@ extern int __cond_resched_lock(spinlock_t *lock); #endif #define cond_resched_lock(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ __cond_resched_lock(lock); \ }) extern int __cond_resched_softirq(void); #define cond_resched_softirq() ({ \ - __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ + ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ __cond_resched_softirq(); \ }) diff --git a/include/linux/wait.h b/include/linux/wait.h index e4a8eb9312ea..2232ed16635a 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -13,9 +13,12 @@ typedef struct __wait_queue wait_queue_t; typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key); int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); +/* __wait_queue::flags */ +#define WQ_FLAG_EXCLUSIVE 0x01 +#define WQ_FLAG_WOKEN 0x02 + struct __wait_queue { unsigned int flags; -#define WQ_FLAG_EXCLUSIVE 0x01 void *private; wait_queue_func_t func; struct list_head task_list; @@ -258,11 +261,37 @@ __out: __ret; \ */ #define wait_event(wq, condition) \ do { \ + might_sleep(); \ if (condition) \ break; \ __wait_event(wq, condition); \ } while (0) +#define __wait_event_freezable(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ + schedule(); try_to_freeze()) + +/** + * wait_event - sleep (or freeze) until a condition gets true + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * + * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute + * to system load) until the @condition evaluates to true. The + * @condition is checked each time the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + */ +#define wait_event_freezable(wq, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_freezable(wq, condition); \ + __ret; \ +}) + #define __wait_event_timeout(wq, condition, timeout) \ ___wait_event(wq, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ @@ -290,11 +319,30 @@ do { \ #define wait_event_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ + might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_timeout(wq, condition, timeout); \ __ret; \ }) +#define __wait_event_freezable_timeout(wq, condition, timeout) \ + ___wait_event(wq, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, 0, timeout, \ + __ret = schedule_timeout(__ret); try_to_freeze()) + +/* + * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid + * increasing load and is freezable. + */ +#define wait_event_freezable_timeout(wq, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_freezable_timeout(wq, condition, timeout); \ + __ret; \ +}) + #define __wait_event_cmd(wq, condition, cmd1, cmd2) \ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) @@ -315,6 +363,7 @@ do { \ */ #define wait_event_cmd(wq, condition, cmd1, cmd2) \ do { \ + might_sleep(); \ if (condition) \ break; \ __wait_event_cmd(wq, condition, cmd1, cmd2); \ @@ -342,6 +391,7 @@ do { \ #define wait_event_interruptible(wq, condition) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible(wq, condition); \ __ret; \ @@ -375,6 +425,7 @@ do { \ #define wait_event_interruptible_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ + might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_interruptible_timeout(wq, \ condition, timeout); \ @@ -425,6 +476,7 @@ do { \ #define wait_event_hrtimeout(wq, condition, timeout) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_UNINTERRUPTIBLE); \ @@ -450,6 +502,7 @@ do { \ #define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ ({ \ long __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_INTERRUPTIBLE); \ @@ -463,12 +516,27 @@ do { \ #define wait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible_exclusive(wq, condition);\ __ret; \ }) +#define __wait_event_freezable_exclusive(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ + schedule(); try_to_freeze()) + +#define wait_event_freezable_exclusive(wq, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_freezable_exclusive(wq, condition);\ + __ret; \ +}) + + #define __wait_event_interruptible_locked(wq, condition, exclusive, irq) \ ({ \ int __ret = 0; \ @@ -637,6 +705,7 @@ do { \ #define wait_event_killable(wq, condition) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable(wq, condition); \ __ret; \ @@ -830,6 +899,8 @@ void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int sta long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); @@ -886,6 +957,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *); static inline int wait_on_bit(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, @@ -910,6 +982,7 @@ wait_on_bit(void *word, int bit, unsigned mode) static inline int wait_on_bit_io(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, @@ -936,6 +1009,7 @@ wait_on_bit_io(void *word, int bit, unsigned mode) static inline int wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) { + might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, action, mode); @@ -963,6 +1037,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode static inline int wait_on_bit_lock(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); @@ -986,6 +1061,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode) static inline int wait_on_bit_lock_io(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); @@ -1011,6 +1087,7 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode) static inline int wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) { + might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, action, mode); @@ -1029,6 +1106,7 @@ wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned static inline int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) { + might_sleep(); if (atomic_read(val) == 0) return 0; return out_of_line_wait_on_atomic_t(val, action, mode); |