diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 1 | ||||
-rw-r--r-- | kernel/capability.c | 15 | ||||
-rw-r--r-- | kernel/kmod.c | 8 | ||||
-rw-r--r-- | kernel/module.c | 5 | ||||
-rw-r--r-- | kernel/slow-work-debugfs.c | 227 | ||||
-rw-r--r-- | kernel/slow-work.c | 512 | ||||
-rw-r--r-- | kernel/slow-work.h | 72 | ||||
-rw-r--r-- | kernel/smp.c | 56 | ||||
-rw-r--r-- | kernel/workqueue.c | 28 |
9 files changed, 846 insertions, 78 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index b8d4cd8ac0b9..d7c13d249b2d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -94,6 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o +obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) diff --git a/kernel/capability.c b/kernel/capability.c index 4e17041963f5..7f876e60521f 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set); EXPORT_SYMBOL(__cap_full_set); EXPORT_SYMBOL(__cap_init_eff_set); -#ifdef CONFIG_SECURITY_FILE_CAPABILITIES int file_caps_enabled = 1; static int __init file_caps_disable(char *str) @@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str) return 1; } __setup("no_file_caps", file_caps_disable); -#endif /* * More recent versions of libcap are available from: @@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) kernel_cap_t pE, pI, pP; ret = cap_validate_magic(header, &tocopy); - if (ret != 0) - return ret; + if ((dataptr == NULL) || (ret != 0)) + return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i, tocopy; + unsigned i, tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; @@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; - if (copy_from_user(&kdata, data, - tocopy * sizeof(struct __user_cap_data_struct))) + copybytes = tocopy * sizeof(struct __user_cap_data_struct); + if (copybytes > sizeof(kdata)) + return -EFAULT; + + if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; for (i = 0; i < tocopy; i++) { diff --git a/kernel/kmod.c b/kernel/kmod.c index 9fcb53a11f87..25b103190364 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; - ret = security_kernel_module_request(); - if (ret) - return ret; - va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); if (ret >= MODULE_NAME_LEN) return -ENAMETOOLONG; + ret = security_kernel_module_request(module_name); + if (ret) + return ret; + /* If modprobe needs a service that is in a module, we get a recursive * loop. Limit the number of running kmod threads to max_threads/2 or * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method diff --git a/kernel/module.c b/kernel/module.c index 8b7d8805819d..5842a71cf052 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1187,7 +1187,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, /* Count loaded sections and allocate structures */ for (i = 0; i < nsect; i++) - if (sechdrs[i].sh_flags & SHF_ALLOC) + if (sechdrs[i].sh_flags & SHF_ALLOC + && sechdrs[i].sh_size) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1207,6 +1208,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, for (i = 0; i < nsect; i++) { if (! (sechdrs[i].sh_flags & SHF_ALLOC)) continue; + if (!sechdrs[i].sh_size) + continue; sattr->address = sechdrs[i].sh_addr; sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, GFP_KERNEL); diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c new file mode 100644 index 000000000000..e45c43645298 --- /dev/null +++ b/kernel/slow-work-debugfs.c @@ -0,0 +1,227 @@ +/* Slow work debugging + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/slow-work.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/seq_file.h> +#include "slow-work.h" + +#define ITERATOR_SHIFT (BITS_PER_LONG - 4) +#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) +#define ITERATOR_COUNTER (~ITERATOR_SELECTOR) + +void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) +{ + seq_puts(m, "Slow-work: New thread"); +} + +/* + * Render the time mark field on a work item into a 5-char time with units plus + * a space + */ +static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) +{ + struct timespec now, diff; + + now = CURRENT_TIME; + diff = timespec_sub(now, work->mark); + + if (diff.tv_sec < 0) + seq_puts(m, " -ve "); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) + seq_printf(m, "%3luns ", diff.tv_nsec); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) + seq_printf(m, "%3luus ", diff.tv_nsec / 1000); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) + seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); + else if (diff.tv_sec <= 1) + seq_puts(m, " 1s "); + else if (diff.tv_sec < 60) + seq_printf(m, "%4lus ", diff.tv_sec); + else if (diff.tv_sec < 60 * 60) + seq_printf(m, "%4lum ", diff.tv_sec / 60); + else if (diff.tv_sec < 60 * 60 * 24) + seq_printf(m, "%4luh ", diff.tv_sec / 3600); + else + seq_puts(m, "exces "); +} + +/* + * Describe a slow work item for debugfs + */ +static int slow_work_runqueue_show(struct seq_file *m, void *v) +{ + struct slow_work *work; + struct list_head *p = v; + unsigned long id; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); + return 0; + case 2: + seq_puts(m, "=== ===== ================ == ===== ==========\n"); + return 0; + + case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: + id = (unsigned long) v - 3; + + read_lock(&slow_work_execs_lock); + work = slow_work_execs[id]; + if (work) { + smp_read_barrier_depends(); + + seq_printf(m, "%3lu %5d %16p %2lx ", + id, slow_work_pids[id], work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + } + read_unlock(&slow_work_execs_lock); + return 0; + + default: + work = list_entry(p, struct slow_work, link); + seq_printf(m, "%3s - %16p %2lx ", + work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", + work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + return 0; + } +} + +/* + * map the iterator to a work item + */ +static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) +{ + struct list_head *p; + unsigned long count, id; + + switch (*_pos >> ITERATOR_SHIFT) { + case 0x0: + if (*_pos == 0) + *_pos = 1; + if (*_pos < 3) + return (void *)(unsigned long) *_pos; + if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) + for (id = *_pos - 3; + id < SLOW_WORK_THREAD_LIMIT; + id++, (*_pos)++) + if (slow_work_execs[id]) + return (void *)(unsigned long) *_pos; + *_pos = 0x1UL << ITERATOR_SHIFT; + + case 0x1: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &slow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + + case 0x2: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &vslow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) +{ + spin_lock_irq(&slow_work_queue_lock); + return slow_work_runqueue_index(m, _pos); +} + +/* + * move to the next line + */ +static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) +{ + struct list_head *p = v; + unsigned long selector = *_pos >> ITERATOR_SHIFT; + + (*_pos)++; + switch (selector) { + case 0x0: + return slow_work_runqueue_index(m, _pos); + + case 0x1: + if (*_pos >> ITERATOR_SHIFT == 0x1) { + p = p->next; + if (p != &slow_work_queue) + return p; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + p = &vslow_work_queue; + + case 0x2: + if (*_pos >> ITERATOR_SHIFT == 0x2) { + p = p->next; + if (p != &vslow_work_queue) + return p; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * clean up after reading + */ +static void slow_work_runqueue_stop(struct seq_file *m, void *v) +{ + spin_unlock_irq(&slow_work_queue_lock); +} + +static const struct seq_operations slow_work_runqueue_ops = { + .start = slow_work_runqueue_start, + .stop = slow_work_runqueue_stop, + .next = slow_work_runqueue_next, + .show = slow_work_runqueue_show, +}; + +/* + * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents + */ +static int slow_work_runqueue_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slow_work_runqueue_ops); +} + +const struct file_operations slow_work_runqueue_fops = { + .owner = THIS_MODULE, + .open = slow_work_runqueue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 0d31135efbf4..00889bd3c590 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -16,11 +16,8 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/wait.h> - -#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of - * things to do */ -#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after - * OOM */ +#include <linux/debugfs.h> +#include "slow-work.h" static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); @@ -46,7 +43,7 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process #ifdef CONFIG_SYSCTL static const int slow_work_min_min_threads = 2; -static int slow_work_max_max_threads = 255; +static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; static const int slow_work_min_vslow = 1; static const int slow_work_max_vslow = 99; @@ -98,6 +95,56 @@ static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); static struct slow_work slow_work_new_thread; /* new thread starter */ /* + * slow work ID allocation (use slow_work_queue_lock) + */ +static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); + +/* + * Unregistration tracking to prevent put_ref() from disappearing during module + * unload + */ +#ifdef CONFIG_MODULES +static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; +static struct module *slow_work_unreg_module; +static struct slow_work *slow_work_unreg_work_item; +static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); +static DEFINE_MUTEX(slow_work_unreg_sync_lock); + +static void slow_work_set_thread_processing(int id, struct slow_work *work) +{ + if (work) + slow_work_thread_processing[id] = work->owner; +} +static void slow_work_done_thread_processing(int id, struct slow_work *work) +{ + struct module *module = slow_work_thread_processing[id]; + + slow_work_thread_processing[id] = NULL; + smp_mb(); + if (slow_work_unreg_work_item == work || + slow_work_unreg_module == module) + wake_up_all(&slow_work_unreg_wq); +} +static void slow_work_clear_thread_processing(int id) +{ + slow_work_thread_processing[id] = NULL; +} +#else +static void slow_work_set_thread_processing(int id, struct slow_work *work) {} +static void slow_work_done_thread_processing(int id, struct slow_work *work) {} +static void slow_work_clear_thread_processing(int id) {} +#endif + +/* + * Data for tracking currently executing items for indication through /proc + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; +pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; +DEFINE_RWLOCK(slow_work_execs_lock); +#endif + +/* * The queues of work items and the lock governing access to them. These are * shared between all the CPUs. It doesn't make sense to have per-CPU queues * as the number of threads bears no relation to the number of CPUs. @@ -105,9 +152,18 @@ static struct slow_work slow_work_new_thread; /* new thread starter */ * There are two queues of work items: one for slow work items, and one for * very slow work items. */ -static LIST_HEAD(slow_work_queue); -static LIST_HEAD(vslow_work_queue); -static DEFINE_SPINLOCK(slow_work_queue_lock); +LIST_HEAD(slow_work_queue); +LIST_HEAD(vslow_work_queue); +DEFINE_SPINLOCK(slow_work_queue_lock); + +/* + * The following are two wait queues that get pinged when a work item is placed + * on an empty queue. These allow work items that are hogging a thread by + * sleeping in a way that could be deferred to yield their thread and enqueue + * themselves. + */ +static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation); +static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation); /* * The thread controls. A variable used to signal to the threads that they @@ -126,6 +182,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited); static int slow_work_user_count; static DEFINE_MUTEX(slow_work_user_lock); +static inline int slow_work_get_ref(struct slow_work *work) +{ + if (work->ops->get_ref) + return work->ops->get_ref(work); + + return 0; +} + +static inline void slow_work_put_ref(struct slow_work *work) +{ + if (work->ops->put_ref) + work->ops->put_ref(work); +} + /* * Calculate the maximum number of active threads in the pool that are * permitted to process very slow work items. @@ -149,7 +219,7 @@ static unsigned slow_work_calc_vsmax(void) * Attempt to execute stuff queued on a slow thread. Return true if we managed * it, false if there was nothing to do. */ -static bool slow_work_execute(void) +static noinline bool slow_work_execute(int id) { struct slow_work *work = NULL; unsigned vsmax; @@ -186,6 +256,13 @@ static bool slow_work_execute(void) } else { very_slow = false; /* avoid the compiler warning */ } + + slow_work_set_thread_processing(id, work); + if (work) { + slow_work_mark_time(work); + slow_work_begin_exec(id, work); + } + spin_unlock_irq(&slow_work_queue_lock); if (!work) @@ -194,12 +271,19 @@ static bool slow_work_execute(void) if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) BUG(); - work->ops->execute(work); + /* don't execute if the work is in the process of being cancelled */ + if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) + work->ops->execute(work); if (very_slow) atomic_dec(&vslow_work_executing_count); clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); + /* wake up anyone waiting for this work to be complete */ + wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); + + slow_work_end_exec(id, work); + /* if someone tried to enqueue the item whilst we were executing it, * then it'll be left unenqueued to avoid multiple threads trying to * execute it simultaneously @@ -219,7 +303,10 @@ static bool slow_work_execute(void) spin_unlock_irq(&slow_work_queue_lock); } - work->ops->put_ref(work); + /* sort out the race between module unloading and put_ref() */ + slow_work_put_ref(work); + slow_work_done_thread_processing(id, work); + return true; auto_requeue: @@ -227,15 +314,61 @@ auto_requeue: * - we transfer our ref on the item back to the appropriate queue * - don't wake another thread up as we're awake already */ + slow_work_mark_time(work); if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) list_add_tail(&work->link, &vslow_work_queue); else list_add_tail(&work->link, &slow_work_queue); spin_unlock_irq(&slow_work_queue_lock); + slow_work_clear_thread_processing(id); return true; } /** + * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work + * work: The work item under execution that wants to sleep + * _timeout: Scheduler sleep timeout + * + * Allow a requeueable work item to sleep on a slow-work processor thread until + * that thread is needed to do some other work or the sleep is interrupted by + * some other event. + * + * The caller must set up a wake up event before calling this and must have set + * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own + * condition before calling this function as no test is made here. + * + * False is returned if there is nothing on the queue; true is returned if the + * work item should be requeued + */ +bool slow_work_sleep_till_thread_needed(struct slow_work *work, + signed long *_timeout) +{ + wait_queue_head_t *wfo_wq; + struct list_head *queue; + + DEFINE_WAIT(wait); + + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + + if (!list_empty(queue)) + return true; + + add_wait_queue_exclusive(wfo_wq, &wait); + if (list_empty(queue)) + *_timeout = schedule_timeout(*_timeout); + finish_wait(wfo_wq, &wait); + + return !list_empty(queue); +} +EXPORT_SYMBOL(slow_work_sleep_till_thread_needed); + +/** * slow_work_enqueue - Schedule a slow work item for processing * @work: The work item to queue * @@ -260,16 +393,22 @@ auto_requeue: * allowed to pick items to execute. This ensures that very slow items won't * overly block ones that are just ordinarily slow. * - * Returns 0 if successful, -EAGAIN if not. + * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is + * attempted queued) */ int slow_work_enqueue(struct slow_work *work) { + wait_queue_head_t *wfo_wq; + struct list_head *queue; unsigned long flags; + int ret; + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + return -ECANCELED; BUG_ON(slow_work_user_count <= 0); BUG_ON(!work); BUG_ON(!work->ops); - BUG_ON(!work->ops->get_ref); /* when honouring an enqueue request, we only promise that we will run * the work function in the future; we do not promise to run it once @@ -280,8 +419,19 @@ int slow_work_enqueue(struct slow_work *work) * maintaining our promise */ if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + spin_lock_irqsave(&slow_work_queue_lock, flags); + if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) + goto cancelled; + /* we promise that we will not attempt to execute the work * function in more than one thread simultaneously * @@ -299,25 +449,221 @@ int slow_work_enqueue(struct slow_work *work) if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); } else { - if (work->ops->get_ref(work) < 0) - goto cant_get_ref; - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) - list_add_tail(&work->link, &vslow_work_queue); - else - list_add_tail(&work->link, &slow_work_queue); + ret = slow_work_get_ref(work); + if (ret < 0) + goto failed; + slow_work_mark_time(work); + list_add_tail(&work->link, queue); wake_up(&slow_work_thread_wq); + + /* if someone who could be requeued is sleeping on a + * thread, then ask them to yield their thread */ + if (work->link.prev == queue) + wake_up(wfo_wq); } spin_unlock_irqrestore(&slow_work_queue_lock, flags); } return 0; -cant_get_ref: +cancelled: + ret = -ECANCELED; +failed: spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return -EAGAIN; + return ret; } EXPORT_SYMBOL(slow_work_enqueue); +static int slow_work_wait(void *word) +{ + schedule(); + return 0; +} + +/** + * slow_work_cancel - Cancel a slow work item + * @work: The work item to cancel + * + * This function will cancel a previously enqueued work item. If we cannot + * cancel the work item, it is guarenteed to have run when this function + * returns. + */ +void slow_work_cancel(struct slow_work *work) +{ + bool wait = true, put = false; + + set_bit(SLOW_WORK_CANCELLING, &work->flags); + smp_mb(); + + /* if the work item is a delayed work item with an active timer, we + * need to wait for the timer to finish _before_ getting the spinlock, + * lest we deadlock against the timer routine + * + * the timer routine will leave DELAYED set if it notices the + * CANCELLING flag in time + */ + if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { + struct delayed_slow_work *dwork = + container_of(work, struct delayed_slow_work, work); + del_timer_sync(&dwork->timer); + } + + spin_lock_irq(&slow_work_queue_lock); + + if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { + /* the timer routine aborted or never happened, so we are left + * holding the timer's reference on the item and should just + * drop the pending flag and wait for any ongoing execution to + * finish */ + struct delayed_slow_work *dwork = + container_of(work, struct delayed_slow_work, work); + + BUG_ON(timer_pending(&dwork->timer)); + BUG_ON(!list_empty(&work->link)); + + clear_bit(SLOW_WORK_DELAYED, &work->flags); + put = true; + clear_bit(SLOW_WORK_PENDING, &work->flags); + + } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && + !list_empty(&work->link)) { + /* the link in the pending queue holds a reference on the item + * that we will need to release */ + list_del_init(&work->link); + wait = false; + put = true; + clear_bit(SLOW_WORK_PENDING, &work->flags); + + } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { + /* the executor is holding our only reference on the item, so + * we merely need to wait for it to finish executing */ + clear_bit(SLOW_WORK_PENDING, &work->flags); + } + + spin_unlock_irq(&slow_work_queue_lock); + + /* the EXECUTING flag is set by the executor whilst the spinlock is set + * and before the item is dequeued - so assuming the above doesn't + * actually dequeue it, simply waiting for the EXECUTING flag to be + * released here should be sufficient */ + if (wait) + wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, + TASK_UNINTERRUPTIBLE); + + clear_bit(SLOW_WORK_CANCELLING, &work->flags); + if (put) + slow_work_put_ref(work); +} +EXPORT_SYMBOL(slow_work_cancel); + +/* + * Handle expiry of the delay timer, indicating that a delayed slow work item + * should now be queued if not cancelled + */ +static void delayed_slow_work_timer(unsigned long data) +{ + wait_queue_head_t *wfo_wq; + struct list_head *queue; + struct slow_work *work = (struct slow_work *) data; + unsigned long flags; + bool queued = false, put = false, first = false; + + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + + spin_lock_irqsave(&slow_work_queue_lock, flags); + if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { + clear_bit(SLOW_WORK_DELAYED, &work->flags); + + if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { + /* we discard the reference the timer was holding in + * favour of the one the executor holds */ + set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); + put = true; + } else { + slow_work_mark_time(work); + list_add_tail(&work->link, queue); + queued = true; + if (work->link.prev == queue) + first = true; + } + } + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + if (put) + slow_work_put_ref(work); + if (first) + wake_up(wfo_wq); + if (queued) + wake_up(&slow_work_thread_wq); +} + +/** + * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing + * @dwork: The delayed work item to queue + * @delay: When to start executing the work, in jiffies from now + * + * This is similar to slow_work_enqueue(), but it adds a delay before the work + * is actually queued for processing. + * + * The item can have delayed processing requested on it whilst it is being + * executed. The delay will begin immediately, and if it expires before the + * item finishes executing, the item will be placed back on the queue when it + * has done executing. + */ +int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, + unsigned long delay) +{ + struct slow_work *work = &dwork->work; + unsigned long flags; + int ret; + + if (delay == 0) + return slow_work_enqueue(&dwork->work); + + BUG_ON(slow_work_user_count <= 0); + BUG_ON(!work); + BUG_ON(!work->ops); + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + return -ECANCELED; + + if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { + spin_lock_irqsave(&slow_work_queue_lock, flags); + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + goto cancelled; + + /* the timer holds a reference whilst it is pending */ + ret = work->ops->get_ref(work); + if (ret < 0) + goto cant_get_ref; + + if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) + BUG(); + dwork->timer.expires = jiffies + delay; + dwork->timer.data = (unsigned long) work; + dwork->timer.function = delayed_slow_work_timer; + add_timer(&dwork->timer); + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + } + + return 0; + +cancelled: + ret = -ECANCELED; +cant_get_ref: + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + return ret; +} +EXPORT_SYMBOL(delayed_slow_work_enqueue); + /* * Schedule a cull of the thread pool at some time in the near future */ @@ -368,13 +714,23 @@ static inline bool slow_work_available(int vsmax) */ static int slow_work_thread(void *_data) { - int vsmax; + int vsmax, id; DEFINE_WAIT(wait); set_freezable(); set_user_nice(current, -5); + /* allocate ourselves an ID */ + spin_lock_irq(&slow_work_queue_lock); + id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); + BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); + __set_bit(id, slow_work_ids); + slow_work_set_thread_pid(id, current->pid); + spin_unlock_irq(&slow_work_queue_lock); + + sprintf(current->comm, "kslowd%03u", id); + for (;;) { vsmax = vslow_work_proportion; vsmax *= atomic_read(&slow_work_thread_count); @@ -395,7 +751,7 @@ static int slow_work_thread(void *_data) vsmax *= atomic_read(&slow_work_thread_count); vsmax /= 100; - if (slow_work_available(vsmax) && slow_work_execute()) { + if (slow_work_available(vsmax) && slow_work_execute(id)) { cond_resched(); if (list_empty(&slow_work_queue) && list_empty(&vslow_work_queue) && @@ -412,6 +768,11 @@ static int slow_work_thread(void *_data) break; } + spin_lock_irq(&slow_work_queue_lock); + slow_work_set_thread_pid(id, 0); + __clear_bit(id, slow_work_ids); + spin_unlock_irq(&slow_work_queue_lock); + if (atomic_dec_and_test(&slow_work_thread_count)) complete_and_exit(&slow_work_last_thread_exited, 0); return 0; @@ -427,21 +788,6 @@ static void slow_work_cull_timeout(unsigned long data) } /* - * Get a reference on slow work thread starter - */ -static int slow_work_new_thread_get_ref(struct slow_work *work) -{ - return 0; -} - -/* - * Drop a reference on slow work thread starter - */ -static void slow_work_new_thread_put_ref(struct slow_work *work) -{ -} - -/* * Start a new slow work thread */ static void slow_work_new_thread_execute(struct slow_work *work) @@ -475,9 +821,11 @@ static void slow_work_new_thread_execute(struct slow_work *work) } static const struct slow_work_ops slow_work_new_thread_ops = { - .get_ref = slow_work_new_thread_get_ref, - .put_ref = slow_work_new_thread_put_ref, + .owner = THIS_MODULE, .execute = slow_work_new_thread_execute, +#ifdef CONFIG_SLOW_WORK_DEBUG + .desc = slow_work_new_thread_desc, +#endif }; /* @@ -546,12 +894,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, /** * slow_work_register_user - Register a user of the facility + * @module: The module about to make use of the facility * * Register a user of the facility, starting up the initial threads if there * aren't any other users at this point. This will return 0 if successful, or * an error if not. */ -int slow_work_register_user(void) +int slow_work_register_user(struct module *module) { struct task_struct *p; int loop; @@ -598,14 +947,81 @@ error: } EXPORT_SYMBOL(slow_work_register_user); +/* + * wait for all outstanding items from the calling module to complete + * - note that more items may be queued whilst we're waiting + */ +static void slow_work_wait_for_items(struct module *module) +{ +#ifdef CONFIG_MODULES + DECLARE_WAITQUEUE(myself, current); + struct slow_work *work; + int loop; + + mutex_lock(&slow_work_unreg_sync_lock); + add_wait_queue(&slow_work_unreg_wq, &myself); + + for (;;) { + spin_lock_irq(&slow_work_queue_lock); + + /* first of all, we wait for the last queued item in each list + * to be processed */ + list_for_each_entry_reverse(work, &vslow_work_queue, link) { + if (work->owner == module) { + set_current_state(TASK_UNINTERRUPTIBLE); + slow_work_unreg_work_item = work; + goto do_wait; + } + } + list_for_each_entry_reverse(work, &slow_work_queue, link) { + if (work->owner == module) { + set_current_state(TASK_UNINTERRUPTIBLE); + slow_work_unreg_work_item = work; + goto do_wait; + } + } + + /* then we wait for the items being processed to finish */ + slow_work_unreg_module = module; + smp_mb(); + for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { + if (slow_work_thread_processing[loop] == module) + goto do_wait; + } + spin_unlock_irq(&slow_work_queue_lock); + break; /* okay, we're done */ + + do_wait: + spin_unlock_irq(&slow_work_queue_lock); + schedule(); + slow_work_unreg_work_item = NULL; + slow_work_unreg_module = NULL; + } + + remove_wait_queue(&slow_work_unreg_wq, &myself); + mutex_unlock(&slow_work_unreg_sync_lock); +#endif /* CONFIG_MODULES */ +} + /** * slow_work_unregister_user - Unregister a user of the facility + * @module: The module whose items should be cleared * * Unregister a user of the facility, killing all the threads if this was the * last one. + * + * This waits for all the work items belonging to the nominated module to go + * away before proceeding. */ -void slow_work_unregister_user(void) +void slow_work_unregister_user(struct module *module) { + /* first of all, wait for all outstanding items from the calling module + * to complete */ + if (module) + slow_work_wait_for_items(module); + + /* then we can actually go about shutting down the facility if need + * be */ mutex_lock(&slow_work_user_lock); BUG_ON(slow_work_user_count <= 0); @@ -639,6 +1055,16 @@ static int __init init_slow_work(void) if (slow_work_max_max_threads < nr_cpus * 2) slow_work_max_max_threads = nr_cpus * 2; #endif +#ifdef CONFIG_SLOW_WORK_DEBUG + { + struct dentry *dbdir; + + dbdir = debugfs_create_dir("slow_work", NULL); + if (dbdir && !IS_ERR(dbdir)) + debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, + NULL, &slow_work_runqueue_fops); + } +#endif return 0; } diff --git a/kernel/slow-work.h b/kernel/slow-work.h new file mode 100644 index 000000000000..321f3c59d732 --- /dev/null +++ b/kernel/slow-work.h @@ -0,0 +1,72 @@ +/* Slow work private definitions + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of + * things to do */ +#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after + * OOM */ + +#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ + +/* + * slow-work.c + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +extern struct slow_work *slow_work_execs[]; +extern pid_t slow_work_pids[]; +extern rwlock_t slow_work_execs_lock; +#endif + +extern struct list_head slow_work_queue; +extern struct list_head vslow_work_queue; +extern spinlock_t slow_work_queue_lock; + +/* + * slow-work-debugfs.c + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +extern const struct file_operations slow_work_runqueue_fops; + +extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); +#endif + +/* + * Helper functions + */ +static inline void slow_work_set_thread_pid(int id, pid_t pid) +{ +#ifdef CONFIG_SLOW_WORK_PROC + slow_work_pids[id] = pid; +#endif +} + +static inline void slow_work_mark_time(struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + work->mark = CURRENT_TIME; +#endif +} + +static inline void slow_work_begin_exec(int id, struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + slow_work_execs[id] = work; +#endif +} + +static inline void slow_work_end_exec(int id, struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + write_lock(&slow_work_execs_lock); + slow_work_execs[id] = NULL; + write_unlock(&slow_work_execs_lock); +#endif +} diff --git a/kernel/smp.c b/kernel/smp.c index c9d1c7835c2f..a8c76069cf50 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -265,9 +265,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data); * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. Note that @wait - * will be implicitly turned on in case of allocation failures, since - * we fall back to on-stack allocation. + * Returns 0 on success, else a negative status code. */ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) @@ -321,6 +319,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); +/* + * smp_call_function_any - Run a function on any of the given cpus + * @mask: The mask of cpus it can run on. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait until function has completed. + * + * Returns 0 on success, else a negative status code (if no cpus were online). + * Note that @wait will be implicitly turned on in case of allocation failures, + * since we fall back to on-stack allocation. + * + * Selection preference: + * 1) current cpu if in @mask + * 2) any cpu of current node if in @mask + * 3) any other online cpu in @mask + */ +int smp_call_function_any(const struct cpumask *mask, + void (*func)(void *info), void *info, int wait) +{ + unsigned int cpu; + const struct cpumask *nodemask; + int ret; + + /* Try for same CPU (cheapest) */ + cpu = get_cpu(); + if (cpumask_test_cpu(cpu, mask)) + goto call; + + /* Try for same node. */ + nodemask = cpumask_of_node(cpu); + for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; + cpu = cpumask_next_and(cpu, nodemask, mask)) { + if (cpu_online(cpu)) + goto call; + } + + /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ + cpu = cpumask_any_and(mask, cpu_online_mask); +call: + ret = smp_call_function_single(cpu, func, info, wait); + put_cpu(); + return ret; +} +EXPORT_SYMBOL_GPL(smp_call_function_any); + /** * __smp_call_function_single(): Run a function on another CPU * @cpu: The CPU to run on. @@ -355,9 +398,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, * @wait: If true, wait (atomically) until function has completed * on other CPUs. * - * If @wait is true, then returns once @func has returned. Note that @wait - * will be implicitly turned on in case of allocation failures, since - * we fall back to on-stack allocation. + * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption @@ -443,8 +484,7 @@ EXPORT_SYMBOL(smp_call_function_many); * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. In case of allocation - * failure, @wait will be implicitly turned on. + * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 12328147132c..67e526b6ae81 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -692,31 +692,29 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; + get_online_cpus(); + /* - * when running in keventd don't schedule a work item on itself. - * Can just call directly because the work queue is already bound. - * This also is faster. - * Make this a generic parameter for other workqueues? + * When running in keventd don't schedule a work item on + * itself. Can just call directly because the work queue is + * already bound. This also is faster. */ - if (current_is_keventd()) { + if (current_is_keventd()) orig = raw_smp_processor_id(); - INIT_WORK(per_cpu_ptr(works, orig), func); - func(per_cpu_ptr(works, orig)); - } - get_online_cpus(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); - if (cpu == orig) - continue; INIT_WORK(work, func); - schedule_work_on(cpu, work); - } - for_each_online_cpu(cpu) { if (cpu != orig) - flush_work(per_cpu_ptr(works, cpu)); + schedule_work_on(cpu, work); } + if (orig >= 0) + func(per_cpu_ptr(works, orig)); + + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(works, cpu)); + put_online_cpus(); free_percpu(works); return 0; |