summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c29
-rw-r--r--kernel/auditsc.c27
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c587
-rw-r--r--kernel/cgroup_freezer.c26
-rw-r--r--kernel/compat.c55
-rw-r--r--kernel/cpuset.c107
-rw-r--r--kernel/cred.c8
-rw-r--r--kernel/events/core.c30
-rw-r--r--kernel/exit.c226
-rw-r--r--kernel/extable.c10
-rw-r--r--kernel/fork.c125
-rw-r--r--kernel/gcov/Kconfig3
-rw-r--r--kernel/hrtimer.c162
-rw-r--r--kernel/irq/generic-chip.c18
-rw-r--r--kernel/irq/handle.c6
-rw-r--r--kernel/irq/irqdesc.c14
-rw-r--r--kernel/irq/manage.c27
-rw-r--r--kernel/irq/proc.c55
-rw-r--r--kernel/irq/spurious.c31
-rw-r--r--kernel/jump_label.c30
-rw-r--r--kernel/kmod.c108
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/mutex.c25
-rw-r--r--kernel/ns_cgroup.c118
-rw-r--r--kernel/nsproxy.c46
-rw-r--r--kernel/pm_qos_params.c72
-rw-r--r--kernel/posix-cpu-timers.c4
-rw-r--r--kernel/posix-timers.c27
-rw-r--r--kernel/power/hibernate.c220
-rw-r--r--kernel/power/snapshot.c6
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk.c87
-rw-r--r--kernel/profile.c22
-rw-r--r--kernel/ptrace.c291
-rw-r--r--kernel/rcutiny.c1
-rw-r--r--kernel/rcutree.c535
-rw-r--r--kernel/rcutree.h42
-rw-r--r--kernel/rcutree_plugin.h496
-rw-r--r--kernel/rcutree_trace.c44
-rw-r--r--kernel/resource.c116
-rw-r--r--kernel/sched.c407
-rw-r--r--kernel/sched_fair.c97
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c16
-rw-r--r--kernel/sched_stats.h4
-rw-r--r--kernel/signal.c938
-rw-r--r--kernel/smp.c5
-rw-r--r--kernel/softirq.c14
-rw-r--r--kernel/sys_ni.c9
-rw-r--r--kernel/sysctl.c26
-rw-r--r--kernel/taskstats.c15
-rw-r--r--kernel/time/alarmtimer.c154
-rw-r--r--kernel/time/clockevents.c7
-rw-r--r--kernel/time/clocksource.c26
-rw-r--r--kernel/time/tick-broadcast.c16
-rw-r--r--kernel/time/timekeeping.c17
-rw-r--r--kernel/timer.c15
-rw-r--r--kernel/trace/ftrace.c40
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.h15
-rw-r--r--kernel/trace/trace_events.c7
-rw-r--r--kernel/trace/trace_kprobe.c8
-rw-r--r--kernel/trace/trace_output.c27
-rw-r--r--kernel/trace/trace_printk.c5
-rw-r--r--kernel/utsname.c39
-rw-r--r--kernel/watchdog.c61
-rw-r--r--kernel/workqueue.c4
72 files changed, 3765 insertions, 2076 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 88c92fb44618..5068e2a4e75f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
config MUTEX_SPIN_ON_OWNER
- def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
+ def_bool SMP && !DEBUG_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index e9cf19155b46..2d64cfcc8b42 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -61,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
-obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 939500317066..52501b5d4902 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -55,6 +55,9 @@
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
+#ifdef CONFIG_SECURITY
+#include <linux/security.h>
+#endif
#include <linux/netlink.h>
#include <linux/freezer.h>
#include <linux/tty.h>
@@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
}
}
+#ifdef CONFIG_SECURITY
+/**
+ * audit_log_secctx - Converts and logs SELinux context
+ * @ab: audit_buffer
+ * @secid: security number
+ *
+ * This is a helper function that calls security_secid_to_secctx to convert
+ * secid to secctx and then adds the (converted) SELinux context to the audit
+ * log by calling audit_log_format, thus also preventing leak of internal secid
+ * to userspace. If secid cannot be converted audit_panic is called.
+ */
+void audit_log_secctx(struct audit_buffer *ab, u32 secid)
+{
+ u32 len;
+ char *secctx;
+
+ if (security_secid_to_secctx(secid, &secctx, &len)) {
+ audit_panic("Cannot convert secid to context");
+ } else {
+ audit_log_format(ab, " obj=%s", secctx);
+ security_release_secctx(secctx, len);
+ }
+}
+EXPORT_SYMBOL(audit_log_secctx);
+#endif
+
EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b33513a08beb..00d79df03e76 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
- * otherwise. */
+ * otherwise.
+ *
+ * If task_creation is true, this is an explicit indication that we are
+ * filtering a task rule at task creation time. This and tsk == current are
+ * the only situations where tsk->cred may be accessed without an rcu read lock.
+ */
static int audit_filter_rules(struct task_struct *tsk,
struct audit_krule *rule,
struct audit_context *ctx,
struct audit_names *name,
- enum audit_state *state)
+ enum audit_state *state,
+ bool task_creation)
{
- const struct cred *cred = get_task_cred(tsk);
+ const struct cred *cred;
int i, j, need_sid = 1;
u32 sid;
+ cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
+
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
int result = 0;
@@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
}
- if (!result) {
- put_cred(cred);
+ if (!result)
return 0;
- }
}
if (ctx) {
@@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
}
- put_cred(cred);
return 1;
}
@@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
- if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
+ &state, true)) {
if (state == AUDIT_RECORD_CONTEXT)
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
rcu_read_unlock();
@@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
list_for_each_entry_rcu(e, list, list) {
if ((e->rule.mask[word] & bit) == bit &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state)) {
+ &state, false)) {
rcu_read_unlock();
ctx->current_state = state;
return state;
@@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
list_for_each_entry_rcu(e, list, list) {
if ((e->rule.mask[word] & bit) == bit &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
+ audit_filter_rules(tsk, &e->rule, ctx, n,
+ &state, false)) {
rcu_read_unlock();
ctx->current_state = state;
return;
diff --git a/kernel/capability.c b/kernel/capability.c
index 32a80e08ff4b..283c529f8b1c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -22,12 +22,8 @@
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-const kernel_cap_t __cap_full_set = CAP_FULL_SET;
-const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
EXPORT_SYMBOL(__cap_empty_set);
-EXPORT_SYMBOL(__cap_full_set);
-EXPORT_SYMBOL(__cap_init_eff_set);
int file_caps_enabled = 1;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 909a35510af5..2731d115d725 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/eventfd.h>
#include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
#include <asm/atomic.h>
@@ -1735,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
}
EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, bool guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+
+ /* locate or allocate a new css_set for this task. */
+ if (guarantee) {
+ /* we know the css_set we want already exists. */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+ * it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1745,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
/* Nothing to do if the task is already in that cgroup */
@@ -1759,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk, false);
+ retval = ss->can_attach(ss, cgrp, tsk);
if (retval) {
/*
* Remember on which subsystem the can_attach()
@@ -1771,46 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
}
}
+ if (ss->can_attach_task) {
+ retval = ss->can_attach_task(cgrp, tsk);
+ if (retval) {
+ failed_ss = ss;
+ goto out;
+ }
+ }
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
- /*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
- */
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- retval = -ESRCH;
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+ if (retval)
goto out;
- }
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
-
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_move(&tsk->cg_list, &newcg->tasks);
- write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ ss->attach(ss, cgrp, oldcgrp, tsk);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+
synchronize_rcu();
- put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1829,7 +1881,7 @@ out:
*/
break;
if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, tsk, false);
+ ss->cancel_attach(ss, cgrp, tsk);
}
}
return retval;
@@ -1860,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+ struct task_struct *tsk, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+
+ /* doesn't exist at all? */
+ if (!newcg)
+ return false;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return true;
+ }
+ }
+
+ /* not found */
+ put_css_set(newcg);
+ return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+
+ /* ensure a new css_set will exist for this thread */
+ newcg = find_css_set(cg, cgrp);
+ if (!newcg)
+ return -ENOMEM;
+ /* add it to the list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval, i, group_size;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
+ bool cancel_failed_ss = false;
+ /* guaranteed to be initialized later, but the compiler needs this */
+ struct cgroup *oldcgrp = NULL;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor and array */
+ struct task_struct *tsk;
+ struct flex_array *group;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry, *temp_nobe;
+
+ /*
+ * step 0: in order to do expensive, possibly blocking operations for
+ * every thread, we cannot iterate the thread group list, since it needs
+ * rcu or tasklist locked. instead, build an array of all threads in the
+ * group - threadgroup_fork_lock prevents new threads from appearing,
+ * and if threads exit, this will just be an over-estimate.
+ */
+ group_size = get_nr_threads(leader);
+ /* flex_array supports very large thread-groups better than kmalloc. */
+ group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+ GFP_KERNEL);
+ if (!group)
+ return -ENOMEM;
+ /* pre-allocate to guarantee space while iterating in rcu read-side. */
+ retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ if (retval)
+ goto out_free_group_list;
+
+ /* prevent changes to the threadgroup list while we take a snapshot. */
+ rcu_read_lock();
+ if (!thread_group_leader(leader)) {
+ /*
+ * a race with de_thread from another thread's exec() may strip
+ * us of our leadership, making while_each_thread unsafe to use
+ * on this task. if this happens, there is no choice but to
+ * throw this task away and try again (from cgroup_procs_write);
+ * this is "double-double-toil-and-trouble-check locking".
+ */
+ rcu_read_unlock();
+ retval = -EAGAIN;
+ goto out_free_group_list;
+ }
+ /* take a reference on each task in the group to go in the array. */
+ tsk = leader;
+ i = 0;
+ do {
+ /* as per above, nr_threads may decrease, but not increase. */
+ BUG_ON(i >= group_size);
+ get_task_struct(tsk);
+ /*
+ * saying GFP_ATOMIC has no effect here because we did prealloc
+ * earlier, but it's good form to communicate our expectations.
+ */
+ retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+ BUG_ON(retval != 0);
+ i++;
+ } while_each_thread(leader, tsk);
+ /* remember the number of threads in the array for later. */
+ group_size = i;
+ rcu_read_unlock();
+
+ /*
+ * step 1: check that we can legitimately attach to the cgroup.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader);
+ if (retval) {
+ failed_ss = ss;
+ goto out_cancel_attach;
+ }
+ }
+ /* a callback to be run on every thread in the threadgroup. */
+ if (ss->can_attach_task) {
+ /* run on each task in the threadgroup. */
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ retval = ss->can_attach_task(cgrp, tsk);
+ if (retval) {
+ failed_ss = ss;
+ cancel_failed_ss = true;
+ goto out_cancel_attach;
+ }
+ }
+ }
+ }
+
+ /*
+ * step 2: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /* see if the new one for us is already in the list? */
+ if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ } else {
+ /* we don't already have it. get new one. */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto out_list_teardown;
+ }
+ }
+
+ /*
+ * step 3: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup, calling ss->attach_task for each
+ * one along the way. there are no failure cases after here, so this is
+ * the commit point.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ }
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* attach each task to each subsystem */
+ for_each_subsys(root, ss) {
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
+ }
+ /* if the thread is PF_EXITING, it can just get skipped. */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ /* nothing is sensitive to fork() after this point. */
+
+ /*
+ * step 4: do expensive, non-thread-specific subsystem callbacks.
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, leader);
+ }
+
+ /*
+ * step 5: success! and cleanup
+ */
+ synchronize_rcu();
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ retval = 0;
+out_list_teardown:
+ /* clean up the list of prefetched css_sets. */
+ list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+ list_del(&cg_entry->links);
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+out_cancel_attach:
+ /* same deal as in cgroup_attach_task */
+ if (retval) {
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss) {
+ if (cancel_failed_ss && ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ break;
+ }
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ }
+ }
+ /* clean up the array of referenced threads in the group. */
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ put_task_struct(tsk);
+ }
+out_free_group_list:
+ flex_array_free(group);
+ return retval;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
rcu_read_unlock();
+ cgroup_unlock();
+ return -ESRCH;
+ }
+ if (threadgroup) {
+ /*
+ * RCU protects this access, since tsk was found in the
+ * tid map. a race with de_thread may cause group_leader
+ * to stop being the leader, but cgroup_attach_proc will
+ * detect it later.
+ */
+ tsk = tsk->group_leader;
+ } else if (tsk->flags & PF_EXITING) {
+ /* optimization for the single-task-only case */
+ rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
rcu_read_unlock();
} else {
- tsk = current;
+ if (threadgroup)
+ tsk = current->group_leader;
+ else
+ tsk = current;
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ if (threadgroup) {
+ threadgroup_fork_write_lock(tsk);
+ ret = cgroup_attach_proc(cgrp, tsk);
+ threadgroup_fork_write_unlock(tsk);
+ } else {
+ ret = cgroup_attach_task(cgrp, tsk);
+ }
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
+ return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
+ do {
+ /*
+ * attach_proc fails with -EAGAIN if threadgroup leadership
+ * changes in the middle of the operation, in which case we need
+ * to find the task_struct for the new leader and start over.
+ */
+ ret = attach_task_by_pid(cgrp, tgid, true);
+ } while (ret == -EAGAIN);
return ret;
}
@@ -3259,9 +3632,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
@@ -4257,122 +4630,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
}
/**
- * cgroup_clone - clone the cgroup the given subsystem is attached to
- * @tsk: the task to be moved
- * @subsys: the given subsystem
- * @nodename: the name for the new cgroup
- *
- * Duplicate the current cgroup in the hierarchy that the given
- * subsystem is attached to, and move this task into the new
- * child.
- */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
- char *nodename)
-{
- struct dentry *dentry;
- int ret = 0;
- struct cgroup *parent, *child;
- struct inode *inode;
- struct css_set *cg;
- struct cgroupfs_root *root;
- struct cgroup_subsys *ss;
-
- /* We shouldn't be called by an unregistered subsystem */
- BUG_ON(!subsys->active);
-
- /* First figure out what hierarchy and cgroup we're dealing
- * with, and pin them so we can drop cgroup_mutex */
- mutex_lock(&cgroup_mutex);
- again:
- root = subsys->root;
- if (root == &rootnode) {
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
-
- /* Pin the hierarchy */
- if (!atomic_inc_not_zero(&root->sb->s_active)) {
- /* We race with the final deactivate_super() */
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
-
- /* Keep the cgroup alive */
- task_lock(tsk);
- parent = task_cgroup(tsk, subsys->subsys_id);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
-
- mutex_unlock(&cgroup_mutex);
-
- /* Now do the VFS work to create a cgroup */
- inode = parent->dentry->d_inode;
-
- /* Hold the parent directory mutex across this operation to
- * stop anyone else deleting the new cgroup */
- mutex_lock(&inode->i_mutex);
- dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
- if (IS_ERR(dentry)) {
- printk(KERN_INFO
- "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
- PTR_ERR(dentry));
- ret = PTR_ERR(dentry);
- goto out_release;
- }
-
- /* Create the cgroup directory, which also creates the cgroup */
- ret = vfs_mkdir(inode, dentry, 0755);
- child = __d_cgrp(dentry);
- dput(dentry);
- if (ret) {
- printk(KERN_INFO
- "Failed to create cgroup %s: %d\n", nodename,
- ret);
- goto out_release;
- }
-
- /* The cgroup now exists. Retake cgroup_mutex and check
- * that we're still in the same state that we thought we
- * were. */
- mutex_lock(&cgroup_mutex);
- if ((root != subsys->root) ||
- (parent != task_cgroup(tsk, subsys->subsys_id))) {
- /* Aargh, we raced ... */
- mutex_unlock(&inode->i_mutex);
- put_css_set(cg);
-
- deactivate_super(root->sb);
- /* The cgroup is still accessible in the VFS, but
- * we're not going to try to rmdir() it at this
- * point. */
- printk(KERN_INFO
- "Race in cgroup_clone() - leaking cgroup %s\n",
- nodename);
- goto again;
- }
-
- /* do any required auto-setup */
- for_each_subsys(root, ss) {
- if (ss->post_clone)
- ss->post_clone(ss, child);
- }
-
- /* All seems fine. Finish by moving the task into the new cgroup */
- ret = cgroup_attach_task(child, tsk);
- mutex_unlock(&cgroup_mutex);
-
- out_release:
- mutex_unlock(&inode->i_mutex);
-
- mutex_lock(&cgroup_mutex);
- put_css_set(cg);
- mutex_unlock(&cgroup_mutex);
- deactivate_super(root->sb);
- return ret;
-}
-
-/**
* cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
* @cgrp: the cgroup in question
* @task: the task in question
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7c6c38..e691818d7e45 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
+ struct task_struct *task)
{
struct freezer *freezer;
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state != CGROUP_THAWED)
return -EBUSY;
+ return 0;
+}
+
+static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
rcu_read_lock();
- if (__cgroup_freezing_or_frozen(task)) {
+ if (__cgroup_freezing_or_frozen(tsk)) {
rcu_read_unlock();
return -EBUSY;
}
rcu_read_unlock();
-
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (__cgroup_freezing_or_frozen(c)) {
- rcu_read_unlock();
- return -EBUSY;
- }
- }
- rcu_read_unlock();
- }
-
return 0;
}
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
.populate = freezer_populate,
.subsys_id = freezer_subsys_id,
.can_attach = freezer_can_attach,
+ .can_attach_task = freezer_can_attach_task,
+ .pre_attach = NULL,
+ .attach_task = NULL,
.attach = NULL,
.fork = freezer_fork,
.exit = NULL,
diff --git a/kernel/compat.c b/kernel/compat.c
index 38b1d2c1cbe8..fc9eb093acd5 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -293,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
return compat_jiffies_to_clock_t(jiffies);
}
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+
/*
* Assumption: old_sigset_t and compat_old_sigset_t are both
* types that can be passed to put_user()/get_user().
@@ -312,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
return ret;
}
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+
asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
compat_old_sigset_t __user *oset)
{
@@ -333,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
return ret;
}
+#endif
+
asmlinkage long compat_sys_setrlimit(unsigned int resource,
struct compat_rlimit __user *rlim)
{
@@ -890,10 +898,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
{
compat_sigset_t s32;
sigset_t s;
- int sig;
struct timespec t;
siginfo_t info;
- long ret, timeout = 0;
+ long ret;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
@@ -901,51 +908,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
return -EFAULT;
sigset_from_compat(&s, &s32);
- sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
- signotset(&s);
if (uts) {
- if (get_compat_timespec (&t, uts))
+ if (get_compat_timespec(&t, uts))
return -EFAULT;
- if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
- || t.tv_sec < 0)
- return -EINVAL;
}
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &s, &info);
- if (!sig) {
- timeout = MAX_SCHEDULE_TIMEOUT;
- if (uts)
- timeout = timespec_to_jiffies(&t)
- +(t.tv_sec || t.tv_nsec);
- if (timeout) {
- current->real_blocked = current->blocked;
- sigandsets(&current->blocked, &current->blocked, &s);
-
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- timeout = schedule_timeout_interruptible(timeout);
-
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &s, &info);
- current->blocked = current->real_blocked;
- siginitset(&current->real_blocked, 0);
- recalc_sigpending();
- }
- }
- spin_unlock_irq(&current->sighand->siglock);
+ ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
- if (sig) {
- ret = sig;
- if (uinfo) {
- if (copy_siginfo_to_user32(uinfo, &info))
- ret = -EFAULT;
- }
- }else {
- ret = timeout?-EINTR:-EAGAIN;
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
}
+
return ret;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2bb8c2e98fff..9c9b7545c810 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
-
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct task_struct *tsk, bool threadgroup)
+ struct task_struct *tsk)
{
- int ret;
struct cpuset *cs = cgroup_cs(cont);
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
- ret = security_task_setscheduler(tsk);
- if (ret)
- return ret;
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- ret = security_task_setscheduler(c);
- if (ret) {
- rcu_read_unlock();
- return ret;
- }
- }
- rcu_read_unlock();
- }
return 0;
}
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
- struct cpuset *cs)
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
+{
+ return security_task_setscheduler(task);
+}
+
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in pre_attach, and they must
+ * persist among pre_attach, attach_task, and attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+
+/* Set-up work for before attaching each task. */
+static void cpuset_pre_attach(struct cgroup *cont)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+}
+
+/* Per-thread attachment work. */
+static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
{
int err;
+ struct cpuset *cs = cgroup_cs(cont);
+
/*
* can_attach beforehand should guarantee that this doesn't fail.
* TODO: have a better way to handle failure here
@@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
err = set_cpus_allowed_ptr(tsk, cpus_attach);
WARN_ON_ONCE(err);
- cpuset_change_task_nodemask(tsk, to);
+ cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
cpuset_update_task_spread_flag(cs, tsk);
-
}
static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct cgroup *oldcont, struct task_struct *tsk,
- bool threadgroup)
+ struct cgroup *oldcont, struct task_struct *tsk)
{
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- static nodemask_t to; /* protected by cgroup_mutex */
- if (cs == &top_cpuset) {
- cpumask_copy(cpus_attach, cpu_possible_mask);
- } else {
- guarantee_online_cpus(cs, cpus_attach);
- }
- guarantee_online_mems(cs, &to);
-
- /* do per-task migration stuff possibly for each in the threadgroup */
- cpuset_attach_task(tsk, &to, cs);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- cpuset_attach_task(c, &to, cs);
- }
- rcu_read_unlock();
- }
-
- /* change mm; only needs to be done once even if threadgroup */
- to = cs->mems_allowed;
+ /*
+ * Change mm, possibly for multiple threads in a threadgroup. This is
+ * expensive and may sleep.
+ */
+ cpuset_attach_nodemask_from = oldcs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->mems_allowed;
mm = get_task_mm(tsk);
if (mm) {
- mpol_rebind_mm(mm, &to);
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
+ cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+ &cpuset_attach_nodemask_to);
mmput(mm);
}
}
@@ -1809,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
}
/*
- * post_clone() is called at the end of cgroup_clone().
- * 'cgroup' was just created automatically as a result of
- * a cgroup_clone(), and the current task is about to
- * be moved into 'cgroup'.
+ * post_clone() is called during cgroup_create() when the
+ * clone_children mount argument was specified. The cgroup
+ * can not yet have any tasks.
*
* Currently we refuse to set up the cgroup - thereby
* refusing the task to be entered, and as a result refusing
@@ -1911,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
.create = cpuset_create,
.destroy = cpuset_destroy,
.can_attach = cpuset_can_attach,
+ .can_attach_task = cpuset_can_attach_task,
+ .pre_attach = cpuset_pre_attach,
+ .attach_task = cpuset_attach_task,
.attach = cpuset_attach,
.populate = cpuset_populate,
.post_clone = cpuset_post_clone,
@@ -2195,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
rcu_read_lock();
cs = task_cs(tsk);
if (cs)
- cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+ do_set_cpus_allowed(tsk, cs->cpus_allowed);
rcu_read_unlock();
/*
@@ -2222,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* Like above we can temporary set any mask and rely on
* set_cpus_allowed_ptr() as synchronization point.
*/
- cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+ do_set_cpus_allowed(tsk, cpu_possible_mask);
cpu = cpumask_any(cpu_active_mask);
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 8093c16b84b1..174fa84eca30 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management - see Documentation/credentials.txt
+/* Task credentials management - see Documentation/security/credentials.txt
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -49,10 +49,10 @@ struct cred init_cred = {
.magic = CRED_MAGIC,
#endif
.securebits = SECUREBITS_DEFAULT,
- .cap_inheritable = CAP_INIT_INH_SET,
+ .cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
- .cap_effective = CAP_INIT_EFF_SET,
- .cap_bset = CAP_INIT_BSET,
+ .cap_effective = CAP_FULL_SET,
+ .cap_bset = CAP_FULL_SET,
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c09767f7db3e..9efe7108ccaf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5028,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
else
perf_event_output(event, nmi, data, regs);
+ if (event->fasync && event->pending_kill) {
+ if (nmi) {
+ event->pending_wakeup = 1;
+ irq_work_queue(&event->pending);
+ } else
+ perf_event_wakeup(event);
+ }
+
return ret;
}
@@ -7394,26 +7402,12 @@ static int __perf_cgroup_move(void *info)
return 0;
}
-static void perf_cgroup_move(struct task_struct *task)
+static void
+perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
{
task_function_call(task, __perf_cgroup_move, task);
}
-static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *task,
- bool threadgroup)
-{
- perf_cgroup_move(task);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- perf_cgroup_move(c);
- }
- rcu_read_unlock();
- }
-}
-
static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct cgroup *old_cgrp, struct task_struct *task)
{
@@ -7425,7 +7419,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
if (!(task->flags & PF_EXITING))
return;
- perf_cgroup_move(task);
+ perf_cgroup_attach_task(cgrp, task);
}
struct cgroup_subsys perf_subsys = {
@@ -7434,6 +7428,6 @@ struct cgroup_subsys perf_subsys = {
.create = perf_cgroup_create,
.destroy = perf_cgroup_destroy,
.exit = perf_cgroup_exit,
- .attach = perf_cgroup_attach,
+ .attach_task = perf_cgroup_attach_task,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/exit.c b/kernel/exit.c
index 8dd874181542..73bb192a3d32 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -169,7 +169,6 @@ void release_task(struct task_struct * p)
struct task_struct *leader;
int zap_leader;
repeat:
- tracehook_prepare_release_task(p);
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
@@ -179,7 +178,7 @@ repeat:
proc_flush_task(p);
write_lock_irq(&tasklist_lock);
- tracehook_finish_release_task(p);
+ ptrace_release_task(p);
__exit_signal(p);
/*
@@ -190,22 +189,12 @@ repeat:
zap_leader = 0;
leader = p->group_leader;
if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
- BUG_ON(task_detached(leader));
- do_notify_parent(leader, leader->exit_signal);
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
* then we are the one who should release the leader.
- *
- * do_notify_parent() will have marked it self-reaping in
- * that case.
- */
- zap_leader = task_detached(leader);
-
- /*
- * This maintains the invariant that release_task()
- * only runs on a task in EXIT_DEAD, just for sanity.
*/
+ zap_leader = do_notify_parent(leader, leader->exit_signal);
if (zap_leader)
leader->exit_state = EXIT_DEAD;
}
@@ -277,18 +266,16 @@ int is_current_pgrp_orphaned(void)
return retval;
}
-static int has_stopped_jobs(struct pid *pgrp)
+static bool has_stopped_jobs(struct pid *pgrp)
{
- int retval = 0;
struct task_struct *p;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
- if (!task_is_stopped(p))
- continue;
- retval = 1;
- break;
+ if (p->signal->flags & SIGNAL_STOP_STOPPED)
+ return true;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
- return retval;
+
+ return false;
}
/*
@@ -561,29 +548,28 @@ void exit_files(struct task_struct *tsk)
#ifdef CONFIG_MM_OWNER
/*
- * Task p is exiting and it owned mm, lets find a new owner for it
+ * A task is exiting. If it owned this mm, find a new owner for the mm.
*/
-static inline int
-mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
-{
- /*
- * If there are other users of the mm and the owner (us) is exiting
- * we need to find a new owner to take on the responsibility.
- */
- if (atomic_read(&mm->mm_users) <= 1)
- return 0;
- if (mm->owner != p)
- return 0;
- return 1;
-}
-
void mm_update_next_owner(struct mm_struct *mm)
{
struct task_struct *c, *g, *p = current;
retry:
- if (!mm_need_new_owner(mm, p))
+ /*
+ * If the exiting or execing task is not the owner, it's
+ * someone else's problem.
+ */
+ if (mm->owner != p)
return;
+ /*
+ * The current owner is exiting/execing and there are no other
+ * candidates. Do not leave the mm pointing to a possibly
+ * freed task structure.
+ */
+ if (atomic_read(&mm->mm_users) <= 1) {
+ mm->owner = NULL;
+ return;
+ }
read_lock(&tasklist_lock);
/*
@@ -752,7 +738,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
{
list_move_tail(&p->sibling, &p->real_parent->children);
- if (task_detached(p))
+ if (p->exit_state == EXIT_DEAD)
return;
/*
* If this is a threaded reparent there is no need to
@@ -765,10 +751,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
- if (!task_ptrace(p) &&
+ if (!p->ptrace &&
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
- do_notify_parent(p, p->exit_signal);
- if (task_detached(p)) {
+ if (do_notify_parent(p, p->exit_signal)) {
p->exit_state = EXIT_DEAD;
list_move_tail(&p->sibling, dead);
}
@@ -795,7 +780,7 @@ static void forget_original_parent(struct task_struct *father)
do {
t->real_parent = reaper;
if (t->parent == father) {
- BUG_ON(task_ptrace(t));
+ BUG_ON(t->ptrace);
t->parent = t->real_parent;
}
if (t->pdeath_signal)
@@ -820,8 +805,7 @@ static void forget_original_parent(struct task_struct *father)
*/
static void exit_notify(struct task_struct *tsk, int group_dead)
{
- int signal;
- void *cookie;
+ bool autoreap;
/*
* This does two things:
@@ -852,26 +836,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
* we have changed execution domain as these two values started
* the same after a fork.
*/
- if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
+ if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
(tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
tsk->self_exec_id != tsk->parent_exec_id))
tsk->exit_signal = SIGCHLD;
- signal = tracehook_notify_death(tsk, &cookie, group_dead);
- if (signal >= 0)
- signal = do_notify_parent(tsk, signal);
+ if (unlikely(tsk->ptrace)) {
+ int sig = thread_group_leader(tsk) &&
+ thread_group_empty(tsk) &&
+ !ptrace_reparented(tsk) ?
+ tsk->exit_signal : SIGCHLD;
+ autoreap = do_notify_parent(tsk, sig);
+ } else if (thread_group_leader(tsk)) {
+ autoreap = thread_group_empty(tsk) &&
+ do_notify_parent(tsk, tsk->exit_signal);
+ } else {
+ autoreap = true;
+ }
- tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+ tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
- tracehook_report_death(tsk, signal, cookie, group_dead);
-
/* If the process is dead, release it - nobody will wait for it */
- if (signal == DEATH_REAP)
+ if (autoreap)
release_task(tsk);
}
@@ -924,7 +915,7 @@ NORET_TYPE void do_exit(long code)
*/
set_fs(USER_DS);
- tracehook_report_exit(&code);
+ ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -1236,9 +1227,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
traced = ptrace_reparented(p);
/*
* It can be ptraced but not reparented, check
- * !task_detached() to filter out sub-threads.
+ * thread_group_leader() to filter out sub-threads.
*/
- if (likely(!traced) && likely(!task_detached(p))) {
+ if (likely(!traced) && thread_group_leader(p)) {
struct signal_struct *psig;
struct signal_struct *sig;
unsigned long maxrss;
@@ -1346,16 +1337,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
/*
- * If this is not a detached task, notify the parent.
- * If it's still not detached after that, don't release
- * it now.
+ * If this is not a sub-thread, notify the parent.
+ * If parent wants a zombie, don't release it now.
*/
- if (!task_detached(p)) {
- do_notify_parent(p, p->exit_signal);
- if (!task_detached(p)) {
- p->exit_state = EXIT_ZOMBIE;
- p = NULL;
- }
+ if (thread_group_leader(p) &&
+ !do_notify_parent(p, p->exit_signal)) {
+ p->exit_state = EXIT_ZOMBIE;
+ p = NULL;
}
write_unlock_irq(&tasklist_lock);
}
@@ -1368,7 +1356,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
- if (task_is_stopped_or_traced(p))
+ if (task_is_stopped_or_traced(p) &&
+ !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1377,11 +1366,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
return NULL;
}
-/*
- * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
- * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
- * the lock and this task is uninteresting. If we return nonzero, we have
- * released the lock and the system call should return.
+/**
+ * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
+ * @wo: wait options
+ * @ptrace: is the wait for ptrace
+ * @p: task to wait for
+ *
+ * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
+ *
+ * CONTEXT:
+ * read_lock(&tasklist_lock), which is released if return value is
+ * non-zero. Also, grabs and releases @p->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 if wait condition didn't exist and search for other wait conditions
+ * should continue. Non-zero return, -errno on failure and @p's pid on
+ * success, implies that tasklist_lock is released and wait condition
+ * search should terminate.
*/
static int wait_task_stopped(struct wait_opts *wo,
int ptrace, struct task_struct *p)
@@ -1397,6 +1398,9 @@ static int wait_task_stopped(struct wait_opts *wo,
if (!ptrace && !(wo->wo_flags & WUNTRACED))
return 0;
+ if (!task_stopped_code(p, ptrace))
+ return 0;
+
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);
@@ -1538,33 +1542,83 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+ /* dead body doesn't have much to contribute */
+ if (p->exit_state == EXIT_DEAD)
+ return 0;
+
+ /* slay zombie? */
+ if (p->exit_state == EXIT_ZOMBIE) {
+ /*
+ * A zombie ptracee is only visible to its ptracer.
+ * Notification and reaping will be cascaded to the real
+ * parent when the ptracer detaches.
+ */
+ if (likely(!ptrace) && unlikely(p->ptrace)) {
+ /* it will become visible, clear notask_error */
+ wo->notask_error = 0;
+ return 0;
+ }
+
+ /* we don't reap group leaders with subthreads */
+ if (!delay_group_leader(p))
+ return wait_task_zombie(wo, p);
+
+ /*
+ * Allow access to stopped/continued state via zombie by
+ * falling through. Clearing of notask_error is complex.
+ *
+ * When !@ptrace:
+ *
+ * If WEXITED is set, notask_error should naturally be
+ * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
+ * so, if there are live subthreads, there are events to
+ * wait for. If all subthreads are dead, it's still safe
+ * to clear - this function will be called again in finite
+ * amount time once all the subthreads are released and
+ * will then return without clearing.
+ *
+ * When @ptrace:
+ *
+ * Stopped state is per-task and thus can't change once the
+ * target task dies. Only continued and exited can happen.
+ * Clear notask_error if WCONTINUED | WEXITED.
+ */
+ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
+ wo->notask_error = 0;
+ } else {
+ /*
+ * If @p is ptraced by a task in its real parent's group,
+ * hide group stop/continued state when looking at @p as
+ * the real parent; otherwise, a single stop can be
+ * reported twice as group and ptrace stops.
+ *
+ * If a ptracer wants to distinguish the two events for its
+ * own children, it should create a separate process which
+ * takes the role of real parent.
+ */
+ if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
+ return 0;
+
/*
- * This child is hidden by ptrace.
- * We aren't allowed to see it now, but eventually we will.
+ * @p is alive and it's gonna stop, continue or exit, so
+ * there always is something to wait for.
*/
wo->notask_error = 0;
- return 0;
}
- if (p->exit_state == EXIT_DEAD)
- return 0;
-
/*
- * We don't reap group leaders with subthreads.
+ * Wait for stopped. Depending on @ptrace, different stopped state
+ * is used and the two don't interact with each other.
*/
- if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
- return wait_task_zombie(wo, p);
+ ret = wait_task_stopped(wo, ptrace, p);
+ if (ret)
+ return ret;
/*
- * It's stopped or running now, so it might
- * later continue, exit, or stop again.
+ * Wait for continued. There's only one continued state and the
+ * ptracer can consume it which can confuse the real parent. Don't
+ * use WCONTINUED from ptracer. You don't need or want it.
*/
- wo->notask_error = 0;
-
- if (task_stopped_code(p, ptrace))
- return wait_task_stopped(wo, ptrace, p);
-
return wait_task_continued(wo, p);
}
diff --git a/kernel/extable.c b/kernel/extable.c
index c2d625fcda77..5339705b8241 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -72,6 +72,16 @@ int core_kernel_text(unsigned long addr)
return 0;
}
+/**
+ * core_kernel_data - tell if addr points to kernel data
+ * @addr: address to test
+ *
+ * Returns true if @addr passed in is from the core kernel data
+ * section.
+ *
+ * Note: On some archs it may return true for core RODATA, and false
+ * for others. But will always be true for core RW data.
+ */
int core_kernel_data(unsigned long addr)
{
if (addr >= (unsigned long)_sdata &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 2b44d82b8237..4d4117e01504 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
-#include <linux/tracehook.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/kthread.h>
@@ -59,7 +58,6 @@
#include <linux/taskstats_kern.h>
#include <linux/random.h>
#include <linux/tty.h>
-#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
@@ -383,15 +381,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
if (tmp->vm_flags & VM_SHARED)
mapping->i_mmap_writable++;
- tmp->vm_truncate_count = mpnt->vm_truncate_count;
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
vma_prio_tree_add(tmp, mpnt);
flush_dcache_mmap_unlock(mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
@@ -522,11 +519,12 @@ struct mm_struct * mm_alloc(void)
struct mm_struct * mm;
mm = allocate_mm();
- if (mm) {
- memset(mm, 0, sizeof(*mm));
- mm = mm_init(mm, current);
- }
- return mm;
+ if (!mm)
+ return NULL;
+
+ memset(mm, 0, sizeof(*mm));
+ mm_init_cpumask(mm);
+ return mm_init(mm, current);
}
/*
@@ -573,6 +571,57 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+ mm->num_exe_file_vmas++;
+}
+
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+ mm->num_exe_file_vmas--;
+ if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
+ fput(mm->exe_file);
+ mm->exe_file = NULL;
+ }
+
+}
+
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+ if (new_exe_file)
+ get_file(new_exe_file);
+ if (mm->exe_file)
+ fput(mm->exe_file);
+ mm->exe_file = new_exe_file;
+ mm->num_exe_file_vmas = 0;
+}
+
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+ struct file *exe_file;
+
+ /* We need mmap_sem to protect against races with removal of
+ * VM_EXECUTABLE vmas */
+ down_read(&mm->mmap_sem);
+ exe_file = mm->exe_file;
+ if (exe_file)
+ get_file(exe_file);
+ up_read(&mm->mmap_sem);
+ return exe_file;
+}
+
+static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+ /* It's safe to write the exe_file pointer without exe_file_lock because
+ * this is called during fork when the task is not yet in /proc */
+ newmm->exe_file = get_mm_exe_file(oldmm);
+}
+
/**
* get_task_mm - acquire a reference to the task's mm
*
@@ -679,6 +728,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
+ mm_init_cpumask(mm);
/* Initializing for Swap token stuff */
mm->token_priority = 0;
@@ -927,6 +977,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->threadgroup_fork_lock);
+#endif
+
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1108,6 +1162,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_lock(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1193,12 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;
- if (current->nsproxy != p->nsproxy) {
- retval = ns_cgroup_clone(p, pid);
- if (retval)
- goto bad_fork_free_pid;
- }
-
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
@@ -1289,7 +1339,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}
if (likely(p->pid)) {
- tracehook_finish_clone(p, clone_flags, trace);
+ ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
if (thread_group_leader(p)) {
if (is_child_reaper(pid))
@@ -1312,6 +1362,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
perf_event_fork(p);
return p;
@@ -1350,6 +1402,8 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
@@ -1426,10 +1480,22 @@ long do_fork(unsigned long clone_flags,
}
/*
- * When called from kernel_thread, don't do user tracing stuff.
+ * Determine whether and which event to report to ptracer. When
+ * called from kernel_thread or CLONE_UNTRACED is explicitly
+ * requested, no event is reported; otherwise, report if the event
+ * for the type of forking is enabled.
*/
- if (likely(user_mode(regs)))
- trace = tracehook_prepare_clone(clone_flags);
+ if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+ if (clone_flags & CLONE_VFORK)
+ trace = PTRACE_EVENT_VFORK;
+ else if ((clone_flags & CSIGNAL) != SIGCHLD)
+ trace = PTRACE_EVENT_CLONE;
+ else
+ trace = PTRACE_EVENT_FORK;
+
+ if (likely(!ptrace_event_enabled(current, trace)))
+ trace = 0;
+ }
p = copy_process(clone_flags, stack_start, regs, stack_size,
child_tidptr, NULL, trace);
@@ -1453,26 +1519,26 @@ long do_fork(unsigned long clone_flags,
}
audit_finish_fork(p);
- tracehook_report_clone(regs, clone_flags, nr, p);
/*
* We set PF_STARTING at creation in case tracing wants to
* use this to distinguish a fully live task from one that
- * hasn't gotten to tracehook_report_clone() yet. Now we
- * clear it and set the child going.
+ * hasn't finished SIGSTOP raising yet. Now we clear it
+ * and set the child going.
*/
p->flags &= ~PF_STARTING;
wake_up_new_task(p);
- tracehook_report_clone_complete(trace, regs,
- clone_flags, nr, p);
+ /* forking complete and child started to run, tell ptracer */
+ if (unlikely(trace))
+ ptrace_event(trace, nr);
if (clone_flags & CLONE_VFORK) {
freezer_do_not_count();
wait_for_completion(&vfork);
freezer_count();
- tracehook_report_vfork_done(p, nr);
+ ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
}
} else {
nr = PTR_ERR(p);
@@ -1507,6 +1573,13 @@ void __init proc_caches_init(void)
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ /*
+ * FIXME! The "sizeof(struct mm_struct)" currently includes the
+ * whole struct cpumask for the OFFSTACK case. We could change
+ * this to *only* allocate as much of it as required by the
+ * maximum number of CPU's we can ever have. The cpumask_allocation
+ * is at the end of the structure, exactly for that reason.
+ */
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index b8cadf70b1fb..5bf924d80b5c 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
- depends on DEBUG_FS && CONSTRUCTORS
+ depends on DEBUG_FS
+ select CONSTRUCTORS
default n
---help---
This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index dbbbf7d43080..a9205e32a059 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -64,17 +64,20 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clock_base =
{
{
- .index = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
+ .index = HRTIMER_BASE_MONOTONIC,
+ .clockid = CLOCK_MONOTONIC,
+ .get_time = &ktime_get,
.resolution = KTIME_LOW_RES,
},
{
- .index = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
+ .index = HRTIMER_BASE_REALTIME,
+ .clockid = CLOCK_REALTIME,
+ .get_time = &ktime_get_real,
.resolution = KTIME_LOW_RES,
},
{
- .index = CLOCK_BOOTTIME,
+ .index = HRTIMER_BASE_BOOTTIME,
+ .clockid = CLOCK_BOOTTIME,
.get_time = &ktime_get_boottime,
.resolution = KTIME_LOW_RES,
},
@@ -196,7 +199,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
struct hrtimer_cpu_base *new_cpu_base;
int this_cpu = smp_processor_id();
int cpu = hrtimer_get_target(this_cpu, pinned);
- int basenum = hrtimer_clockid_to_base(base->index);
+ int basenum = base->index;
again:
new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -621,66 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer,
return res;
}
-
-/*
- * Retrigger next event is called after clock was set
- *
- * Called with interrupts disabled via on_each_cpu()
- */
-static void retrigger_next_event(void *arg)
-{
- struct hrtimer_cpu_base *base;
- struct timespec realtime_offset, wtm, sleep;
-
- if (!hrtimer_hres_active())
- return;
-
- get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
- &sleep);
- set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-
- base = &__get_cpu_var(hrtimer_bases);
-
- /* Adjust CLOCK_REALTIME offset */
- raw_spin_lock(&base->lock);
- base->clock_base[HRTIMER_BASE_REALTIME].offset =
- timespec_to_ktime(realtime_offset);
- base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
- timespec_to_ktime(sleep);
-
- hrtimer_force_reprogram(base, 0);
- raw_spin_unlock(&base->lock);
-}
-
-/*
- * Clock realtime was set
- *
- * Change the offset of the realtime clock vs. the monotonic
- * clock.
- *
- * We might have to reprogram the high resolution timer interrupt. On
- * SMP we call the architecture specific code to retrigger _all_ high
- * resolution timer interrupts. On UP we just disable interrupts and
- * call the high resolution interrupt code.
- */
-void clock_was_set(void)
-{
- /* Retrigger the CPU local events everywhere */
- on_each_cpu(retrigger_next_event, NULL, 1);
-}
-
-/*
- * During resume we might have to reprogram the high resolution timer
- * interrupt (on the local CPU):
- */
-void hres_timers_resume(void)
-{
- WARN_ONCE(!irqs_disabled(),
- KERN_INFO "hres_timers_resume() called with IRQs enabled!");
-
- retrigger_next_event(NULL);
-}
-
/*
* Initialize the high resolution related parts of cpu_base
*/
@@ -715,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
}
/*
+ * Retrigger next event is called after clock was set
+ *
+ * Called with interrupts disabled via on_each_cpu()
+ */
+static void retrigger_next_event(void *arg)
+{
+ struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+ struct timespec realtime_offset, xtim, wtm, sleep;
+
+ if (!hrtimer_hres_active())
+ return;
+
+ /* Optimized out for !HIGH_RES */
+ get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
+ set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
+
+ /* Adjust CLOCK_REALTIME offset */
+ raw_spin_lock(&base->lock);
+ base->clock_base[HRTIMER_BASE_REALTIME].offset =
+ timespec_to_ktime(realtime_offset);
+ base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+ timespec_to_ktime(sleep);
+
+ hrtimer_force_reprogram(base, 0);
+ raw_spin_unlock(&base->lock);
+}
+
+/*
* Switch to high resolution mode
*/
static int hrtimer_switch_to_hres(void)
{
- int cpu = smp_processor_id();
+ int i, cpu = smp_processor_id();
struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
unsigned long flags;
@@ -735,9 +706,8 @@ static int hrtimer_switch_to_hres(void)
return 0;
}
base->hres_active = 1;
- base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
- base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
- base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ base->clock_base[i].resolution = KTIME_HIGH_RES;
tick_setup_sched_timer();
@@ -761,9 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
return 0;
}
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
+static inline void retrigger_next_event(void *arg) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock.
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ /* Retrigger the CPU local events everywhere */
+ on_each_cpu(retrigger_next_event, NULL, 1);
+#endif
+ timerfd_clock_was_set();
+}
+
+/*
+ * During resume we might have to reprogram the high resolution timer
+ * interrupt (on the local CPU):
+ */
+void hrtimers_resume(void)
+{
+ WARN_ONCE(!irqs_disabled(),
+ KERN_INFO "hrtimers_resume() called with IRQs enabled!");
+
+ retrigger_next_event(NULL);
+ timerfd_clock_was_set();
+}
+
static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
{
#ifdef CONFIG_TIMER_STATS
@@ -856,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
debug_activate(timer);
timerqueue_add(&base->active, &timer->node);
+ base->cpu_base->active_bases |= 1 << base->index;
/*
* HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
@@ -897,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
#endif
}
timerqueue_del(&base->active, &timer->node);
+ if (!timerqueue_getnext(&base->active))
+ base->cpu_base->active_bases &= ~(1 << base->index);
out:
timer->state = newstate;
}
@@ -1234,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
void hrtimer_interrupt(struct clock_event_device *dev)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
- struct hrtimer_clock_base *base;
ktime_t expires_next, now, entry_time, delta;
int i, retries = 0;
@@ -1256,12 +1262,15 @@ retry:
*/
cpu_base->expires_next.tv64 = KTIME_MAX;
- base = cpu_base->clock_base;
-
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- ktime_t basenow;
+ struct hrtimer_clock_base *base;
struct timerqueue_node *node;
+ ktime_t basenow;
+
+ if (!(cpu_base->active_bases & (1 << i)))
+ continue;
+ base = cpu_base->clock_base + i;
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {
@@ -1294,7 +1303,6 @@ retry:
__run_hrtimer(timer, &basenow);
}
- base++;
}
/*
@@ -1525,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
struct timespec __user *rmtp;
int ret = 0;
- hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
+ hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
HRTIMER_MODE_ABS);
hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
@@ -1577,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
restart = &current_thread_info()->restart_block;
restart->fn = hrtimer_nanosleep_restart;
- restart->nanosleep.index = t.timer.base->index;
+ restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.rmtp = rmtp;
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 31a9db711906..3a2cab407b93 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -101,10 +101,10 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
}
/**
- * irq_gc_ack - Ack pending interrupt
+ * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
* @d: irq_data
*/
-void irq_gc_ack(struct irq_data *d)
+void irq_gc_ack_set_bit(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
u32 mask = 1 << (d->irq - gc->irq_base);
@@ -115,6 +115,20 @@ void irq_gc_ack(struct irq_data *d)
}
/**
+ * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
+ * @d: irq_data
+ */
+void irq_gc_ack_clr_bit(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ u32 mask = ~(1 << (d->irq - gc->irq_base));
+
+ irq_gc_lock(gc);
+ irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+ irq_gc_unlock(gc);
+}
+
+/**
* irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
* @d: irq_data
*/
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 90cb55f6d7eb..470d08c82bbe 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,12 +133,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
switch (res) {
case IRQ_WAKE_THREAD:
/*
- * Set result to handled so the spurious check
- * does not trigger.
- */
- res = IRQ_HANDLED;
-
- /*
* Catch drivers which return WAKE_THREAD but
* did not set up a thread function
*/
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 886e80347b32..4c60a50e66b2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -257,13 +257,11 @@ int __init early_irq_init(void)
count = ARRAY_SIZE(irq_desc);
for (i = 0; i < count; i++) {
- desc[i].irq_data.irq = i;
- desc[i].irq_data.chip = &no_irq_chip;
desc[i].kstat_irqs = alloc_percpu(unsigned int);
- irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
- alloc_masks(desc + i, GFP_KERNEL, node);
- desc_smp_init(desc + i, node);
+ alloc_masks(&desc[i], GFP_KERNEL, node);
+ raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+ desc_set_defaults(i, &desc[i], node);
}
return arch_early_irq_init();
}
@@ -346,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
if (!cnt)
return -EINVAL;
+ if (irq >= 0) {
+ if (from > irq)
+ return -EINVAL;
+ from = irq;
+ }
+
mutex_lock(&sparse_irq_lock);
start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f7ce0021e1c4..0a7840aeb0fb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -491,6 +491,9 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
int ret = 0;
+ if (!desc)
+ return -EINVAL;
+
/* wakeup-capable irqs can be shared between drivers that
* don't need to have the same sleep mode behaviors.
*/
@@ -723,13 +726,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
* context. So we need to disable bh here to avoid deadlocks and other
* side effects.
*/
-static void
+static irqreturn_t
irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
{
+ irqreturn_t ret;
+
local_bh_disable();
- action->thread_fn(action->irq, action->dev_id);
+ ret = action->thread_fn(action->irq, action->dev_id);
irq_finalize_oneshot(desc, action, false);
local_bh_enable();
+ return ret;
}
/*
@@ -737,10 +743,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
* preemtible - many of them need to sleep and wait for slow busses to
* complete.
*/
-static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
+static irqreturn_t irq_thread_fn(struct irq_desc *desc,
+ struct irqaction *action)
{
- action->thread_fn(action->irq, action->dev_id);
+ irqreturn_t ret;
+
+ ret = action->thread_fn(action->irq, action->dev_id);
irq_finalize_oneshot(desc, action, false);
+ return ret;
}
/*
@@ -753,7 +763,8 @@ static int irq_thread(void *data)
};
struct irqaction *action = data;
struct irq_desc *desc = irq_to_desc(action->irq);
- void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
+ irqreturn_t (*handler_fn)(struct irq_desc *desc,
+ struct irqaction *action);
int wake;
if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
@@ -783,8 +794,12 @@ static int irq_thread(void *data)
desc->istate |= IRQS_PENDING;
raw_spin_unlock_irq(&desc->lock);
} else {
+ irqreturn_t action_ret;
+
raw_spin_unlock_irq(&desc->lock);
- handler_fn(desc, action);
+ action_ret = handler_fn(desc, action);
+ if (!noirqdebug)
+ note_interrupt(action->irq, desc, action_ret);
}
wake = atomic_dec_and_test(&desc->threads_active);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 834899f2500f..4bd4faa6323a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
-static int irq_affinity_proc_show(struct seq_file *m, void *v)
+static int show_irq_affinity(int type, struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
const struct cpumask *mask = desc->irq_data.affinity;
@@ -28,7 +28,10 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
if (irqd_is_setaffinity_pending(&desc->irq_data))
mask = desc->pending_mask;
#endif
- seq_cpumask(m, mask);
+ if (type)
+ seq_cpumask_list(m, mask);
+ else
+ seq_cpumask(m, mask);
seq_putc(m, '\n');
return 0;
}
@@ -59,7 +62,18 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
#endif
int no_irq_affinity;
-static ssize_t irq_affinity_proc_write(struct file *file,
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(0, m, v);
+}
+
+static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(1, m, v);
+}
+
+
+static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
@@ -72,7 +86,10 @@ static ssize_t irq_affinity_proc_write(struct file *file,
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
- err = cpumask_parse_user(buffer, count, new_value);
+ if (type)
+ err = cpumask_parselist_user(buffer, count, new_value);
+ else
+ err = cpumask_parse_user(buffer, count, new_value);
if (err)
goto free_cpumask;
@@ -100,11 +117,28 @@ free_cpumask:
return err;
}
+static ssize_t irq_affinity_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ return write_irq_affinity(0, file, buffer, count, pos);
+}
+
+static ssize_t irq_affinity_list_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ return write_irq_affinity(1, file, buffer, count, pos);
+}
+
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
}
+static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
+}
+
static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = {
.release = single_release,
};
+static const struct file_operations irq_affinity_list_proc_fops = {
+ .open = irq_affinity_list_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = irq_affinity_list_proc_write,
+};
+
static int default_affinity_show(struct seq_file *m, void *v)
{
seq_cpumask(m, irq_default_affinity);
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
proc_create_data("affinity_hint", 0400, desc->dir,
&irq_affinity_hint_proc_fops, (void *)(long)irq);
+ /* create /proc/irq/<irq>/smp_affinity_list */
+ proc_create_data("smp_affinity_list", 0600, desc->dir,
+ &irq_affinity_list_proc_fops, (void *)(long)irq);
+
proc_create_data("node", 0444, desc->dir,
&irq_node_proc_fops, (void *)(long)irq);
#endif
@@ -306,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
#ifdef CONFIG_SMP
remove_proc_entry("smp_affinity", desc->dir);
remove_proc_entry("affinity_hint", desc->dir);
+ remove_proc_entry("smp_affinity_list", desc->dir);
remove_proc_entry("node", desc->dir);
#endif
remove_proc_entry("spurious", desc->dir);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dfbd550401b2..aa57d5da18c1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -167,6 +167,13 @@ out:
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
+static inline int bad_action_ret(irqreturn_t action_ret)
+{
+ if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
+ return 0;
+ return 1;
+}
+
/*
* If 99,900 of the previous 100,000 interrupts have not been handled
* then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
struct irqaction *action;
unsigned long flags;
- if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+ if (bad_action_ret(action_ret)) {
printk(KERN_ERR "irq event %d: bogus return value %x\n",
irq, action_ret);
} else {
@@ -201,10 +208,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
raw_spin_lock_irqsave(&desc->lock, flags);
action = desc->action;
while (action) {
- printk(KERN_ERR "[<%p>]", action->handler);
- print_symbol(" (%s)",
- (unsigned long)action->handler);
- printk("\n");
+ printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
+ if (action->thread_fn)
+ printk(KERN_CONT " threaded [<%p>] %pf",
+ action->thread_fn, action->thread_fn);
+ printk(KERN_CONT "\n");
action = action->next;
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -262,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
if (desc->istate & IRQS_POLL_INPROGRESS)
return;
- if (unlikely(action_ret != IRQ_HANDLED)) {
+ /* we get here again via the threaded handler */
+ if (action_ret == IRQ_WAKE_THREAD)
+ return;
+
+ if (bad_action_ret(action_ret)) {
+ report_bad_irq(irq, desc, action_ret);
+ return;
+ }
+
+ if (unlikely(action_ret == IRQ_NONE)) {
/*
* If we are seeing only the odd spurious IRQ caused by
* bus asynchronicity then don't eventually trigger an error,
@@ -274,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
else
desc->irqs_unhandled++;
desc->last_unhandled = jiffies;
- if (unlikely(action_ret != IRQ_NONE))
- report_bad_irq(irq, desc, action_ret);
}
if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 74d1c099fbd1..a8ce45097f3d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -105,9 +105,12 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
}
static void __jump_label_update(struct jump_label_key *key,
- struct jump_entry *entry, int enable)
+ struct jump_entry *entry,
+ struct jump_entry *stop, int enable)
{
- for (; entry->key == (jump_label_t)(unsigned long)key; entry++) {
+ for (; (entry < stop) &&
+ (entry->key == (jump_label_t)(unsigned long)key);
+ entry++) {
/*
* entry->code set to 0 invalidates module init text sections
* kernel_text_address() verifies we are not in core kernel
@@ -181,7 +184,11 @@ static void __jump_label_mod_update(struct jump_label_key *key, int enable)
struct jump_label_mod *mod = key->next;
while (mod) {
- __jump_label_update(key, mod->entries, enable);
+ struct module *m = mod->mod;
+
+ __jump_label_update(key, mod->entries,
+ m->jump_entries + m->num_jump_entries,
+ enable);
mod = mod->next;
}
}
@@ -245,7 +252,8 @@ static int jump_label_add_module(struct module *mod)
key->next = jlm;
if (jump_label_enabled(key))
- __jump_label_update(key, iter, JUMP_LABEL_ENABLE);
+ __jump_label_update(key, iter, iter_stop,
+ JUMP_LABEL_ENABLE);
}
return 0;
@@ -367,15 +375,19 @@ int jump_label_text_reserved(void *start, void *end)
static void jump_label_update(struct jump_label_key *key, int enable)
{
- struct jump_entry *entry = key->entries;
-
- /* if there are no users, entry can be NULL */
- if (entry)
- __jump_label_update(key, entry, enable);
+ struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
#ifdef CONFIG_MODULES
+ struct module *mod = __module_address((jump_label_t)key);
+
__jump_label_mod_update(key, enable);
+
+ if (mod)
+ stop = mod->jump_entries + mod->num_jump_entries;
#endif
+ /* if there are no users, entry can be NULL */
+ if (entry)
+ __jump_label_update(key, entry, stop, enable);
}
#endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5ae0ff38425f..47613dfb7b28 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,7 @@
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
+#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/workqueue.h>
@@ -43,6 +44,13 @@ extern int max_threads;
static struct workqueue_struct *khelper_wq;
+#define CAP_BSET (void *)1
+#define CAP_PI (void *)2
+
+static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+static DEFINE_SPINLOCK(umh_sysctl_lock);
+
#ifdef CONFIG_MODULES
/*
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module);
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
+ struct cred *new;
int retval;
spin_lock_irq(&current->sighand->siglock);
@@ -147,12 +156,27 @@ static int ____call_usermodehelper(void *data)
*/
set_user_nice(current, 0);
+ retval = -ENOMEM;
+ new = prepare_kernel_cred(current);
+ if (!new)
+ goto fail;
+
+ spin_lock(&umh_sysctl_lock);
+ new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+ new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+ new->cap_inheritable);
+ spin_unlock(&umh_sysctl_lock);
+
if (sub_info->init) {
- retval = sub_info->init(sub_info);
- if (retval)
+ retval = sub_info->init(sub_info, new);
+ if (retval) {
+ abort_creds(new);
goto fail;
+ }
}
+ commit_creds(new);
+
retval = kernel_execve(sub_info->path,
(const char *const *)sub_info->argv,
(const char *const *)sub_info->envp);
@@ -366,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
* context in which call_usermodehelper_exec is called.
*/
void call_usermodehelper_setfns(struct subprocess_info *info,
- int (*init)(struct subprocess_info *info),
+ int (*init)(struct subprocess_info *info, struct cred *new),
void (*cleanup)(struct subprocess_info *info),
void *data)
{
@@ -420,6 +444,84 @@ unlock:
}
EXPORT_SYMBOL(call_usermodehelper_exec);
+static int proc_cap_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+ kernel_cap_t new_cap;
+ int err, i;
+
+ if (write && (!capable(CAP_SETPCAP) ||
+ !capable(CAP_SYS_MODULE)))
+ return -EPERM;
+
+ /*
+ * convert from the global kernel_cap_t to the ulong array to print to
+ * userspace if this is a read.
+ */
+ spin_lock(&umh_sysctl_lock);
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
+ if (table->data == CAP_BSET)
+ cap_array[i] = usermodehelper_bset.cap[i];
+ else if (table->data == CAP_PI)
+ cap_array[i] = usermodehelper_inheritable.cap[i];
+ else
+ BUG();
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ t = *table;
+ t.data = &cap_array;
+
+ /*
+ * actually read or write and array of ulongs from userspace. Remember
+ * these are least significant 32 bits first
+ */
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ /*
+ * convert from the sysctl array of ulongs to the kernel_cap_t
+ * internal representation
+ */
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+ new_cap.cap[i] = cap_array[i];
+
+ /*
+ * Drop everything not in the new_cap (but don't add things)
+ */
+ spin_lock(&umh_sysctl_lock);
+ if (write) {
+ if (table->data == CAP_BSET)
+ usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+ if (table->data == CAP_PI)
+ usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ return 0;
+}
+
+struct ctl_table usermodehelper_table[] = {
+ {
+ .procname = "bset",
+ .data = CAP_BSET,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ {
+ .procname = "inheritable",
+ .data = CAP_PI,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ { }
+};
+
void __init usermodehelper_init(void)
{
khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3b34d2732bce..4ba7cccb4994 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
return;
}
- p->cpus_allowed = cpumask_of_cpu(cpu);
- p->rt.nr_cpus_allowed = 1;
+ /* It's safe because the task is inactive. */
+ do_set_cpus_allowed(p, cpumask_of(cpu));
p->flags |= PF_THREAD_BOUND;
}
EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 63437d065ac8..298c9276dfdb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3426,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock)
int ret = 0;
if (unlikely(current->lockdep_recursion))
- return ret;
+ return 1; /* avoid false negative lockdep_assert_held() */
raw_local_irq_save(flags);
check_flags(flags);
diff --git a/kernel/module.c b/kernel/module.c
index 22879725678d..795bdc7f5c3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2812,7 +2812,7 @@ static struct module *load_module(void __user *umod,
}
/* This has to be done once we're sure module name is unique. */
- if (!mod->taints)
+ if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
dynamic_debug_setup(info.debug, info.num_debug);
/* Find duplicate symbols */
@@ -2849,7 +2849,7 @@ static struct module *load_module(void __user *umod,
module_bug_cleanup(mod);
ddebug:
- if (!mod->taints)
+ if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
dynamic_debug_remove(info.debug);
unlock:
mutex_unlock(&module_mutex);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 2c938e2337cd..d607ed5dd441 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock);
*/
static inline int __sched
__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
- unsigned long ip)
+ struct lockdep_map *nest_lock, unsigned long ip)
{
struct task_struct *task = current;
struct mutex_waiter waiter;
unsigned long flags;
preempt_disable();
- mutex_acquire(&lock->dep_map, subclass, 0, ip);
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
@@ -269,16 +269,25 @@ void __sched
mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_nested);
+void __sched
+_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+ might_sleep();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+}
+
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
int __sched
mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
- return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
@@ -287,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
- subclass, _RET_IP_);
+ subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -393,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
@@ -401,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
@@ -409,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
}
#endif
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
deleted file mode 100644
index 2c98ad94ba0e..000000000000
--- a/kernel/ns_cgroup.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * ns_cgroup.c - namespace cgroup subsystem
- *
- * Copyright 2006, 2007 IBM Corp
- */
-
-#include <linux/module.h>
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <linux/nsproxy.h>
-
-struct ns_cgroup {
- struct cgroup_subsys_state css;
-};
-
-struct cgroup_subsys ns_subsys;
-
-static inline struct ns_cgroup *cgroup_to_ns(
- struct cgroup *cgroup)
-{
- return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
- struct ns_cgroup, css);
-}
-
-int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
-{
- char name[PROC_NUMBUF];
-
- snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
- return cgroup_clone(task, &ns_subsys, name);
-}
-
-/*
- * Rules:
- * 1. you can only enter a cgroup which is a descendant of your current
- * cgroup
- * 2. you can only place another process into a cgroup if
- * a. you have CAP_SYS_ADMIN
- * b. your cgroup is an ancestor of task's destination cgroup
- * (hence either you are in the same cgroup as task, or in an
- * ancestor cgroup thereof)
- */
-static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
-{
- if (current != task) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!cgroup_is_descendant(new_cgroup, current))
- return -EPERM;
- }
-
- if (!cgroup_is_descendant(new_cgroup, task))
- return -EPERM;
-
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (!cgroup_is_descendant(new_cgroup, c)) {
- rcu_read_unlock();
- return -EPERM;
- }
- }
- rcu_read_unlock();
- }
-
- return 0;
-}
-
-/*
- * Rules: you can only create a cgroup if
- * 1. you are capable(CAP_SYS_ADMIN)
- * 2. the target cgroup is a descendant of your own cgroup
- */
-static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
-{
- struct ns_cgroup *ns_cgroup;
-
- if (!capable(CAP_SYS_ADMIN))
- return ERR_PTR(-EPERM);
- if (!cgroup_is_descendant(cgroup, current))
- return ERR_PTR(-EPERM);
- if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
- printk("ns_cgroup can't be created with parent "
- "'clone_children' set.\n");
- return ERR_PTR(-EINVAL);
- }
-
- printk_once("ns_cgroup deprecated: consider using the "
- "'clone_children' flag without the ns_cgroup.\n");
-
- ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
- if (!ns_cgroup)
- return ERR_PTR(-ENOMEM);
- return &ns_cgroup->css;
-}
-
-static void ns_destroy(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
-{
- struct ns_cgroup *ns_cgroup;
-
- ns_cgroup = cgroup_to_ns(cgroup);
- kfree(ns_cgroup);
-}
-
-struct cgroup_subsys ns_subsys = {
- .name = "ns",
- .can_attach = ns_can_attach,
- .create = ns_create,
- .destroy = ns_destroy,
- .subsys_id = ns_subsys_id,
-};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a05d191ffdd9..d6a00f3de15d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
static struct kmem_cache *nsproxy_cachep;
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
goto out;
}
- err = ns_cgroup_clone(current, task_pid(current));
- if (err)
- put_nsproxy(*new_nsp);
-
out:
return err;
}
@@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
+SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+{
+ const struct proc_ns_operations *ops;
+ struct task_struct *tsk = current;
+ struct nsproxy *new_nsproxy;
+ struct proc_inode *ei;
+ struct file *file;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ file = proc_ns_fget(fd);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ err = -EINVAL;
+ ei = PROC_I(file->f_dentry->d_inode);
+ ops = ei->ns_ops;
+ if (nstype && (ops->type != nstype))
+ goto out;
+
+ new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+ if (IS_ERR(new_nsproxy)) {
+ err = PTR_ERR(new_nsproxy);
+ goto out;
+ }
+
+ err = ops->install(new_nsproxy, ei->ns);
+ if (err) {
+ free_nsproxy(new_nsproxy);
+ goto out;
+ }
+ switch_task_namespaces(tsk, new_nsproxy);
+out:
+ fput(file);
+ return err;
+}
+
static int __init nsproxy_cache_init(void)
{
nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 0da058bff8eb..6824ca7d4d0c 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -40,6 +40,7 @@
#include <linux/string.h>
#include <linux/platform_device.h>
#include <linux/init.h>
+#include <linux/kernel.h>
#include <linux/uaccess.h>
@@ -53,11 +54,17 @@ enum pm_qos_type {
PM_QOS_MIN /* return the smallest value */
};
+/*
+ * Note: The lockless read path depends on the CPU accessing
+ * target_value atomically. Atomic access is only guaranteed on all CPU
+ * types linux supports for 32 bit quantites
+ */
struct pm_qos_object {
struct plist_head requests;
struct blocking_notifier_head *notifiers;
struct miscdevice pm_qos_power_miscdev;
char *name;
+ s32 target_value; /* Do not change to 64 bit */
s32 default_value;
enum pm_qos_type type;
};
@@ -70,7 +77,8 @@ static struct pm_qos_object cpu_dma_pm_qos = {
.requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
.notifiers = &cpu_dma_lat_notifier,
.name = "cpu_dma_latency",
- .default_value = 2000 * USEC_PER_SEC,
+ .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+ .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
.type = PM_QOS_MIN,
};
@@ -79,7 +87,8 @@ static struct pm_qos_object network_lat_pm_qos = {
.requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
.notifiers = &network_lat_notifier,
.name = "network_latency",
- .default_value = 2000 * USEC_PER_SEC,
+ .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+ .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
.type = PM_QOS_MIN
};
@@ -89,7 +98,8 @@ static struct pm_qos_object network_throughput_pm_qos = {
.requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
.notifiers = &network_throughput_notifier,
.name = "network_throughput",
- .default_value = 0,
+ .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+ .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
.type = PM_QOS_MAX,
};
@@ -135,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
}
}
+static inline s32 pm_qos_read_value(struct pm_qos_object *o)
+{
+ return o->target_value;
+}
+
+static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
+{
+ o->target_value = value;
+}
+
static void update_target(struct pm_qos_object *o, struct plist_node *node,
int del, int value)
{
@@ -159,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
plist_add(node, &o->requests);
}
curr_value = pm_qos_get_value(o);
+ pm_qos_set_value(o, curr_value);
spin_unlock_irqrestore(&pm_qos_lock, flags);
if (prev_value != curr_value)
@@ -193,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor)
* pm_qos_request - returns current system wide qos expectation
* @pm_qos_class: identification of which qos value is requested
*
- * This function returns the current target value in an atomic manner.
+ * This function returns the current target value.
*/
int pm_qos_request(int pm_qos_class)
{
- unsigned long flags;
- int value;
-
- spin_lock_irqsave(&pm_qos_lock, flags);
- value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
- spin_unlock_irqrestore(&pm_qos_lock, flags);
-
- return value;
+ return pm_qos_read_value(pm_qos_array[pm_qos_class]);
}
EXPORT_SYMBOL_GPL(pm_qos_request);
@@ -385,7 +399,7 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
s32 value;
unsigned long flags;
struct pm_qos_object *o;
- struct pm_qos_request_list *pm_qos_req = filp->private_data;;
+ struct pm_qos_request_list *pm_qos_req = filp->private_data;
if (!pm_qos_req)
return -EINVAL;
@@ -404,24 +418,36 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos)
{
s32 value;
- int x;
- char ascii_value[11];
struct pm_qos_request_list *pm_qos_req;
if (count == sizeof(s32)) {
if (copy_from_user(&value, buf, sizeof(s32)))
return -EFAULT;
- } else if (count == 11) { /* len('0x12345678/0') */
- if (copy_from_user(ascii_value, buf, 11))
+ } else if (count <= 11) { /* ASCII perhaps? */
+ char ascii_value[11];
+ unsigned long int ulval;
+ int ret;
+
+ if (copy_from_user(ascii_value, buf, count))
return -EFAULT;
- if (strlen(ascii_value) != 10)
- return -EINVAL;
- x = sscanf(ascii_value, "%x", &value);
- if (x != 1)
+
+ if (count > 10) {
+ if (ascii_value[10] == '\n')
+ ascii_value[10] = '\0';
+ else
+ return -EINVAL;
+ } else {
+ ascii_value[count] = '\0';
+ }
+ ret = strict_strtoul(ascii_value, 16, &ulval);
+ if (ret) {
+ pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
return -EINVAL;
- pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
- } else
+ }
+ value = (s32)lower_32_bits(ulval);
+ } else {
return -EINVAL;
+ }
pm_qos_req = filp->private_data;
pm_qos_update_request(pm_qos_req, value);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 0791b13df7bf..58f405b581e7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1514,7 +1514,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
return -EFAULT;
restart_block->fn = posix_cpu_nsleep_restart;
- restart_block->nanosleep.index = which_clock;
+ restart_block->nanosleep.clockid = which_clock;
restart_block->nanosleep.rmtp = rmtp;
restart_block->nanosleep.expires = timespec_to_ns(rqtp);
}
@@ -1523,7 +1523,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
{
- clockid_t which_clock = restart_block->nanosleep.index;
+ clockid_t which_clock = restart_block->nanosleep.clockid;
struct timespec t;
struct itimerspec it;
int error;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e5498d7405c3..4556182527f3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)
return tmr;
}
+static void k_itimer_rcu_free(struct rcu_head *head)
+{
+ struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
+
+ kmem_cache_free(posix_timers_cache, tmr);
+}
+
#define IT_ID_SET 1
#define IT_ID_NOT_SET 0
static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
@@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
}
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
- kmem_cache_free(posix_timers_cache, tmr);
+ call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
}
static struct k_clock *clockid_to_kclock(const clockid_t id)
@@ -631,22 +638,18 @@ out:
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
- /*
- * Watch out here. We do a irqsave on the idr_lock and pass the
- * flags part over to the timer lock. Must not let interrupts in
- * while we are moving the lock.
- */
- spin_lock_irqsave(&idr_lock, *flags);
+
+ rcu_read_lock();
timr = idr_find(&posix_timers_id, (int)timer_id);
if (timr) {
- spin_lock(&timr->it_lock);
+ spin_lock_irqsave(&timr->it_lock, *flags);
if (timr->it_signal == current->signal) {
- spin_unlock(&idr_lock);
+ rcu_read_unlock();
return timr;
}
- spin_unlock(&timr->it_lock);
+ spin_unlock_irqrestore(&timr->it_lock, *flags);
}
- spin_unlock_irqrestore(&idr_lock, *flags);
+ rcu_read_unlock();
return NULL;
}
@@ -1056,7 +1059,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
*/
long clock_nanosleep_restart(struct restart_block *restart_block)
{
- clockid_t which_clock = restart_block->nanosleep.index;
+ clockid_t which_clock = restart_block->nanosleep.clockid;
struct k_clock *kc = clockid_to_kclock(which_clock);
if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f9bec56d8825..8f7b1db1ece1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,7 +25,6 @@
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <scsi/scsi_scan.h>
-#include <asm/suspend.h>
#include "power.h"
@@ -55,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN;
static const struct platform_hibernation_ops *hibernation_ops;
/**
- * hibernation_set_ops - set the global hibernate operations
- * @ops: the hibernation operations to use in subsequent hibernation transitions
+ * hibernation_set_ops - Set the global hibernate operations.
+ * @ops: Hibernation operations to use in subsequent hibernation transitions.
*/
-
void hibernation_set_ops(const struct platform_hibernation_ops *ops)
{
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
@@ -115,10 +113,9 @@ static int hibernation_test(int level) { return 0; }
#endif /* !CONFIG_PM_DEBUG */
/**
- * platform_begin - tell the platform driver that we're starting
- * hibernation
+ * platform_begin - Call platform to start hibernation.
+ * @platform_mode: Whether or not to use the platform driver.
*/
-
static int platform_begin(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
@@ -126,10 +123,9 @@ static int platform_begin(int platform_mode)
}
/**
- * platform_end - tell the platform driver that we've entered the
- * working state
+ * platform_end - Call platform to finish transition to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
*/
-
static void platform_end(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -137,8 +133,11 @@ static void platform_end(int platform_mode)
}
/**
- * platform_pre_snapshot - prepare the machine for hibernation using the
- * platform driver if so configured and return an error code if it fails
+ * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to prepare the system for creating a hibernate image,
+ * if so configured, and return an error code if that fails.
*/
static int platform_pre_snapshot(int platform_mode)
@@ -148,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode)
}
/**
- * platform_leave - prepare the machine for switching to the normal mode
- * of operation using the platform driver (called with interrupts disabled)
+ * platform_leave - Call platform to prepare a transition to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver prepare to prepare the machine for switching to the
+ * normal mode of operation.
+ *
+ * This routine is called on one CPU with interrupts disabled.
*/
-
static void platform_leave(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -159,10 +162,14 @@ static void platform_leave(int platform_mode)
}
/**
- * platform_finish - switch the machine to the normal mode of operation
- * using the platform driver (must be called after platform_prepare())
+ * platform_finish - Call platform to switch the system to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to switch the machine to the normal mode of
+ * operation.
+ *
+ * This routine must be called after platform_prepare().
*/
-
static void platform_finish(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -170,11 +177,15 @@ static void platform_finish(int platform_mode)
}
/**
- * platform_pre_restore - prepare the platform for the restoration from a
- * hibernation image. If the restore fails after this function has been
- * called, platform_restore_cleanup() must be called.
+ * platform_pre_restore - Prepare for hibernate image restoration.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to prepare the system for resume from a hibernation
+ * image.
+ *
+ * If the restore fails after this function has been called,
+ * platform_restore_cleanup() must be called.
*/
-
static int platform_pre_restore(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
@@ -182,12 +193,16 @@ static int platform_pre_restore(int platform_mode)
}
/**
- * platform_restore_cleanup - switch the platform to the normal mode of
- * operation after a failing restore. If platform_pre_restore() has been
- * called before the failing restore, this function must be called too,
- * regardless of the result of platform_pre_restore().
+ * platform_restore_cleanup - Switch to the working state after failing restore.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to switch the system to the normal mode of operation
+ * after a failing restore.
+ *
+ * If platform_pre_restore() has been called before the failing restore, this
+ * function must be called too, regardless of the result of
+ * platform_pre_restore().
*/
-
static void platform_restore_cleanup(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -195,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode)
}
/**
- * platform_recover - recover the platform from a failure to suspend
- * devices.
+ * platform_recover - Recover from a failure to suspend devices.
+ * @platform_mode: Whether or not to use the platform driver.
*/
-
static void platform_recover(int platform_mode)
{
if (platform_mode && hibernation_ops && hibernation_ops->recover)
@@ -206,13 +220,12 @@ static void platform_recover(int platform_mode)
}
/**
- * swsusp_show_speed - print the time elapsed between two events.
- * @start: Starting event.
- * @stop: Final event.
- * @nr_pages - number of pages processed between @start and @stop
- * @msg - introductory message to print
+ * swsusp_show_speed - Print time elapsed between two events during hibernation.
+ * @start: Starting event.
+ * @stop: Final event.
+ * @nr_pages: Number of memory pages processed between @start and @stop.
+ * @msg: Additional diagnostic message to print.
*/
-
void swsusp_show_speed(struct timeval *start, struct timeval *stop,
unsigned nr_pages, char *msg)
{
@@ -235,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
}
/**
- * create_image - freeze devices that need to be frozen with interrupts
- * off, create the hibernation image and thaw those devices. Control
- * reappears in this routine after a restore.
+ * create_image - Create a hibernation image.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
+ * and execute the drivers' .thaw_noirq() callbacks.
+ *
+ * Control reappears in this routine after the subsequent restore.
*/
-
static int create_image(int platform_mode)
{
int error;
- error = arch_prepare_suspend();
- if (error)
- return error;
-
- /* At this point, dpm_suspend_start() has been called, but *not*
- * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
- * Otherwise, drivers for some devices (e.g. interrupt controllers)
- * become desynchronized with the actual state of the hardware
- * at resume time, and evil weirdness ensues.
- */
error = dpm_suspend_noirq(PMSG_FREEZE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -297,9 +303,6 @@ static int create_image(int platform_mode)
Power_up:
syscore_resume();
- /* NOTE: dpm_resume_noirq() is just a resume() for devices
- * that suspended with irqs off ... no overall powerup.
- */
Enable_irqs:
local_irq_enable();
@@ -317,14 +320,11 @@ static int create_image(int platform_mode)
}
/**
- * hibernation_snapshot - quiesce devices and create the hibernation
- * snapshot image.
- * @platform_mode - if set, use the platform driver, if available, to
- * prepare the platform firmware for the power transition.
+ * hibernation_snapshot - Quiesce devices and create a hibernation image.
+ * @platform_mode: If set, use platform driver to prepare for the transition.
*
- * Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held.
*/
-
int hibernation_snapshot(int platform_mode)
{
pm_message_t msg = PMSG_RECOVER;
@@ -384,13 +384,14 @@ int hibernation_snapshot(int platform_mode)
}
/**
- * resume_target_kernel - prepare devices that need to be suspended with
- * interrupts off, restore the contents of highmem that have not been
- * restored yet from the image and run the low level code that will restore
- * the remaining contents of memory and switch to the just restored target
- * kernel.
+ * resume_target_kernel - Restore system state from a hibernation image.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
+ * highmem that have not been restored yet from the image and run the low-level
+ * code that will restore the remaining contents of memory and switch to the
+ * just restored target kernel.
*/
-
static int resume_target_kernel(bool platform_mode)
{
int error;
@@ -416,24 +417,26 @@ static int resume_target_kernel(bool platform_mode)
if (error)
goto Enable_irqs;
- /* We'll ignore saved state, but this gets preempt count (etc) right */
save_processor_state();
error = restore_highmem();
if (!error) {
error = swsusp_arch_resume();
/*
* The code below is only ever reached in case of a failure.
- * Otherwise execution continues at place where
- * swsusp_arch_suspend() was called
+ * Otherwise, execution continues at the place where
+ * swsusp_arch_suspend() was called.
*/
BUG_ON(!error);
- /* This call to restore_highmem() undos the previous one */
+ /*
+ * This call to restore_highmem() reverts the changes made by
+ * the previous one.
+ */
restore_highmem();
}
/*
* The only reason why swsusp_arch_resume() can fail is memory being
* very tight, so we have to free it as soon as we can to avoid
- * subsequent failures
+ * subsequent failures.
*/
swsusp_free();
restore_processor_state();
@@ -456,14 +459,12 @@ static int resume_target_kernel(bool platform_mode)
}
/**
- * hibernation_restore - quiesce devices and restore the hibernation
- * snapshot image. If successful, control returns in hibernation_snaphot()
- * @platform_mode - if set, use the platform driver, if available, to
- * prepare the platform firmware for the transition.
+ * hibernation_restore - Quiesce devices and restore from a hibernation image.
+ * @platform_mode: If set, use platform driver to prepare for the transition.
*
- * Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held. If it is successful, control
+ * reappears in the restored target kernel in hibernation_snaphot().
*/
-
int hibernation_restore(int platform_mode)
{
int error;
@@ -483,10 +484,8 @@ int hibernation_restore(int platform_mode)
}
/**
- * hibernation_platform_enter - enter the hibernation state using the
- * platform driver (if available)
+ * hibernation_platform_enter - Power off the system using the platform driver.
*/
-
int hibernation_platform_enter(void)
{
int error;
@@ -557,12 +556,12 @@ int hibernation_platform_enter(void)
}
/**
- * power_down - Shut the machine down for hibernation.
+ * power_down - Shut the machine down for hibernation.
*
- * Use the platform driver, if configured so; otherwise try
- * to power off or reboot.
+ * Use the platform driver, if configured, to put the system into the sleep
+ * state corresponding to hibernation, or try to power it off or reboot,
+ * depending on the value of hibernation_mode.
*/
-
static void power_down(void)
{
switch (hibernation_mode) {
@@ -599,9 +598,8 @@ static int prepare_processes(void)
}
/**
- * hibernate - The granpappy of the built-in hibernation management
+ * hibernate - Carry out system hibernation, including saving the image.
*/
-
int hibernate(void)
{
int error;
@@ -679,17 +677,20 @@ int hibernate(void)
/**
- * software_resume - Resume from a saved image.
+ * software_resume - Resume from a saved hibernation image.
*
- * Called as a late_initcall (so all devices are discovered and
- * initialized), we call swsusp to see if we have a saved image or not.
- * If so, we quiesce devices, the restore the saved image. We will
- * return above (in hibernate() ) if everything goes well.
- * Otherwise, we fail gracefully and return to the normally
- * scheduled program.
+ * This routine is called as a late initcall, when all devices have been
+ * discovered and initialized already.
*
+ * The image reading code is called to see if there is a hibernation image
+ * available for reading. If that is the case, devices are quiesced and the
+ * contents of memory is restored from the saved image.
+ *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
*/
-
static int software_resume(void)
{
int error;
@@ -819,21 +820,17 @@ static const char * const hibernation_modes[] = {
[HIBERNATION_TESTPROC] = "testproc",
};
-/**
- * disk - Control hibernation mode
- *
- * Suspend-to-disk can be handled in several ways. We have a few options
- * for putting the system to sleep - using the platform driver (e.g. ACPI
- * or other hibernation_ops), powering off the system or rebooting the
- * system (for testing) as well as the two test modes.
+/*
+ * /sys/power/disk - Control hibernation mode.
*
- * The system can support 'platform', and that is known a priori (and
- * encoded by the presence of hibernation_ops). However, the user may
- * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
- * test modes, 'test' or 'testproc'.
+ * Hibernation can be handled in several ways. There are a few different ways
+ * to put the system into the sleep state: using the platform driver (e.g. ACPI
+ * or other hibernation_ops), powering it off or rebooting it (for testing
+ * mostly), or using one of the two available test modes.
*
- * show() will display what the mode is currently set to.
- * store() will accept one of
+ * The sysfs file /sys/power/disk provides an interface for selecting the
+ * hibernation mode to use. Reading from this file causes the available modes
+ * to be printed. There are 5 modes that can be supported:
*
* 'platform'
* 'shutdown'
@@ -841,8 +838,14 @@ static const char * const hibernation_modes[] = {
* 'test'
* 'testproc'
*
- * It will only change to 'platform' if the system
- * supports it (as determined by having hibernation_ops).
+ * If a platform hibernation driver is in use, 'platform' will be supported
+ * and will be used by default. Otherwise, 'shutdown' will be used by default.
+ * The selected option (i.e. the one corresponding to the current value of
+ * hibernation_mode) is enclosed by a square bracket.
+ *
+ * To select a given hibernation mode it is necessary to write the mode's
+ * string representation (as returned by reading from /sys/power/disk) back
+ * into /sys/power/disk.
*/
static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -875,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
return buf-start;
}
-
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ace55889f702..06efa54f93d6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1211,7 +1211,11 @@ static void free_unnecessary_pages(void)
to_free_highmem = alloc_highmem - save;
} else {
to_free_highmem = 0;
- to_free_normal -= save - alloc_highmem;
+ save -= alloc_highmem;
+ if (to_free_normal > save)
+ to_free_normal -= save;
+ else
+ to_free_normal = 0;
}
memory_bm_position_reset(&copy_bm);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 7d02d33be699..42ddbc6f0de6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
if (error)
pm_notifier_call_chain(PM_POST_RESTORE);
}
- if (error)
+ if (error) {
+ free_basic_memory_bitmaps();
atomic_inc(&snapshot_device_available);
+ }
data->frozen = 0;
data->ready = 0;
data->platform_support = 0;
diff --git a/kernel/printk.c b/kernel/printk.c
index da8ca817eae3..35185392173f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
@@ -167,46 +168,74 @@ void log_buf_kexec_setup(void)
}
#endif
+/* requested log_buf_len from kernel cmdline */
+static unsigned long __initdata new_log_buf_len;
+
+/* save requested log_buf_len since it's too early to process it */
static int __init log_buf_len_setup(char *str)
{
unsigned size = memparse(str, &str);
- unsigned long flags;
if (size)
size = roundup_pow_of_two(size);
- if (size > log_buf_len) {
- unsigned start, dest_idx, offset;
- char *new_log_buf;
+ if (size > log_buf_len)
+ new_log_buf_len = size;
- new_log_buf = alloc_bootmem(size);
- if (!new_log_buf) {
- printk(KERN_WARNING "log_buf_len: allocation failed\n");
- goto out;
- }
+ return 0;
+}
+early_param("log_buf_len", log_buf_len_setup);
- spin_lock_irqsave(&logbuf_lock, flags);
- log_buf_len = size;
- log_buf = new_log_buf;
-
- offset = start = min(con_start, log_start);
- dest_idx = 0;
- while (start != log_end) {
- log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)];
- start++;
- dest_idx++;
- }
- log_start -= offset;
- con_start -= offset;
- log_end -= offset;
- spin_unlock_irqrestore(&logbuf_lock, flags);
+void __init setup_log_buf(int early)
+{
+ unsigned long flags;
+ unsigned start, dest_idx, offset;
+ char *new_log_buf;
+ int free;
+
+ if (!new_log_buf_len)
+ return;
+
+ if (early) {
+ unsigned long mem;
- printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
+ mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
+ if (mem == MEMBLOCK_ERROR)
+ return;
+ new_log_buf = __va(mem);
+ } else {
+ new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
}
-out:
- return 1;
-}
-__setup("log_buf_len=", log_buf_len_setup);
+ if (unlikely(!new_log_buf)) {
+ pr_err("log_buf_len: %ld bytes not available\n",
+ new_log_buf_len);
+ return;
+ }
+
+ spin_lock_irqsave(&logbuf_lock, flags);
+ log_buf_len = new_log_buf_len;
+ log_buf = new_log_buf;
+ new_log_buf_len = 0;
+ free = __LOG_BUF_LEN - log_end;
+
+ offset = start = min(con_start, log_start);
+ dest_idx = 0;
+ while (start != log_end) {
+ unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
+
+ log_buf[dest_idx] = __log_buf[log_idx_mask];
+ start++;
+ dest_idx++;
+ }
+ log_start -= offset;
+ con_start -= offset;
+ log_end -= offset;
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ pr_info("log_buf_len: %d\n", log_buf_len);
+ pr_info("early log buf free: %d(%d%%)\n",
+ free, (free * 100) / __LOG_BUF_LEN);
+}
#ifdef CONFIG_BOOT_PRINTK_DELAY
diff --git a/kernel/profile.c b/kernel/profile.c
index 66f841b7fbd3..961b389fe52f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -126,11 +126,9 @@ int __ref profile_init(void)
if (prof_buffer)
return 0;
- prof_buffer = vmalloc(buffer_bytes);
- if (prof_buffer) {
- memset(prof_buffer, 0, buffer_bytes);
+ prof_buffer = vzalloc(buffer_bytes);
+ if (prof_buffer)
return 0;
- }
free_cpumask_var(prof_cpu_mask);
return -ENOMEM;
@@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void)
mutex_unlock(&profile_flip_mutex);
}
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
int i, j, cpu;
struct profile_hit *hits;
- if (prof_on != type || !prof_buffer)
- return;
pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
@@ -419,16 +415,20 @@ out_free:
#define profile_discard_flip_buffers() do { } while (0)
#define profile_cpu_callback NULL
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
unsigned long pc;
-
- if (prof_on != type || !prof_buffer)
- return;
pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
}
#endif /* !CONFIG_SMP */
+
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+ if (prof_on != type || !prof_buffer)
+ return;
+ do_profile_hits(type, __pc, nr_hits);
+}
EXPORT_SYMBOL_GPL(profile_hits);
void profile_tick(int type)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dc7ab65f3b36..9de3ecfd20f9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -23,8 +23,15 @@
#include <linux/uaccess.h>
#include <linux/regset.h>
#include <linux/hw_breakpoint.h>
+#include <linux/cn_proc.h>
+static int ptrace_trapping_sleep_fn(void *flags)
+{
+ schedule();
+ return 0;
+}
+
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
@@ -38,35 +45,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
child->parent = new_parent;
}
-/*
- * Turn a tracing stop into a normal stop now, since with no tracer there
- * would be no way to wake it up with SIGCONT or SIGKILL. If there was a
- * signal sent that would resume the child, but didn't because it was in
- * TASK_TRACED, resume it now.
- * Requires that irqs be disabled.
- */
-static void ptrace_untrace(struct task_struct *child)
-{
- spin_lock(&child->sighand->siglock);
- if (task_is_traced(child)) {
- /*
- * If the group stop is completed or in progress,
- * this thread was already counted as stopped.
- */
- if (child->signal->flags & SIGNAL_STOP_STOPPED ||
- child->signal->group_stop_count)
- __set_task_state(child, TASK_STOPPED);
- else
- signal_wake_up(child, 1);
- }
- spin_unlock(&child->sighand->siglock);
-}
-
-/*
- * unptrace a task: move it back to its original parent and
- * remove it from the ptrace list.
+/**
+ * __ptrace_unlink - unlink ptracee and restore its execution state
+ * @child: ptracee to be unlinked
*
- * Must be called with the tasklist lock write-held.
+ * Remove @child from the ptrace list, move it back to the original parent,
+ * and restore the execution state so that it conforms to the group stop
+ * state.
+ *
+ * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
+ * exiting. For PTRACE_DETACH, unless the ptracee has been killed between
+ * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
+ * If the ptracer is exiting, the ptracee can be in any state.
+ *
+ * After detach, the ptracee should be in a state which conforms to the
+ * group stop. If the group is stopped or in the process of stopping, the
+ * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
+ * up from TASK_TRACED.
+ *
+ * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
+ * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
+ * to but in the opposite direction of what happens while attaching to a
+ * stopped task. However, in this direction, the intermediate RUNNING
+ * state is not hidden even from the current ptracer and if it immediately
+ * re-attaches and performs a WNOHANG wait(2), it may fail.
+ *
+ * CONTEXT:
+ * write_lock_irq(tasklist_lock)
*/
void __ptrace_unlink(struct task_struct *child)
{
@@ -76,14 +81,54 @@ void __ptrace_unlink(struct task_struct *child)
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
- if (task_is_traced(child))
- ptrace_untrace(child);
+ spin_lock(&child->sighand->siglock);
+
+ /*
+ * Clear all pending traps and TRAPPING. TRAPPING should be
+ * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly.
+ */
+ task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
+ task_clear_jobctl_trapping(child);
+
+ /*
+ * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
+ * @child isn't dead.
+ */
+ if (!(child->flags & PF_EXITING) &&
+ (child->signal->flags & SIGNAL_STOP_STOPPED ||
+ child->signal->group_stop_count))
+ child->jobctl |= JOBCTL_STOP_PENDING;
+
+ /*
+ * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
+ * @child in the butt. Note that @resume should be used iff @child
+ * is in TASK_TRACED; otherwise, we might unduly disrupt
+ * TASK_KILLABLE sleeps.
+ */
+ if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
+ signal_wake_up(child, task_is_traced(child));
+
+ spin_unlock(&child->sighand->siglock);
}
-/*
- * Check that we have indeed attached to the thing..
+/**
+ * ptrace_check_attach - check whether ptracee is ready for ptrace operation
+ * @child: ptracee to check for
+ * @ignore_state: don't check whether @child is currently %TASK_TRACED
+ *
+ * Check whether @child is being ptraced by %current and ready for further
+ * ptrace operations. If @ignore_state is %false, @child also should be in
+ * %TASK_TRACED state and on return the child is guaranteed to be traced
+ * and not executing. If @ignore_state is %true, @child can be in any
+ * state.
+ *
+ * CONTEXT:
+ * Grabs and releases tasklist_lock and @child->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 on success, -ESRCH if %child is not ready.
*/
-int ptrace_check_attach(struct task_struct *child, int kill)
+int ptrace_check_attach(struct task_struct *child, bool ignore_state)
{
int ret = -ESRCH;
@@ -96,21 +141,20 @@ int ptrace_check_attach(struct task_struct *child, int kill)
*/
read_lock(&tasklist_lock);
if ((child->ptrace & PT_PTRACED) && child->parent == current) {
- ret = 0;
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
*/
spin_lock_irq(&child->sighand->siglock);
- if (task_is_stopped(child))
- child->state = TASK_TRACED;
- else if (!task_is_traced(child) && !kill)
- ret = -ESRCH;
+ WARN_ON_ONCE(task_is_stopped(child));
+ if (ignore_state || (task_is_traced(child) &&
+ !(child->jobctl & JOBCTL_LISTENING)))
+ ret = 0;
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
- if (!ret && !kill)
+ if (!ret && !ignore_state)
ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
/* All systems go.. */
@@ -167,10 +211,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
return !err;
}
-static int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task, long request,
+ unsigned long flags)
{
+ bool seize = (request == PTRACE_SEIZE);
int retval;
+ /*
+ * SEIZE will enable new ptrace behaviors which will be implemented
+ * gradually. SEIZE_DEVEL is used to prevent applications
+ * expecting full SEIZE behaviors trapping on kernel commits which
+ * are still in the process of implementing them.
+ *
+ * Only test programs for new ptrace behaviors being implemented
+ * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
+ *
+ * Once SEIZE behaviors are completely implemented, this flag and
+ * the following test will be removed.
+ */
+ retval = -EIO;
+ if (seize && !(flags & PTRACE_SEIZE_DEVEL))
+ goto out;
+
audit_ptrace(task);
retval = -EPERM;
@@ -202,11 +264,41 @@ static int ptrace_attach(struct task_struct *task)
goto unlock_tasklist;
task->ptrace = PT_PTRACED;
+ if (seize)
+ task->ptrace |= PT_SEIZED;
if (task_ns_capable(task, CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
__ptrace_link(task, current);
- send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+
+ /* SEIZE doesn't trap tracee on attach */
+ if (!seize)
+ send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+
+ spin_lock(&task->sighand->siglock);
+
+ /*
+ * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
+ * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
+ * will be cleared if the child completes the transition or any
+ * event which clears the group stop states happens. We'll wait
+ * for the transition to complete before returning from this
+ * function.
+ *
+ * This hides STOPPED -> RUNNING -> TRACED transition from the
+ * attaching thread but a different thread in the same group can
+ * still observe the transient RUNNING state. IOW, if another
+ * thread's WNOHANG wait(2) on the stopped tracee races against
+ * ATTACH, the wait(2) may fail due to the transient RUNNING.
+ *
+ * The following task_is_stopped() test is safe as both transitions
+ * in and out of STOPPED are protected by siglock.
+ */
+ if (task_is_stopped(task) &&
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
+ signal_wake_up(task, 1);
+
+ spin_unlock(&task->sighand->siglock);
retval = 0;
unlock_tasklist:
@@ -214,6 +306,12 @@ unlock_tasklist:
unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
+ if (!retval) {
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
+ ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
+ proc_ptrace_connector(task, PTRACE_ATTACH);
+ }
+
return retval;
}
@@ -276,25 +374,27 @@ static int ignoring_children(struct sighand_struct *sigh)
*/
static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
{
+ bool dead;
+
__ptrace_unlink(p);
- if (p->exit_state == EXIT_ZOMBIE) {
- if (!task_detached(p) && thread_group_empty(p)) {
- if (!same_thread_group(p->real_parent, tracer))
- do_notify_parent(p, p->exit_signal);
- else if (ignoring_children(tracer->sighand)) {
- __wake_up_parent(p, tracer);
- p->exit_signal = -1;
- }
- }
- if (task_detached(p)) {
- /* Mark it as in the process of being reaped. */
- p->exit_state = EXIT_DEAD;
- return true;
+ if (p->exit_state != EXIT_ZOMBIE)
+ return false;
+
+ dead = !thread_group_leader(p);
+
+ if (!dead && thread_group_empty(p)) {
+ if (!same_thread_group(p->real_parent, tracer))
+ dead = do_notify_parent(p, p->exit_signal);
+ else if (ignoring_children(tracer->sighand)) {
+ __wake_up_parent(p, tracer);
+ dead = true;
}
}
-
- return false;
+ /* Mark it as in the process of being reaped. */
+ if (dead)
+ p->exit_state = EXIT_DEAD;
+ return dead;
}
static int ptrace_detach(struct task_struct *child, unsigned int data)
@@ -316,11 +416,10 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
if (child->ptrace) {
child->exit_code = data;
dead = __ptrace_detach(current, child);
- if (!child->exit_state)
- wake_up_state(child, TASK_TRACED | TASK_STOPPED);
}
write_unlock_irq(&tasklist_lock);
+ proc_ptrace_connector(child, PTRACE_DETACH);
if (unlikely(dead))
release_task(child);
@@ -518,7 +617,7 @@ static int ptrace_resume(struct task_struct *child, long request,
}
child->exit_code = data;
- wake_up_process(child);
+ wake_up_state(child, __TASK_TRACED);
return 0;
}
@@ -567,10 +666,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
int ptrace_request(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
+ bool seized = child->ptrace & PT_SEIZED;
int ret = -EIO;
- siginfo_t siginfo;
+ siginfo_t siginfo, *si;
void __user *datavp = (void __user *) data;
unsigned long __user *datalp = datavp;
+ unsigned long flags;
switch (request) {
case PTRACE_PEEKTEXT:
@@ -603,6 +704,62 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setsiginfo(child, &siginfo);
break;
+ case PTRACE_INTERRUPT:
+ /*
+ * Stop tracee without any side-effect on signal or job
+ * control. At least one trap is guaranteed to happen
+ * after this request. If @child is already trapped, the
+ * current trap is not disturbed and another trap will
+ * happen after the current trap is ended with PTRACE_CONT.
+ *
+ * The actual trap might not be PTRACE_EVENT_STOP trap but
+ * the pending condition is cleared regardless.
+ */
+ if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+ break;
+
+ /*
+ * INTERRUPT doesn't disturb existing trap sans one
+ * exception. If ptracer issued LISTEN for the current
+ * STOP, this INTERRUPT should clear LISTEN and re-trap
+ * tracee into STOP.
+ */
+ if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
+ signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+
+ unlock_task_sighand(child, &flags);
+ ret = 0;
+ break;
+
+ case PTRACE_LISTEN:
+ /*
+ * Listen for events. Tracee must be in STOP. It's not
+ * resumed per-se but is not considered to be in TRACED by
+ * wait(2) or ptrace(2). If an async event (e.g. group
+ * stop state change) happens, tracee will enter STOP trap
+ * again. Alternatively, ptracer can issue INTERRUPT to
+ * finish listening and re-trap tracee into STOP.
+ */
+ if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+ break;
+
+ si = child->last_siginfo;
+ if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP))
+ break;
+
+ child->jobctl |= JOBCTL_LISTENING;
+
+ /*
+ * If NOTIFY is set, it means event happened between start
+ * of this trap and now. Trigger re-trap immediately.
+ */
+ if (child->jobctl & JOBCTL_TRAP_NOTIFY)
+ signal_wake_up(child, true);
+
+ unlock_task_sighand(child, &flags);
+ ret = 0;
+ break;
+
case PTRACE_DETACH: /* detach a process that was attached. */
ret = ptrace_detach(child, data);
break;
@@ -717,8 +874,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out;
}
- if (request == PTRACE_ATTACH) {
- ret = ptrace_attach(child);
+ if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+ ret = ptrace_attach(child, request, data);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
@@ -728,7 +885,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out_put_task_struct;
}
- ret = ptrace_check_attach(child, request == PTRACE_KILL);
+ ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+ request == PTRACE_INTERRUPT);
if (ret < 0)
goto out_put_task_struct;
@@ -859,8 +1017,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
goto out;
}
- if (request == PTRACE_ATTACH) {
- ret = ptrace_attach(child);
+ if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+ ret = ptrace_attach(child, request, data);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
@@ -870,7 +1028,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
goto out_put_task_struct;
}
- ret = ptrace_check_attach(child, request == PTRACE_KILL);
+ ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+ request == PTRACE_INTERRUPT);
if (!ret)
ret = compat_arch_ptrace(child, request, addr, data);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 421abfd3641d..7bbac7d0f5ab 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -35,6 +35,7 @@
#include <linux/init.h>
#include <linux/time.h>
#include <linux/cpu.h>
+#include <linux/prefetch.h>
/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
static struct task_struct *rcu_kthread_task;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e486f7c3ffb8..ba06207b1dd3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -36,7 +36,7 @@
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/nmi.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
@@ -49,6 +49,7 @@
#include <linux/kernel_stat.h>
#include <linux/wait.h>
#include <linux/kthread.h>
+#include <linux/prefetch.h>
#include "rcutree.h"
@@ -83,10 +84,35 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
static struct rcu_state *rcu_state;
+/*
+ * The rcu_scheduler_active variable transitions from zero to one just
+ * before the first task is spawned. So when this variable is zero, RCU
+ * can assume that there is but one task, allowing RCU to (for example)
+ * optimized synchronize_sched() to a simple barrier(). When this variable
+ * is one, RCU must actually do all the hard work required to detect real
+ * grace periods. This variable is also used to suppress boot-time false
+ * positives from lockdep-RCU error checking.
+ */
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
/*
+ * The rcu_scheduler_fully_active variable transitions from zero to one
+ * during the early_initcall() processing, which is after the scheduler
+ * is capable of creating new tasks. So RCU processing (for example,
+ * creating tasks for RCU priority boosting) must be delayed until after
+ * rcu_scheduler_fully_active transitions from zero to one. We also
+ * currently delay invocation of any RCU callbacks until after this point.
+ *
+ * It might later prove better for people registering RCU callbacks during
+ * early boot to take responsibility for these callbacks, but one step at
+ * a time.
+ */
+static int rcu_scheduler_fully_active __read_mostly;
+
+#ifdef CONFIG_RCU_BOOST
+
+/*
* Control variables for per-CPU and per-rcu_node kthreads. These
* handle all flavors of RCU.
*/
@@ -94,12 +120,13 @@ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
DEFINE_PER_CPU(char, rcu_cpu_has_work);
-static char rcu_kthreads_spawnable;
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
-static void invoke_rcu_cpu_kthread(void);
+static void invoke_rcu_core(void);
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
@@ -162,7 +189,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
#ifdef CONFIG_NO_HZ
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = 1,
- .dynticks = 1,
+ .dynticks = ATOMIC_INIT(1),
};
#endif /* #ifdef CONFIG_NO_HZ */
@@ -321,13 +348,25 @@ void rcu_enter_nohz(void)
unsigned long flags;
struct rcu_dynticks *rdtp;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- rdtp->dynticks++;
- rdtp->dynticks_nesting--;
- WARN_ON_ONCE(rdtp->dynticks & 0x1);
+ if (--rdtp->dynticks_nesting) {
+ local_irq_restore(flags);
+ return;
+ }
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic_inc(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
local_irq_restore(flags);
+
+ /* If the interrupt queued a callback, get out of dyntick mode. */
+ if (in_irq() &&
+ (__get_cpu_var(rcu_sched_data).nxtlist ||
+ __get_cpu_var(rcu_bh_data).nxtlist ||
+ rcu_preempt_needs_cpu(smp_processor_id())))
+ set_need_resched();
}
/*
@@ -343,11 +382,16 @@ void rcu_exit_nohz(void)
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- rdtp->dynticks++;
- rdtp->dynticks_nesting++;
- WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
+ if (rdtp->dynticks_nesting++) {
+ local_irq_restore(flags);
+ return;
+ }
+ smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic_inc(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
local_irq_restore(flags);
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
}
/**
@@ -361,11 +405,15 @@ void rcu_nmi_enter(void)
{
struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
- if (rdtp->dynticks & 0x1)
+ if (rdtp->dynticks_nmi_nesting == 0 &&
+ (atomic_read(&rdtp->dynticks) & 0x1))
return;
- rdtp->dynticks_nmi++;
- WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+ rdtp->dynticks_nmi_nesting++;
+ smp_mb__before_atomic_inc(); /* Force delay from prior write. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic_inc(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
}
/**
@@ -379,11 +427,14 @@ void rcu_nmi_exit(void)
{
struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
- if (rdtp->dynticks & 0x1)
+ if (rdtp->dynticks_nmi_nesting == 0 ||
+ --rdtp->dynticks_nmi_nesting != 0)
return;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
- rdtp->dynticks_nmi++;
- WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic_inc(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic_inc(); /* Force delay to next write. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
}
/**
@@ -394,13 +445,7 @@ void rcu_nmi_exit(void)
*/
void rcu_irq_enter(void)
{
- struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-
- if (rdtp->dynticks_nesting++)
- return;
- rdtp->dynticks++;
- WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+ rcu_exit_nohz();
}
/**
@@ -412,18 +457,7 @@ void rcu_irq_enter(void)
*/
void rcu_irq_exit(void)
{
- struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-
- if (--rdtp->dynticks_nesting)
- return;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
- rdtp->dynticks++;
- WARN_ON_ONCE(rdtp->dynticks & 0x1);
-
- /* If the interrupt queued a callback, get out of dyntick mode. */
- if (__this_cpu_read(rcu_sched_data.nxtlist) ||
- __this_cpu_read(rcu_bh_data.nxtlist))
- set_need_resched();
+ rcu_enter_nohz();
}
#ifdef CONFIG_SMP
@@ -435,19 +469,8 @@ void rcu_irq_exit(void)
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
- int ret;
- int snap;
- int snap_nmi;
-
- snap = rdp->dynticks->dynticks;
- snap_nmi = rdp->dynticks->dynticks_nmi;
- smp_mb(); /* Order sampling of snap with end of grace period. */
- rdp->dynticks_snap = snap;
- rdp->dynticks_nmi_snap = snap_nmi;
- ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
- if (ret)
- rdp->dynticks_fqs++;
- return ret;
+ rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+ return 0;
}
/*
@@ -458,16 +481,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
*/
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
- long curr;
- long curr_nmi;
- long snap;
- long snap_nmi;
+ unsigned long curr;
+ unsigned long snap;
- curr = rdp->dynticks->dynticks;
- snap = rdp->dynticks_snap;
- curr_nmi = rdp->dynticks->dynticks_nmi;
- snap_nmi = rdp->dynticks_nmi_snap;
- smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+ curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
+ snap = (unsigned long)rdp->dynticks_snap;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -477,8 +495,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* read-side critical section that started before the beginning
* of the current RCU grace period.
*/
- if ((curr != snap || (curr & 0x1) == 0) &&
- (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
+ if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
rdp->dynticks_fqs++;
return 1;
}
@@ -907,6 +924,12 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
unsigned long gp_duration;
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+
+ /*
+ * Ensure that all grace-period and pre-grace-period activity
+ * is seen before the assignment to rsp->completed.
+ */
+ smp_mb(); /* See above block comment. */
gp_duration = jiffies - rsp->gp_start;
if (gp_duration > rsp->gp_max)
rsp->gp_max = gp_duration;
@@ -1092,14 +1115,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
int need_report = 0;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp;
- struct task_struct *t;
- /* Stop the CPU's kthread. */
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t != NULL) {
- per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
- kthread_stop(t);
- }
+ rcu_stop_cpu_kthread(cpu);
/* Exclude any attempts to start a new grace period. */
raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -1235,7 +1252,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/* Re-raise the RCU softirq if there are callbacks remaining. */
if (cpu_has_callbacks_ready_to_invoke(rdp))
- invoke_rcu_cpu_kthread();
+ invoke_rcu_core();
}
/*
@@ -1281,7 +1298,7 @@ void rcu_check_callbacks(int cpu, int user)
}
rcu_preempt_check_callbacks(cpu);
if (rcu_pending(cpu))
- invoke_rcu_cpu_kthread();
+ invoke_rcu_core();
}
#ifdef CONFIG_SMP
@@ -1446,33 +1463,20 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
}
/* If there are callbacks ready, invoke them. */
- rcu_do_batch(rsp, rdp);
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ invoke_rcu_callbacks(rsp, rdp);
}
/*
* Do softirq processing for the current CPU.
*/
-static void rcu_process_callbacks(void)
+static void rcu_process_callbacks(struct softirq_action *unused)
{
- /*
- * Memory references from any prior RCU read-side critical sections
- * executed by the interrupted code must be seen before any RCU
- * grace-period manipulations below.
- */
- smp_mb(); /* See above block comment. */
-
__rcu_process_callbacks(&rcu_sched_state,
&__get_cpu_var(rcu_sched_data));
__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
rcu_preempt_process_callbacks();
- /*
- * Memory references from any later RCU read-side critical sections
- * executed by the interrupted code must be seen after any RCU
- * grace-period manipulations above.
- */
- smp_mb(); /* See above block comment. */
-
/* If we are last CPU on way to dyntick-idle mode, accelerate it. */
rcu_needs_cpu_flush();
}
@@ -1483,341 +1487,22 @@ static void rcu_process_callbacks(void)
* the current CPU with interrupts disabled, the rcu_cpu_kthread_task
* cannot disappear out from under us.
*/
-static void invoke_rcu_cpu_kthread(void)
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
{
- unsigned long flags;
-
- local_irq_save(flags);
- __this_cpu_write(rcu_cpu_has_work, 1);
- if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
- local_irq_restore(flags);
+ if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
return;
- }
- wake_up(&__get_cpu_var(rcu_cpu_wq));
- local_irq_restore(flags);
-}
-
-/*
- * Wake up the specified per-rcu_node-structure kthread.
- * Because the per-rcu_node kthreads are immortal, we don't need
- * to do anything to keep them alive.
- */
-static void invoke_rcu_node_kthread(struct rcu_node *rnp)
-{
- struct task_struct *t;
-
- t = rnp->node_kthread_task;
- if (t != NULL)
- wake_up_process(t);
-}
-
-/*
- * Set the specified CPU's kthread to run RT or not, as specified by
- * the to_rt argument. The CPU-hotplug locks are held, so the task
- * is not going away.
- */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
- int policy;
- struct sched_param sp;
- struct task_struct *t;
-
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t == NULL)
+ if (likely(!rsp->boost)) {
+ rcu_do_batch(rsp, rdp);
return;
- if (to_rt) {
- policy = SCHED_FIFO;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- } else {
- policy = SCHED_NORMAL;
- sp.sched_priority = 0;
}
- sched_setscheduler_nocheck(t, policy, &sp);
-}
-
-/*
- * Timer handler to initiate the waking up of per-CPU kthreads that
- * have yielded the CPU due to excess numbers of RCU callbacks.
- * We wake up the per-rcu_node kthread, which in turn will wake up
- * the booster kthread.
- */
-static void rcu_cpu_kthread_timer(unsigned long arg)
-{
- unsigned long flags;
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
- struct rcu_node *rnp = rdp->mynode;
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rnp->wakemask |= rdp->grpmask;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- invoke_rcu_node_kthread(rnp);
+ invoke_rcu_callbacks_kthread();
}
-/*
- * Drop to non-real-time priority and yield, but only after posting a
- * timer that will cause us to regain our real-time priority if we
- * remain preempted. Either way, we restore our real-time priority
- * before returning.
- */
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
+static void invoke_rcu_core(void)
{
- struct sched_param sp;
- struct timer_list yield_timer;
-
- setup_timer_on_stack(&yield_timer, f, arg);
- mod_timer(&yield_timer, jiffies + 2);
- sp.sched_priority = 0;
- sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
- set_user_nice(current, 19);
- schedule();
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
- del_timer(&yield_timer);
+ raise_softirq(RCU_SOFTIRQ);
}
-/*
- * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
- * This can happen while the corresponding CPU is either coming online
- * or going offline. We cannot wait until the CPU is fully online
- * before starting the kthread, because the various notifier functions
- * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
- * the corresponding CPU is online.
- *
- * Return 1 if the kthread needs to stop, 0 otherwise.
- *
- * Caller must disable bh. This function can momentarily enable it.
- */
-static int rcu_cpu_kthread_should_stop(int cpu)
-{
- while (cpu_is_offline(cpu) ||
- !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
- smp_processor_id() != cpu) {
- if (kthread_should_stop())
- return 1;
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
- per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
- local_bh_enable();
- schedule_timeout_uninterruptible(1);
- if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
- local_bh_disable();
- }
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- return 0;
-}
-
-/*
- * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
- * earlier RCU softirq.
- */
-static int rcu_cpu_kthread(void *arg)
-{
- int cpu = (int)(long)arg;
- unsigned long flags;
- int spincnt = 0;
- unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
- wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
- char work;
- char *workp = &per_cpu(rcu_cpu_has_work, cpu);
-
- for (;;) {
- *statusp = RCU_KTHREAD_WAITING;
- wait_event_interruptible(*wqp,
- *workp != 0 || kthread_should_stop());
- local_bh_disable();
- if (rcu_cpu_kthread_should_stop(cpu)) {
- local_bh_enable();
- break;
- }
- *statusp = RCU_KTHREAD_RUNNING;
- per_cpu(rcu_cpu_kthread_loops, cpu)++;
- local_irq_save(flags);
- work = *workp;
- *workp = 0;
- local_irq_restore(flags);
- if (work)
- rcu_process_callbacks();
- local_bh_enable();
- if (*workp != 0)
- spincnt++;
- else
- spincnt = 0;
- if (spincnt > 10) {
- *statusp = RCU_KTHREAD_YIELDING;
- rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
- spincnt = 0;
- }
- }
- *statusp = RCU_KTHREAD_STOPPED;
- return 0;
-}
-
-/*
- * Spawn a per-CPU kthread, setting up affinity and priority.
- * Because the CPU hotplug lock is held, no other CPU will be attempting
- * to manipulate rcu_cpu_kthread_task. There might be another CPU
- * attempting to access it during boot, but the locking in kthread_bind()
- * will enforce sufficient ordering.
- */
-static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
-{
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_kthreads_spawnable ||
- per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
- return 0;
- t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
- if (IS_ERR(t))
- return PTR_ERR(t);
- kthread_bind(t, cpu);
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
- per_cpu(rcu_cpu_kthread_task, cpu) = t;
- wake_up_process(t);
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- return 0;
-}
-
-/*
- * Per-rcu_node kthread, which is in charge of waking up the per-CPU
- * kthreads when needed. We ignore requests to wake up kthreads
- * for offline CPUs, which is OK because force_quiescent_state()
- * takes care of this case.
- */
-static int rcu_node_kthread(void *arg)
-{
- int cpu;
- unsigned long flags;
- unsigned long mask;
- struct rcu_node *rnp = (struct rcu_node *)arg;
- struct sched_param sp;
- struct task_struct *t;
-
- for (;;) {
- rnp->node_kthread_status = RCU_KTHREAD_WAITING;
- wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0);
- rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- mask = rnp->wakemask;
- rnp->wakemask = 0;
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
- if ((mask & 0x1) == 0)
- continue;
- preempt_disable();
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (!cpu_online(cpu) || t == NULL) {
- preempt_enable();
- continue;
- }
- per_cpu(rcu_cpu_has_work, cpu) = 1;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- preempt_enable();
- }
- }
- /* NOTREACHED */
- rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
- return 0;
-}
-
-/*
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question. The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU. If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
- */
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
- cpumask_var_t cm;
- int cpu;
- unsigned long mask = rnp->qsmaskinit;
-
- if (rnp->node_kthread_task == NULL)
- return;
- if (!alloc_cpumask_var(&cm, GFP_KERNEL))
- return;
- cpumask_clear(cm);
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
- if ((mask & 0x1) && cpu != outgoingcpu)
- cpumask_set_cpu(cpu, cm);
- if (cpumask_weight(cm) == 0) {
- cpumask_setall(cm);
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
- cpumask_clear_cpu(cpu, cm);
- WARN_ON_ONCE(cpumask_weight(cm) == 0);
- }
- set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
- rcu_boost_kthread_setaffinity(rnp, cm);
- free_cpumask_var(cm);
-}
-
-/*
- * Spawn a per-rcu_node kthread, setting priority and affinity.
- * Called during boot before online/offline can happen, or, if
- * during runtime, with the main CPU-hotplug locks held. So only
- * one of these can be executing at a time.
- */
-static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp)
-{
- unsigned long flags;
- int rnp_index = rnp - &rsp->node[0];
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_kthreads_spawnable ||
- rnp->qsmaskinit == 0)
- return 0;
- if (rnp->node_kthread_task == NULL) {
- t = kthread_create(rcu_node_kthread, (void *)rnp,
- "rcun%d", rnp_index);
- if (IS_ERR(t))
- return PTR_ERR(t);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rnp->node_kthread_task = t;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- wake_up_process(t);
- sp.sched_priority = 99;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- }
- return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
-}
-
-/*
- * Spawn all kthreads -- called as soon as the scheduler is running.
- */
-static int __init rcu_spawn_kthreads(void)
-{
- int cpu;
- struct rcu_node *rnp;
-
- rcu_kthreads_spawnable = 1;
- for_each_possible_cpu(cpu) {
- init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
- per_cpu(rcu_cpu_has_work, cpu) = 0;
- if (cpu_online(cpu))
- (void)rcu_spawn_one_cpu_kthread(cpu);
- }
- rnp = rcu_get_root(rcu_state);
- init_waitqueue_head(&rnp->node_wq);
- rcu_init_boost_waitqueue(rnp);
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- if (NUM_RCU_NODES > 1)
- rcu_for_each_leaf_node(rcu_state, rnp) {
- init_waitqueue_head(&rnp->node_wq);
- rcu_init_boost_waitqueue(rnp);
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- }
- return 0;
-}
-early_initcall(rcu_spawn_kthreads);
-
static void
__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
struct rcu_state *rsp)
@@ -2217,26 +1902,13 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
}
-static void __cpuinit rcu_online_cpu(int cpu)
+static void __cpuinit rcu_prepare_cpu(int cpu)
{
rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
rcu_preempt_init_percpu_data(cpu);
}
-static void __cpuinit rcu_online_kthreads(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
- struct rcu_node *rnp = rdp->mynode;
-
- /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_kthreads_spawnable) {
- (void)rcu_spawn_one_cpu_kthread(cpu);
- if (rnp->node_kthread_task == NULL)
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- }
-}
-
/*
* Handle CPU online/offline notification events.
*/
@@ -2250,8 +1922,8 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- rcu_online_cpu(cpu);
- rcu_online_kthreads(cpu);
+ rcu_prepare_cpu(cpu);
+ rcu_prepare_kthreads(cpu);
break;
case CPU_ONLINE:
case CPU_DOWN_FAILED:
@@ -2401,6 +2073,7 @@ void __init rcu_init(void)
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
/*
* We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 257664815d5d..01b2ccda26fb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,11 +84,9 @@
* Dynticks per-CPU state.
*/
struct rcu_dynticks {
- int dynticks_nesting; /* Track nesting level, sort of. */
- int dynticks; /* Even value for dynticks-idle, else odd. */
- int dynticks_nmi; /* Even value for either dynticks-idle or */
- /* not in nmi handler, else odd. So this */
- /* remains even for nmi from irq handler. */
+ int dynticks_nesting; /* Track irq/process nesting level. */
+ int dynticks_nmi_nesting; /* Track NMI nesting level. */
+ atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
};
/* RCU's kthread states for tracing. */
@@ -121,7 +119,9 @@ struct rcu_node {
/* elements that need to drain to allow the */
/* current expedited grace period to */
/* complete (only for TREE_PREEMPT_RCU). */
- unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */
+ atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
+ /* Since this has meaning only for leaf */
+ /* rcu_node structures, 32 bits suffices. */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -159,9 +159,6 @@ struct rcu_node {
struct task_struct *boost_kthread_task;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
- wait_queue_head_t boost_wq;
- /* Wait queue on which to park the boost */
- /* kthread. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
unsigned long n_tasks_boosted;
@@ -188,9 +185,6 @@ struct rcu_node {
/* kthread that takes care of this rcu_node */
/* structure, for example, awakening the */
/* per-CPU kthreads as needed. */
- wait_queue_head_t node_wq;
- /* Wait queue on which to park the per-node */
- /* kthread. */
unsigned int node_kthread_status;
/* State of node_kthread_task for tracing. */
} ____cacheline_internodealigned_in_smp;
@@ -284,7 +278,6 @@ struct rcu_data {
/* 3) dynticks interface. */
struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
int dynticks_snap; /* Per-GP tracking for dynticks. */
- int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
#endif /* #ifdef CONFIG_NO_HZ */
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -337,6 +330,16 @@ struct rcu_data {
/* scheduling clock irq */
/* before ratting on them. */
+#define rcu_wait(cond) \
+do { \
+ for (;;) { \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ if (cond) \
+ break; \
+ schedule(); \
+ } \
+ __set_current_state(TASK_RUNNING); \
+} while (0)
/*
* RCU global state, including node hierarchy. This hierarchy is
@@ -366,6 +369,7 @@ struct rcu_state {
/* period because */
/* force_quiescent_state() */
/* was running. */
+ u8 boost; /* Subject to priority boost. */
unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */
@@ -423,6 +427,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
unsigned long flags);
+static void rcu_stop_cpu_kthread(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static void rcu_print_task_stall(struct rcu_node *rnp);
@@ -446,13 +451,20 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void);
static void rcu_needs_cpu_flush(void);
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
+static void invoke_rcu_callbacks_kthread(void);
+#ifdef CONFIG_RCU_BOOST
+static void rcu_preempt_do_callbacks(void);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
cpumask_var_t cm);
-static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp,
int rnp_index);
+static void invoke_rcu_node_kthread(struct rcu_node *rnp);
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
+static void __cpuinit rcu_prepare_kthreads(int cpu);
#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3f6559a5f5cd..8aafbb80b8b0 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -68,6 +68,7 @@ struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
static struct rcu_state *rcu_state = &rcu_preempt_state;
+static void rcu_read_unlock_special(struct task_struct *t);
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
/*
@@ -147,7 +148,7 @@ static void rcu_preempt_note_context_switch(int cpu)
struct rcu_data *rdp;
struct rcu_node *rnp;
- if (t->rcu_read_lock_nesting &&
+ if (t->rcu_read_lock_nesting > 0 &&
(t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
/* Possibly blocking in an RCU read-side critical section. */
@@ -190,6 +191,14 @@ static void rcu_preempt_note_context_switch(int cpu)
rnp->gp_tasks = &t->rcu_node_entry;
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ } else if (t->rcu_read_lock_nesting < 0 &&
+ t->rcu_read_unlock_special) {
+
+ /*
+ * Complete exit from RCU read-side critical section on
+ * behalf of preempted instance of __rcu_read_unlock().
+ */
+ rcu_read_unlock_special(t);
}
/*
@@ -284,7 +293,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
*/
-static void rcu_read_unlock_special(struct task_struct *t)
+static noinline void rcu_read_unlock_special(struct task_struct *t)
{
int empty;
int empty_exp;
@@ -309,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
}
/* Hardware IRQ handlers cannot block. */
- if (in_irq()) {
+ if (in_irq() || in_serving_softirq()) {
local_irq_restore(flags);
return;
}
@@ -342,6 +351,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
if (&t->rcu_node_entry == rnp->boost_tasks)
rnp->boost_tasks = np;
+ /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
+ if (t->rcu_boosted) {
+ special |= RCU_READ_UNLOCK_BOOSTED;
+ t->rcu_boosted = 0;
+ }
#endif /* #ifdef CONFIG_RCU_BOOST */
t->rcu_blocked_node = NULL;
@@ -358,7 +372,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
if (special & RCU_READ_UNLOCK_BOOSTED) {
- t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
rt_mutex_unlock(t->rcu_boost_mutex);
t->rcu_boost_mutex = NULL;
}
@@ -387,13 +400,22 @@ void __rcu_read_unlock(void)
struct task_struct *t = current;
barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
- --t->rcu_read_lock_nesting;
- barrier(); /* decrement before load of ->rcu_read_unlock_special */
- if (t->rcu_read_lock_nesting == 0 &&
- unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
- rcu_read_unlock_special(t);
+ if (t->rcu_read_lock_nesting != 1)
+ --t->rcu_read_lock_nesting;
+ else {
+ t->rcu_read_lock_nesting = INT_MIN;
+ barrier(); /* assign before ->rcu_read_unlock_special load */
+ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+ rcu_read_unlock_special(t);
+ barrier(); /* ->rcu_read_unlock_special load before assign */
+ t->rcu_read_lock_nesting = 0;
+ }
#ifdef CONFIG_PROVE_LOCKING
- WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
+ {
+ int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+
+ WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+ }
#endif /* #ifdef CONFIG_PROVE_LOCKING */
}
EXPORT_SYMBOL_GPL(__rcu_read_unlock);
@@ -589,7 +611,8 @@ static void rcu_preempt_check_callbacks(int cpu)
rcu_preempt_qs(cpu);
return;
}
- if (per_cpu(rcu_preempt_data, cpu).qs_pending)
+ if (t->rcu_read_lock_nesting > 0 &&
+ per_cpu(rcu_preempt_data, cpu).qs_pending)
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
}
@@ -602,6 +625,15 @@ static void rcu_preempt_process_callbacks(void)
&__get_cpu_var(rcu_preempt_data));
}
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_preempt_do_callbacks(void)
+{
+ rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
/*
* Queue a preemptible-RCU callback for invocation after a grace period.
*/
@@ -686,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
raw_spin_lock_irqsave(&rnp->lock, flags);
for (;;) {
- if (!sync_rcu_preempt_exp_done(rnp))
+ if (!sync_rcu_preempt_exp_done(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
break;
+ }
if (rnp->parent == NULL) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
wake_up(&sync_rcu_preempt_exp_wq);
break;
}
@@ -698,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
raw_spin_lock(&rnp->lock); /* irqs already disabled */
rnp->expmask &= ~mask;
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
/*
@@ -1165,7 +1199,7 @@ static int rcu_boost(struct rcu_node *rnp)
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&mtx, t);
t->rcu_boost_mutex = &mtx;
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+ t->rcu_boosted = 1;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
@@ -1196,8 +1230,7 @@ static int rcu_boost_kthread(void *arg)
for (;;) {
rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
- wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks ||
- rnp->exp_tasks);
+ rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
more2boost = rcu_boost(rnp);
if (more2boost)
@@ -1250,6 +1283,23 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
}
/*
+ * Wake up the per-CPU kthread to invoke RCU callbacks.
+ */
+static void invoke_rcu_callbacks_kthread(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __this_cpu_write(rcu_cpu_has_work, 1);
+ if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
+ local_irq_restore(flags);
+ return;
+ }
+ wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+ local_irq_restore(flags);
+}
+
+/*
* Set the affinity of the boost kthread. The CPU-hotplug locks are
* held, so no one should be messing with the existence of the boost
* kthread.
@@ -1275,14 +1325,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
}
/*
- * Initialize the RCU-boost waitqueue.
- */
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
-{
- init_waitqueue_head(&rnp->boost_wq);
-}
-
-/*
* Create an RCU-boost kthread for the specified node if one does not
* already exist. We only create this kthread for preemptible RCU.
* Returns zero if all is well, a negated errno otherwise.
@@ -1297,6 +1339,7 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
if (&rcu_preempt_state != rsp)
return 0;
+ rsp->boost = 1;
if (rnp->boost_kthread_task != NULL)
return 0;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
@@ -1306,12 +1349,376 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
raw_spin_lock_irqsave(&rnp->lock, flags);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- wake_up_process(t);
sp.sched_priority = RCU_KTHREAD_PRIO;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
return 0;
}
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Stop the RCU's per-CPU kthread when its CPU goes offline,.
+ */
+static void rcu_stop_cpu_kthread(int cpu)
+{
+ struct task_struct *t;
+
+ /* Stop the CPU's kthread. */
+ t = per_cpu(rcu_cpu_kthread_task, cpu);
+ if (t != NULL) {
+ per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
+ kthread_stop(t);
+ }
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void rcu_kthread_do_work(void)
+{
+ rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
+ rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+ rcu_preempt_do_callbacks();
+}
+
+/*
+ * Wake up the specified per-rcu_node-structure kthread.
+ * Because the per-rcu_node kthreads are immortal, we don't need
+ * to do anything to keep them alive.
+ */
+static void invoke_rcu_node_kthread(struct rcu_node *rnp)
+{
+ struct task_struct *t;
+
+ t = rnp->node_kthread_task;
+ if (t != NULL)
+ wake_up_process(t);
+}
+
+/*
+ * Set the specified CPU's kthread to run RT or not, as specified by
+ * the to_rt argument. The CPU-hotplug locks are held, so the task
+ * is not going away.
+ */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+{
+ int policy;
+ struct sched_param sp;
+ struct task_struct *t;
+
+ t = per_cpu(rcu_cpu_kthread_task, cpu);
+ if (t == NULL)
+ return;
+ if (to_rt) {
+ policy = SCHED_FIFO;
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ } else {
+ policy = SCHED_NORMAL;
+ sp.sched_priority = 0;
+ }
+ sched_setscheduler_nocheck(t, policy, &sp);
+}
+
+/*
+ * Timer handler to initiate the waking up of per-CPU kthreads that
+ * have yielded the CPU due to excess numbers of RCU callbacks.
+ * We wake up the per-rcu_node kthread, which in turn will wake up
+ * the booster kthread.
+ */
+static void rcu_cpu_kthread_timer(unsigned long arg)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
+ struct rcu_node *rnp = rdp->mynode;
+
+ atomic_or(rdp->grpmask, &rnp->wakemask);
+ invoke_rcu_node_kthread(rnp);
+}
+
+/*
+ * Drop to non-real-time priority and yield, but only after posting a
+ * timer that will cause us to regain our real-time priority if we
+ * remain preempted. Either way, we restore our real-time priority
+ * before returning.
+ */
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
+{
+ struct sched_param sp;
+ struct timer_list yield_timer;
+
+ setup_timer_on_stack(&yield_timer, f, arg);
+ mod_timer(&yield_timer, jiffies + 2);
+ sp.sched_priority = 0;
+ sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
+ set_user_nice(current, 19);
+ schedule();
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+ del_timer(&yield_timer);
+}
+
+/*
+ * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
+ * This can happen while the corresponding CPU is either coming online
+ * or going offline. We cannot wait until the CPU is fully online
+ * before starting the kthread, because the various notifier functions
+ * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
+ * the corresponding CPU is online.
+ *
+ * Return 1 if the kthread needs to stop, 0 otherwise.
+ *
+ * Caller must disable bh. This function can momentarily enable it.
+ */
+static int rcu_cpu_kthread_should_stop(int cpu)
+{
+ while (cpu_is_offline(cpu) ||
+ !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
+ smp_processor_id() != cpu) {
+ if (kthread_should_stop())
+ return 1;
+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+ per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
+ local_bh_enable();
+ schedule_timeout_uninterruptible(1);
+ if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+ local_bh_disable();
+ }
+ per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+ return 0;
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
+ * earlier RCU softirq.
+ */
+static int rcu_cpu_kthread(void *arg)
+{
+ int cpu = (int)(long)arg;
+ unsigned long flags;
+ int spincnt = 0;
+ unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
+ char work;
+ char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+
+ for (;;) {
+ *statusp = RCU_KTHREAD_WAITING;
+ rcu_wait(*workp != 0 || kthread_should_stop());
+ local_bh_disable();
+ if (rcu_cpu_kthread_should_stop(cpu)) {
+ local_bh_enable();
+ break;
+ }
+ *statusp = RCU_KTHREAD_RUNNING;
+ per_cpu(rcu_cpu_kthread_loops, cpu)++;
+ local_irq_save(flags);
+ work = *workp;
+ *workp = 0;
+ local_irq_restore(flags);
+ if (work)
+ rcu_kthread_do_work();
+ local_bh_enable();
+ if (*workp != 0)
+ spincnt++;
+ else
+ spincnt = 0;
+ if (spincnt > 10) {
+ *statusp = RCU_KTHREAD_YIELDING;
+ rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+ spincnt = 0;
+ }
+ }
+ *statusp = RCU_KTHREAD_STOPPED;
+ return 0;
+}
+
+/*
+ * Spawn a per-CPU kthread, setting up affinity and priority.
+ * Because the CPU hotplug lock is held, no other CPU will be attempting
+ * to manipulate rcu_cpu_kthread_task. There might be another CPU
+ * attempting to access it during boot, but the locking in kthread_bind()
+ * will enforce sufficient ordering.
+ *
+ * Please note that we cannot simply refuse to wake up the per-CPU
+ * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
+ * which can result in softlockup complaints if the task ends up being
+ * idle for more than a couple of minutes.
+ *
+ * However, please note also that we cannot bind the per-CPU kthread to its
+ * CPU until that CPU is fully online. We also cannot wait until the
+ * CPU is fully online before we create its per-CPU kthread, as this would
+ * deadlock the system when CPU notifiers tried waiting for grace
+ * periods. So we bind the per-CPU kthread to its CPU only if the CPU
+ * is online. If its CPU is not yet fully online, then the code in
+ * rcu_cpu_kthread() will wait until it is fully online, and then do
+ * the binding.
+ */
+static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
+{
+ struct sched_param sp;
+ struct task_struct *t;
+
+ if (!rcu_scheduler_fully_active ||
+ per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
+ return 0;
+ t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ if (cpu_online(cpu))
+ kthread_bind(t, cpu);
+ per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+ WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ per_cpu(rcu_cpu_kthread_task, cpu) = t;
+ wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
+ return 0;
+}
+
+/*
+ * Per-rcu_node kthread, which is in charge of waking up the per-CPU
+ * kthreads when needed. We ignore requests to wake up kthreads
+ * for offline CPUs, which is OK because force_quiescent_state()
+ * takes care of this case.
+ */
+static int rcu_node_kthread(void *arg)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_node *rnp = (struct rcu_node *)arg;
+ struct sched_param sp;
+ struct task_struct *t;
+
+ for (;;) {
+ rnp->node_kthread_status = RCU_KTHREAD_WAITING;
+ rcu_wait(atomic_read(&rnp->wakemask) != 0);
+ rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ mask = atomic_xchg(&rnp->wakemask, 0);
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
+ if ((mask & 0x1) == 0)
+ continue;
+ preempt_disable();
+ t = per_cpu(rcu_cpu_kthread_task, cpu);
+ if (!cpu_online(cpu) || t == NULL) {
+ preempt_enable();
+ continue;
+ }
+ per_cpu(rcu_cpu_has_work, cpu) = 1;
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ preempt_enable();
+ }
+ }
+ /* NOTREACHED */
+ rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
+ return 0;
+}
+
+/*
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question. The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU. If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+ cpumask_var_t cm;
+ int cpu;
+ unsigned long mask = rnp->qsmaskinit;
+
+ if (rnp->node_kthread_task == NULL)
+ return;
+ if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+ return;
+ cpumask_clear(cm);
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
+ if ((mask & 0x1) && cpu != outgoingcpu)
+ cpumask_set_cpu(cpu, cm);
+ if (cpumask_weight(cm) == 0) {
+ cpumask_setall(cm);
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
+ cpumask_clear_cpu(cpu, cm);
+ WARN_ON_ONCE(cpumask_weight(cm) == 0);
+ }
+ set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
+ rcu_boost_kthread_setaffinity(rnp, cm);
+ free_cpumask_var(cm);
+}
+
+/*
+ * Spawn a per-rcu_node kthread, setting priority and affinity.
+ * Called during boot before online/offline can happen, or, if
+ * during runtime, with the main CPU-hotplug locks held. So only
+ * one of these can be executing at a time.
+ */
+static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp)
+{
+ unsigned long flags;
+ int rnp_index = rnp - &rsp->node[0];
+ struct sched_param sp;
+ struct task_struct *t;
+
+ if (!rcu_scheduler_fully_active ||
+ rnp->qsmaskinit == 0)
+ return 0;
+ if (rnp->node_kthread_task == NULL) {
+ t = kthread_create(rcu_node_kthread, (void *)rnp,
+ "rcun%d", rnp_index);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rnp->node_kthread_task = t;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ sp.sched_priority = 99;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+ }
+ return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
+}
+
+/*
+ * Spawn all kthreads -- called as soon as the scheduler is running.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+ int cpu;
+ struct rcu_node *rnp;
+
+ rcu_scheduler_fully_active = 1;
+ for_each_possible_cpu(cpu) {
+ per_cpu(rcu_cpu_has_work, cpu) = 0;
+ if (cpu_online(cpu))
+ (void)rcu_spawn_one_cpu_kthread(cpu);
+ }
+ rnp = rcu_get_root(rcu_state);
+ (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ if (NUM_RCU_NODES > 1) {
+ rcu_for_each_leaf_node(rcu_state, rnp)
+ (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ }
+ return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+
+static void __cpuinit rcu_prepare_kthreads(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
+ if (rcu_scheduler_fully_active) {
+ (void)rcu_spawn_one_cpu_kthread(cpu);
+ if (rnp->node_kthread_task == NULL)
+ (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ }
+}
+
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1319,25 +1726,41 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
- cpumask_var_t cm)
+static void invoke_rcu_callbacks_kthread(void)
{
+ WARN_ON_ONCE(1);
}
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void rcu_stop_cpu_kthread(int cpu)
{
}
-static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp,
- int rnp_index)
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+}
+
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
{
+}
+
+static int __init rcu_scheduler_really_started(void)
+{
+ rcu_scheduler_fully_active = 1;
return 0;
}
+early_initcall(rcu_scheduler_really_started);
+
+static void __cpuinit rcu_prepare_kthreads(int cpu)
+{
+}
#endif /* #else #ifdef CONFIG_RCU_BOOST */
@@ -1513,14 +1936,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
*
* Because it is not legal to invoke rcu_process_callbacks() with irqs
* disabled, we do one pass of force_quiescent_state(), then do a
- * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked
+ * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
* later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
*/
int rcu_needs_cpu(int cpu)
{
int c = 0;
int snap;
- int snap_nmi;
int thatcpu;
/* Check for being in the holdoff period. */
@@ -1531,10 +1953,10 @@ int rcu_needs_cpu(int cpu)
for_each_online_cpu(thatcpu) {
if (thatcpu == cpu)
continue;
- snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
- snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
+ snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
+ thatcpu).dynticks);
smp_mb(); /* Order sampling of snap with end of grace period. */
- if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
+ if ((snap & 0x1) != 0) {
per_cpu(rcu_dyntick_drain, cpu) = 0;
per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
return rcu_needs_cpu_quick_check(cpu);
@@ -1565,7 +1987,7 @@ int rcu_needs_cpu(int cpu)
/* If RCU callbacks are still pending, RCU still needs this CPU. */
if (c)
- invoke_rcu_cpu_kthread();
+ invoke_rcu_core();
return c;
}
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index aa0fd72b4bc7..4e144876dc68 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,8 @@
#define RCU_TREE_NONCORE
#include "rcutree.h"
+#ifdef CONFIG_RCU_BOOST
+
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
@@ -58,6 +60,8 @@ static char convert_kthread_status(unsigned int kthread_status)
return "SRWOY"[kthread_status];
}
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
{
if (!rdp->beenonline)
@@ -69,14 +73,14 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->passed_quiesc, rdp->passed_quiesc_completed,
rdp->qs_pending);
#ifdef CONFIG_NO_HZ
- seq_printf(m, " dt=%d/%d dn=%d df=%lu",
- rdp->dynticks->dynticks,
+ seq_printf(m, " dt=%d/%d/%d df=%lu",
+ atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi,
+ rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld",
+ seq_printf(m, " ql=%ld qs=%c%c%c%c",
rdp->qlen,
".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
rdp->nxttail[RCU_NEXT_TAIL]],
@@ -84,13 +88,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->nxttail[RCU_NEXT_READY_TAIL]],
".W"[rdp->nxttail[RCU_DONE_TAIL] !=
rdp->nxttail[RCU_WAIT_TAIL]],
- ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
+ ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
+#ifdef CONFIG_RCU_BOOST
+ seq_printf(m, " kt=%d/%c/%d ktl=%x",
per_cpu(rcu_cpu_has_work, rdp->cpu),
convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
rdp->cpu)),
per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
- per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff,
- rdp->blimit);
+ per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ seq_printf(m, " b=%ld", rdp->blimit);
seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
@@ -141,24 +148,27 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
rdp->qs_pending);
#ifdef CONFIG_NO_HZ
seq_printf(m, ",%d,%d,%d,%lu",
- rdp->dynticks->dynticks,
+ atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi,
+ rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen,
+ seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
rdp->nxttail[RCU_NEXT_TAIL]],
".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
rdp->nxttail[RCU_NEXT_READY_TAIL]],
".W"[rdp->nxttail[RCU_DONE_TAIL] !=
rdp->nxttail[RCU_WAIT_TAIL]],
- ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
+ ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
+#ifdef CONFIG_RCU_BOOST
+ seq_printf(m, ",%d,\"%c\"",
per_cpu(rcu_cpu_has_work, rdp->cpu),
convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
- rdp->cpu)),
- rdp->blimit);
+ rdp->cpu)));
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ seq_printf(m, ",%ld", rdp->blimit);
seq_printf(m, ",%lu,%lu,%lu\n",
rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
@@ -167,9 +177,13 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
{
seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
#ifdef CONFIG_NO_HZ
- seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
#endif /* #ifdef CONFIG_NO_HZ */
- seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
+ seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
+#ifdef CONFIG_RCU_BOOST
+ seq_puts(m, "\"kt\",\"ktl\"");
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
#ifdef CONFIG_TREE_PREEMPT_RCU
seq_puts(m, "\"rcu_preempt:\"\n");
PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
diff --git a/kernel/resource.c b/kernel/resource.c
index 798e2fae2a06..3ff40178dce7 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,6 +38,14 @@ struct resource iomem_resource = {
};
EXPORT_SYMBOL(iomem_resource);
+/* constraints to be met while allocating resources */
+struct resource_constraint {
+ resource_size_t min, max, align;
+ resource_size_t (*alignf)(void *, const struct resource *,
+ resource_size_t, resource_size_t);
+ void *alignf_data;
+};
+
static DEFINE_RWLOCK(resource_lock);
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
@@ -384,16 +392,13 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
}
/*
- * Find empty slot in the resource tree given range and alignment.
+ * Find empty slot in the resource tree with the given range and
+ * alignment constraints
*/
-static int find_resource(struct resource *root, struct resource *new,
- resource_size_t size, resource_size_t min,
- resource_size_t max, resource_size_t align,
- resource_size_t (*alignf)(void *,
- const struct resource *,
- resource_size_t,
- resource_size_t),
- void *alignf_data)
+static int __find_resource(struct resource *root, struct resource *old,
+ struct resource *new,
+ resource_size_t size,
+ struct resource_constraint *constraint)
{
struct resource *this = root->child;
struct resource tmp = *new, avail, alloc;
@@ -404,25 +409,26 @@ static int find_resource(struct resource *root, struct resource *new,
* Skip past an allocated resource that starts at 0, since the assignment
* of this->start - 1 to tmp->end below would cause an underflow.
*/
- if (this && this->start == 0) {
- tmp.start = this->end + 1;
+ if (this && this->start == root->start) {
+ tmp.start = (this == old) ? old->start : this->end + 1;
this = this->sibling;
}
for(;;) {
if (this)
- tmp.end = this->start - 1;
+ tmp.end = (this == old) ? this->end : this->start - 1;
else
tmp.end = root->end;
- resource_clip(&tmp, min, max);
+ resource_clip(&tmp, constraint->min, constraint->max);
arch_remove_reservations(&tmp);
/* Check for overflow after ALIGN() */
avail = *new;
- avail.start = ALIGN(tmp.start, align);
+ avail.start = ALIGN(tmp.start, constraint->align);
avail.end = tmp.end;
if (avail.start >= tmp.start) {
- alloc.start = alignf(alignf_data, &avail, size, align);
+ alloc.start = constraint->alignf(constraint->alignf_data, &avail,
+ size, constraint->align);
alloc.end = alloc.start + size - 1;
if (resource_contains(&avail, &alloc)) {
new->start = alloc.start;
@@ -432,14 +438,75 @@ static int find_resource(struct resource *root, struct resource *new,
}
if (!this)
break;
- tmp.start = this->end + 1;
+ if (this != old)
+ tmp.start = this->end + 1;
this = this->sibling;
}
return -EBUSY;
}
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ */
+static int find_resource(struct resource *root, struct resource *new,
+ resource_size_t size,
+ struct resource_constraint *constraint)
+{
+ return __find_resource(root, NULL, new, size, constraint);
+}
+
/**
- * allocate_resource - allocate empty slot in the resource tree given range & alignment
+ * reallocate_resource - allocate a slot in the resource tree given range & alignment.
+ * The resource will be relocated if the new size cannot be reallocated in the
+ * current location.
+ *
+ * @root: root resource descriptor
+ * @old: resource descriptor desired by caller
+ * @newsize: new size of the resource descriptor
+ * @constraint: the size and alignment constraints to be met.
+ */
+int reallocate_resource(struct resource *root, struct resource *old,
+ resource_size_t newsize,
+ struct resource_constraint *constraint)
+{
+ int err=0;
+ struct resource new = *old;
+ struct resource *conflict;
+
+ write_lock(&resource_lock);
+
+ if ((err = __find_resource(root, old, &new, newsize, constraint)))
+ goto out;
+
+ if (resource_contains(&new, old)) {
+ old->start = new.start;
+ old->end = new.end;
+ goto out;
+ }
+
+ if (old->child) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (resource_contains(old, &new)) {
+ old->start = new.start;
+ old->end = new.end;
+ } else {
+ __release_resource(old);
+ *old = new;
+ conflict = __request_resource(root, old);
+ BUG_ON(conflict);
+ }
+out:
+ write_unlock(&resource_lock);
+ return err;
+}
+
+
+/**
+ * allocate_resource - allocate empty slot in the resource tree given range & alignment.
+ * The resource will be reallocated with a new size if it was already allocated
* @root: root resource descriptor
* @new: resource descriptor desired by caller
* @size: requested resource region size
@@ -459,12 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new,
void *alignf_data)
{
int err;
+ struct resource_constraint constraint;
if (!alignf)
alignf = simple_align_resource;
+ constraint.min = min;
+ constraint.max = max;
+ constraint.align = align;
+ constraint.alignf = alignf;
+ constraint.alignf_data = alignf_data;
+
+ if ( new->parent ) {
+ /* resource is already allocated, try reallocating with
+ the new constraints */
+ return reallocate_resource(root, new, size, &constraint);
+ }
+
write_lock(&resource_lock);
- err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+ err = find_resource(root, new, size, &constraint);
if (err >= 0 && __request_resource(root, new))
err = -EBUSY;
write_unlock(&resource_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index c62acf45d3b9..fde6ff903525 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -292,7 +292,7 @@ static DEFINE_SPINLOCK(task_group_lock);
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
-#define MIN_SHARES 2
+#define MIN_SHARES (1UL << 1)
#define MAX_SHARES (1UL << 18)
static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
@@ -605,10 +605,10 @@ static inline int cpu_of(struct rq *rq)
/*
* Return the group to which this tasks belongs.
*
- * We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
- * holds that lock for each task it moves into the cgroup. Therefore
- * by holding that lock, we pin the task to the current cgroup.
+ * We use task_subsys_state_check() and extend the RCU verification with
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * task it moves into the cgroup. Therefore by holding either of those locks,
+ * we pin the task to the current cgroup.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
@@ -616,7 +616,8 @@ static inline struct task_group *task_group(struct task_struct *p)
struct cgroup_subsys_state *css;
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&p->pi_lock));
+ lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock));
tg = container_of(css, struct task_group, css);
return autogroup_task_group(p, tg);
@@ -1330,13 +1331,25 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
{
u64 tmp;
- tmp = (u64)delta_exec * weight;
+ /*
+ * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+ * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+ * 2^SCHED_LOAD_RESOLUTION.
+ */
+ if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+ tmp = (u64)delta_exec * scale_load_down(weight);
+ else
+ tmp = (u64)delta_exec;
if (!lw->inv_weight) {
- if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+ unsigned long w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
else
- lw->inv_weight = WMULT_CONST / lw->weight;
+ lw->inv_weight = WMULT_CONST / w;
}
/*
@@ -1778,17 +1791,20 @@ static void dec_nr_running(struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
+ int prio = p->static_prio - MAX_RT_PRIO;
+ struct load_weight *load = &p->se.load;
+
/*
* SCHED_IDLE tasks get minimal weight:
*/
if (p->policy == SCHED_IDLE) {
- p->se.load.weight = WEIGHT_IDLEPRIO;
- p->se.load.inv_weight = WMULT_IDLEPRIO;
+ load->weight = scale_load(WEIGHT_IDLEPRIO);
+ load->inv_weight = WMULT_IDLEPRIO;
return;
}
- p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
- p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+ load->weight = scale_load(prio_to_weight[prio]);
+ load->inv_weight = prio_to_wmult[prio];
}
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2185,6 +2201,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
#ifdef CONFIG_LOCKDEP
+ /*
+ * The caller should hold either p->pi_lock or rq->lock, when changing
+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+ *
+ * sched_move_task() holds both and thus holding either pins the cgroup,
+ * see set_task_rq().
+ *
+ * Furthermore, all task_rq users should acquire both locks, see
+ * task_rq_lock().
+ */
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(&task_rq(p)->lock)));
#endif
@@ -2432,6 +2458,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
rcu_read_unlock();
}
+
+ if (wake_flags & WF_MIGRATED)
+ schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+
#endif /* CONFIG_SMP */
schedstat_inc(rq, ttwu_count);
@@ -2440,9 +2470,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
if (wake_flags & WF_SYNC)
schedstat_inc(p, se.statistics.nr_wakeups_sync);
- if (cpu != task_cpu(p))
- schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
#endif /* CONFIG_SCHEDSTATS */
}
@@ -2517,13 +2544,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+static void sched_ttwu_do_pending(struct task_struct *list)
{
struct rq *rq = this_rq();
- struct task_struct *list = xchg(&rq->wake_list, NULL);
-
- if (!list)
- return;
raw_spin_lock(&rq->lock);
@@ -2536,9 +2559,45 @@ static void sched_ttwu_pending(void)
raw_spin_unlock(&rq->lock);
}
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void sched_ttwu_pending(void)
+{
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ sched_ttwu_do_pending(list);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
void scheduler_ipi(void)
{
- sched_ttwu_pending();
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ /*
+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+ * traditionally all their work was done from the interrupt return
+ * path. Now that we actually do some work, we need to make sure
+ * we do call them.
+ *
+ * Some archs already do call them, luckily irq_enter/exit nest
+ * properly.
+ *
+ * Arguably we should visit all archs and update all handlers,
+ * however a fair share of IPIs are still resched only so this would
+ * somewhat pessimize the simple resched case.
+ */
+ irq_enter();
+ sched_ttwu_do_pending(list);
+ irq_exit();
}
static void ttwu_queue_remote(struct task_struct *p, int cpu)
@@ -2558,14 +2617,34 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
if (!next)
smp_send_reschedule(cpu);
}
-#endif
+
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (p->on_cpu) {
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu)
{
struct rq *rq = cpu_rq(cpu);
-#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)
+#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ sched_clock_cpu(cpu); /* sync clocks x-cpu */
ttwu_queue_remote(p, cpu);
return;
}
@@ -2616,17 +2695,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
while (p->on_cpu) {
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
/*
- * If called from interrupt context we could have landed in the
- * middle of schedule(), in this case we should take care not
- * to spin on ->on_cpu if p is current, since that would
- * deadlock.
+ * In case the architecture enables interrupts in
+ * context_switch(), we cannot busy wait, since that
+ * would lead to deadlocks when an interrupt hits and
+ * tries to wake up @prev. So bail and do a complete
+ * remote wakeup.
*/
- if (p == current) {
- ttwu_queue(p, cpu);
+ if (ttwu_activate_remote(p, wake_flags))
goto stat;
- }
-#endif
+#else
cpu_relax();
+#endif
}
/*
* Pairs with the smp_wmb() in finish_lock_switch().
@@ -2640,8 +2719,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_class->task_waking(p);
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
- if (task_cpu(p) != cpu)
+ if (task_cpu(p) != cpu) {
+ wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
+ }
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu);
@@ -5826,7 +5907,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ do_set_cpus_allowed(idle, cpumask_of(cpu));
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5914,6 +5995,16 @@ static inline void sched_init_granularity(void)
}
#ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ if (p->sched_class && p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+ else {
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ }
+}
+
/*
* This is how migration works:
*
@@ -5959,12 +6050,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;
}
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
- else {
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
- }
+ do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), new_mask))
@@ -6503,7 +6589,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!group->cpu_power) {
+ if (!group->sgp->power) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -6527,9 +6613,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->cpu_power != SCHED_LOAD_SCALE) {
+ if (group->sgp->power != SCHED_POWER_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
- group->cpu_power);
+ group->sgp->power);
}
group = group->next;
@@ -6720,11 +6806,39 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+ struct sched_group *tmp, *first;
+
+ if (!sg)
+ return;
+
+ first = sg;
+ do {
+ tmp = sg->next;
+
+ if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+ kfree(sg->sgp);
+
+ kfree(sg);
+ sg = tmp;
+ } while (sg != first);
+}
+
static void free_sched_domain(struct rcu_head *rcu)
{
struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
- if (atomic_dec_and_test(&sd->groups->ref))
+
+ /*
+ * If its an overlapping domain it has private groups, iterate and
+ * nuke them all.
+ */
+ if (sd->flags & SD_OVERLAP) {
+ free_sched_groups(sd->groups, 1);
+ } else if (atomic_dec_and_test(&sd->groups->ref)) {
+ kfree(sd->groups->sgp);
kfree(sd->groups);
+ }
kfree(sd);
}
@@ -6891,6 +7005,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
+ struct sched_group_power **__percpu sgp;
};
struct s_data {
@@ -6910,15 +7025,73 @@ struct sched_domain_topology_level;
typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+#define SDTL_OVERLAP 0x01
+
struct sched_domain_topology_level {
sched_domain_init_f init;
sched_domain_mask_f mask;
+ int flags;
struct sd_data data;
};
-/*
- * Assumes the sched_domain tree is fully constructed
- */
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered = sched_domains_tmpmask;
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *child;
+ int i;
+
+ cpumask_clear(covered);
+
+ for_each_cpu(i, span) {
+ struct cpumask *sg_span;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(i));
+
+ if (!sg)
+ goto fail;
+
+ sg_span = sched_group_cpus(sg);
+
+ child = *per_cpu_ptr(sdd->sd, i);
+ if (child->child) {
+ child = child->child;
+ cpumask_copy(sg_span, sched_domain_span(child));
+ } else
+ cpumask_set_cpu(i, sg_span);
+
+ cpumask_or(covered, covered, sg_span);
+
+ sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ atomic_inc(&sg->sgp->ref);
+
+ if (cpumask_test_cpu(cpu, sg_span))
+ groups = sg;
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ last->next = first;
+ }
+ sd->groups = groups;
+
+ return 0;
+
+fail:
+ free_sched_groups(first, 0);
+
+ return -ENOMEM;
+}
+
static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
{
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -6927,24 +7100,24 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (child)
cpu = cpumask_first(sched_domain_span(child));
- if (sg)
+ if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
+ (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+ atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+ }
return cpu;
}
/*
- * build_sched_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
*/
-static void
-build_sched_groups(struct sched_domain *sd)
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
{
struct sched_group *first = NULL, *last = NULL;
struct sd_data *sdd = sd->private;
@@ -6952,6 +7125,12 @@ build_sched_groups(struct sched_domain *sd)
struct cpumask *covered;
int i;
+ get_group(cpu, sdd, &sd->groups);
+ atomic_inc(&sd->groups->ref);
+
+ if (cpu != cpumask_first(sched_domain_span(sd)))
+ return 0;
+
lockdep_assert_held(&sched_domains_mutex);
covered = sched_domains_tmpmask;
@@ -6966,7 +7145,7 @@ build_sched_groups(struct sched_domain *sd)
continue;
cpumask_clear(sched_group_cpus(sg));
- sg->cpu_power = 0;
+ sg->sgp->power = 0;
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
@@ -6983,6 +7162,8 @@ build_sched_groups(struct sched_domain *sd)
last = sg;
}
last->next = first;
+
+ return 0;
}
/*
@@ -6997,12 +7178,17 @@ build_sched_groups(struct sched_domain *sd)
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
- WARN_ON(!sd || !sd->groups);
+ struct sched_group *sg = sd->groups;
- if (cpu != group_first_cpu(sd->groups))
- return;
+ WARN_ON(!sd || !sg);
- sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+ do {
+ sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ sg = sg->next;
+ } while (sg != sd->groups);
+
+ if (cpu != group_first_cpu(sg))
+ return;
update_group_power(sd, cpu);
}
@@ -7123,15 +7309,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
static void claim_allocations(int cpu, struct sched_domain *sd)
{
struct sd_data *sdd = sd->private;
- struct sched_group *sg = sd->groups;
WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
*per_cpu_ptr(sdd->sd, cpu) = NULL;
- if (cpu == cpumask_first(sched_group_cpus(sg))) {
- WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
- }
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+ *per_cpu_ptr(sdd->sgp, cpu) = NULL;
}
#ifdef CONFIG_SCHED_SMT
@@ -7156,7 +7342,7 @@ static struct sched_domain_topology_level default_topology[] = {
#endif
{ sd_init_CPU, cpu_cpu_mask, },
#ifdef CONFIG_NUMA
- { sd_init_NODE, cpu_node_mask, },
+ { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
{ sd_init_ALLNODES, cpu_allnodes_mask, },
#endif
{ NULL, },
@@ -7180,9 +7366,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
+ sdd->sgp = alloc_percpu(struct sched_group_power *);
+ if (!sdd->sgp)
+ return -ENOMEM;
+
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
+ struct sched_group_power *sgp;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
@@ -7197,6 +7388,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
return -ENOMEM;
*per_cpu_ptr(sdd->sg, j) = sg;
+
+ sgp = kzalloc_node(sizeof(struct sched_group_power),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sgp)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sgp, j) = sgp;
}
}
@@ -7212,11 +7410,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
struct sd_data *sdd = &tl->data;
for_each_cpu(j, cpu_map) {
- kfree(*per_cpu_ptr(sdd->sd, j));
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
kfree(*per_cpu_ptr(sdd->sg, j));
+ kfree(*per_cpu_ptr(sdd->sgp, j));
}
free_percpu(sdd->sd);
free_percpu(sdd->sg);
+ free_percpu(sdd->sgp);
}
}
@@ -7262,8 +7464,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_topology_level *tl;
sd = NULL;
- for (tl = sched_domain_topology; tl->init; tl++)
+ for (tl = sched_domain_topology; tl->init; tl++) {
sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+ if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ sd->flags |= SD_OVERLAP;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
while (sd->child)
sd = sd->child;
@@ -7275,13 +7482,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
- get_group(i, sd->private, &sd->groups);
- atomic_inc(&sd->groups->ref);
-
- if (i != cpumask_first(sched_domain_span(sd)))
- continue;
-
- build_sched_groups(sd);
+ if (sd->flags & SD_OVERLAP) {
+ if (build_overlap_sched_groups(sd, i))
+ goto error;
+ } else {
+ if (build_sched_groups(sd, i))
+ goto error;
+ }
}
}
@@ -7703,6 +7910,9 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
#endif
#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
}
static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -7902,7 +8112,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_power = SCHED_LOAD_SCALE;
+ rq->cpu_power = SCHED_POWER_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -8396,10 +8606,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
if (!tg->se[0])
return -EINVAL;
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- else if (shares > MAX_SHARES)
- shares = MAX_SHARES;
+ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
mutex_lock(&shares_mutex);
if (tg->shares == shares)
@@ -8749,42 +8956,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
return 0;
}
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup)
-{
- int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
- if (retval)
- return retval;
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- retval = cpu_cgroup_can_attach_task(cgrp, c);
- if (retval) {
- rcu_read_unlock();
- return retval;
- }
- }
- rcu_read_unlock();
- }
- return 0;
-}
-
static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cont, struct task_struct *tsk,
- bool threadgroup)
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
sched_move_task(tsk);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- sched_move_task(c);
- }
- rcu_read_unlock();
- }
}
static void
@@ -8806,14 +8981,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
{
- return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+ return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
}
static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
{
struct task_group *tg = cgroup_tg(cgrp);
- return (u64) tg->shares;
+ return (u64) scale_load_down(tg->shares);
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8872,8 +9047,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.create = cpu_cgroup_create,
.destroy = cpu_cgroup_destroy,
- .can_attach = cpu_cgroup_can_attach,
- .attach = cpu_cgroup_attach,
+ .can_attach_task = cpu_cgroup_can_attach_task,
+ .attach_task = cpu_cgroup_attach_task,
.exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 37f22626225e..c768588e180b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1076,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->on_rq = 0;
update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
- update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -1086,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
+
+ update_min_vruntime(cfs_rq);
+ update_cfs_shares(cfs_rq);
}
/*
@@ -1584,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
if (local_group) {
this_load = avg_load;
@@ -1722,7 +1723,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
nr_running += cpu_rq(i)->cfs.nr_running;
}
- capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+ capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
nr_running /= 2;
@@ -2570,7 +2571,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
{
- return SCHED_LOAD_SCALE;
+ return SCHED_POWER_SCALE;
}
unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -2607,10 +2608,10 @@ unsigned long scale_rt_power(int cpu)
available = total - rq->rt_avg;
}
- if (unlikely((s64)total < SCHED_LOAD_SCALE))
- total = SCHED_LOAD_SCALE;
+ if (unlikely((s64)total < SCHED_POWER_SCALE))
+ total = SCHED_POWER_SCALE;
- total >>= SCHED_LOAD_SHIFT;
+ total >>= SCHED_POWER_SHIFT;
return div_u64(available, total);
}
@@ -2618,7 +2619,7 @@ unsigned long scale_rt_power(int cpu)
static void update_cpu_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
- unsigned long power = SCHED_LOAD_SCALE;
+ unsigned long power = SCHED_POWER_SCALE;
struct sched_group *sdg = sd->groups;
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -2627,26 +2628,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
else
power *= default_scale_smt_power(sd, cpu);
- power >>= SCHED_LOAD_SHIFT;
+ power >>= SCHED_POWER_SHIFT;
}
- sdg->cpu_power_orig = power;
+ sdg->sgp->power_orig = power;
if (sched_feat(ARCH_POWER))
power *= arch_scale_freq_power(sd, cpu);
else
power *= default_scale_freq_power(sd, cpu);
- power >>= SCHED_LOAD_SHIFT;
+ power >>= SCHED_POWER_SHIFT;
power *= scale_rt_power(cpu);
- power >>= SCHED_LOAD_SHIFT;
+ power >>= SCHED_POWER_SHIFT;
if (!power)
power = 1;
cpu_rq(cpu)->cpu_power = power;
- sdg->cpu_power = power;
+ sdg->sgp->power = power;
}
static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2664,11 +2665,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- power += group->cpu_power;
+ power += group->sgp->power;
group = group->next;
} while (group != child->groups);
- sdg->cpu_power = power;
+ sdg->sgp->power = power;
}
/*
@@ -2682,7 +2683,7 @@ static inline int
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
{
/*
- * Only siblings can have significantly less than SCHED_LOAD_SCALE
+ * Only siblings can have significantly less than SCHED_POWER_SCALE
*/
if (!(sd->flags & SD_SHARE_CPUPOWER))
return 0;
@@ -2690,7 +2691,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
/*
* If ~90% of the cpu_power is still there, we're good.
*/
- if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+ if (group->sgp->power * 32 > group->sgp->power_orig * 29)
return 1;
return 0;
@@ -2770,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
}
/* Adjust by relative CPU power of the group */
- sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
/*
* Consider the group unbalanced when the imbalance is larger
@@ -2787,7 +2788,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
- sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
+ SCHED_POWER_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
sgs->group_weight = group->group_weight;
@@ -2875,7 +2877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
return;
sds->total_load += sgs.group_load;
- sds->total_pwr += sg->cpu_power;
+ sds->total_pwr += sg->sgp->power;
/*
* In case the child domain prefers tasks go to siblings
@@ -2960,8 +2962,8 @@ static int check_asym_packing(struct sched_domain *sd,
if (this_cpu > busiest_cpu)
return 0;
- *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
- SCHED_LOAD_SCALE);
+ *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
+ SCHED_POWER_SCALE);
return 1;
}
@@ -2990,8 +2992,8 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
cpu_avg_load_per_task(this_cpu);
scaled_busy_load_per_task = sds->busiest_load_per_task
- * SCHED_LOAD_SCALE;
- scaled_busy_load_per_task /= sds->busiest->cpu_power;
+ * SCHED_POWER_SCALE;
+ scaled_busy_load_per_task /= sds->busiest->sgp->power;
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
(scaled_busy_load_per_task * imbn)) {
@@ -3005,30 +3007,30 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
* moving them.
*/
- pwr_now += sds->busiest->cpu_power *
+ pwr_now += sds->busiest->sgp->power *
min(sds->busiest_load_per_task, sds->max_load);
- pwr_now += sds->this->cpu_power *
+ pwr_now += sds->this->sgp->power *
min(sds->this_load_per_task, sds->this_load);
- pwr_now /= SCHED_LOAD_SCALE;
+ pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
- tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
- sds->busiest->cpu_power;
+ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
+ sds->busiest->sgp->power;
if (sds->max_load > tmp)
- pwr_move += sds->busiest->cpu_power *
+ pwr_move += sds->busiest->sgp->power *
min(sds->busiest_load_per_task, sds->max_load - tmp);
/* Amount of load we'd add */
- if (sds->max_load * sds->busiest->cpu_power <
- sds->busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = (sds->max_load * sds->busiest->cpu_power) /
- sds->this->cpu_power;
+ if (sds->max_load * sds->busiest->sgp->power <
+ sds->busiest_load_per_task * SCHED_POWER_SCALE)
+ tmp = (sds->max_load * sds->busiest->sgp->power) /
+ sds->this->sgp->power;
else
- tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
- sds->this->cpu_power;
- pwr_move += sds->this->cpu_power *
+ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
+ sds->this->sgp->power;
+ pwr_move += sds->this->sgp->power *
min(sds->this_load_per_task, sds->this_load + tmp);
- pwr_move /= SCHED_LOAD_SCALE;
+ pwr_move /= SCHED_POWER_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
@@ -3070,9 +3072,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
load_above_capacity = (sds->busiest_nr_running -
sds->busiest_group_capacity);
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+ load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
- load_above_capacity /= sds->busiest->cpu_power;
+ load_above_capacity /= sds->busiest->sgp->power;
}
/*
@@ -3088,9 +3090,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * sds->busiest->cpu_power,
- (sds->avg_load - sds->this_load) * sds->this->cpu_power)
- / SCHED_LOAD_SCALE;
+ *imbalance = min(max_pull * sds->busiest->sgp->power,
+ (sds->avg_load - sds->this_load) * sds->this->sgp->power)
+ / SCHED_POWER_SCALE;
/*
* if *imbalance is less than the average load per runnable task
@@ -3159,7 +3161,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
- sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+ sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
/*
* If the busiest group is imbalanced the below checks don't
@@ -3238,7 +3240,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
for_each_cpu(i, sched_group_cpus(group)) {
unsigned long power = power_of(i);
- unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+ unsigned long capacity = DIV_ROUND_CLOSEST(power,
+ SCHED_POWER_SCALE);
unsigned long wl;
if (!capacity)
@@ -3263,7 +3266,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
* the load can be moved away from the cpu that is potentially
* running at a lower capacity.
*/
- wl = (wl * SCHED_LOAD_SCALE) / power;
+ wl = (wl * SCHED_POWER_SCALE) / power;
if (wl > max_load) {
max_load = wl;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f7371ee1..1e7066d76c26 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -70,3 +70,5 @@ SCHED_FEAT(NONIRQ_POWER, 1)
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, 1)
+
+SCHED_FEAT(FORCE_SD_OVERLAP, 0)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 64b2a37c07d0..10d018212bab 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1096,7 +1096,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
* to move current somewhere else, making room for our non-migratable
* task.
*/
- if (p->prio == rq->curr->prio && !need_resched())
+ if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
check_preempt_equal_prio(rq, p);
#endif
}
@@ -1239,6 +1239,10 @@ static int find_lowest_rq(struct task_struct *task)
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ /* Make sure the mask is initialized first */
+ if (unlikely(!lowest_mask))
+ return -1;
+
if (task->rt.nr_cpus_allowed == 1)
return -1; /* No other targets possible */
@@ -1263,6 +1267,7 @@ static int find_lowest_rq(struct task_struct *task)
if (!cpumask_test_cpu(this_cpu, lowest_mask))
this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+ rcu_read_lock();
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) {
int best_cpu;
@@ -1272,15 +1277,20 @@ static int find_lowest_rq(struct task_struct *task)
* remote processor.
*/
if (this_cpu != -1 &&
- cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
return this_cpu;
+ }
best_cpu = cpumask_first_and(lowest_mask,
sched_domain_span(sd));
- if (best_cpu < nr_cpu_ids)
+ if (best_cpu < nr_cpu_ids) {
+ rcu_read_unlock();
return best_cpu;
+ }
}
}
+ rcu_read_unlock();
/*
* And finally, if there were no matches within the domains
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 48ddf431db0e..331e01bcd026 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
#ifdef CONFIG_SMP
/* domain-specific stats */
- preempt_disable();
+ rcu_read_lock();
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
}
- preempt_enable();
+ rcu_read_unlock();
#endif
}
kfree(mask_str);
diff --git a/kernel/signal.c b/kernel/signal.c
index 7165af5f1b11..d7f70aed1cc0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
/*
* Tracers may want to know about even ignored signals.
*/
- return !tracehook_consider_ignored_signal(t, sig);
+ return !t->ptrace;
}
/*
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
static int recalc_sigpending_tsk(struct task_struct *t)
{
- if (t->signal->group_stop_count > 0 ||
+ if ((t->jobctl & JOBCTL_PENDING_MASK) ||
PENDING(&t->pending, &t->blocked) ||
PENDING(&t->signal->shared_pending, &t->blocked)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
void recalc_sigpending(void)
{
- if (unlikely(tracehook_force_sigpending()))
- set_thread_flag(TIF_SIGPENDING);
- else if (!recalc_sigpending_tsk(current) && !freezing(current))
+ if (!recalc_sigpending_tsk(current) && !freezing(current))
clear_thread_flag(TIF_SIGPENDING);
}
@@ -223,6 +221,129 @@ static inline void print_dropped_signal(int sig)
current->comm, current->pid, sig);
}
+/**
+ * task_set_jobctl_pending - set jobctl pending bits
+ * @task: target task
+ * @mask: pending bits to set
+ *
+ * Clear @mask from @task->jobctl. @mask must be subset of
+ * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
+ * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is
+ * cleared. If @task is already being killed or exiting, this function
+ * becomes noop.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if @mask is set, %false if made noop because @task was dying.
+ */
+bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+{
+ BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
+ JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
+ BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
+
+ if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
+ return false;
+
+ if (mask & JOBCTL_STOP_SIGMASK)
+ task->jobctl &= ~JOBCTL_STOP_SIGMASK;
+
+ task->jobctl |= mask;
+ return true;
+}
+
+/**
+ * task_clear_jobctl_trapping - clear jobctl trapping bit
+ * @task: target task
+ *
+ * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
+ * Clear it and wake up the ptracer. Note that we don't need any further
+ * locking. @task->siglock guarantees that @task->parent points to the
+ * ptracer.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+void task_clear_jobctl_trapping(struct task_struct *task)
+{
+ if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_TRAPPING;
+ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
+ }
+}
+
+/**
+ * task_clear_jobctl_pending - clear jobctl pending bits
+ * @task: target task
+ * @mask: pending bits to clear
+ *
+ * Clear @mask from @task->jobctl. @mask must be subset of
+ * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other
+ * STOP bits are cleared together.
+ *
+ * If clearing of @mask leaves no stop or trap pending, this function calls
+ * task_clear_jobctl_trapping().
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
+{
+ BUG_ON(mask & ~JOBCTL_PENDING_MASK);
+
+ if (mask & JOBCTL_STOP_PENDING)
+ mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
+
+ task->jobctl &= ~mask;
+
+ if (!(task->jobctl & JOBCTL_PENDING_MASK))
+ task_clear_jobctl_trapping(task);
+}
+
+/**
+ * task_participate_group_stop - participate in a group stop
+ * @task: task participating in a group stop
+ *
+ * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
+ * Group stop states are cleared and the group stop count is consumed if
+ * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
+ * stop, the appropriate %SIGNAL_* flags are set.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if group stop completion should be notified to the parent, %false
+ * otherwise.
+ */
+static bool task_participate_group_stop(struct task_struct *task)
+{
+ struct signal_struct *sig = task->signal;
+ bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
+
+ WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
+
+ task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
+
+ if (!consume)
+ return false;
+
+ if (!WARN_ON_ONCE(sig->group_stop_count == 0))
+ sig->group_stop_count--;
+
+ /*
+ * Tell the caller to notify completion iff we are entering into a
+ * fresh group stop. Read comment in do_signal_stop() for details.
+ */
+ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
+ sig->flags = SIGNAL_STOP_STOPPED;
+ return true;
+ }
+ return false;
+}
+
/*
* allocate a new signal queue record
* - this may be called without locks if and only if t == current, otherwise an
@@ -372,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return 1;
if (handler != SIG_IGN && handler != SIG_DFL)
return 0;
- return !tracehook_consider_fatal_signal(tsk, sig);
+ /* if ptraced, let the tracer determine */
+ return !tsk->ptrace;
}
/*
@@ -527,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* is to alert stop-signal processing code when another
* processor has come along and cleared the flag.
*/
- tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+ current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
/*
@@ -592,7 +714,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
if (sigisemptyset(&m))
return 0;
- signandsets(&s->signal, &s->signal, mask);
+ sigandnsets(&s->signal, &s->signal, mask);
list_for_each_entry_safe(q, n, &s->list, list) {
if (sigismember(mask, q->info.si_signo)) {
list_del_init(&q->list);
@@ -696,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info,
return security_task_kill(t, info, sig, 0);
}
+/**
+ * ptrace_trap_notify - schedule trap to notify ptracer
+ * @t: tracee wanting to notify tracer
+ *
+ * This function schedules sticky ptrace trap which is cleared on the next
+ * TRAP_STOP to notify ptracer of an event. @t must have been seized by
+ * ptracer.
+ *
+ * If @t is running, STOP trap will be taken. If trapped for STOP and
+ * ptracer is listening for events, tracee is woken up so that it can
+ * re-trap for the new event. If trapped otherwise, STOP trap will be
+ * eventually taken without returning to userland after the existing traps
+ * are finished by PTRACE_CONT.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+static void ptrace_trap_notify(struct task_struct *t)
+{
+ WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
+ assert_spin_locked(&t->sighand->siglock);
+
+ task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
+ signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
+}
+
/*
* Handle magic process-wide effects of stop/continue signals. Unlike
* the signal actions, these happen immediately at signal-generation
@@ -727,34 +875,17 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
} else if (sig == SIGCONT) {
unsigned int why;
/*
- * Remove all stop signals from all queues,
- * and wake all threads.
+ * Remove all stop signals from all queues, wake all threads.
*/
rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
t = p;
do {
- unsigned int state;
+ task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
- /*
- * If there is a handler for SIGCONT, we must make
- * sure that no thread returns to user mode before
- * we post the signal, in case it was the only
- * thread eligible to run the signal handler--then
- * it must not do anything between resuming and
- * running the handler. With the TIF_SIGPENDING
- * flag set, the thread will pause and acquire the
- * siglock that we hold now and until we've queued
- * the pending signal.
- *
- * Wake up the stopped thread _after_ setting
- * TIF_SIGPENDING
- */
- state = __TASK_STOPPED;
- if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
- set_tsk_thread_flag(t, TIF_SIGPENDING);
- state |= TASK_INTERRUPTIBLE;
- }
- wake_up_state(t, state);
+ if (likely(!(t->ptrace & PT_SEIZED)))
+ wake_up_state(t, __TASK_STOPPED);
+ else
+ ptrace_trap_notify(t);
} while_each_thread(p, t);
/*
@@ -780,13 +911,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
signal->flags = why | SIGNAL_STOP_CONTINUED;
signal->group_stop_count = 0;
signal->group_exit_code = 0;
- } else {
- /*
- * We are not stopped, but there could be a stop
- * signal in the middle of being processed after
- * being removed from the queue. Clear that too.
- */
- signal->flags &= ~SIGNAL_STOP_DEQUEUED;
}
}
@@ -858,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
if (sig_fatal(p, sig) &&
!(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
!sigismember(&t->real_blocked, sig) &&
- (sig == SIGKILL ||
- !tracehook_consider_fatal_signal(t, sig))) {
+ (sig == SIGKILL || !t->ptrace)) {
/*
* This signal will be fatal to the whole group.
*/
@@ -875,6 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
signal->group_stop_count = 0;
t = p;
do {
+ task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
} while_each_thread(p, t);
@@ -1109,6 +1233,7 @@ int zap_other_threads(struct task_struct *p)
p->signal->group_stop_count = 0;
while_each_thread(p, t) {
+ task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
count++;
/* Don't bother with already dead threads */
@@ -1126,18 +1251,25 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
{
struct sighand_struct *sighand;
- rcu_read_lock();
for (;;) {
+ local_irq_save(*flags);
+ rcu_read_lock();
sighand = rcu_dereference(tsk->sighand);
- if (unlikely(sighand == NULL))
+ if (unlikely(sighand == NULL)) {
+ rcu_read_unlock();
+ local_irq_restore(*flags);
break;
+ }
- spin_lock_irqsave(&sighand->siglock, *flags);
- if (likely(sighand == tsk->sighand))
+ spin_lock(&sighand->siglock);
+ if (likely(sighand == tsk->sighand)) {
+ rcu_read_unlock();
break;
- spin_unlock_irqrestore(&sighand->siglock, *flags);
+ }
+ spin_unlock(&sighand->siglock);
+ rcu_read_unlock();
+ local_irq_restore(*flags);
}
- rcu_read_unlock();
return sighand;
}
@@ -1452,22 +1584,22 @@ ret:
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
*
- * Returns -1 if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
+ * Returns true if our parent ignored us and so we've switched to
+ * self-reaping.
*/
-int do_notify_parent(struct task_struct *tsk, int sig)
+bool do_notify_parent(struct task_struct *tsk, int sig)
{
struct siginfo info;
unsigned long flags;
struct sighand_struct *psig;
- int ret = sig;
+ bool autoreap = false;
BUG_ON(sig == -1);
/* do_notify_parent_cldstop should have been called instead. */
BUG_ON(task_is_stopped_or_traced(tsk));
- BUG_ON(!task_ptrace(tsk) &&
+ BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
info.si_signo = sig;
@@ -1506,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
psig = tsk->parent->sighand;
spin_lock_irqsave(&psig->siglock, flags);
- if (!task_ptrace(tsk) && sig == SIGCHLD &&
+ if (!tsk->ptrace && sig == SIGCHLD &&
(psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
(psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
/*
@@ -1524,28 +1656,42 @@ int do_notify_parent(struct task_struct *tsk, int sig)
* is implementation-defined: we do (if you don't want
* it, just use SIG_IGN instead).
*/
- ret = tsk->exit_signal = -1;
+ autoreap = true;
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
- sig = -1;
+ sig = 0;
}
- if (valid_signal(sig) && sig > 0)
+ if (valid_signal(sig) && sig)
__group_send_sig_info(sig, &info, tsk->parent);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
- return ret;
+ return autoreap;
}
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
+/**
+ * do_notify_parent_cldstop - notify parent of stopped/continued state change
+ * @tsk: task reporting the state change
+ * @for_ptracer: the notification is for ptracer
+ * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
+ *
+ * Notify @tsk's parent that the stopped/continued state has changed. If
+ * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
+ * If %true, @tsk reports to @tsk->parent which should be the ptracer.
+ *
+ * CONTEXT:
+ * Must be called with tasklist_lock at least read locked.
+ */
+static void do_notify_parent_cldstop(struct task_struct *tsk,
+ bool for_ptracer, int why)
{
struct siginfo info;
unsigned long flags;
struct task_struct *parent;
struct sighand_struct *sighand;
- if (task_ptrace(tsk))
+ if (for_ptracer) {
parent = tsk->parent;
- else {
+ } else {
tsk = tsk->group_leader;
parent = tsk->real_parent;
}
@@ -1592,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
static inline int may_ptrace_stop(void)
{
- if (!likely(task_ptrace(current)))
+ if (!likely(current->ptrace))
return 0;
/*
* Are we in the middle of do_coredump?
@@ -1631,10 +1777,12 @@ static int sigkill_pending(struct task_struct *tsk)
* If we actually decide not to stop at all because the tracer
* is gone, we keep current->exit_code unless clear_code.
*/
-static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
+static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
__releases(&current->sighand->siglock)
__acquires(&current->sighand->siglock)
{
+ bool gstop_done = false;
+
if (arch_ptrace_stop_needed(exit_code, info)) {
/*
* The arch code has something special to do before a
@@ -1655,21 +1803,52 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
}
/*
- * If there is a group stop in progress,
- * we must participate in the bookkeeping.
+ * We're committing to trapping. TRACED should be visible before
+ * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
+ * Also, transition to TRACED and updates to ->jobctl should be
+ * atomic with respect to siglock and should be done after the arch
+ * hook as siglock is released and regrabbed across it.
*/
- if (current->signal->group_stop_count > 0)
- --current->signal->group_stop_count;
+ set_current_state(TASK_TRACED);
current->last_siginfo = info;
current->exit_code = exit_code;
- /* Let the debugger run. */
- __set_current_state(TASK_TRACED);
+ /*
+ * If @why is CLD_STOPPED, we're trapping to participate in a group
+ * stop. Do the bookkeeping. Note that if SIGCONT was delievered
+ * across siglock relocks since INTERRUPT was scheduled, PENDING
+ * could be clear now. We act as if SIGCONT is received after
+ * TASK_TRACED is entered - ignore it.
+ */
+ if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
+ gstop_done = task_participate_group_stop(current);
+
+ /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
+ task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
+ if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
+ task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
+
+ /* entering a trap, clear TRAPPING */
+ task_clear_jobctl_trapping(current);
+
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
if (may_ptrace_stop()) {
- do_notify_parent_cldstop(current, CLD_TRAPPED);
+ /*
+ * Notify parents of the stop.
+ *
+ * While ptraced, there are two parents - the ptracer and
+ * the real_parent of the group_leader. The ptracer should
+ * know about every stop while the real parent is only
+ * interested in the completion of group stop. The states
+ * for the two don't interact with each other. Notify
+ * separately unless they're gonna be duplicates.
+ */
+ do_notify_parent_cldstop(current, true, why);
+ if (gstop_done && ptrace_reparented(current))
+ do_notify_parent_cldstop(current, false, why);
+
/*
* Don't want to allow preemption here, because
* sys_ptrace() needs this task to be inactive.
@@ -1684,7 +1863,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
/*
* By the time we got the lock, our tracer went away.
* Don't drop the lock yet, another tracer may come.
+ *
+ * If @gstop_done, the ptracer went away between group stop
+ * completion and here. During detach, it would have set
+ * JOBCTL_STOP_PENDING on us and we'll re-enter
+ * TASK_STOPPED in do_signal_stop() on return, so notifying
+ * the real parent of the group stop completion is enough.
*/
+ if (gstop_done)
+ do_notify_parent_cldstop(current, false, why);
+
__set_current_state(TASK_RUNNING);
if (clear_code)
current->exit_code = 0;
@@ -1706,6 +1894,9 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
spin_lock_irq(&current->sighand->siglock);
current->last_siginfo = NULL;
+ /* LISTENING can be set only during STOP traps, clear it */
+ current->jobctl &= ~JOBCTL_LISTENING;
+
/*
* Queued signals ignored us while we were stopped for tracing.
* So check for any that we should take before resuming user mode.
@@ -1714,107 +1905,204 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
recalc_sigpending_tsk(current);
}
-void ptrace_notify(int exit_code)
+static void ptrace_do_notify(int signr, int exit_code, int why)
{
siginfo_t info;
- BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
-
memset(&info, 0, sizeof info);
- info.si_signo = SIGTRAP;
+ info.si_signo = signr;
info.si_code = exit_code;
info.si_pid = task_pid_vnr(current);
info.si_uid = current_uid();
/* Let the debugger run. */
+ ptrace_stop(exit_code, why, 1, &info);
+}
+
+void ptrace_notify(int exit_code)
+{
+ BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+
spin_lock_irq(&current->sighand->siglock);
- ptrace_stop(exit_code, 1, &info);
+ ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
spin_unlock_irq(&current->sighand->siglock);
}
-/*
- * This performs the stopping for SIGSTOP and other stop signals.
- * We have to stop all threads in the thread group.
- * Returns non-zero if we've actually stopped and released the siglock.
- * Returns zero if we didn't stop and still hold the siglock.
+/**
+ * do_signal_stop - handle group stop for SIGSTOP and other stop signals
+ * @signr: signr causing group stop if initiating
+ *
+ * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
+ * and participate in it. If already set, participate in the existing
+ * group stop. If participated in a group stop (and thus slept), %true is
+ * returned with siglock released.
+ *
+ * If ptraced, this function doesn't handle stop itself. Instead,
+ * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
+ * untouched. The caller must ensure that INTERRUPT trap handling takes
+ * places afterwards.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which is released
+ * on %true return.
+ *
+ * RETURNS:
+ * %false if group stop is already cancelled or ptrace trap is scheduled.
+ * %true if participated in group stop.
*/
-static int do_signal_stop(int signr)
+static bool do_signal_stop(int signr)
+ __releases(&current->sighand->siglock)
{
struct signal_struct *sig = current->signal;
- int notify;
- if (!sig->group_stop_count) {
+ if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
+ unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
struct task_struct *t;
- if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
+ /* signr will be recorded in task->jobctl for retries */
+ WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
+
+ if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
unlikely(signal_group_exit(sig)))
- return 0;
+ return false;
/*
- * There is no group stop already in progress.
- * We must initiate one now.
+ * There is no group stop already in progress. We must
+ * initiate one now.
+ *
+ * While ptraced, a task may be resumed while group stop is
+ * still in effect and then receive a stop signal and
+ * initiate another group stop. This deviates from the
+ * usual behavior as two consecutive stop signals can't
+ * cause two group stops when !ptraced. That is why we
+ * also check !task_is_stopped(t) below.
+ *
+ * The condition can be distinguished by testing whether
+ * SIGNAL_STOP_STOPPED is already set. Don't generate
+ * group_exit_code in such case.
+ *
+ * This is not necessary for SIGNAL_STOP_CONTINUED because
+ * an intervening stop signal is required to cause two
+ * continued events regardless of ptrace.
*/
- sig->group_exit_code = signr;
+ if (!(sig->flags & SIGNAL_STOP_STOPPED))
+ sig->group_exit_code = signr;
+ else
+ WARN_ON_ONCE(!current->ptrace);
- sig->group_stop_count = 1;
- for (t = next_thread(current); t != current; t = next_thread(t))
+ sig->group_stop_count = 0;
+
+ if (task_set_jobctl_pending(current, signr | gstop))
+ sig->group_stop_count++;
+
+ for (t = next_thread(current); t != current;
+ t = next_thread(t)) {
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
* so this check has no races.
*/
- if (!(t->flags & PF_EXITING) &&
- !task_is_stopped_or_traced(t)) {
+ if (!task_is_stopped(t) &&
+ task_set_jobctl_pending(t, signr | gstop)) {
sig->group_stop_count++;
- signal_wake_up(t, 0);
+ if (likely(!(t->ptrace & PT_SEIZED)))
+ signal_wake_up(t, 0);
+ else
+ ptrace_trap_notify(t);
}
+ }
}
- /*
- * If there are no other threads in the group, or if there is
- * a group stop in progress and we are the last to stop, report
- * to the parent. When ptraced, every thread reports itself.
- */
- notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
- notify = tracehook_notify_jctl(notify, CLD_STOPPED);
- /*
- * tracehook_notify_jctl() can drop and reacquire siglock, so
- * we keep ->group_stop_count != 0 before the call. If SIGCONT
- * or SIGKILL comes in between ->group_stop_count == 0.
- */
- if (sig->group_stop_count) {
- if (!--sig->group_stop_count)
- sig->flags = SIGNAL_STOP_STOPPED;
- current->exit_code = sig->group_exit_code;
+
+ if (likely(!current->ptrace)) {
+ int notify = 0;
+
+ /*
+ * If there are no other threads in the group, or if there
+ * is a group stop in progress and we are the last to stop,
+ * report to the parent.
+ */
+ if (task_participate_group_stop(current))
+ notify = CLD_STOPPED;
+
__set_current_state(TASK_STOPPED);
- }
- spin_unlock_irq(&current->sighand->siglock);
+ spin_unlock_irq(&current->sighand->siglock);
- if (notify) {
- read_lock(&tasklist_lock);
- do_notify_parent_cldstop(current, notify);
- read_unlock(&tasklist_lock);
- }
+ /*
+ * Notify the parent of the group stop completion. Because
+ * we're not holding either the siglock or tasklist_lock
+ * here, ptracer may attach inbetween; however, this is for
+ * group stop and should always be delivered to the real
+ * parent of the group leader. The new ptracer will get
+ * its notification when this task transitions into
+ * TASK_TRACED.
+ */
+ if (notify) {
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(current, false, notify);
+ read_unlock(&tasklist_lock);
+ }
- /* Now we don't run again until woken by SIGCONT or SIGKILL */
- do {
+ /* Now we don't run again until woken by SIGCONT or SIGKILL */
schedule();
- } while (try_to_freeze());
-
- tracehook_finish_jctl();
- current->exit_code = 0;
+ return true;
+ } else {
+ /*
+ * While ptraced, group stop is handled by STOP trap.
+ * Schedule it and let the caller deal with it.
+ */
+ task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
+ return false;
+ }
+}
- return 1;
+/**
+ * do_jobctl_trap - take care of ptrace jobctl traps
+ *
+ * When PT_SEIZED, it's used for both group stop and explicit
+ * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with
+ * accompanying siginfo. If stopped, lower eight bits of exit_code contain
+ * the stop signal; otherwise, %SIGTRAP.
+ *
+ * When !PT_SEIZED, it's used only for group stop trap with stop signal
+ * number as exit_code and no siginfo.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which may be
+ * released and re-acquired before returning with intervening sleep.
+ */
+static void do_jobctl_trap(void)
+{
+ struct signal_struct *signal = current->signal;
+ int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
+
+ if (current->ptrace & PT_SEIZED) {
+ if (!signal->group_stop_count &&
+ !(signal->flags & SIGNAL_STOP_STOPPED))
+ signr = SIGTRAP;
+ WARN_ON_ONCE(!signr);
+ ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
+ CLD_STOPPED);
+ } else {
+ WARN_ON_ONCE(!signr);
+ ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+ current->exit_code = 0;
+ }
}
static int ptrace_signal(int signr, siginfo_t *info,
struct pt_regs *regs, void *cookie)
{
- if (!task_ptrace(current))
- return signr;
-
ptrace_signal_deliver(regs, cookie);
-
- /* Let the debugger run. */
- ptrace_stop(signr, 0, info);
+ /*
+ * We do not check sig_kernel_stop(signr) but set this marker
+ * unconditionally because we do not know whether debugger will
+ * change signr. This flag has no meaning unless we are going
+ * to stop after return from ptrace_stop(). In this case it will
+ * be checked in do_signal_stop(), we should only stop if it was
+ * not cleared by SIGCONT while we were sleeping. See also the
+ * comment in dequeue_signal().
+ */
+ current->jobctl |= JOBCTL_STOP_DEQUEUED;
+ ptrace_stop(signr, CLD_TRAPPED, 0, info);
/* We're back. Did the debugger cancel the sig? */
signr = current->exit_code;
@@ -1869,54 +2157,63 @@ relock:
* the CLD_ si_code into SIGNAL_CLD_MASK bits.
*/
if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
- int why = (signal->flags & SIGNAL_STOP_CONTINUED)
- ? CLD_CONTINUED : CLD_STOPPED;
+ int why;
+
+ if (signal->flags & SIGNAL_CLD_CONTINUED)
+ why = CLD_CONTINUED;
+ else
+ why = CLD_STOPPED;
+
signal->flags &= ~SIGNAL_CLD_MASK;
- why = tracehook_notify_jctl(why, CLD_CONTINUED);
spin_unlock_irq(&sighand->siglock);
- if (why) {
- read_lock(&tasklist_lock);
- do_notify_parent_cldstop(current->group_leader, why);
- read_unlock(&tasklist_lock);
- }
+ /*
+ * Notify the parent that we're continuing. This event is
+ * always per-process and doesn't make whole lot of sense
+ * for ptracers, who shouldn't consume the state via
+ * wait(2) either, but, for backward compatibility, notify
+ * the ptracer of the group leader too unless it's gonna be
+ * a duplicate.
+ */
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(current, false, why);
+
+ if (ptrace_reparented(current->group_leader))
+ do_notify_parent_cldstop(current->group_leader,
+ true, why);
+ read_unlock(&tasklist_lock);
+
goto relock;
}
for (;;) {
struct k_sigaction *ka;
- /*
- * Tracing can induce an artificial signal and choose sigaction.
- * The return value in @signr determines the default action,
- * but @info->si_signo is the signal number we will report.
- */
- signr = tracehook_get_signal(current, regs, info, return_ka);
- if (unlikely(signr < 0))
+
+ if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
+ do_signal_stop(0))
goto relock;
- if (unlikely(signr != 0))
- ka = return_ka;
- else {
- if (unlikely(signal->group_stop_count > 0) &&
- do_signal_stop(0))
- goto relock;
- signr = dequeue_signal(current, &current->blocked,
- info);
+ if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
+ do_jobctl_trap();
+ spin_unlock_irq(&sighand->siglock);
+ goto relock;
+ }
- if (!signr)
- break; /* will return 0 */
+ signr = dequeue_signal(current, &current->blocked, info);
- if (signr != SIGKILL) {
- signr = ptrace_signal(signr, info,
- regs, cookie);
- if (!signr)
- continue;
- }
+ if (!signr)
+ break; /* will return 0 */
- ka = &sighand->action[signr-1];
+ if (unlikely(current->ptrace) && signr != SIGKILL) {
+ signr = ptrace_signal(signr, info,
+ regs, cookie);
+ if (!signr)
+ continue;
}
+ ka = &sighand->action[signr-1];
+
/* Trace actually delivered signals. */
trace_signal_deliver(signr, info, ka);
@@ -2017,10 +2314,42 @@ relock:
return signr;
}
+/*
+ * It could be that complete_signal() picked us to notify about the
+ * group-wide signal. Other threads should be notified now to take
+ * the shared signals in @which since we will not.
+ */
+static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
+{
+ sigset_t retarget;
+ struct task_struct *t;
+
+ sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
+ if (sigisemptyset(&retarget))
+ return;
+
+ t = tsk;
+ while_each_thread(tsk, t) {
+ if (t->flags & PF_EXITING)
+ continue;
+
+ if (!has_pending_signals(&retarget, &t->blocked))
+ continue;
+ /* Remove the signals this thread can handle. */
+ sigandsets(&retarget, &retarget, &t->blocked);
+
+ if (!signal_pending(t))
+ signal_wake_up(t, 0);
+
+ if (sigisemptyset(&retarget))
+ break;
+ }
+}
+
void exit_signals(struct task_struct *tsk)
{
int group_stop = 0;
- struct task_struct *t;
+ sigset_t unblocked;
if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
tsk->flags |= PF_EXITING;
@@ -2036,26 +2365,23 @@ void exit_signals(struct task_struct *tsk)
if (!signal_pending(tsk))
goto out;
- /*
- * It could be that __group_complete_signal() choose us to
- * notify about group-wide signal. Another thread should be
- * woken now to take the signal since we will not.
- */
- for (t = tsk; (t = next_thread(t)) != tsk; )
- if (!signal_pending(t) && !(t->flags & PF_EXITING))
- recalc_sigpending_and_wake(t);
+ unblocked = tsk->blocked;
+ signotset(&unblocked);
+ retarget_shared_pending(tsk, &unblocked);
- if (unlikely(tsk->signal->group_stop_count) &&
- !--tsk->signal->group_stop_count) {
- tsk->signal->flags = SIGNAL_STOP_STOPPED;
- group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
- }
+ if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
+ task_participate_group_stop(tsk))
+ group_stop = CLD_STOPPED;
out:
spin_unlock_irq(&tsk->sighand->siglock);
+ /*
+ * If group stop has completed, deliver the notification. This
+ * should always go to the real parent of the group leader.
+ */
if (unlikely(group_stop)) {
read_lock(&tasklist_lock);
- do_notify_parent_cldstop(tsk, group_stop);
+ do_notify_parent_cldstop(tsk, false, group_stop);
read_unlock(&tasklist_lock);
}
}
@@ -2089,11 +2415,33 @@ long do_no_restart_syscall(struct restart_block *param)
return -EINTR;
}
-/*
- * We don't need to get the kernel lock - this is all local to this
- * particular thread.. (and that's good, because this is _heavily_
- * used by various programs)
+static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
+{
+ if (signal_pending(tsk) && !thread_group_empty(tsk)) {
+ sigset_t newblocked;
+ /* A set of now blocked but previously unblocked signals. */
+ sigandnsets(&newblocked, newset, &current->blocked);
+ retarget_shared_pending(tsk, &newblocked);
+ }
+ tsk->blocked = *newset;
+ recalc_sigpending();
+}
+
+/**
+ * set_current_blocked - change current->blocked mask
+ * @newset: new mask
+ *
+ * It is wrong to change ->blocked directly, this helper should be used
+ * to ensure the process can't miss a shared signal we are going to block.
*/
+void set_current_blocked(const sigset_t *newset)
+{
+ struct task_struct *tsk = current;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ __set_task_blocked(tsk, newset);
+ spin_unlock_irq(&tsk->sighand->siglock);
+}
/*
* This is also useful for kernel threads that want to temporarily
@@ -2105,73 +2453,66 @@ long do_no_restart_syscall(struct restart_block *param)
*/
int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
{
- int error;
+ struct task_struct *tsk = current;
+ sigset_t newset;
- spin_lock_irq(&current->sighand->siglock);
+ /* Lockless, only current can change ->blocked, never from irq */
if (oldset)
- *oldset = current->blocked;
+ *oldset = tsk->blocked;
- error = 0;
switch (how) {
case SIG_BLOCK:
- sigorsets(&current->blocked, &current->blocked, set);
+ sigorsets(&newset, &tsk->blocked, set);
break;
case SIG_UNBLOCK:
- signandsets(&current->blocked, &current->blocked, set);
+ sigandnsets(&newset, &tsk->blocked, set);
break;
case SIG_SETMASK:
- current->blocked = *set;
+ newset = *set;
break;
default:
- error = -EINVAL;
+ return -EINVAL;
}
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return error;
+ set_current_blocked(&newset);
+ return 0;
}
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
- * @set: stores pending signals
+ * @nset: stores pending signals
* @oset: previous value of signal mask if non-null
* @sigsetsize: size of sigset_t type
*/
-SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
+SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
sigset_t __user *, oset, size_t, sigsetsize)
{
- int error = -EINVAL;
sigset_t old_set, new_set;
+ int error;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
- goto out;
+ return -EINVAL;
- if (set) {
- error = -EFAULT;
- if (copy_from_user(&new_set, set, sizeof(*set)))
- goto out;
+ old_set = current->blocked;
+
+ if (nset) {
+ if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
+ return -EFAULT;
sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
- error = sigprocmask(how, &new_set, &old_set);
+ error = sigprocmask(how, &new_set, NULL);
if (error)
- goto out;
- if (oset)
- goto set_old;
- } else if (oset) {
- spin_lock_irq(&current->sighand->siglock);
- old_set = current->blocked;
- spin_unlock_irq(&current->sighand->siglock);
+ return error;
+ }
- set_old:
- error = -EFAULT;
- if (copy_to_user(oset, &old_set, sizeof(*oset)))
- goto out;
+ if (oset) {
+ if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
+ return -EFAULT;
}
- error = 0;
-out:
- return error;
+
+ return 0;
}
long do_sigpending(void __user *set, unsigned long sigsetsize)
@@ -2284,6 +2625,66 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
#endif
/**
+ * do_sigtimedwait - wait for queued signals specified in @which
+ * @which: queued signals to wait for
+ * @info: if non-null, the signal's siginfo is returned here
+ * @ts: upper bound on process time suspension
+ */
+int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
+ const struct timespec *ts)
+{
+ struct task_struct *tsk = current;
+ long timeout = MAX_SCHEDULE_TIMEOUT;
+ sigset_t mask = *which;
+ int sig;
+
+ if (ts) {
+ if (!timespec_valid(ts))
+ return -EINVAL;
+ timeout = timespec_to_jiffies(ts);
+ /*
+ * We can be close to the next tick, add another one
+ * to ensure we will wait at least the time asked for.
+ */
+ if (ts->tv_sec || ts->tv_nsec)
+ timeout++;
+ }
+
+ /*
+ * Invert the set of allowed signals to get those we want to block.
+ */
+ sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ signotset(&mask);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ sig = dequeue_signal(tsk, &mask, info);
+ if (!sig && timeout) {
+ /*
+ * None ready, temporarily unblock those we're interested
+ * while we are sleeping in so that we'll be awakened when
+ * they arrive. Unblocking is always fine, we can avoid
+ * set_current_blocked().
+ */
+ tsk->real_blocked = tsk->blocked;
+ sigandsets(&tsk->blocked, &tsk->blocked, &mask);
+ recalc_sigpending();
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ timeout = schedule_timeout_interruptible(timeout);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ __set_task_blocked(tsk, &tsk->real_blocked);
+ siginitset(&tsk->real_blocked, 0);
+ sig = dequeue_signal(tsk, &mask, info);
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ if (sig)
+ return sig;
+ return timeout ? -EINTR : -EAGAIN;
+}
+
+/**
* sys_rt_sigtimedwait - synchronously wait for queued signals specified
* in @uthese
* @uthese: queued signals to wait for
@@ -2295,11 +2696,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
siginfo_t __user *, uinfo, const struct timespec __user *, uts,
size_t, sigsetsize)
{
- int ret, sig;
sigset_t these;
struct timespec ts;
siginfo_t info;
- long timeout = 0;
+ int ret;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
@@ -2308,61 +2708,16 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
if (copy_from_user(&these, uthese, sizeof(these)))
return -EFAULT;
- /*
- * Invert the set of allowed signals to get those we
- * want to block.
- */
- sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
- signotset(&these);
-
if (uts) {
if (copy_from_user(&ts, uts, sizeof(ts)))
return -EFAULT;
- if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
- || ts.tv_sec < 0)
- return -EINVAL;
}
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &these, &info);
- if (!sig) {
- timeout = MAX_SCHEDULE_TIMEOUT;
- if (uts)
- timeout = (timespec_to_jiffies(&ts)
- + (ts.tv_sec || ts.tv_nsec));
-
- if (timeout) {
- /*
- * None ready -- temporarily unblock those we're
- * interested while we are sleeping in so that we'll
- * be awakened when they arrive.
- */
- current->real_blocked = current->blocked;
- sigandsets(&current->blocked, &current->blocked, &these);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- timeout = schedule_timeout_interruptible(timeout);
-
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &these, &info);
- current->blocked = current->real_blocked;
- siginitset(&current->real_blocked, 0);
- recalc_sigpending();
- }
- }
- spin_unlock_irq(&current->sighand->siglock);
+ ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
- if (sig) {
- ret = sig;
- if (uinfo) {
- if (copy_siginfo_to_user(uinfo, &info))
- ret = -EFAULT;
- }
- } else {
- ret = -EAGAIN;
- if (timeout)
- ret = -EINTR;
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user(uinfo, &info))
+ ret = -EFAULT;
}
return ret;
@@ -2650,60 +3005,51 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
/**
* sys_sigprocmask - examine and change blocked signals
* @how: whether to add, remove, or set signals
- * @set: signals to add or remove (if non-null)
+ * @nset: signals to add or remove (if non-null)
* @oset: previous value of signal mask if non-null
*
* Some platforms have their own version with special arguments;
* others support only sys_rt_sigprocmask.
*/
-SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
+SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
old_sigset_t __user *, oset)
{
- int error;
old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
- if (set) {
- error = -EFAULT;
- if (copy_from_user(&new_set, set, sizeof(*set)))
- goto out;
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (copy_from_user(&new_set, nset, sizeof(*nset)))
+ return -EFAULT;
new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
- spin_lock_irq(&current->sighand->siglock);
- old_set = current->blocked.sig[0];
+ new_blocked = current->blocked;
- error = 0;
switch (how) {
- default:
- error = -EINVAL;
- break;
case SIG_BLOCK:
- sigaddsetmask(&current->blocked, new_set);
+ sigaddsetmask(&new_blocked, new_set);
break;
case SIG_UNBLOCK:
- sigdelsetmask(&current->blocked, new_set);
+ sigdelsetmask(&new_blocked, new_set);
break;
case SIG_SETMASK:
- current->blocked.sig[0] = new_set;
+ new_blocked.sig[0] = new_set;
break;
+ default:
+ return -EINVAL;
}
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- if (error)
- goto out;
- if (oset)
- goto set_old;
- } else if (oset) {
- old_set = current->blocked.sig[0];
- set_old:
- error = -EFAULT;
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
if (copy_to_user(oset, &old_set, sizeof(*oset)))
- goto out;
+ return -EFAULT;
}
- error = 0;
-out:
- return error;
+
+ return 0;
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
@@ -2793,8 +3139,10 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
SYSCALL_DEFINE0(pause)
{
- current->state = TASK_INTERRUPTIBLE;
- schedule();
+ while (!signal_pending(current)) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ }
return -ERESTARTNOHAND;
}
diff --git a/kernel/smp.c b/kernel/smp.c
index 73a195193558..fb67dfa8394e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -74,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
.notifier_call = hotplug_cfd,
};
-static int __cpuinit init_call_single_data(void)
+void __init call_function_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
int i;
@@ -88,10 +88,7 @@ static int __cpuinit init_call_single_data(void)
hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
register_cpu_notifier(&hotplug_cfd_notifier);
-
- return 0;
}
-early_initcall(init_call_single_data);
/*
* csd_lock/csd_unlock used to serialize access to per-cpu csd resources
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 13960170cad4..fca82c32042b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
char *softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
- "TASKLET", "SCHED", "HRTIMER"
+ "TASKLET", "SCHED", "HRTIMER", "RCU"
};
/*
@@ -315,16 +315,24 @@ static inline void invoke_softirq(void)
{
if (!force_irqthreads)
__do_softirq();
- else
+ else {
+ __local_bh_disable((unsigned long)__builtin_return_address(0),
+ SOFTIRQ_OFFSET);
wakeup_softirqd();
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ }
}
#else
static inline void invoke_softirq(void)
{
if (!force_irqthreads)
do_softirq();
- else
+ else {
+ __local_bh_disable((unsigned long)__builtin_return_address(0),
+ SOFTIRQ_OFFSET);
wakeup_softirqd();
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ }
}
#endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 25cc41cd8f33..62cbc8877fef 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt);
cond_syscall(compat_sys_getsockopt);
cond_syscall(sys_shutdown);
cond_syscall(sys_sendmsg);
+cond_syscall(sys_sendmmsg);
cond_syscall(compat_sys_sendmsg);
+cond_syscall(compat_sys_sendmmsg);
cond_syscall(sys_recvmsg);
cond_syscall(sys_recvmmsg);
cond_syscall(compat_sys_recvmsg);
@@ -69,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait);
cond_syscall(sys_semget);
cond_syscall(sys_semop);
cond_syscall(sys_semtimedop);
+cond_syscall(compat_sys_semtimedop);
cond_syscall(sys_semctl);
+cond_syscall(compat_sys_semctl);
cond_syscall(sys_msgget);
cond_syscall(sys_msgsnd);
+cond_syscall(compat_sys_msgsnd);
cond_syscall(sys_msgrcv);
+cond_syscall(compat_sys_msgrcv);
cond_syscall(sys_msgctl);
+cond_syscall(compat_sys_msgctl);
cond_syscall(sys_shmget);
cond_syscall(sys_shmat);
+cond_syscall(compat_sys_shmat);
cond_syscall(sys_shmdt);
cond_syscall(sys_shmctl);
+cond_syscall(compat_sys_shmctl);
cond_syscall(sys_mq_open);
cond_syscall(sys_mq_unlink);
cond_syscall(sys_mq_timedsend);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c0bb32414b17..f175d98bd355 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -56,6 +56,7 @@
#include <linux/kprobes.h>
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
+#include <linux/kmod.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -616,6 +617,11 @@ static struct ctl_table kern_table[] = {
.child = random_table,
},
{
+ .procname = "usermodehelper",
+ .mode = 0555,
+ .child = usermodehelper_table,
+ },
+ {
.procname = "overflowuid",
.data = &overflowuid,
.maxlen = sizeof(int),
@@ -730,14 +736,16 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_enabled,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_dowatchdog_enabled,
+ .proc_handler = proc_dowatchdog,
+ .extra1 = &zero,
+ .extra2 = &one,
},
{
.procname = "watchdog_thresh",
- .data = &softlockup_thresh,
+ .data = &watchdog_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dowatchdog_thresh,
+ .proc_handler = proc_dowatchdog,
.extra1 = &neg_one,
.extra2 = &sixty,
},
@@ -755,7 +763,9 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_enabled,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_dowatchdog_enabled,
+ .proc_handler = proc_dowatchdog,
+ .extra1 = &zero,
+ .extra2 = &one,
},
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
@@ -928,6 +938,12 @@ static struct ctl_table kern_table[] = {
},
#endif
#ifdef CONFIG_PERF_EVENTS
+ /*
+ * User-space scripts rely on the existence of this file
+ * as a feature check for perf_events being enabled.
+ *
+ * So it's an ABI, do not remove!
+ */
{
.procname = "perf_event_paranoid",
.data = &sysctl_perf_event_paranoid,
@@ -1496,7 +1512,7 @@ static struct ctl_table fs_table[] = {
static struct ctl_table debug_table[] = {
#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
- defined(CONFIG_S390)
+ defined(CONFIG_S390) || defined(CONFIG_TILE)
{
.procname = "exception-trace",
.data = &show_unhandled_signals,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 9ffea360a778..fc0f22005417 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -285,16 +285,18 @@ ret:
static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
{
struct listener_list *listeners;
- struct listener *s, *tmp;
+ struct listener *s, *tmp, *s2;
unsigned int cpu;
if (!cpumask_subset(mask, cpu_possible_mask))
return -EINVAL;
+ s = NULL;
if (isadd == REGISTER) {
for_each_cpu(cpu, mask) {
- s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
- cpu_to_node(cpu));
+ if (!s)
+ s = kmalloc_node(sizeof(struct listener),
+ GFP_KERNEL, cpu_to_node(cpu));
if (!s)
goto cleanup;
s->pid = pid;
@@ -303,9 +305,16 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
listeners = &per_cpu(listener_array, cpu);
down_write(&listeners->sem);
+ list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
+ if (s2->pid == pid)
+ goto next_cpu;
+ }
list_add(&s->list, &listeners->list);
+ s = NULL;
+next_cpu:
up_write(&listeners->sem);
}
+ kfree(s);
return 0;
}
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 9265014cb4db..59f369f98a04 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -42,15 +42,75 @@ static struct alarm_base {
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
+/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
+static ktime_t freezer_delta;
+static DEFINE_SPINLOCK(freezer_delta_lock);
+
#ifdef CONFIG_RTC_CLASS
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
static struct rtc_device *rtcdev;
-#endif
+static DEFINE_SPINLOCK(rtcdev_lock);
-/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
-static ktime_t freezer_delta;
-static DEFINE_SPINLOCK(freezer_delta_lock);
+/**
+ * has_wakealarm - check rtc device has wakealarm ability
+ * @dev: current device
+ * @name_ptr: name to be returned
+ *
+ * This helper function checks to see if the rtc device can wake
+ * from suspend.
+ */
+static int has_wakealarm(struct device *dev, void *name_ptr)
+{
+ struct rtc_device *candidate = to_rtc_device(dev);
+
+ if (!candidate->ops->set_alarm)
+ return 0;
+ if (!device_may_wakeup(candidate->dev.parent))
+ return 0;
+
+ *(const char **)name_ptr = dev_name(dev);
+ return 1;
+}
+
+/**
+ * alarmtimer_get_rtcdev - Return selected rtcdevice
+ *
+ * This function returns the rtc device to use for wakealarms.
+ * If one has not already been chosen, it checks to see if a
+ * functional rtc device is available.
+ */
+static struct rtc_device *alarmtimer_get_rtcdev(void)
+{
+ struct device *dev;
+ char *str;
+ unsigned long flags;
+ struct rtc_device *ret;
+
+ spin_lock_irqsave(&rtcdev_lock, flags);
+ if (!rtcdev) {
+ /* Find an rtc device and init the rtc_timer */
+ dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
+ /* If we have a device then str is valid. See has_wakealarm() */
+ if (dev) {
+ rtcdev = rtc_class_open(str);
+ /*
+ * Drop the reference we got in class_find_device,
+ * rtc_open takes its own.
+ */
+ put_device(dev);
+ rtc_timer_init(&rtctimer, NULL, NULL);
+ }
+ }
+ ret = rtcdev;
+ spin_unlock_irqrestore(&rtcdev_lock, flags);
+
+ return ret;
+}
+#else
+#define alarmtimer_get_rtcdev() (0)
+#define rtcdev (0)
+#endif
/**
@@ -166,6 +226,7 @@ static int alarmtimer_suspend(struct device *dev)
struct rtc_time tm;
ktime_t min, now;
unsigned long flags;
+ struct rtc_device *rtc;
int i;
spin_lock_irqsave(&freezer_delta_lock, flags);
@@ -173,8 +234,9 @@ static int alarmtimer_suspend(struct device *dev)
freezer_delta = ktime_set(0, 0);
spin_unlock_irqrestore(&freezer_delta_lock, flags);
+ rtc = rtcdev;
/* If we have no rtcdev, just return */
- if (!rtcdev)
+ if (!rtc)
return 0;
/* Find the soonest timer to expire*/
@@ -199,12 +261,12 @@ static int alarmtimer_suspend(struct device *dev)
WARN_ON(min.tv64 < NSEC_PER_SEC);
/* Setup an rtc timer to fire that far in the future */
- rtc_timer_cancel(rtcdev, &rtctimer);
- rtc_read_time(rtcdev, &tm);
+ rtc_timer_cancel(rtc, &rtctimer);
+ rtc_read_time(rtc, &tm);
now = rtc_tm_to_ktime(tm);
now = ktime_add(now, min);
- rtc_timer_start(rtcdev, &rtctimer, now, ktime_set(0, 0));
+ rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
return 0;
}
@@ -322,6 +384,9 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
{
clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
return hrtimer_get_res(baseid, tp);
}
@@ -336,6 +401,9 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
{
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
*tp = ktime_to_timespec(base->gettime());
return 0;
}
@@ -351,6 +419,9 @@ static int alarm_timer_create(struct k_itimer *new_timer)
enum alarmtimer_type type;
struct alarm_base *base;
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
@@ -385,6 +456,9 @@ static void alarm_timer_get(struct k_itimer *timr,
*/
static int alarm_timer_del(struct k_itimer *timr)
{
+ if (!rtcdev)
+ return -ENOTSUPP;
+
alarm_cancel(&timr->it.alarmtimer);
return 0;
}
@@ -402,6 +476,9 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
struct itimerspec *new_setting,
struct itimerspec *old_setting)
{
+ if (!rtcdev)
+ return -ENOTSUPP;
+
/* Save old values */
old_setting->it_interval =
ktime_to_timespec(timr->it.alarmtimer.period);
@@ -494,7 +571,7 @@ static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
*/
static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
{
- enum alarmtimer_type type = restart->nanosleep.index;
+ enum alarmtimer_type type = restart->nanosleep.clockid;
ktime_t exp;
struct timespec __user *rmtp;
struct alarm alarm;
@@ -541,6 +618,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
int ret = 0;
struct restart_block *restart;
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
@@ -573,7 +653,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
restart = &current_thread_info()->restart_block;
restart->fn = alarm_timer_nsleep_restart;
- restart->nanosleep.index = type;
+ restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp.tv64;
restart->nanosleep.rmtp = rmtp;
ret = -ERESTART_RESTARTBLOCK;
@@ -638,57 +718,3 @@ static int __init alarmtimer_init(void)
}
device_initcall(alarmtimer_init);
-#ifdef CONFIG_RTC_CLASS
-/**
- * has_wakealarm - check rtc device has wakealarm ability
- * @dev: current device
- * @name_ptr: name to be returned
- *
- * This helper function checks to see if the rtc device can wake
- * from suspend.
- */
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
-{
- struct rtc_device *candidate = to_rtc_device(dev);
-
- if (!candidate->ops->set_alarm)
- return 0;
- if (!device_may_wakeup(candidate->dev.parent))
- return 0;
-
- *(const char **)name_ptr = dev_name(dev);
- return 1;
-}
-
-/**
- * alarmtimer_init_late - Late initializing of alarmtimer code
- *
- * This function locates a rtc device to use for wakealarms.
- * Run as late_initcall to make sure rtc devices have been
- * registered.
- */
-static int __init alarmtimer_init_late(void)
-{
- char *str;
-
- /* Find an rtc device and init the rtc_timer */
- class_find_device(rtc_class, NULL, &str, has_wakealarm);
- if (str)
- rtcdev = rtc_class_open(str);
- if (!rtcdev) {
- printk(KERN_WARNING "No RTC device found, ALARM timers will"
- " not wake from suspend");
- }
- rtc_timer_init(&rtctimer, NULL, NULL);
-
- return 0;
-}
-#else
-static int __init alarmtimer_init_late(void)
-{
- printk(KERN_WARNING "Kernel not built with RTC support, ALARM timers"
- " will not wake from suspend");
- return 0;
-}
-#endif
-late_initcall(alarmtimer_init_late);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 22a9da9a9c96..e4c699dfa4e8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev)
unsigned long flags;
BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
- BUG_ON(!dev->cpumask);
+ if (!dev->cpumask) {
+ WARN_ON(num_possible_cpus() > 1);
+ dev->cpumask = cpumask_of(smp_processor_id());
+ }
raw_spin_lock_irqsave(&clockevents_lock, flags);
@@ -197,7 +200,7 @@ EXPORT_SYMBOL_GPL(clockevents_register_device);
static void clockevents_config(struct clock_event_device *dev,
u32 freq)
{
- unsigned long sec;
+ u64 sec;
if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
return;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d9d5f8c885f6..e0980f0d9a0a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -185,7 +185,6 @@ static struct clocksource *watchdog;
static struct timer_list watchdog_timer;
static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
-static cycle_t watchdog_last;
static int watchdog_running;
static int clocksource_watchdog_kthread(void *data);
@@ -254,11 +253,6 @@ static void clocksource_watchdog(unsigned long data)
if (!watchdog_running)
goto out;
- wdnow = watchdog->read(watchdog);
- wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
- watchdog->mult, watchdog->shift);
- watchdog_last = wdnow;
-
list_for_each_entry(cs, &watchdog_list, wd_list) {
/* Clocksource already marked unstable? */
@@ -268,19 +262,28 @@ static void clocksource_watchdog(unsigned long data)
continue;
}
+ local_irq_disable();
csnow = cs->read(cs);
+ wdnow = watchdog->read(watchdog);
+ local_irq_enable();
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
cs->flags |= CLOCK_SOURCE_WATCHDOG;
- cs->wd_last = csnow;
+ cs->wd_last = wdnow;
+ cs->cs_last = csnow;
continue;
}
- /* Check the deviation from the watchdog clocksource. */
- cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
+ wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
+ watchdog->mult, watchdog->shift);
+
+ cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
cs->mask, cs->mult, cs->shift);
- cs->wd_last = csnow;
+ cs->cs_last = csnow;
+ cs->wd_last = wdnow;
+
+ /* Check the deviation from the watchdog clocksource. */
if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
clocksource_unstable(cs, cs_nsec - wd_nsec);
continue;
@@ -318,7 +321,6 @@ static inline void clocksource_start_watchdog(void)
return;
init_timer(&watchdog_timer);
watchdog_timer.function = clocksource_watchdog;
- watchdog_last = watchdog->read(watchdog);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
watchdog_running = 1;
@@ -639,7 +641,7 @@ static void clocksource_enqueue(struct clocksource *cs)
*/
void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
- unsigned long sec;
+ u64 sec;
/*
* Calc the maximum number of seconds which we can run before
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 723c7637e55a..c7218d132738 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -456,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason)
unsigned long flags;
int cpu;
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
/*
* Periodic mode does not care about the enter/exit of power
* states
*/
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
- goto out;
+ return;
- bc = tick_broadcast_device.evtdev;
+ /*
+ * We are called with preemtion disabled from the depth of the
+ * idle code, so we can't be moved away.
+ */
cpu = smp_processor_id();
td = &per_cpu(tick_cpu_device, cpu);
dev = td->evtdev;
if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
- goto out;
+ return;
+
+ bc = tick_broadcast_device.evtdev;
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
@@ -489,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
tick_program_event(dev->next_event, 1);
}
}
-
-out:
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8e6a05a5915a..342408cf68dd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -680,7 +680,7 @@ static void timekeeping_resume(void)
clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
/* Resume hrtimers */
- hres_timers_resume();
+ hrtimers_resume();
}
static int timekeeping_suspend(void)
@@ -1099,6 +1099,21 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
}
/**
+ * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
+ */
+ktime_t ktime_get_monotonic_offset(void)
+{
+ unsigned long seq;
+ struct timespec wtom;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ wtom = wall_to_monotonic;
+ } while (read_seqretry(&xtime_lock, seq));
+ return timespec_to_ktime(wtom);
+}
+
+/**
* xtime_update() - advances the timekeeping infrastructure
* @ticks: number of ticks, that have elapsed since the last call.
*
diff --git a/kernel/timer.c b/kernel/timer.c
index fd6198692b57..8cff36119e4d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
unsigned long expires_limit, mask;
int bit;
- expires_limit = expires;
-
if (timer->slack >= 0) {
expires_limit = expires + timer->slack;
} else {
- unsigned long now = jiffies;
+ long delta = expires - jiffies;
+
+ if (delta < 256)
+ return expires;
- /* No slack, if already expired else auto slack 0.4% */
- if (time_after(expires, now))
- expires_limit = expires + (expires - now)/256;
+ expires_limit = expires + delta / 256;
}
mask = expires ^ expires_limit;
if (mask == 0)
@@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
+ expires = apply_slack(timer, expires);
+
/*
* This is a common optimization triggered by the
* networking code - if the timer is re-modified
@@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
if (timer_pending(timer) && timer->expires == expires)
return 1;
- expires = apply_slack(timer, expires);
-
return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
}
EXPORT_SYMBOL(mod_timer);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d017c2c82c44..908038f57440 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -109,12 +109,18 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
static void ftrace_global_list_func(unsigned long ip,
unsigned long parent_ip)
{
- struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/
+ struct ftrace_ops *op;
+
+ if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+ return;
+ trace_recursion_set(TRACE_GLOBAL_BIT);
+ op = rcu_dereference_raw(ftrace_global_list); /*see above*/
while (op != &ftrace_list_end) {
op->func(ip, parent_ip);
op = rcu_dereference_raw(op->next); /*see above*/
};
+ trace_recursion_clear(TRACE_GLOBAL_BIT);
}
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
@@ -1638,12 +1644,12 @@ static void ftrace_startup_enable(int command)
ftrace_run_update_code(command);
}
-static void ftrace_startup(struct ftrace_ops *ops, int command)
+static int ftrace_startup(struct ftrace_ops *ops, int command)
{
bool hash_enable = true;
if (unlikely(ftrace_disabled))
- return;
+ return -ENODEV;
ftrace_start_up++;
command |= FTRACE_ENABLE_CALLS;
@@ -1662,6 +1668,8 @@ static void ftrace_startup(struct ftrace_ops *ops, int command)
ftrace_hash_rec_enable(ops, 1);
ftrace_startup_enable(command);
+
+ return 0;
}
static void ftrace_shutdown(struct ftrace_ops *ops, int command)
@@ -2501,7 +2509,7 @@ static void __enable_ftrace_function_probe(void)
ret = __register_ftrace_function(&trace_probe_ops);
if (!ret)
- ftrace_startup(&trace_probe_ops, 0);
+ ret = ftrace_startup(&trace_probe_ops, 0);
ftrace_probe_registered = 1;
}
@@ -2732,7 +2740,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash,
{
char *func, *command, *next = buff;
struct ftrace_func_command *p;
- int ret;
+ int ret = -EINVAL;
func = strsep(&next, ":");
@@ -3322,6 +3330,7 @@ static int ftrace_process_locs(struct module *mod,
{
unsigned long *p;
unsigned long addr;
+ unsigned long flags;
mutex_lock(&ftrace_lock);
p = start;
@@ -3338,7 +3347,13 @@ static int ftrace_process_locs(struct module *mod,
ftrace_record_ip(addr);
}
+ /*
+ * Disable interrupts to prevent interrupts from executing
+ * code that is being modified.
+ */
+ local_irq_save(flags);
ftrace_update_code(mod);
+ local_irq_restore(flags);
mutex_unlock(&ftrace_lock);
return 0;
@@ -3466,7 +3481,11 @@ device_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
/* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command) do { } while (0)
+# define ftrace_startup(ops, command) \
+ ({ \
+ (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
+ 0; \
+ })
# define ftrace_shutdown(ops, command) do { } while (0)
# define ftrace_startup_sysctl() do { } while (0)
# define ftrace_shutdown_sysctl() do { } while (0)
@@ -3484,6 +3503,10 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
{
struct ftrace_ops *op;
+ if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+ return;
+
+ trace_recursion_set(TRACE_INTERNAL_BIT);
/*
* Some of the ops may be dynamically allocated,
* they must be freed after a synchronize_sched().
@@ -3496,6 +3519,7 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
op = rcu_dereference_raw(op->next);
};
preempt_enable_notrace();
+ trace_recursion_clear(TRACE_INTERNAL_BIT);
}
static void clear_ftrace_swapper(void)
@@ -3799,7 +3823,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
ret = __register_ftrace_function(ops);
if (!ret)
- ftrace_startup(ops, 0);
+ ret = ftrace_startup(ops, 0);
out_unlock:
@@ -4045,7 +4069,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
ftrace_graph_return = retfunc;
ftrace_graph_entry = entryfunc;
- ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
+ ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
out:
mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0ef7b4b2a1f7..b0c7aa407943 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2216,7 +2216,7 @@ static noinline void trace_recursive_fail(void)
printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
"HC[%lu]:SC[%lu]:NMI[%lu]\n",
- current->trace_recursion,
+ trace_recursion_buffer(),
hardirq_count() >> HARDIRQ_SHIFT,
softirq_count() >> SOFTIRQ_SHIFT,
in_nmi());
@@ -2226,9 +2226,9 @@ static noinline void trace_recursive_fail(void)
static inline int trace_recursive_lock(void)
{
- current->trace_recursion++;
+ trace_recursion_inc();
- if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+ if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
return 0;
trace_recursive_fail();
@@ -2238,9 +2238,9 @@ static inline int trace_recursive_lock(void)
static inline void trace_recursive_unlock(void)
{
- WARN_ON_ONCE(!current->trace_recursion);
+ WARN_ON_ONCE(!trace_recursion_buffer());
- current->trace_recursion--;
+ trace_recursion_dec();
}
#else
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6b69c4bd306f..229f8591f61d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -784,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[];
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
#include "trace_entries.h"
+/* Only current can touch trace_recursion */
+#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
+#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
+
+/* Ring buffer has the 10 LSB bits to count */
+#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+
+/* for function tracing recursion */
+#define TRACE_INTERNAL_BIT (1<<11)
+#define TRACE_GLOBAL_BIT (1<<12)
+
+#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
+#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
+#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2fe110341359..686ec399f2a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1657,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata =
static __init void event_trace_self_test_with_function(void)
{
- register_ftrace_function(&trace_ops);
+ int ret;
+ ret = register_ftrace_function(&trace_ops);
+ if (WARN_ON(ret < 0)) {
+ pr_info("Failed to enable function tracer for event tests\n");
+ return;
+ }
pr_info("Running tests again, along with the function tracer\n");
event_trace_self_tests();
unregister_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f925c45f0afa..27d13b36b8be 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1870,8 +1870,12 @@ fs_initcall(init_kprobe_trace);
#ifdef CONFIG_FTRACE_STARTUP_TEST
-static int kprobe_trace_selftest_target(int a1, int a2, int a3,
- int a4, int a5, int a6)
+/*
+ * The "__used" keeps gcc from removing the function symbol
+ * from the kallsyms table.
+ */
+static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
+ int a4, int a5, int a6)
{
return a1 + a2 + a3 + a4 + a5 + a6;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index cf535ccedc86..e37de492a9e1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
}
EXPORT_SYMBOL(ftrace_print_symbols_seq);
+#if BITS_PER_LONG == 32
+const char *
+ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+ const struct trace_print_flags_u64 *symbol_array)
+{
+ int i;
+ const char *ret = p->buffer + p->len;
+
+ for (i = 0; symbol_array[i].name; i++) {
+
+ if (val != symbol_array[i].mask)
+ continue;
+
+ trace_seq_puts(p, symbol_array[i].name);
+ break;
+ }
+
+ if (!p->len)
+ trace_seq_printf(p, "0x%llx", val);
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+#endif
+
const char *
ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
{
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index dff763b7baf1..1f06468a10d7 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -240,13 +240,10 @@ static const char **find_next(void *v, loff_t *pos)
const char **fmt = v;
int start_index;
- if (!fmt)
- fmt = __start___trace_bprintk_fmt + *pos;
-
start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
if (*pos < start_index)
- return fmt;
+ return __start___trace_bprintk_fmt + *pos;
return find_next_mod_format(start_index, v, fmt, pos);
}
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 44646179eaba..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
static struct uts_namespace *create_uts_ns(void)
{
@@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref)
put_user_ns(ns->user_ns);
kfree(ns);
}
+
+static void *utsns_get(struct task_struct *task)
+{
+ struct uts_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ rcu_read_lock();
+ nsproxy = task_nsproxy(task);
+ if (nsproxy) {
+ ns = nsproxy->uts_ns;
+ get_uts_ns(ns);
+ }
+ rcu_read_unlock();
+
+ return ns;
+}
+
+static void utsns_put(void *ns)
+{
+ put_uts_ns(ns);
+}
+
+static int utsns_install(struct nsproxy *nsproxy, void *ns)
+{
+ get_uts_ns(ns);
+ put_uts_ns(nsproxy->uts_ns);
+ nsproxy->uts_ns = ns;
+ return 0;
+}
+
+const struct proc_ns_operations utsns_operations = {
+ .name = "uts",
+ .type = CLONE_NEWUTS,
+ .get = utsns_get,
+ .put = utsns_put,
+ .install = utsns_install,
+};
+
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14733d4d156b..3d0c56ad4792 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -28,7 +28,7 @@
#include <linux/perf_event.h>
int watchdog_enabled = 1;
-int __read_mostly softlockup_thresh = 60;
+int __read_mostly watchdog_thresh = 10;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str)
__setup("nosoftlockup", nosoftlockup_setup);
/* */
+/*
+ * Hard-lockup warnings should be triggered after just a few seconds. Soft-
+ * lockups can have false positives under extreme conditions. So we generally
+ * want a higher threshold for soft lockups than for hard lockups. So we couple
+ * the thresholds with a factor: we make the soft threshold twice the amount of
+ * time the hard threshold is.
+ */
+static int get_softlockup_thresh(void)
+{
+ return watchdog_thresh * 2;
+}
/*
* Returns seconds, approximately. We don't need nanosecond
@@ -105,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu)
static unsigned long get_sample_period(void)
{
/*
- * convert softlockup_thresh from seconds to ns
+ * convert watchdog_thresh from seconds to ns
* the divide by 5 is to give hrtimer 5 chances to
* increment before the hardlockup detector generates
* a warning
*/
- return softlockup_thresh / 5 * NSEC_PER_SEC;
+ return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
}
/* Commands for resetting the watchdog */
@@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts)
unsigned long now = get_timestamp(smp_processor_id());
/* Warn about unreasonable delays: */
- if (time_after(now, touch_ts + softlockup_thresh))
+ if (time_after(now, touch_ts + get_softlockup_thresh()))
return now - touch_ts;
return 0;
@@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu)
/* Try to register using hardware perf events */
wd_attr = &wd_hw_attr;
- wd_attr->sample_period = hw_nmi_get_sample_period();
+ wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
if (!IS_ERR(event)) {
printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
@@ -404,15 +415,13 @@ static void watchdog_nmi_disable(int cpu) { return; }
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
/* prepare/enable/disable routines */
-static int watchdog_prepare_cpu(int cpu)
+static void watchdog_prepare_cpu(int cpu)
{
struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
WARN_ON(per_cpu(softlockup_watchdog, cpu));
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
-
- return 0;
}
static int watchdog_enable(int cpu)
@@ -501,28 +510,25 @@ static void watchdog_disable_all_cpus(void)
/* sysctl functions */
#ifdef CONFIG_SYSCTL
/*
- * proc handler for /proc/sys/kernel/nmi_watchdog
+ * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
*/
-int proc_dowatchdog_enabled(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+int proc_dowatchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
- proc_dointvec(table, write, buffer, length, ppos);
+ int ret;
- if (write) {
- if (watchdog_enabled)
- watchdog_enable_all_cpus();
- else
- watchdog_disable_all_cpus();
- }
- return 0;
-}
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ goto out;
-int proc_dowatchdog_thresh(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (watchdog_enabled && watchdog_thresh)
+ watchdog_enable_all_cpus();
+ else
+ watchdog_disable_all_cpus();
+
+out:
+ return ret;
}
#endif /* CONFIG_SYSCTL */
@@ -534,17 +540,16 @@ static int __cpuinit
cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
- int err = 0;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- err = watchdog_prepare_cpu(hotcpu);
+ watchdog_prepare_cpu(hotcpu);
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
if (watchdog_enabled)
- err = watchdog_enable(hotcpu);
+ watchdog_enable(hotcpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 25c8afeaeae8..25fb1b0e53fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2915,9 +2915,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
}
}
- /* just in case, make sure it's actually aligned
- * - this is affected by PERCPU() alignment in vmlinux.lds.S
- */
+ /* just in case, make sure it's actually aligned */
BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
return wq->cpu_wq.v ? 0 : -ENOMEM;
}