summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/auditsc.c8
-rw-r--r--kernel/capability.c21
-rw-r--r--kernel/cgroup.c570
-rw-r--r--kernel/cgroup_freezer.c11
-rw-r--r--kernel/compat.c73
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c31
-rw-r--r--kernel/cred.c44
-rw-r--r--kernel/events/Makefile3
-rw-r--r--kernel/events/core.c11
-rw-r--r--kernel/events/uprobes.c1667
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/extable.c8
-rw-r--r--kernel/fork.c86
-rw-r--r--kernel/groups.c50
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/manage.c52
-rw-r--r--kernel/irq/pm.c7
-rw-r--r--kernel/irq/resend.c7
-rw-r--r--kernel/kfifo.c1
-rw-r--r--kernel/module.c5
-rw-r--r--kernel/params.c62
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/power/Kconfig27
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/autosleep.c127
-rw-r--r--kernel/power/hibernate.c13
-rw-r--r--kernel/power/main.c160
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/swap.c62
-rw-r--r--kernel/power/wakelock.c259
-rw-r--r--kernel/printk.c1390
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny_plugin.h16
-rw-r--r--kernel/rcutorture.c257
-rw-r--r--kernel/rcutree.c332
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h154
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/res_counter.c71
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c448
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c462
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c56
-rw-r--r--kernel/sched/sched.h8
-rw-r--r--kernel/seccomp.c458
-rw-r--r--kernel/semaphore.c2
-rw-r--r--kernel/signal.c89
-rw-r--r--kernel/smp.c27
-rw-r--r--kernel/smpboot.c62
-rw-r--r--kernel/smpboot.h18
-rw-r--r--kernel/srcu.c548
-rw-r--r--kernel/sys.c278
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/timer.c20
-rw-r--r--kernel/trace/Kconfig23
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/ftrace.c242
-rw-r--r--kernel/trace/ring_buffer.c585
-rw-r--r--kernel/trace/trace.c503
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_export.c1
-rw-r--r--kernel/trace/trace_kprobe.c899
-rw-r--r--kernel/trace/trace_printk.c4
-rw-r--r--kernel/trace/trace_probe.c839
-rw-r--r--kernel/trace/trace_probe.h161
-rw-r--r--kernel/trace/trace_uprobe.c788
-rw-r--r--kernel/trace/trace_workqueue.c300
-rw-r--r--kernel/uid16.c48
-rw-r--r--kernel/user.c51
-rw-r--r--kernel/user_namespace.c595
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/workqueue.c21
80 files changed, 9590 insertions, 3702 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb41b9547c9f..6c07f30fa9b7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SMP) += smpboot.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index af1de0f34eae..4b96415527b8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,6 +67,7 @@
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
+#include <linux/compat.h>
#include "audit.h"
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr)
audit_log_end(ab);
}
-void __audit_seccomp(unsigned long syscall)
+void __audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- audit_log_abend(ab, "seccomp", SIGKILL);
+ audit_log_abend(ab, "seccomp", signr);
audit_log_format(ab, " syscall=%ld", syscall);
+ audit_log_format(ab, " compat=%d", is_compat_task());
+ audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
+ audit_log_format(ab, " code=0x%x", code);
audit_log_end(ab);
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 3f1adb6c6470..493d97259484 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -419,3 +419,24 @@ bool nsown_capable(int cap)
{
return ns_capable(current_user_ns(), cap);
}
+
+/**
+ * inode_capable - Check superior capability over inode
+ * @inode: The inode in question
+ * @cap: The capability in question
+ *
+ * Return true if the current task has the given superior capability
+ * targeted at it's own user namespace and that the given inode is owned
+ * by the current user namespace or a child namespace.
+ *
+ * Currently we check to see if an inode is owned by the current
+ * user namespace by seeing if the inode's owner maps into the
+ * current user namespace.
+ *
+ */
+bool inode_capable(const struct inode *inode, int cap)
+{
+ struct user_namespace *ns = current_user_ns();
+
+ return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ed64ccac67c9..a0c6af34d500 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,9 +60,13 @@
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/kthread.h>
#include <linux/atomic.h>
+/* css deactivation bias, makes css->refcnt negative to deny new trygets */
+#define CSS_DEACT_BIAS INT_MIN
+
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
@@ -127,6 +131,9 @@ struct cgroupfs_root {
/* A list running through the active hierarchies */
struct list_head root_list;
+ /* All cgroups on this root, cgroup_mutex protected */
+ struct list_head allcg_list;
+
/* Hierarchy-specific flags */
unsigned long flags;
@@ -145,6 +152,15 @@ struct cgroupfs_root {
static struct cgroupfs_root rootnode;
/*
+ * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
+ */
+struct cfent {
+ struct list_head node;
+ struct dentry *dentry;
+ struct cftype *type;
+};
+
+/*
* CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
* cgroup_subsys->use_id != 0.
*/
@@ -239,6 +255,14 @@ int cgroup_lock_is_held(void)
EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
+/* the current nr of refs, always >= 0 whether @css is deactivated or not */
+static int css_refcnt(struct cgroup_subsys_state *css)
+{
+ int v = atomic_read(&css->refcnt);
+
+ return v >= 0 ? v : v - CSS_DEACT_BIAS;
+}
+
/* convenient tests for these bits */
inline int cgroup_is_removed(const struct cgroup *cgrp)
{
@@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
#define for_each_active_root(_root) \
list_for_each_entry(_root, &roots, root_list)
+static inline struct cgroup *__d_cgrp(struct dentry *dentry)
+{
+ return dentry->d_fsdata;
+}
+
+static inline struct cfent *__d_cfe(struct dentry *dentry)
+{
+ return dentry->d_fsdata;
+}
+
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+ return __d_cfe(dentry)->type;
+}
+
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD(release_list);
@@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
struct cgroup_subsys *ss;
int ret = 0;
- for_each_subsys(cgrp->root, ss)
- if (ss->pre_destroy) {
- ret = ss->pre_destroy(cgrp);
- if (ret)
- break;
+ for_each_subsys(cgrp->root, ss) {
+ if (!ss->pre_destroy)
+ continue;
+
+ ret = ss->pre_destroy(cgrp);
+ if (ret) {
+ /* ->pre_destroy() failure is being deprecated */
+ WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
+ break;
}
+ }
return ret;
}
@@ -864,6 +908,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
BUG_ON(!list_empty(&cgrp->pidlists));
kfree_rcu(cgrp, rcu_head);
+ } else {
+ struct cfent *cfe = __d_cfe(dentry);
+ struct cgroup *cgrp = dentry->d_parent->d_fsdata;
+
+ WARN_ONCE(!list_empty(&cfe->node) &&
+ cgrp != &cgrp->root->top_cgroup,
+ "cfe still linked for %s\n", cfe->type->name);
+ kfree(cfe);
}
iput(inode);
}
@@ -882,34 +934,36 @@ static void remove_dir(struct dentry *d)
dput(parent);
}
-static void cgroup_clear_directory(struct dentry *dentry)
-{
- struct list_head *node;
-
- BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
- spin_lock(&dentry->d_lock);
- node = dentry->d_subdirs.next;
- while (node != &dentry->d_subdirs) {
- struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
-
- spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
- list_del_init(node);
- if (d->d_inode) {
- /* This should never be called on a cgroup
- * directory with child cgroups */
- BUG_ON(d->d_inode->i_mode & S_IFDIR);
- dget_dlock(d);
- spin_unlock(&d->d_lock);
- spin_unlock(&dentry->d_lock);
- d_delete(d);
- simple_unlink(dentry->d_inode, d);
- dput(d);
- spin_lock(&dentry->d_lock);
- } else
- spin_unlock(&d->d_lock);
- node = dentry->d_subdirs.next;
+static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+{
+ struct cfent *cfe;
+
+ lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
+ lockdep_assert_held(&cgroup_mutex);
+
+ list_for_each_entry(cfe, &cgrp->files, node) {
+ struct dentry *d = cfe->dentry;
+
+ if (cft && cfe->type != cft)
+ continue;
+
+ dget(d);
+ d_delete(d);
+ simple_unlink(d->d_inode, d);
+ list_del_init(&cfe->node);
+ dput(d);
+
+ return 0;
}
- spin_unlock(&dentry->d_lock);
+ return -ENOENT;
+}
+
+static void cgroup_clear_directory(struct dentry *dir)
+{
+ struct cgroup *cgrp = __d_cgrp(dir);
+
+ while (!list_empty(&cgrp->files))
+ cgroup_rm_file(cgrp, NULL);
}
/*
@@ -1294,6 +1348,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto out_unlock;
+ /* See feature-removal-schedule.txt */
+ if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
+ pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
+
/* Don't allow flags or name to change at remount */
if (opts.flags != root->flags ||
(opts.name && strcmp(opts.name, root->name))) {
@@ -1308,7 +1367,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
goto out_unlock;
}
- /* (re)populate subsystem files */
+ /* clear out any existing files and repopulate subsystem files */
+ cgroup_clear_directory(cgrp->dentry);
cgroup_populate_dir(cgrp);
if (opts.release_agent)
@@ -1333,6 +1393,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
INIT_LIST_HEAD(&cgrp->sibling);
INIT_LIST_HEAD(&cgrp->children);
+ INIT_LIST_HEAD(&cgrp->files);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1344,11 +1405,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
static void init_cgroup_root(struct cgroupfs_root *root)
{
struct cgroup *cgrp = &root->top_cgroup;
+
INIT_LIST_HEAD(&root->subsys_list);
INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD(&root->allcg_list);
root->number_of_cgroups = 1;
cgrp->root = root;
cgrp->top_cgroup = cgrp;
+ list_add_tail(&cgrp->allcg_node, &root->allcg_list);
init_cgroup_housekeeping(cgrp);
}
@@ -1692,16 +1756,6 @@ static struct file_system_type cgroup_fs_type = {
static struct kobject *cgroup_kobj;
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
/**
* cgroup_path - generate the path of a cgroup
* @cgrp: the cgroup in question
@@ -2160,9 +2214,9 @@ retry_find_task:
* only need to check permissions on one of them.
*/
tcred = __task_cred(tsk);
- if (cred->euid &&
- cred->euid != tcred->uid &&
- cred->euid != tcred->suid) {
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid)) {
rcu_read_unlock();
ret = -EACCES;
goto out_unlock_cgroup;
@@ -2172,6 +2226,18 @@ retry_find_task:
if (threadgroup)
tsk = tsk->group_leader;
+
+ /*
+ * Workqueue threads may acquire PF_THREAD_BOUND and become
+ * trapped in a cpuset, or RT worker may be born in a cgroup
+ * with no rt_runtime allocated. Just say no.
+ */
+ if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
+ ret = -EINVAL;
+ rcu_read_unlock();
+ goto out_unlock_cgroup;
+ }
+
get_task_struct(tsk);
rcu_read_unlock();
@@ -2603,50 +2669,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
return mode;
}
-int cgroup_add_file(struct cgroup *cgrp,
- struct cgroup_subsys *subsys,
- const struct cftype *cft)
+static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+ const struct cftype *cft)
{
struct dentry *dir = cgrp->dentry;
+ struct cgroup *parent = __d_cgrp(dir);
struct dentry *dentry;
+ struct cfent *cfe;
int error;
umode_t mode;
-
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
+
+ /* does @cft->flags tell us to skip creation on @cgrp? */
+ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+ return 0;
+ if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+ return 0;
+
if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
strcpy(name, subsys->name);
strcat(name, ".");
}
strcat(name, cft->name);
+
BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
+
+ cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
+ if (!cfe)
+ return -ENOMEM;
+
dentry = lookup_one_len(name, dir, strlen(name));
- if (!IS_ERR(dentry)) {
- mode = cgroup_file_mode(cft);
- error = cgroup_create_file(dentry, mode | S_IFREG,
- cgrp->root->sb);
- if (!error)
- dentry->d_fsdata = (void *)cft;
- dput(dentry);
- } else
+ if (IS_ERR(dentry)) {
error = PTR_ERR(dentry);
+ goto out;
+ }
+
+ mode = cgroup_file_mode(cft);
+ error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
+ if (!error) {
+ cfe->type = (void *)cft;
+ cfe->dentry = dentry;
+ dentry->d_fsdata = cfe;
+ list_add_tail(&cfe->node, &parent->files);
+ cfe = NULL;
+ }
+ dput(dentry);
+out:
+ kfree(cfe);
return error;
}
-EXPORT_SYMBOL_GPL(cgroup_add_file);
-int cgroup_add_files(struct cgroup *cgrp,
- struct cgroup_subsys *subsys,
- const struct cftype cft[],
- int count)
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+ const struct cftype cfts[], bool is_add)
{
- int i, err;
- for (i = 0; i < count; i++) {
- err = cgroup_add_file(cgrp, subsys, &cft[i]);
- if (err)
- return err;
+ const struct cftype *cft;
+ int err, ret = 0;
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ if (is_add)
+ err = cgroup_add_file(cgrp, subsys, cft);
+ else
+ err = cgroup_rm_file(cgrp, cft);
+ if (err) {
+ pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
+ is_add ? "add" : "remove", cft->name, err);
+ ret = err;
+ }
+ }
+ return ret;
+}
+
+static DEFINE_MUTEX(cgroup_cft_mutex);
+
+static void cgroup_cfts_prepare(void)
+ __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
+{
+ /*
+ * Thanks to the entanglement with vfs inode locking, we can't walk
+ * the existing cgroups under cgroup_mutex and create files.
+ * Instead, we increment reference on all cgroups and build list of
+ * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure
+ * exclusive access to the field.
+ */
+ mutex_lock(&cgroup_cft_mutex);
+ mutex_lock(&cgroup_mutex);
+}
+
+static void cgroup_cfts_commit(struct cgroup_subsys *ss,
+ const struct cftype *cfts, bool is_add)
+ __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
+{
+ LIST_HEAD(pending);
+ struct cgroup *cgrp, *n;
+
+ /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
+ if (cfts && ss->root != &rootnode) {
+ list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
+ dget(cgrp->dentry);
+ list_add_tail(&cgrp->cft_q_node, &pending);
+ }
+ }
+
+ mutex_unlock(&cgroup_mutex);
+
+ /*
+ * All new cgroups will see @cfts update on @ss->cftsets. Add/rm
+ * files for all cgroups which were created before.
+ */
+ list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
+ struct inode *inode = cgrp->dentry->d_inode;
+
+ mutex_lock(&inode->i_mutex);
+ mutex_lock(&cgroup_mutex);
+ if (!cgroup_is_removed(cgrp))
+ cgroup_addrm_files(cgrp, ss, cfts, is_add);
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&inode->i_mutex);
+
+ list_del_init(&cgrp->cft_q_node);
+ dput(cgrp->dentry);
}
+
+ mutex_unlock(&cgroup_cft_mutex);
+}
+
+/**
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Register @cfts to @ss. Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too. This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure. Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
+ */
+int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+{
+ struct cftype_set *set;
+
+ set = kzalloc(sizeof(*set), GFP_KERNEL);
+ if (!set)
+ return -ENOMEM;
+
+ cgroup_cfts_prepare();
+ set->cfts = cfts;
+ list_add_tail(&set->node, &ss->cftsets);
+ cgroup_cfts_commit(ss, cfts, true);
+
return 0;
}
-EXPORT_SYMBOL_GPL(cgroup_add_files);
+EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
+
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts from @ss. Files described by @cfts are removed from
+ * all existing cgroups to which @ss is attached and all future cgroups
+ * won't have them either. This function can be called anytime whether @ss
+ * is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered with @ss.
+ */
+int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+{
+ struct cftype_set *set;
+
+ cgroup_cfts_prepare();
+
+ list_for_each_entry(set, &ss->cftsets, node) {
+ if (set->cfts == cfts) {
+ list_del_init(&set->node);
+ cgroup_cfts_commit(ss, cfts, false);
+ return 0;
+ }
+ }
+
+ cgroup_cfts_commit(ss, NULL, false);
+ return -ENOENT;
+}
/**
* cgroup_task_count - count the number of tasks in a cgroup.
@@ -3625,13 +3832,14 @@ static struct cftype files[] = {
.read_u64 = cgroup_clone_children_read,
.write_u64 = cgroup_clone_children_write,
},
-};
-
-static struct cftype cft_release_agent = {
- .name = "release_agent",
- .read_seq_string = cgroup_release_agent_show,
- .write_string = cgroup_release_agent_write,
- .max_write_len = PATH_MAX,
+ {
+ .name = "release_agent",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_seq_string = cgroup_release_agent_show,
+ .write_string = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX,
+ },
+ { } /* terminate */
};
static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3639,22 +3847,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
int err;
struct cgroup_subsys *ss;
- /* First clear out any existing files */
- cgroup_clear_directory(cgrp->dentry);
-
- err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
+ err = cgroup_addrm_files(cgrp, NULL, files, true);
if (err < 0)
return err;
- if (cgrp == cgrp->top_cgroup) {
- if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
- return err;
- }
-
+ /* process cftsets of each subsystem */
for_each_subsys(cgrp->root, ss) {
- if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
- return err;
+ struct cftype_set *set;
+
+ list_for_each_entry(set, &ss->cftsets, node)
+ cgroup_addrm_files(cgrp, ss, set->cfts, true);
}
+
/* This cgroup is ready now */
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -3670,6 +3874,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
return 0;
}
+static void css_dput_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, dput_work);
+
+ dput(css->cgroup->dentry);
+}
+
static void init_cgroup_css(struct cgroup_subsys_state *css,
struct cgroup_subsys *ss,
struct cgroup *cgrp)
@@ -3682,6 +3894,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
set_bit(CSS_ROOT, &css->flags);
BUG_ON(cgrp->subsys[ss->subsys_id]);
cgrp->subsys[ss->subsys_id] = css;
+
+ /*
+ * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
+ * which is put on the last css_put(). dput() requires process
+ * context, which css_put() may be called without. @css->dput_work
+ * will be used to invoke dput() asynchronously from css_put().
+ */
+ INIT_WORK(&css->dput_work, css_dput_fn);
+ if (ss->__DEPRECATED_clear_css_refs)
+ set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
}
static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3784,9 +4006,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (err < 0)
goto err_remove;
+ /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
+ for_each_subsys(root, ss)
+ if (!ss->__DEPRECATED_clear_css_refs)
+ dget(dentry);
+
/* The cgroup directory was pre-locked for us */
BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
+ list_add_tail(&cgrp->allcg_node, &root->allcg_list);
+
err = cgroup_populate_dir(cgrp);
/* If err < 0, we have a half-filled directory - oh well ;) */
@@ -3826,18 +4055,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}
+/*
+ * Check the reference count on each subsystem. Since we already
+ * established that there are no tasks in the cgroup, if the css refcount
+ * is also 1, then there should be no outstanding references, so the
+ * subsystem is safe to destroy. We scan across all subsystems rather than
+ * using the per-hierarchy linked list of mounted subsystems since we can
+ * be called via check_for_release() with no synchronization other than
+ * RCU, and the subsystem linked list isn't RCU-safe.
+ */
static int cgroup_has_css_refs(struct cgroup *cgrp)
{
- /* Check the reference count on each subsystem. Since we
- * already established that there are no tasks in the
- * cgroup, if the css refcount is also 1, then there should
- * be no outstanding references, so the subsystem is safe to
- * destroy. We scan across all subsystems rather than using
- * the per-hierarchy linked list of mounted subsystems since
- * we can be called via check_for_release() with no
- * synchronization other than RCU, and the subsystem linked
- * list isn't RCU-safe */
int i;
+
/*
* We won't need to lock the subsys array, because the subsystems
* we're concerned about aren't going anywhere since our cgroup root
@@ -3846,17 +4076,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
struct cgroup_subsys_state *css;
+
/* Skip subsystems not present or not in this hierarchy */
if (ss == NULL || ss->root != cgrp->root)
continue;
+
css = cgrp->subsys[ss->subsys_id];
- /* When called from check_for_release() it's possible
+ /*
+ * When called from check_for_release() it's possible
* that by this point the cgroup has been removed
* and the css deleted. But a false-positive doesn't
* matter, since it can only happen if the cgroup
* has been deleted and hence no longer needs the
- * release agent to be called anyway. */
- if (css && (atomic_read(&css->refcnt) > 1))
+ * release agent to be called anyway.
+ */
+ if (css && css_refcnt(css) > 1)
return 1;
}
return 0;
@@ -3866,51 +4100,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
* Atomically mark all (or else none) of the cgroup's CSS objects as
* CSS_REMOVED. Return true on success, or false if the cgroup has
* busy subsystems. Call with cgroup_mutex held
+ *
+ * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
+ * not, cgroup removal behaves differently.
+ *
+ * If clear is set, css refcnt for the subsystem should be zero before
+ * cgroup removal can be committed. This is implemented by
+ * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
+ * called multiple times until all css refcnts reach zero and is allowed to
+ * veto removal on any invocation. This behavior is deprecated and will be
+ * removed as soon as the existing user (memcg) is updated.
+ *
+ * If clear is not set, each css holds an extra reference to the cgroup's
+ * dentry and cgroup removal proceeds regardless of css refs.
+ * ->pre_destroy() will be called at least once and is not allowed to fail.
+ * On the last put of each css, whenever that may be, the extra dentry ref
+ * is put so that dentry destruction happens only after all css's are
+ * released.
*/
-
static int cgroup_clear_css_refs(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
unsigned long flags;
bool failed = false;
+
local_irq_save(flags);
+
+ /*
+ * Block new css_tryget() by deactivating refcnt. If all refcnts
+ * for subsystems w/ clear_css_refs set were 1 at the moment of
+ * deactivation, we succeeded.
+ */
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- int refcnt;
- while (1) {
- /* We can only remove a CSS with a refcnt==1 */
- refcnt = atomic_read(&css->refcnt);
- if (refcnt > 1) {
- failed = true;
- goto done;
- }
- BUG_ON(!refcnt);
- /*
- * Drop the refcnt to 0 while we check other
- * subsystems. This will cause any racing
- * css_tryget() to spin until we set the
- * CSS_REMOVED bits or abort
- */
- if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
- break;
- cpu_relax();
- }
+
+ WARN_ON(atomic_read(&css->refcnt) < 0);
+ atomic_add(CSS_DEACT_BIAS, &css->refcnt);
+
+ if (ss->__DEPRECATED_clear_css_refs)
+ failed |= css_refcnt(css) != 1;
}
- done:
+
+ /*
+ * If succeeded, set REMOVED and put all the base refs; otherwise,
+ * restore refcnts to positive values. Either way, all in-progress
+ * css_tryget() will be released.
+ */
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- if (failed) {
- /*
- * Restore old refcnt if we previously managed
- * to clear it from 1 to 0
- */
- if (!atomic_read(&css->refcnt))
- atomic_set(&css->refcnt, 1);
- } else {
- /* Commit the fact that the CSS is removed */
+
+ if (!failed) {
set_bit(CSS_REMOVED, &css->flags);
+ css_put(css);
+ } else {
+ atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
}
}
+
local_irq_restore(flags);
return !failed;
}
@@ -3995,6 +4241,8 @@ again:
list_del_init(&cgrp->sibling);
cgroup_unlock_hierarchy(cgrp->root);
+ list_del_init(&cgrp->allcg_node);
+
d = dget(cgrp->dentry);
cgroup_d_remove_dir(d);
@@ -4021,12 +4269,29 @@ again:
return 0;
}
+static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
+{
+ INIT_LIST_HEAD(&ss->cftsets);
+
+ /*
+ * base_cftset is embedded in subsys itself, no need to worry about
+ * deregistration.
+ */
+ if (ss->base_cftypes) {
+ ss->base_cftset.cfts = ss->base_cftypes;
+ list_add_tail(&ss->base_cftset.node, &ss->cftsets);
+ }
+}
+
static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+ /* init base cftset */
+ cgroup_init_cftsets(ss);
+
/* Create the top cgroup state for this subsystem */
list_add(&ss->sibling, &rootnode.subsys_list);
ss->root = &rootnode;
@@ -4096,6 +4361,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
return 0;
}
+ /* init base cftset */
+ cgroup_init_cftsets(ss);
+
/*
* need to register a subsys id before anything else - for example,
* init_cgroup_css needs it.
@@ -4685,21 +4953,41 @@ static void check_for_release(struct cgroup *cgrp)
}
/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css, int count)
+bool __css_tryget(struct cgroup_subsys_state *css)
+{
+ do {
+ int v = css_refcnt(css);
+
+ if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+ return true;
+ cpu_relax();
+ } while (!test_bit(CSS_REMOVED, &css->flags));
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(__css_tryget);
+
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
- int val;
+
rcu_read_lock();
- val = atomic_sub_return(count, &css->refcnt);
- if (val == 1) {
+ atomic_dec(&css->refcnt);
+ switch (css_refcnt(css)) {
+ case 1:
if (notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
cgroup_wakeup_rmdir_waiter(cgrp);
+ break;
+ case 0:
+ if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
+ schedule_work(&css->dput_work);
+ break;
}
rcu_read_unlock();
- WARN_ON_ONCE(val < 1);
}
EXPORT_SYMBOL_GPL(__css_put);
@@ -4818,7 +5106,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
* on this or this is under rcu_read_lock(). Once css->id is allocated,
* it's unchanged until freed.
*/
- cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
+ cssid = rcu_dereference_check(css->id, css_refcnt(css));
if (cssid)
return cssid->id;
@@ -4830,7 +5118,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
{
struct css_id *cssid;
- cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
+ cssid = rcu_dereference_check(css->id, css_refcnt(css));
if (cssid)
return cssid->depth;
@@ -5211,19 +5499,15 @@ static struct cftype debug_files[] = {
.name = "releasable",
.read_u64 = releasable_read,
},
-};
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- return cgroup_add_files(cont, ss, debug_files,
- ARRAY_SIZE(debug_files));
-}
+ { } /* terminate */
+};
struct cgroup_subsys debug_subsys = {
.name = "debug",
.create = debug_create,
.destroy = debug_destroy,
- .populate = debug_populate,
.subsys_id = debug_subsys_id,
+ .base_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f86e93920b62..3649fc6b3eaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup,
static struct cftype files[] = {
{
.name = "state",
+ .flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = freezer_read,
.write_string = freezer_write,
},
+ { } /* terminate */
};
-static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
-{
- if (!cgroup->parent)
- return 0;
- return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
-}
-
struct cgroup_subsys freezer_subsys = {
.name = "freezer",
.create = freezer_create,
.destroy = freezer_destroy,
- .populate = freezer_populate,
.subsys_id = freezer_subsys_id,
.can_attach = freezer_can_attach,
.fork = freezer_fork,
+ .base_cftypes = files,
};
diff --git a/kernel/compat.c b/kernel/compat.c
index 74ff8498809a..c28a306ae05c 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
-asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
- compat_old_sigset_t __user *oset)
+/*
+ * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
+ * blocked set of signals to the supplied signal set
+ */
+static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
{
- old_sigset_t s;
- long ret;
- mm_segment_t old_fs;
+ memcpy(blocked->sig, &set, sizeof(set));
+}
- if (set && get_user(s, set))
- return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_sigprocmask(how,
- set ? (old_sigset_t __user *) &s : NULL,
- oset ? (old_sigset_t __user *) &s : NULL);
- set_fs(old_fs);
- if (ret == 0)
- if (oset)
- ret = put_user(s, oset);
- return ret;
+asmlinkage long compat_sys_sigprocmask(int how,
+ compat_old_sigset_t __user *nset,
+ compat_old_sigset_t __user *oset)
+{
+ old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
+
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (get_user(new_set, nset))
+ return -EFAULT;
+ new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+ new_blocked = current->blocked;
+
+ switch (how) {
+ case SIG_BLOCK:
+ sigaddsetmask(&new_blocked, new_set);
+ break;
+ case SIG_UNBLOCK:
+ sigdelsetmask(&new_blocked, new_set);
+ break;
+ case SIG_SETMASK:
+ compat_sig_setmask(&new_blocked, new_set);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
+ if (put_user(old_set, oset))
+ return -EFAULT;
+ }
+
+ return 0;
}
#endif
@@ -1044,15 +1073,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
return -EFAULT;
sigset_from_compat(&newset, &newset32);
- sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
- current->saved_sigmask = current->blocked;
- set_current_blocked(&newset);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
- return -ERESTARTNOHAND;
+ return sigsuspend(&newset);
}
#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2060c6e57027..0e6353cf147a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,8 @@
#include <linux/gfp.h>
#include <linux/suspend.h>
+#include "smpboot.h"
+
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -295,11 +297,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
int ret, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+ struct task_struct *idle;
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;
cpu_hotplug_begin();
+
+ idle = idle_thread_get(cpu);
+ if (IS_ERR(idle)) {
+ ret = PTR_ERR(idle);
+ goto out;
+ }
+
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
@@ -309,7 +319,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
}
/* Arch-specific enabling code. */
- ret = __cpu_up(cpu);
+ ret = __cpu_up(cpu, idle);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
@@ -320,6 +330,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
out_notify:
if (ret != 0)
__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+out:
cpu_hotplug_done();
return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 14f7070b4ba2..8c8bd652dd12 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1765,28 +1765,17 @@ static struct cftype files[] = {
.write_u64 = cpuset_write_u64,
.private = FILE_SPREAD_SLAB,
},
-};
-
-static struct cftype cft_memory_pressure_enabled = {
- .name = "memory_pressure_enabled",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE_ENABLED,
-};
-static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- int err;
+ {
+ .name = "memory_pressure_enabled",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE_ENABLED,
+ },
- err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
- if (err)
- return err;
- /* memory_pressure_enabled is in root cpuset only */
- if (!cont->parent)
- err = cgroup_add_file(cont, ss,
- &cft_memory_pressure_enabled);
- return err;
-}
+ { } /* terminate */
+};
/*
* post_clone() is called during cgroup_create() when the
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = {
.destroy = cpuset_destroy,
.can_attach = cpuset_can_attach,
.attach = cpuset_attach,
- .populate = cpuset_populate,
.post_clone = cpuset_post_clone,
.subsys_id = cpuset_subsys_id,
+ .base_cftypes = files,
.early_init = 1,
};
diff --git a/kernel/cred.c b/kernel/cred.c
index e70683d9ec32..430557ea488f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -49,6 +49,14 @@ struct cred init_cred = {
.subscribers = ATOMIC_INIT(2),
.magic = CRED_MAGIC,
#endif
+ .uid = GLOBAL_ROOT_UID,
+ .gid = GLOBAL_ROOT_GID,
+ .suid = GLOBAL_ROOT_UID,
+ .sgid = GLOBAL_ROOT_GID,
+ .euid = GLOBAL_ROOT_UID,
+ .egid = GLOBAL_ROOT_GID,
+ .fsuid = GLOBAL_ROOT_UID,
+ .fsgid = GLOBAL_ROOT_GID,
.securebits = SECUREBITS_DEFAULT,
.cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
@@ -148,6 +156,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
+ put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -303,6 +312,7 @@ struct cred *prepare_creds(void)
set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
+ get_user_ns(new->user_ns);
#ifdef CONFIG_KEYS
key_get(new->thread_keyring);
@@ -414,11 +424,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
goto error_put;
}
- /* cache user_ns in cred. Doesn't need a refcount because it will
- * stay pinned by cred->user
- */
- new->user_ns = new->user->user_ns;
-
#ifdef CONFIG_KEYS
/* new threads get their own thread keyrings if their parent already
* had one */
@@ -493,10 +498,10 @@ int commit_creds(struct cred *new)
get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */
- if (old->euid != new->euid ||
- old->egid != new->egid ||
- old->fsuid != new->fsuid ||
- old->fsgid != new->fsgid ||
+ if (!uid_eq(old->euid, new->euid) ||
+ !gid_eq(old->egid, new->egid) ||
+ !uid_eq(old->fsuid, new->fsuid) ||
+ !gid_eq(old->fsgid, new->fsgid) ||
!cap_issubset(new->cap_permitted, old->cap_permitted)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
@@ -505,9 +510,9 @@ int commit_creds(struct cred *new)
}
/* alter the thread keyring */
- if (new->fsuid != old->fsuid)
+ if (!uid_eq(new->fsuid, old->fsuid))
key_fsuid_changed(task);
- if (new->fsgid != old->fsgid)
+ if (!gid_eq(new->fsgid, old->fsgid))
key_fsgid_changed(task);
/* do it
@@ -524,16 +529,16 @@ int commit_creds(struct cred *new)
alter_cred_subscribers(old, -2);
/* send notifications */
- if (new->uid != old->uid ||
- new->euid != old->euid ||
- new->suid != old->suid ||
- new->fsuid != old->fsuid)
+ if (!uid_eq(new->uid, old->uid) ||
+ !uid_eq(new->euid, old->euid) ||
+ !uid_eq(new->suid, old->suid) ||
+ !uid_eq(new->fsuid, old->fsuid))
proc_id_connector(task, PROC_EVENT_UID);
- if (new->gid != old->gid ||
- new->egid != old->egid ||
- new->sgid != old->sgid ||
- new->fsgid != old->fsgid)
+ if (!gid_eq(new->gid, old->gid) ||
+ !gid_eq(new->egid, old->egid) ||
+ !gid_eq(new->sgid, old->sgid) ||
+ !gid_eq(new->fsgid, old->fsgid))
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
@@ -678,6 +683,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_uid(new->user);
+ get_user_ns(new->user_ns);
get_group_info(new->group_info);
#ifdef CONFIG_KEYS
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 22d901f9caf4..103f5d147b2f 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -3,4 +3,7 @@ CFLAGS_REMOVE_core.o = -pg
endif
obj-y := core.o ring_buffer.o callchain.o
+
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_UPROBES) += uprobes.o
+
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fd126f82b57c..5b06cbbf6931 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4957,7 +4957,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
if (rctx < 0)
return;
- perf_sample_data_init(&data, addr);
+ perf_sample_data_init(&data, addr, 0);
do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
@@ -5215,7 +5215,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
.data = record,
};
- perf_sample_data_init(&data, addr);
+ perf_sample_data_init(&data, addr, 0);
data.raw = &raw;
hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
@@ -5318,7 +5318,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
struct perf_sample_data sample;
struct pt_regs *regs = data;
- perf_sample_data_init(&sample, bp->attr.bp_addr);
+ perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
if (!bp->hw.state && !perf_exclude_event(bp, regs))
perf_swevent_event(bp, 1, &sample, regs);
@@ -5344,13 +5344,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
event->pmu->read(event);
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
regs = get_irq_regs();
if (regs && !perf_exclude_event(event, regs)) {
if (!(event->attr.exclude_idle && is_idle_task(current)))
- if (perf_event_overflow(event, &data, regs))
+ if (__perf_event_overflow(event, 1, &data, regs))
ret = HRTIMER_NORESTART;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
new file mode 100644
index 000000000000..985be4d80fe8
--- /dev/null
+++ b/kernel/events/uprobes.c
@@ -0,0 +1,1667 @@
+/*
+ * User-space Probes (UProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2012
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h> /* read_mapping_page */
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/rmap.h> /* anon_vma_prepare */
+#include <linux/mmu_notifier.h> /* set_pte_at_notify */
+#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/ptrace.h> /* user_enable_single_step */
+#include <linux/kdebug.h> /* notifier mechanism */
+
+#include <linux/uprobes.h>
+
+#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
+#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
+
+static struct srcu_struct uprobes_srcu;
+static struct rb_root uprobes_tree = RB_ROOT;
+
+static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
+
+#define UPROBES_HASH_SZ 13
+
+/* serialize (un)register */
+static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
+
+#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+
+/* serialize uprobe->pending_list */
+static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
+#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+
+/*
+ * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
+ * events active at this time. Probably a fine grained per inode count is
+ * better?
+ */
+static atomic_t uprobe_events = ATOMIC_INIT(0);
+
+/*
+ * Maintain a temporary per vma info that can be used to search if a vma
+ * has already been handled. This structure is introduced since extending
+ * vm_area_struct wasnt recommended.
+ */
+struct vma_info {
+ struct list_head probe_list;
+ struct mm_struct *mm;
+ loff_t vaddr;
+};
+
+struct uprobe {
+ struct rb_node rb_node; /* node in the rb tree */
+ atomic_t ref;
+ struct rw_semaphore consumer_rwsem;
+ struct list_head pending_list;
+ struct uprobe_consumer *consumers;
+ struct inode *inode; /* Also hold a ref to inode */
+ loff_t offset;
+ int flags;
+ struct arch_uprobe arch;
+};
+
+/*
+ * valid_vma: Verify if the specified vma is an executable vma
+ * Relax restrictions while unregistering: vm_flags might have
+ * changed after breakpoint was inserted.
+ * - is_register: indicates if we are in register context.
+ * - Return 1 if the specified virtual address is in an
+ * executable vma.
+ */
+static bool valid_vma(struct vm_area_struct *vma, bool is_register)
+{
+ if (!vma->vm_file)
+ return false;
+
+ if (!is_register)
+ return true;
+
+ if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC))
+ return true;
+
+ return false;
+}
+
+static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
+{
+ loff_t vaddr;
+
+ vaddr = vma->vm_start + offset;
+ vaddr -= vma->vm_pgoff << PAGE_SHIFT;
+
+ return vaddr;
+}
+
+/**
+ * __replace_page - replace page in vma by new page.
+ * based on replace_page in mm/ksm.c
+ *
+ * @vma: vma that holds the pte pointing to page
+ * @page: the cowed page we are replacing by kpage
+ * @kpage: the modified page we replace page by
+ *
+ * Returns 0 on success, -EFAULT on failure.
+ */
+static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep;
+ spinlock_t *ptl;
+ unsigned long addr;
+ int err = -EFAULT;
+
+ addr = page_address_in_vma(page, vma);
+ if (addr == -EFAULT)
+ goto out;
+
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ goto out;
+
+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!ptep)
+ goto out;
+
+ get_page(kpage);
+ page_add_new_anon_rmap(kpage, vma, addr);
+
+ if (!PageAnon(page)) {
+ dec_mm_counter(mm, MM_FILEPAGES);
+ inc_mm_counter(mm, MM_ANONPAGES);
+ }
+
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+
+ page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
+ put_page(page);
+ pte_unmap_unlock(ptep, ptl);
+ err = 0;
+
+out:
+ return err;
+}
+
+/**
+ * is_swbp_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_swbp_insn
+ * Returns true if @insn is a breakpoint instruction.
+ */
+bool __weak is_swbp_insn(uprobe_opcode_t *insn)
+{
+ return *insn == UPROBE_SWBP_INSN;
+}
+
+/*
+ * NOTE:
+ * Expect the breakpoint instruction to be the smallest size instruction for
+ * the architecture. If an arch has variable length instruction and the
+ * breakpoint instruction is not of the smallest length instruction
+ * supported by that architecture then we need to modify read_opcode /
+ * write_opcode accordingly. This would never be a problem for archs that
+ * have fixed length instructions.
+ */
+
+/*
+ * write_opcode - write the opcode at a given virtual address.
+ * @auprobe: arch breakpointing information.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @vaddr.
+ *
+ * Called with mm->mmap_sem held (for read and with a reference to
+ * mm).
+ *
+ * For mm @mm, write the opcode at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
+ unsigned long vaddr, uprobe_opcode_t opcode)
+{
+ struct page *old_page, *new_page;
+ struct address_space *mapping;
+ void *vaddr_old, *vaddr_new;
+ struct vm_area_struct *vma;
+ struct uprobe *uprobe;
+ loff_t addr;
+ int ret;
+
+ /* Read the page with vaddr into memory */
+ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
+ if (ret <= 0)
+ return ret;
+
+ ret = -EINVAL;
+
+ /*
+ * We are interested in text pages only. Our pages of interest
+ * should be mapped for read and execute only. We desist from
+ * adding probes in write mapped pages since the breakpoints
+ * might end up in the file copy.
+ */
+ if (!valid_vma(vma, is_swbp_insn(&opcode)))
+ goto put_out;
+
+ uprobe = container_of(auprobe, struct uprobe, arch);
+ mapping = uprobe->inode->i_mapping;
+ if (mapping != vma->vm_file->f_mapping)
+ goto put_out;
+
+ addr = vma_address(vma, uprobe->offset);
+ if (vaddr != (unsigned long)addr)
+ goto put_out;
+
+ ret = -ENOMEM;
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
+ if (!new_page)
+ goto put_out;
+
+ __SetPageUptodate(new_page);
+
+ /*
+ * lock page will serialize against do_wp_page()'s
+ * PageAnon() handling
+ */
+ lock_page(old_page);
+ /* copy the page now that we've got it stable */
+ vaddr_old = kmap_atomic(old_page);
+ vaddr_new = kmap_atomic(new_page);
+
+ memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
+
+ /* poke the new insn in, ASSUMES we don't cross page boundary */
+ vaddr &= ~PAGE_MASK;
+ BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+ memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+
+ kunmap_atomic(vaddr_new);
+ kunmap_atomic(vaddr_old);
+
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ goto unlock_out;
+
+ lock_page(new_page);
+ ret = __replace_page(vma, old_page, new_page);
+ unlock_page(new_page);
+
+unlock_out:
+ unlock_page(old_page);
+ page_cache_release(new_page);
+
+put_out:
+ put_page(old_page);
+
+ return ret;
+}
+
+/**
+ * read_opcode - read the opcode at a given virtual address.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to read the opcode.
+ * @opcode: location to store the read opcode.
+ *
+ * Called with mm->mmap_sem held (for read and with a reference to
+ * mm.
+ *
+ * For mm @mm, read the opcode at @vaddr and store it in @opcode.
+ * Return 0 (success) or a negative errno.
+ */
+static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode)
+{
+ struct page *page;
+ void *vaddr_new;
+ int ret;
+
+ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL);
+ if (ret <= 0)
+ return ret;
+
+ lock_page(page);
+ vaddr_new = kmap_atomic(page);
+ vaddr &= ~PAGE_MASK;
+ memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
+ kunmap_atomic(vaddr_new);
+ unlock_page(page);
+
+ put_page(page);
+
+ return 0;
+}
+
+static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
+{
+ uprobe_opcode_t opcode;
+ int result;
+
+ result = read_opcode(mm, vaddr, &opcode);
+ if (result)
+ return result;
+
+ if (is_swbp_insn(&opcode))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * set_swbp - store breakpoint at a given address.
+ * @auprobe: arch specific probepoint information.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to insert the opcode.
+ *
+ * For mm @mm, store the breakpoint instruction at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ int result;
+
+ result = is_swbp_at_addr(mm, vaddr);
+ if (result == 1)
+ return -EEXIST;
+
+ if (result)
+ return result;
+
+ return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
+}
+
+/**
+ * set_orig_insn - Restore the original instruction.
+ * @mm: the probed process address space.
+ * @auprobe: arch specific probepoint information.
+ * @vaddr: the virtual address to insert the opcode.
+ * @verify: if true, verify existance of breakpoint instruction.
+ *
+ * For mm @mm, restore the original opcode (opcode) at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak
+set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify)
+{
+ if (verify) {
+ int result;
+
+ result = is_swbp_at_addr(mm, vaddr);
+ if (!result)
+ return -EINVAL;
+
+ if (result != 1)
+ return result;
+ }
+ return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+}
+
+static int match_uprobe(struct uprobe *l, struct uprobe *r)
+{
+ if (l->inode < r->inode)
+ return -1;
+
+ if (l->inode > r->inode)
+ return 1;
+
+ if (l->offset < r->offset)
+ return -1;
+
+ if (l->offset > r->offset)
+ return 1;
+
+ return 0;
+}
+
+static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe u = { .inode = inode, .offset = offset };
+ struct rb_node *n = uprobes_tree.rb_node;
+ struct uprobe *uprobe;
+ int match;
+
+ while (n) {
+ uprobe = rb_entry(n, struct uprobe, rb_node);
+ match = match_uprobe(&u, uprobe);
+ if (!match) {
+ atomic_inc(&uprobe->ref);
+ return uprobe;
+ }
+
+ if (match < 0)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+ }
+ return NULL;
+}
+
+/*
+ * Find a uprobe corresponding to a given inode:offset
+ * Acquires uprobes_treelock
+ */
+static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe *uprobe;
+ unsigned long flags;
+
+ spin_lock_irqsave(&uprobes_treelock, flags);
+ uprobe = __find_uprobe(inode, offset);
+ spin_unlock_irqrestore(&uprobes_treelock, flags);
+
+ return uprobe;
+}
+
+static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
+{
+ struct rb_node **p = &uprobes_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct uprobe *u;
+ int match;
+
+ while (*p) {
+ parent = *p;
+ u = rb_entry(parent, struct uprobe, rb_node);
+ match = match_uprobe(uprobe, u);
+ if (!match) {
+ atomic_inc(&u->ref);
+ return u;
+ }
+
+ if (match < 0)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+
+ }
+
+ u = NULL;
+ rb_link_node(&uprobe->rb_node, parent, p);
+ rb_insert_color(&uprobe->rb_node, &uprobes_tree);
+ /* get access + creation ref */
+ atomic_set(&uprobe->ref, 2);
+
+ return u;
+}
+
+/*
+ * Acquire uprobes_treelock.
+ * Matching uprobe already exists in rbtree;
+ * increment (access refcount) and return the matching uprobe.
+ *
+ * No matching uprobe; insert the uprobe in rb_tree;
+ * get a double refcount (access + creation) and return NULL.
+ */
+static struct uprobe *insert_uprobe(struct uprobe *uprobe)
+{
+ unsigned long flags;
+ struct uprobe *u;
+
+ spin_lock_irqsave(&uprobes_treelock, flags);
+ u = __insert_uprobe(uprobe);
+ spin_unlock_irqrestore(&uprobes_treelock, flags);
+
+ /* For now assume that the instruction need not be single-stepped */
+ uprobe->flags |= UPROBE_SKIP_SSTEP;
+
+ return u;
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+ if (atomic_dec_and_test(&uprobe->ref))
+ kfree(uprobe);
+}
+
+static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe *uprobe, *cur_uprobe;
+
+ uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
+ if (!uprobe)
+ return NULL;
+
+ uprobe->inode = igrab(inode);
+ uprobe->offset = offset;
+ init_rwsem(&uprobe->consumer_rwsem);
+ INIT_LIST_HEAD(&uprobe->pending_list);
+
+ /* add to uprobes_tree, sorted on inode:offset */
+ cur_uprobe = insert_uprobe(uprobe);
+
+ /* a uprobe exists for this inode:offset combination */
+ if (cur_uprobe) {
+ kfree(uprobe);
+ uprobe = cur_uprobe;
+ iput(inode);
+ } else {
+ atomic_inc(&uprobe_events);
+ }
+
+ return uprobe;
+}
+
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct uprobe_consumer *uc;
+
+ if (!(uprobe->flags & UPROBE_RUN_HANDLER))
+ return;
+
+ down_read(&uprobe->consumer_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ if (!uc->filter || uc->filter(uc, current))
+ uc->handler(uc, regs);
+ }
+ up_read(&uprobe->consumer_rwsem);
+}
+
+/* Returns the previous consumer */
+static struct uprobe_consumer *
+consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ down_write(&uprobe->consumer_rwsem);
+ uc->next = uprobe->consumers;
+ uprobe->consumers = uc;
+ up_write(&uprobe->consumer_rwsem);
+
+ return uc->next;
+}
+
+/*
+ * For uprobe @uprobe, delete the consumer @uc.
+ * Return true if the @uc is deleted successfully
+ * or return false.
+ */
+static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ struct uprobe_consumer **con;
+ bool ret = false;
+
+ down_write(&uprobe->consumer_rwsem);
+ for (con = &uprobe->consumers; *con; con = &(*con)->next) {
+ if (*con == uc) {
+ *con = uc->next;
+ ret = true;
+ break;
+ }
+ }
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static int
+__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn,
+ unsigned long nbytes, unsigned long offset)
+{
+ struct file *filp = vma->vm_file;
+ struct page *page;
+ void *vaddr;
+ unsigned long off1;
+ unsigned long idx;
+
+ if (!filp)
+ return -EINVAL;
+
+ idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
+ off1 = offset &= ~PAGE_MASK;
+
+ /*
+ * Ensure that the page that has the original instruction is
+ * populated and in page-cache.
+ */
+ page = read_mapping_page(mapping, idx, filp);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ vaddr = kmap_atomic(page);
+ memcpy(insn, vaddr + off1, nbytes);
+ kunmap_atomic(vaddr);
+ page_cache_release(page);
+
+ return 0;
+}
+
+static int
+copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
+{
+ struct address_space *mapping;
+ unsigned long nbytes;
+ int bytes;
+
+ addr &= ~PAGE_MASK;
+ nbytes = PAGE_SIZE - addr;
+ mapping = uprobe->inode->i_mapping;
+
+ /* Instruction at end of binary; copy only available bytes */
+ if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
+ bytes = uprobe->inode->i_size - uprobe->offset;
+ else
+ bytes = MAX_UINSN_BYTES;
+
+ /* Instruction at the page-boundary; copy bytes in second page */
+ if (nbytes < bytes) {
+ if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes,
+ bytes - nbytes, uprobe->offset + nbytes))
+ return -ENOMEM;
+
+ bytes = nbytes;
+ }
+ return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset);
+}
+
+/*
+ * How mm->uprobes_state.count gets updated
+ * uprobe_mmap() increments the count if
+ * - it successfully adds a breakpoint.
+ * - it cannot add a breakpoint, but sees that there is a underlying
+ * breakpoint (via a is_swbp_at_addr()).
+ *
+ * uprobe_munmap() decrements the count if
+ * - it sees a underlying breakpoint, (via is_swbp_at_addr)
+ * (Subsequent uprobe_unregister wouldnt find the breakpoint
+ * unless a uprobe_mmap kicks in, since the old vma would be
+ * dropped just after uprobe_munmap.)
+ *
+ * uprobe_register increments the count if:
+ * - it successfully adds a breakpoint.
+ *
+ * uprobe_unregister decrements the count if:
+ * - it sees a underlying breakpoint and removes successfully.
+ * (via is_swbp_at_addr)
+ * (Subsequent uprobe_munmap wouldnt find the breakpoint
+ * since there is no underlying breakpoint after the
+ * breakpoint removal.)
+ */
+static int
+install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
+ struct vm_area_struct *vma, loff_t vaddr)
+{
+ unsigned long addr;
+ int ret;
+
+ /*
+ * If probe is being deleted, unregister thread could be done with
+ * the vma-rmap-walk through. Adding a probe now can be fatal since
+ * nobody will be able to cleanup. Also we could be from fork or
+ * mremap path, where the probe might have already been inserted.
+ * Hence behave as if probe already existed.
+ */
+ if (!uprobe->consumers)
+ return -EEXIST;
+
+ addr = (unsigned long)vaddr;
+
+ if (!(uprobe->flags & UPROBE_COPY_INSN)) {
+ ret = copy_insn(uprobe, vma, addr);
+ if (ret)
+ return ret;
+
+ if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
+ return -EEXIST;
+
+ ret = arch_uprobe_analyze_insn(&uprobe->arch, mm);
+ if (ret)
+ return ret;
+
+ uprobe->flags |= UPROBE_COPY_INSN;
+ }
+
+ /*
+ * Ideally, should be updating the probe count after the breakpoint
+ * has been successfully inserted. However a thread could hit the
+ * breakpoint we just inserted even before the probe count is
+ * incremented. If this is the first breakpoint placed, breakpoint
+ * notifier might ignore uprobes and pass the trap to the thread.
+ * Hence increment before and decrement on failure.
+ */
+ atomic_inc(&mm->uprobes_state.count);
+ ret = set_swbp(&uprobe->arch, mm, addr);
+ if (ret)
+ atomic_dec(&mm->uprobes_state.count);
+
+ return ret;
+}
+
+static void
+remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
+{
+ if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true))
+ atomic_dec(&mm->uprobes_state.count);
+}
+
+/*
+ * There could be threads that have hit the breakpoint and are entering the
+ * notifier code and trying to acquire the uprobes_treelock. The thread
+ * calling delete_uprobe() that is removing the uprobe from the rb_tree can
+ * race with these threads and might acquire the uprobes_treelock compared
+ * to some of the breakpoint hit threads. In such a case, the breakpoint
+ * hit threads will not find the uprobe. The current unregistering thread
+ * waits till all other threads have hit a breakpoint, to acquire the
+ * uprobes_treelock before the uprobe is removed from the rbtree.
+ */
+static void delete_uprobe(struct uprobe *uprobe)
+{
+ unsigned long flags;
+
+ synchronize_srcu(&uprobes_srcu);
+ spin_lock_irqsave(&uprobes_treelock, flags);
+ rb_erase(&uprobe->rb_node, &uprobes_tree);
+ spin_unlock_irqrestore(&uprobes_treelock, flags);
+ iput(uprobe->inode);
+ put_uprobe(uprobe);
+ atomic_dec(&uprobe_events);
+}
+
+static struct vma_info *
+__find_next_vma_info(struct address_space *mapping, struct list_head *head,
+ struct vma_info *vi, loff_t offset, bool is_register)
+{
+ struct prio_tree_iter iter;
+ struct vm_area_struct *vma;
+ struct vma_info *tmpvi;
+ unsigned long pgoff;
+ int existing_vma;
+ loff_t vaddr;
+
+ pgoff = offset >> PAGE_SHIFT;
+
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ if (!valid_vma(vma, is_register))
+ continue;
+
+ existing_vma = 0;
+ vaddr = vma_address(vma, offset);
+
+ list_for_each_entry(tmpvi, head, probe_list) {
+ if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
+ existing_vma = 1;
+ break;
+ }
+ }
+
+ /*
+ * Another vma needs a probe to be installed. However skip
+ * installing the probe if the vma is about to be unlinked.
+ */
+ if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
+ vi->mm = vma->vm_mm;
+ vi->vaddr = vaddr;
+ list_add(&vi->probe_list, head);
+
+ return vi;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Iterate in the rmap prio tree and find a vma where a probe has not
+ * yet been inserted.
+ */
+static struct vma_info *
+find_next_vma_info(struct address_space *mapping, struct list_head *head,
+ loff_t offset, bool is_register)
+{
+ struct vma_info *vi, *retvi;
+
+ vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
+ if (!vi)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_lock(&mapping->i_mmap_mutex);
+ retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
+ mutex_unlock(&mapping->i_mmap_mutex);
+
+ if (!retvi)
+ kfree(vi);
+
+ return retvi;
+}
+
+static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
+{
+ struct list_head try_list;
+ struct vm_area_struct *vma;
+ struct address_space *mapping;
+ struct vma_info *vi, *tmpvi;
+ struct mm_struct *mm;
+ loff_t vaddr;
+ int ret;
+
+ mapping = uprobe->inode->i_mapping;
+ INIT_LIST_HEAD(&try_list);
+
+ ret = 0;
+
+ for (;;) {
+ vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
+ if (!vi)
+ break;
+
+ if (IS_ERR(vi)) {
+ ret = PTR_ERR(vi);
+ break;
+ }
+
+ mm = vi->mm;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, (unsigned long)vi->vaddr);
+ if (!vma || !valid_vma(vma, is_register)) {
+ list_del(&vi->probe_list);
+ kfree(vi);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ continue;
+ }
+ vaddr = vma_address(vma, uprobe->offset);
+ if (vma->vm_file->f_mapping->host != uprobe->inode ||
+ vaddr != vi->vaddr) {
+ list_del(&vi->probe_list);
+ kfree(vi);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ continue;
+ }
+
+ if (is_register)
+ ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
+ else
+ remove_breakpoint(uprobe, mm, vi->vaddr);
+
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ if (is_register) {
+ if (ret && ret == -EEXIST)
+ ret = 0;
+ if (ret)
+ break;
+ }
+ }
+
+ list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
+ list_del(&vi->probe_list);
+ kfree(vi);
+ }
+
+ return ret;
+}
+
+static int __uprobe_register(struct uprobe *uprobe)
+{
+ return register_for_each_vma(uprobe, true);
+}
+
+static void __uprobe_unregister(struct uprobe *uprobe)
+{
+ if (!register_for_each_vma(uprobe, false))
+ delete_uprobe(uprobe);
+
+ /* TODO : cant unregister? schedule a worker thread */
+}
+
+/*
+ * uprobe_register - register a probe
+ * @inode: the file in which the probe has to be placed.
+ * @offset: offset from the start of the file.
+ * @uc: information on howto handle the probe..
+ *
+ * Apart from the access refcount, uprobe_register() takes a creation
+ * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
+ * inserted into the rbtree (i.e first consumer for a @inode:@offset
+ * tuple). Creation refcount stops uprobe_unregister from freeing the
+ * @uprobe even before the register operation is complete. Creation
+ * refcount is released when the last @uc for the @uprobe
+ * unregisters.
+ *
+ * Return errno if it cannot successully install probes
+ * else return 0 (success)
+ */
+int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+ struct uprobe *uprobe;
+ int ret;
+
+ if (!inode || !uc || uc->next)
+ return -EINVAL;
+
+ if (offset > i_size_read(inode))
+ return -EINVAL;
+
+ ret = 0;
+ mutex_lock(uprobes_hash(inode));
+ uprobe = alloc_uprobe(inode, offset);
+
+ if (uprobe && !consumer_add(uprobe, uc)) {
+ ret = __uprobe_register(uprobe);
+ if (ret) {
+ uprobe->consumers = NULL;
+ __uprobe_unregister(uprobe);
+ } else {
+ uprobe->flags |= UPROBE_RUN_HANDLER;
+ }
+ }
+
+ mutex_unlock(uprobes_hash(inode));
+ put_uprobe(uprobe);
+
+ return ret;
+}
+
+/*
+ * uprobe_unregister - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: identify which probe if multiple probes are colocated.
+ */
+void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+ struct uprobe *uprobe;
+
+ if (!inode || !uc)
+ return;
+
+ uprobe = find_uprobe(inode, offset);
+ if (!uprobe)
+ return;
+
+ mutex_lock(uprobes_hash(inode));
+
+ if (consumer_del(uprobe, uc)) {
+ if (!uprobe->consumers) {
+ __uprobe_unregister(uprobe);
+ uprobe->flags &= ~UPROBE_RUN_HANDLER;
+ }
+ }
+
+ mutex_unlock(uprobes_hash(inode));
+ if (uprobe)
+ put_uprobe(uprobe);
+}
+
+/*
+ * Of all the nodes that correspond to the given inode, return the node
+ * with the least offset.
+ */
+static struct rb_node *find_least_offset_node(struct inode *inode)
+{
+ struct uprobe u = { .inode = inode, .offset = 0};
+ struct rb_node *n = uprobes_tree.rb_node;
+ struct rb_node *close_node = NULL;
+ struct uprobe *uprobe;
+ int match;
+
+ while (n) {
+ uprobe = rb_entry(n, struct uprobe, rb_node);
+ match = match_uprobe(&u, uprobe);
+
+ if (uprobe->inode == inode)
+ close_node = n;
+
+ if (!match)
+ return close_node;
+
+ if (match < 0)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+ }
+
+ return close_node;
+}
+
+/*
+ * For a given inode, build a list of probes that need to be inserted.
+ */
+static void build_probe_list(struct inode *inode, struct list_head *head)
+{
+ struct uprobe *uprobe;
+ unsigned long flags;
+ struct rb_node *n;
+
+ spin_lock_irqsave(&uprobes_treelock, flags);
+
+ n = find_least_offset_node(inode);
+
+ for (; n; n = rb_next(n)) {
+ uprobe = rb_entry(n, struct uprobe, rb_node);
+ if (uprobe->inode != inode)
+ break;
+
+ list_add(&uprobe->pending_list, head);
+ atomic_inc(&uprobe->ref);
+ }
+
+ spin_unlock_irqrestore(&uprobes_treelock, flags);
+}
+
+/*
+ * Called from mmap_region.
+ * called with mm->mmap_sem acquired.
+ *
+ * Return -ve no if we fail to insert probes and we cannot
+ * bail-out.
+ * Return 0 otherwise. i.e:
+ *
+ * - successful insertion of probes
+ * - (or) no possible probes to be inserted.
+ * - (or) insertion of probes failed but we can bail-out.
+ */
+int uprobe_mmap(struct vm_area_struct *vma)
+{
+ struct list_head tmp_list;
+ struct uprobe *uprobe, *u;
+ struct inode *inode;
+ int ret, count;
+
+ if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
+ return 0;
+
+ inode = vma->vm_file->f_mapping->host;
+ if (!inode)
+ return 0;
+
+ INIT_LIST_HEAD(&tmp_list);
+ mutex_lock(uprobes_mmap_hash(inode));
+ build_probe_list(inode, &tmp_list);
+
+ ret = 0;
+ count = 0;
+
+ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+ loff_t vaddr;
+
+ list_del(&uprobe->pending_list);
+ if (!ret) {
+ vaddr = vma_address(vma, uprobe->offset);
+
+ if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
+ put_uprobe(uprobe);
+ continue;
+ }
+
+ ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+
+ /* Ignore double add: */
+ if (ret == -EEXIST) {
+ ret = 0;
+
+ if (!is_swbp_at_addr(vma->vm_mm, vaddr))
+ continue;
+
+ /*
+ * Unable to insert a breakpoint, but
+ * breakpoint lies underneath. Increment the
+ * probe count.
+ */
+ atomic_inc(&vma->vm_mm->uprobes_state.count);
+ }
+
+ if (!ret)
+ count++;
+ }
+ put_uprobe(uprobe);
+ }
+
+ mutex_unlock(uprobes_mmap_hash(inode));
+
+ if (ret)
+ atomic_sub(count, &vma->vm_mm->uprobes_state.count);
+
+ return ret;
+}
+
+/*
+ * Called in context of a munmap of a vma.
+ */
+void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ struct list_head tmp_list;
+ struct uprobe *uprobe, *u;
+ struct inode *inode;
+
+ if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
+ return;
+
+ if (!atomic_read(&vma->vm_mm->uprobes_state.count))
+ return;
+
+ inode = vma->vm_file->f_mapping->host;
+ if (!inode)
+ return;
+
+ INIT_LIST_HEAD(&tmp_list);
+ mutex_lock(uprobes_mmap_hash(inode));
+ build_probe_list(inode, &tmp_list);
+
+ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+ loff_t vaddr;
+
+ list_del(&uprobe->pending_list);
+ vaddr = vma_address(vma, uprobe->offset);
+
+ if (vaddr >= start && vaddr < end) {
+ /*
+ * An unregister could have removed the probe before
+ * unmap. So check before we decrement the count.
+ */
+ if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
+ atomic_dec(&vma->vm_mm->uprobes_state.count);
+ }
+ put_uprobe(uprobe);
+ }
+ mutex_unlock(uprobes_mmap_hash(inode));
+}
+
+/* Slot allocation for XOL */
+static int xol_add_vma(struct xol_area *area)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ area->page = alloc_page(GFP_HIGHUSER);
+ if (!area->page)
+ return -ENOMEM;
+
+ ret = -EALREADY;
+ mm = current->mm;
+
+ down_write(&mm->mmap_sem);
+ if (mm->uprobes_state.xol_area)
+ goto fail;
+
+ ret = -ENOMEM;
+
+ /* Try to map as high as possible, this is only a hint. */
+ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
+ if (area->vaddr & ~PAGE_MASK) {
+ ret = area->vaddr;
+ goto fail;
+ }
+
+ ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
+ if (ret)
+ goto fail;
+
+ smp_wmb(); /* pairs with get_xol_area() */
+ mm->uprobes_state.xol_area = area;
+ ret = 0;
+
+fail:
+ up_write(&mm->mmap_sem);
+ if (ret)
+ __free_page(area->page);
+
+ return ret;
+}
+
+static struct xol_area *get_xol_area(struct mm_struct *mm)
+{
+ struct xol_area *area;
+
+ area = mm->uprobes_state.xol_area;
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+
+ return area;
+}
+
+/*
+ * xol_alloc_area - Allocate process's xol_area.
+ * This area will be used for storing instructions for execution out of
+ * line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *xol_alloc_area(void)
+{
+ struct xol_area *area;
+
+ area = kzalloc(sizeof(*area), GFP_KERNEL);
+ if (unlikely(!area))
+ return NULL;
+
+ area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
+
+ if (!area->bitmap)
+ goto fail;
+
+ init_waitqueue_head(&area->wq);
+ if (!xol_add_vma(area))
+ return area;
+
+fail:
+ kfree(area->bitmap);
+ kfree(area);
+
+ return get_xol_area(current->mm);
+}
+
+/*
+ * uprobe_clear_state - Free the area allocated for slots.
+ */
+void uprobe_clear_state(struct mm_struct *mm)
+{
+ struct xol_area *area = mm->uprobes_state.xol_area;
+
+ if (!area)
+ return;
+
+ put_page(area->page);
+ kfree(area->bitmap);
+ kfree(area);
+}
+
+/*
+ * uprobe_reset_state - Free the area allocated for slots.
+ */
+void uprobe_reset_state(struct mm_struct *mm)
+{
+ mm->uprobes_state.xol_area = NULL;
+ atomic_set(&mm->uprobes_state.count, 0);
+}
+
+/*
+ * - search for a free slot.
+ */
+static unsigned long xol_take_insn_slot(struct xol_area *area)
+{
+ unsigned long slot_addr;
+ int slot_nr;
+
+ do {
+ slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
+ if (slot_nr < UINSNS_PER_PAGE) {
+ if (!test_and_set_bit(slot_nr, area->bitmap))
+ break;
+
+ slot_nr = UINSNS_PER_PAGE;
+ continue;
+ }
+ wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
+ } while (slot_nr >= UINSNS_PER_PAGE);
+
+ slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
+ atomic_inc(&area->slot_count);
+
+ return slot_addr;
+}
+
+/*
+ * xol_get_insn_slot - If was not allocated a slot, then
+ * allocate a slot.
+ * Returns the allocated slot address or 0.
+ */
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
+{
+ struct xol_area *area;
+ unsigned long offset;
+ void *vaddr;
+
+ area = get_xol_area(current->mm);
+ if (!area) {
+ area = xol_alloc_area();
+ if (!area)
+ return 0;
+ }
+ current->utask->xol_vaddr = xol_take_insn_slot(area);
+
+ /*
+ * Initialize the slot if xol_vaddr points to valid
+ * instruction slot.
+ */
+ if (unlikely(!current->utask->xol_vaddr))
+ return 0;
+
+ current->utask->vaddr = slot_addr;
+ offset = current->utask->xol_vaddr & ~PAGE_MASK;
+ vaddr = kmap_atomic(area->page);
+ memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
+ kunmap_atomic(vaddr);
+
+ return current->utask->xol_vaddr;
+}
+
+/*
+ * xol_free_insn_slot - If slot was earlier allocated by
+ * @xol_get_insn_slot(), make the slot available for
+ * subsequent requests.
+ */
+static void xol_free_insn_slot(struct task_struct *tsk)
+{
+ struct xol_area *area;
+ unsigned long vma_end;
+ unsigned long slot_addr;
+
+ if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
+ return;
+
+ slot_addr = tsk->utask->xol_vaddr;
+
+ if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
+ return;
+
+ area = tsk->mm->uprobes_state.xol_area;
+ vma_end = area->vaddr + PAGE_SIZE;
+ if (area->vaddr <= slot_addr && slot_addr < vma_end) {
+ unsigned long offset;
+ int slot_nr;
+
+ offset = slot_addr - area->vaddr;
+ slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
+ if (slot_nr >= UINSNS_PER_PAGE)
+ return;
+
+ clear_bit(slot_nr, area->bitmap);
+ atomic_dec(&area->slot_count);
+ if (waitqueue_active(&area->wq))
+ wake_up(&area->wq);
+
+ tsk->utask->xol_vaddr = 0;
+ }
+}
+
+/**
+ * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
+ * @regs: Reflects the saved state of the task after it has hit a breakpoint
+ * instruction.
+ * Return the address of the breakpoint instruction.
+ */
+unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
+{
+ return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
+}
+
+/*
+ * Called with no locks held.
+ * Called in context of a exiting or a exec-ing thread.
+ */
+void uprobe_free_utask(struct task_struct *t)
+{
+ struct uprobe_task *utask = t->utask;
+
+ if (t->uprobe_srcu_id != -1)
+ srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
+
+ if (!utask)
+ return;
+
+ if (utask->active_uprobe)
+ put_uprobe(utask->active_uprobe);
+
+ xol_free_insn_slot(t);
+ kfree(utask);
+ t->utask = NULL;
+}
+
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t)
+{
+ t->utask = NULL;
+ t->uprobe_srcu_id = -1;
+}
+
+/*
+ * Allocate a uprobe_task object for the task.
+ * Called when the thread hits a breakpoint for the first time.
+ *
+ * Returns:
+ * - pointer to new uprobe_task on success
+ * - NULL otherwise
+ */
+static struct uprobe_task *add_utask(void)
+{
+ struct uprobe_task *utask;
+
+ utask = kzalloc(sizeof *utask, GFP_KERNEL);
+ if (unlikely(!utask))
+ return NULL;
+
+ utask->active_uprobe = NULL;
+ current->utask = utask;
+ return utask;
+}
+
+/* Prepare to single-step probed instruction out of line. */
+static int
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
+{
+ if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
+ return 0;
+
+ return -EFAULT;
+}
+
+/*
+ * If we are singlestepping, then ensure this thread is not connected to
+ * non-fatal signals until completion of singlestep. When xol insn itself
+ * triggers the signal, restart the original insn even if the task is
+ * already SIGKILL'ed (since coredump should report the correct ip). This
+ * is even more important if the task has a handler for SIGSEGV/etc, The
+ * _same_ instruction should be repeated again after return from the signal
+ * handler, and SSTEP can never finish in this case.
+ */
+bool uprobe_deny_signal(void)
+{
+ struct task_struct *t = current;
+ struct uprobe_task *utask = t->utask;
+
+ if (likely(!utask || !utask->active_uprobe))
+ return false;
+
+ WARN_ON_ONCE(utask->state != UTASK_SSTEP);
+
+ if (signal_pending(t)) {
+ spin_lock_irq(&t->sighand->siglock);
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ spin_unlock_irq(&t->sighand->siglock);
+
+ if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
+ utask->state = UTASK_SSTEP_TRAPPED;
+ set_tsk_thread_flag(t, TIF_UPROBE);
+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Avoid singlestepping the original instruction if the original instruction
+ * is a NOP or can be emulated.
+ */
+static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
+ return true;
+
+ uprobe->flags &= ~UPROBE_SKIP_SSTEP;
+ return false;
+}
+
+/*
+ * Run handler and ask thread to singlestep.
+ * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
+ */
+static void handle_swbp(struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+ struct uprobe_task *utask;
+ struct uprobe *uprobe;
+ struct mm_struct *mm;
+ unsigned long bp_vaddr;
+
+ uprobe = NULL;
+ bp_vaddr = uprobe_get_swbp_addr(regs);
+ mm = current->mm;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, bp_vaddr);
+
+ if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
+ struct inode *inode;
+ loff_t offset;
+
+ inode = vma->vm_file->f_mapping->host;
+ offset = bp_vaddr - vma->vm_start;
+ offset += (vma->vm_pgoff << PAGE_SHIFT);
+ uprobe = find_uprobe(inode, offset);
+ }
+
+ srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
+ current->uprobe_srcu_id = -1;
+ up_read(&mm->mmap_sem);
+
+ if (!uprobe) {
+ /* No matching uprobe; signal SIGTRAP. */
+ send_sig(SIGTRAP, current, 0);
+ return;
+ }
+
+ utask = current->utask;
+ if (!utask) {
+ utask = add_utask();
+ /* Cannot allocate; re-execute the instruction. */
+ if (!utask)
+ goto cleanup_ret;
+ }
+ utask->active_uprobe = uprobe;
+ handler_chain(uprobe, regs);
+ if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs))
+ goto cleanup_ret;
+
+ utask->state = UTASK_SSTEP;
+ if (!pre_ssout(uprobe, regs, bp_vaddr)) {
+ user_enable_single_step(current);
+ return;
+ }
+
+cleanup_ret:
+ if (utask) {
+ utask->active_uprobe = NULL;
+ utask->state = UTASK_RUNNING;
+ }
+ if (uprobe) {
+ if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
+
+ /*
+ * cannot singlestep; cannot skip instruction;
+ * re-execute the instruction.
+ */
+ instruction_pointer_set(regs, bp_vaddr);
+
+ put_uprobe(uprobe);
+ }
+}
+
+/*
+ * Perform required fix-ups and disable singlestep.
+ * Allow pending signals to take effect.
+ */
+static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
+{
+ struct uprobe *uprobe;
+
+ uprobe = utask->active_uprobe;
+ if (utask->state == UTASK_SSTEP_ACK)
+ arch_uprobe_post_xol(&uprobe->arch, regs);
+ else if (utask->state == UTASK_SSTEP_TRAPPED)
+ arch_uprobe_abort_xol(&uprobe->arch, regs);
+ else
+ WARN_ON_ONCE(1);
+
+ put_uprobe(uprobe);
+ utask->active_uprobe = NULL;
+ utask->state = UTASK_RUNNING;
+ user_disable_single_step(current);
+ xol_free_insn_slot(current);
+
+ spin_lock_irq(&current->sighand->siglock);
+ recalc_sigpending(); /* see uprobe_deny_signal() */
+ spin_unlock_irq(&current->sighand->siglock);
+}
+
+/*
+ * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on
+ * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and
+ * allows the thread to return from interrupt.
+ *
+ * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and
+ * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from
+ * interrupt.
+ *
+ * While returning to userspace, thread notices the TIF_UPROBE flag and calls
+ * uprobe_notify_resume().
+ */
+void uprobe_notify_resume(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+
+ utask = current->utask;
+ if (!utask || utask->state == UTASK_BP_HIT)
+ handle_swbp(regs);
+ else
+ handle_singlestep(utask, regs);
+}
+
+/*
+ * uprobe_pre_sstep_notifier gets called from interrupt context as part of
+ * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
+ */
+int uprobe_pre_sstep_notifier(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+
+ if (!current->mm || !atomic_read(&current->mm->uprobes_state.count))
+ /* task is currently not uprobed */
+ return 0;
+
+ utask = current->utask;
+ if (utask)
+ utask->state = UTASK_BP_HIT;
+
+ set_thread_flag(TIF_UPROBE);
+ current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
+
+ return 1;
+}
+
+/*
+ * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
+ * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
+ */
+int uprobe_post_sstep_notifier(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (!current->mm || !utask || !utask->active_uprobe)
+ /* task is currently not uprobed */
+ return 0;
+
+ utask->state = UTASK_SSTEP_ACK;
+ set_thread_flag(TIF_UPROBE);
+ return 1;
+}
+
+static struct notifier_block uprobe_exception_nb = {
+ .notifier_call = arch_uprobe_exception_notify,
+ .priority = INT_MAX-1, /* notified after kprobes, kgdb */
+};
+
+static int __init init_uprobes(void)
+{
+ int i;
+
+ for (i = 0; i < UPROBES_HASH_SZ; i++) {
+ mutex_init(&uprobes_mutex[i]);
+ mutex_init(&uprobes_mmap_mutex[i]);
+ }
+ init_srcu_struct(&uprobes_srcu);
+
+ return register_die_notifier(&uprobe_exception_nb);
+}
+module_init(init_uprobes);
+
+static void __exit exit_uprobes(void)
+{
+}
+module_exit(exit_uprobes);
diff --git a/kernel/exit.c b/kernel/exit.c
index d8bd3b425fa7..910a0716e17a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1214,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
unsigned long state;
int retval, status, traced;
pid_t pid = task_pid_vnr(p);
- uid_t uid = __task_cred(p)->uid;
+ uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid);
struct siginfo __user *infop;
if (!likely(wo->wo_flags & WEXITED))
@@ -1427,7 +1427,7 @@ static int wait_task_stopped(struct wait_opts *wo,
if (!unlikely(wo->wo_flags & WNOWAIT))
*p_code = 0;
- uid = task_uid(p);
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
spin_unlock_irq(&p->sighand->siglock);
if (!exit_code)
@@ -1500,7 +1500,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
}
if (!unlikely(wo->wo_flags & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
- uid = task_uid(p);
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
spin_unlock_irq(&p->sighand->siglock);
pid = task_pid_vnr(p);
diff --git a/kernel/extable.c b/kernel/extable.c
index 5339705b8241..fe35a634bf76 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex);
extern struct exception_table_entry __start___ex_table[];
extern struct exception_table_entry __stop___ex_table[];
+/* Cleared by build time tools if the table is already sorted. */
+u32 __initdata main_extable_sort_needed = 1;
+
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
- sort_extable(__start___ex_table, __stop___ex_table);
+ if (main_extable_sort_needed)
+ sort_extable(__start___ex_table, __stop___ex_table);
+ else
+ pr_notice("__ex_table already sorted, skipping sort\n");
}
/* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index b9372a0bff18..47b4e4f379f9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
@@ -47,6 +48,7 @@
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
@@ -67,6 +69,7 @@
#include <linux/oom.h>
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
+#include <linux/uprobes.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -111,32 +114,67 @@ int nr_processes(void)
return total;
}
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct_node(node) \
- kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
-# define free_task_struct(tsk) \
- kmem_cache_free(task_struct_cachep, (tsk))
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
+
+static inline struct task_struct *alloc_task_struct_node(int node)
+{
+ return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
+}
+
+void __weak arch_release_task_struct(struct task_struct *tsk) { }
+
+static inline void free_task_struct(struct task_struct *tsk)
+{
+ arch_release_task_struct(tsk);
+ kmem_cache_free(task_struct_cachep, tsk);
+}
#endif
-#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+void __weak arch_release_thread_info(struct thread_info *ti) { }
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+# if THREAD_SIZE >= PAGE_SIZE
static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
int node)
{
-#ifdef CONFIG_DEBUG_STACK_USAGE
- gfp_t mask = GFP_KERNEL | __GFP_ZERO;
-#else
- gfp_t mask = GFP_KERNEL;
-#endif
- struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+ struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
+ arch_release_thread_info(ti);
free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
+# else
+static struct kmem_cache *thread_info_cache;
+
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ int node)
+{
+ return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+}
+
+static void free_thread_info(struct thread_info *ti)
+{
+ arch_release_thread_info(ti);
+ kmem_cache_free(thread_info_cache, ti);
+}
+
+void thread_info_cache_init(void)
+{
+ thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+ THREAD_SIZE, 0, NULL);
+ BUG_ON(thread_info_cache == NULL);
+}
+# endif
#endif
/* SLAB cache for signal_struct structures (tsk->signal) */
@@ -170,6 +208,7 @@ void free_task(struct task_struct *tsk)
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
+ put_seccomp_filter(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -203,17 +242,11 @@ void __put_task_struct(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(__put_task_struct);
-/*
- * macro override instead of weak attribute alias, to workaround
- * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
- */
-#ifndef arch_task_cache_init
-#define arch_task_cache_init()
-#endif
+void __init __weak arch_task_cache_init(void) { }
void __init fork_init(unsigned long mempages)
{
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
#endif
@@ -260,8 +293,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
int node = tsk_fork_get_node(orig);
int err;
- prepare_to_copy(orig);
-
tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
@@ -421,6 +452,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
if (retval)
goto out;
+
+ if (file && uprobe_mmap(tmp))
+ goto out;
}
/* a new mm has just been created */
arch_dup_mmap(oldmm, mm);
@@ -569,6 +603,7 @@ void mmput(struct mm_struct *mm)
might_sleep();
if (atomic_dec_and_test(&mm->mm_users)) {
+ uprobe_clear_state(mm);
exit_aio(mm);
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
@@ -747,6 +782,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
exit_pi_state_list(tsk);
#endif
+ uprobe_free_utask(tsk);
+
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
@@ -801,6 +838,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
mm->pmd_huge_pte = NULL;
#endif
+ uprobe_reset_state(mm);
if (!mm_init(mm, tsk))
goto fail_nomem;
@@ -1162,6 +1200,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
+ get_seccomp_filter(p);
rt_mutex_init_task(p);
@@ -1342,6 +1381,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
#endif
+ uprobe_copy_process(p);
/*
* sigaltstack should be cleared when sharing the same VM
*/
@@ -1464,6 +1504,8 @@ bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
+ if (unlikely(clone_flags & CLONE_NEWPID))
+ pid_ns_release_proc(p->nsproxy->pid_ns);
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm)
diff --git a/kernel/groups.c b/kernel/groups.c
index 99b53d1eb7ea..6b2588dd04ff 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize)
group_info->blocks[0] = group_info->small_block;
else {
for (i = 0; i < nblocks; i++) {
- gid_t *b;
+ kgid_t *b;
b = (void *)__get_free_page(GFP_USER);
if (!b)
goto out_undo_partial_alloc;
@@ -66,18 +66,15 @@ EXPORT_SYMBOL(groups_free);
static int groups_to_user(gid_t __user *grouplist,
const struct group_info *group_info)
{
+ struct user_namespace *user_ns = current_user_ns();
int i;
unsigned int count = group_info->ngroups;
- for (i = 0; i < group_info->nblocks; i++) {
- unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
- unsigned int len = cp_count * sizeof(*grouplist);
-
- if (copy_to_user(grouplist, group_info->blocks[i], len))
+ for (i = 0; i < count; i++) {
+ gid_t gid;
+ gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
+ if (put_user(gid, grouplist+i))
return -EFAULT;
-
- grouplist += NGROUPS_PER_BLOCK;
- count -= cp_count;
}
return 0;
}
@@ -86,18 +83,21 @@ static int groups_to_user(gid_t __user *grouplist,
static int groups_from_user(struct group_info *group_info,
gid_t __user *grouplist)
{
+ struct user_namespace *user_ns = current_user_ns();
int i;
unsigned int count = group_info->ngroups;
- for (i = 0; i < group_info->nblocks; i++) {
- unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
- unsigned int len = cp_count * sizeof(*grouplist);
-
- if (copy_from_user(group_info->blocks[i], grouplist, len))
+ for (i = 0; i < count; i++) {
+ gid_t gid;
+ kgid_t kgid;
+ if (get_user(gid, grouplist+i))
return -EFAULT;
- grouplist += NGROUPS_PER_BLOCK;
- count -= cp_count;
+ kgid = make_kgid(user_ns, gid);
+ if (!gid_valid(kgid))
+ return -EINVAL;
+
+ GROUP_AT(group_info, i) = kgid;
}
return 0;
}
@@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info)
for (base = 0; base < max; base++) {
int left = base;
int right = left + stride;
- gid_t tmp = GROUP_AT(group_info, right);
+ kgid_t tmp = GROUP_AT(group_info, right);
- while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+ while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
GROUP_AT(group_info, right) =
GROUP_AT(group_info, left);
right = left;
@@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info)
}
/* a simple bsearch */
-int groups_search(const struct group_info *group_info, gid_t grp)
+int groups_search(const struct group_info *group_info, kgid_t grp)
{
unsigned int left, right;
@@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
right = group_info->ngroups;
while (left < right) {
unsigned int mid = (left+right)/2;
- if (grp > GROUP_AT(group_info, mid))
+ if (gid_gt(grp, GROUP_AT(group_info, mid)))
left = mid + 1;
- else if (grp < GROUP_AT(group_info, mid))
+ else if (gid_lt(grp, GROUP_AT(group_info, mid)))
right = mid;
else
return 1;
@@ -256,24 +256,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
/*
* Check whether we're fsgid/egid or in the supplemental group..
*/
-int in_group_p(gid_t grp)
+int in_group_p(kgid_t grp)
{
const struct cred *cred = current_cred();
int retval = 1;
- if (grp != cred->fsgid)
+ if (!gid_eq(grp, cred->fsgid))
retval = groups_search(cred->group_info, grp);
return retval;
}
EXPORT_SYMBOL(in_group_p);
-int in_egroup_p(gid_t grp)
+int in_egroup_p(kgid_t grp)
{
const struct cred *cred = current_cred();
int retval = 1;
- if (grp != cred->egid)
+ if (!gid_eq(grp, cred->egid))
retval = groups_search(cred->group_info, grp);
return retval;
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index c21449f85a2a..6df614912b9d 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
touch_nmi_watchdog();
- if (sysctl_hung_task_panic)
+ if (sysctl_hung_task_panic) {
+ trigger_all_cpu_backtrace();
panic("hung_task: blocked tasks");
+ }
}
/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6080f6bc8c33..fc275e4f629b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -379,8 +379,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
* If its disabled or no action available
* keep it masked and get out of here
*/
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
goto out_unlock;
+ }
handle_irq_event(desc);
@@ -518,6 +520,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
out_unlock:
raw_spin_unlock(&desc->lock);
}
+EXPORT_SYMBOL(handle_edge_irq);
#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
/**
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index d86e254b95eb..192a302d6cfd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
{
return radix_tree_lookup(&irq_desc_tree, irq);
}
+EXPORT_SYMBOL(irq_to_desc);
static void delete_irq_desc(unsigned int irq)
{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 89a3ea82569b..bb32326afe87 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -565,8 +565,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
* IRQF_TRIGGER_* but the PIC does not support multiple
* flow-types?
*/
- pr_debug("No set_type function for IRQ %d (%s)\n", irq,
- chip ? (chip->name ? : "unknown") : "unknown");
+ pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq,
+ chip ? (chip->name ? : "unknown") : "unknown");
return 0;
}
@@ -600,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
ret = 0;
break;
default:
- pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
+ pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n",
flags, irq, chip->irq_set_type);
}
if (unmask)
@@ -837,8 +837,7 @@ void exit_irq_thread(void)
action = kthread_data(tsk);
- printk(KERN_ERR
- "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+ pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
desc = irq_to_desc(action->irq);
@@ -878,7 +877,6 @@ static int
__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
{
struct irqaction *old, **old_ptr;
- const char *old_name = NULL;
unsigned long flags, thread_mask = 0;
int ret, nested, shared = 0;
cpumask_var_t mask;
@@ -972,10 +970,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (!((old->flags & new->flags) & IRQF_SHARED) ||
((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
- ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
- old_name = old->name;
+ ((old->flags ^ new->flags) & IRQF_ONESHOT))
goto mismatch;
- }
/* All handlers must agree on per-cpuness */
if ((old->flags & IRQF_PERCPU) !=
@@ -1031,6 +1027,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* all existing action->thread_mask bits.
*/
new->thread_mask = 1 << ffz(thread_mask);
+
+ } else if (new->handler == irq_default_primary_handler) {
+ /*
+ * The interrupt was requested with handler = NULL, so
+ * we use the default primary handler for it. But it
+ * does not have the oneshot flag set. In combination
+ * with level interrupts this is deadly, because the
+ * default primary handler just wakes the thread, then
+ * the irq lines is reenabled, but the device still
+ * has the level irq asserted. Rinse and repeat....
+ *
+ * While this works for edge type interrupts, we play
+ * it safe and reject unconditionally because we can't
+ * say for sure which type this interrupt really
+ * has. The type flags are unreliable as the
+ * underlying chip implementation can override them.
+ */
+ pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
+ irq);
+ ret = -EINVAL;
+ goto out_mask;
}
if (!shared) {
@@ -1078,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (nmsk != omsk)
/* hope the handler works with current trigger mode */
- pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
+ pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n",
irq, nmsk, omsk);
}
@@ -1115,14 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
return 0;
mismatch:
-#ifdef CONFIG_DEBUG_SHIRQ
if (!(new->flags & IRQF_PROBE_SHARED)) {
- printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
- if (old_name)
- printk(KERN_ERR "current handler: %s\n", old_name);
+ pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
+ irq, new->flags, new->name, old->flags, old->name);
+#ifdef CONFIG_DEBUG_SHIRQ
dump_stack();
- }
#endif
+ }
ret = -EBUSY;
out_mask:
@@ -1204,12 +1220,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* Found it - now remove it from the list of entries: */
*action_ptr = action->next;
- /* Currently used only by UML, might disappear one day: */
-#ifdef CONFIG_IRQ_RELEASE_METHOD
- if (desc->irq_data.chip->release)
- desc->irq_data.chip->release(irq, dev_id);
-#endif
-
/* If this was the last handler, shut down the IRQ line: */
if (!desc->action)
irq_shutdown(desc);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 15e53b1766a6..cb228bf21760 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void)
int irq;
for_each_irq_desc(irq, desc) {
+ /*
+ * Only interrupts which are marked as wakeup source
+ * and have not been disabled before the suspend check
+ * can abort suspend.
+ */
if (irqd_is_wakeup_set(&desc->irq_data)) {
- if (desc->istate & IRQS_PENDING)
+ if (desc->depth == 1 && desc->istate & IRQS_PENDING)
return -EBUSY;
continue;
}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 14dd5761e8c9..6454db7b6a4d 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
/*
* We do not resend level type interrupts. Level type
* interrupts are resent by hardware when they are still
- * active.
+ * active. Clear the pending bit so suspend/resume does not
+ * get confused.
*/
- if (irq_settings_is_level(desc))
+ if (irq_settings_is_level(desc)) {
+ desc->istate &= ~IRQS_PENDING;
return;
+ }
if (desc->istate & IRQS_REPLAY)
return;
if (desc->istate & IRQS_PENDING) {
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index c744b88c44e2..59dcf5b81d24 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -402,6 +402,7 @@ unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
return max;
return len;
}
+EXPORT_SYMBOL(__kfifo_max_r);
#define __KFIFO_PEEK(data, out, mask) \
((data)[(out) & (mask)])
diff --git a/kernel/module.c b/kernel/module.c
index 78ac6ec1e425..4edbd9c11aca 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2429,7 +2429,8 @@ static int copy_and_check(struct load_info *info,
goto free_hdr;
}
- if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
+ if (hdr->e_shoff >= len ||
+ hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) {
err = -ENOEXEC;
goto free_hdr;
}
@@ -2953,7 +2954,7 @@ static struct module *load_module(void __user *umod,
/* Module is ready to execute: parsing args may do that. */
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, NULL);
+ -32768, 32767, &ddebug_dyndbg_module_param_cb);
if (err < 0)
goto unlink;
diff --git a/kernel/params.c b/kernel/params.c
index f37d82631347..ed35345be536 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b)
static int parse_one(char *param,
char *val,
+ const char *doing,
const struct kernel_param *params,
unsigned num_params,
s16 min_level,
s16 max_level,
- int (*handle_unknown)(char *param, char *val))
+ int (*handle_unknown)(char *param, char *val,
+ const char *doing))
{
unsigned int i;
int err;
@@ -104,8 +106,8 @@ static int parse_one(char *param,
if (!val && params[i].ops->set != param_set_bool
&& params[i].ops->set != param_set_bint)
return -EINVAL;
- pr_debug("They are equal! Calling %p\n",
- params[i].ops->set);
+ pr_debug("handling %s with %p\n", param,
+ params[i].ops->set);
mutex_lock(&param_lock);
err = params[i].ops->set(val, &params[i]);
mutex_unlock(&param_lock);
@@ -114,11 +116,11 @@ static int parse_one(char *param,
}
if (handle_unknown) {
- pr_debug("Unknown argument: calling %p\n", handle_unknown);
- return handle_unknown(param, val);
+ pr_debug("doing %s: %s='%s'\n", doing, param, val);
+ return handle_unknown(param, val, doing);
}
- pr_debug("Unknown argument `%s'\n", param);
+ pr_debug("Unknown argument '%s'\n", param);
return -ENOENT;
}
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val)
}
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
-int parse_args(const char *name,
+int parse_args(const char *doing,
char *args,
const struct kernel_param *params,
unsigned num,
s16 min_level,
s16 max_level,
- int (*unknown)(char *param, char *val))
+ int (*unknown)(char *param, char *val, const char *doing))
{
char *param, *val;
- pr_debug("Parsing ARGS: %s\n", args);
-
/* Chew leading spaces */
args = skip_spaces(args);
+ if (*args)
+ pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);
+
while (*args) {
int ret;
int irq_was_disabled;
args = next_arg(args, &param, &val);
irq_was_disabled = irqs_disabled();
- ret = parse_one(param, val, params, num,
+ ret = parse_one(param, val, doing, params, num,
min_level, max_level, unknown);
- if (irq_was_disabled && !irqs_disabled()) {
- printk(KERN_WARNING "parse_args(): option '%s' enabled "
- "irq's!\n", param);
- }
+ if (irq_was_disabled && !irqs_disabled())
+ pr_warn("%s: option '%s' enabled irq's!\n",
+ doing, param);
+
switch (ret) {
case -ENOENT:
- printk(KERN_ERR "%s: Unknown parameter `%s'\n",
- name, param);
+ pr_err("%s: Unknown parameter `%s'\n", doing, param);
return ret;
case -ENOSPC:
- printk(KERN_ERR
- "%s: `%s' too large for parameter `%s'\n",
- name, val ?: "", param);
+ pr_err("%s: `%s' too large for parameter `%s'\n",
+ doing, val ?: "", param);
return ret;
case 0:
break;
default:
- printk(KERN_ERR
- "%s: `%s' invalid for parameter `%s'\n",
- name, val ?: "", param);
+ pr_err("%s: `%s' invalid for parameter `%s'\n",
+ doing, val ?: "", param);
return ret;
}
}
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
if (strlen(val) > 1024) {
- printk(KERN_ERR "%s: string parameter too long\n",
- kp->name);
+ pr_err("%s: string parameter too long\n", kp->name);
return -ENOSPC;
}
@@ -400,8 +399,7 @@ static int param_array(const char *name,
int len;
if (*num == max) {
- printk(KERN_ERR "%s: can only take %i arguments\n",
- name, max);
+ pr_err("%s: can only take %i arguments\n", name, max);
return -EINVAL;
}
len = strcspn(val, ",");
@@ -420,8 +418,7 @@ static int param_array(const char *name,
} while (save == ',');
if (*num < min) {
- printk(KERN_ERR "%s: needs at least %i arguments\n",
- name, min);
+ pr_err("%s: needs at least %i arguments\n", name, min);
return -EINVAL;
}
return 0;
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
const struct kparam_string *kps = kp->str;
if (strlen(val)+1 > kps->maxlen) {
- printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
+ pr_err("%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
return -ENOSPC;
}
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
#endif
if (err) {
kobject_put(&mk->kobj);
- printk(KERN_ERR
- "Module '%s' failed add to sysfs, error number %d\n",
+ pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
name, err);
- printk(KERN_ERR
- "The system will be unstable now.\n");
return NULL;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 9f08dfabaf13..e86b291ad834 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -547,7 +547,8 @@ void __init pidhash_init(void)
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL,
- &pidhash_shift, NULL, 4096);
+ &pidhash_shift, NULL,
+ 0, 4096);
pidhash_size = 1U << pidhash_shift;
for (i = 0; i < pidhash_size; i++)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index deb5461e3216..8f9b4eb974e0 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -103,6 +103,33 @@ config PM_SLEEP_SMP
select HOTPLUG
select HOTPLUG_CPU
+config PM_AUTOSLEEP
+ bool "Opportunistic sleep"
+ depends on PM_SLEEP
+ default n
+ ---help---
+ Allow the kernel to trigger a system transition into a global sleep
+ state automatically whenever there are no active wakeup sources.
+
+config PM_WAKELOCKS
+ bool "User space wakeup sources interface"
+ depends on PM_SLEEP
+ default n
+ ---help---
+ Allow user space to create, activate and deactivate wakeup source
+ objects with the help of a sysfs-based interface.
+
+config PM_WAKELOCKS_LIMIT
+ int "Maximum number of user space wakeup sources (0 = no limit)"
+ range 0 100000
+ default 100
+ depends on PM_WAKELOCKS
+
+config PM_WAKELOCKS_GC
+ bool "Garbage collector for user space wakeup sources"
+ depends on PM_WAKELOCKS
+ default y
+
config PM_RUNTIME
bool "Run-time PM core functionality"
depends on !IA64_HP_SIM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 66d808ec5252..29472bff11ef 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -9,5 +9,7 @@ obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
block_io.o
+obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
+obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
new file mode 100644
index 000000000000..ca304046d9e2
--- /dev/null
+++ b/kernel/power/autosleep.c
@@ -0,0 +1,127 @@
+/*
+ * kernel/power/autosleep.c
+ *
+ * Opportunistic sleep support.
+ *
+ * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
+ */
+
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/pm_wakeup.h>
+
+#include "power.h"
+
+static suspend_state_t autosleep_state;
+static struct workqueue_struct *autosleep_wq;
+/*
+ * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source
+ * is active, otherwise a deadlock with try_to_suspend() is possible.
+ * Alternatively mutex_lock_interruptible() can be used. This will then fail
+ * if an auto_sleep cycle tries to freeze processes.
+ */
+static DEFINE_MUTEX(autosleep_lock);
+static struct wakeup_source *autosleep_ws;
+
+static void try_to_suspend(struct work_struct *work)
+{
+ unsigned int initial_count, final_count;
+
+ if (!pm_get_wakeup_count(&initial_count, true))
+ goto out;
+
+ mutex_lock(&autosleep_lock);
+
+ if (!pm_save_wakeup_count(initial_count)) {
+ mutex_unlock(&autosleep_lock);
+ goto out;
+ }
+
+ if (autosleep_state == PM_SUSPEND_ON) {
+ mutex_unlock(&autosleep_lock);
+ return;
+ }
+ if (autosleep_state >= PM_SUSPEND_MAX)
+ hibernate();
+ else
+ pm_suspend(autosleep_state);
+
+ mutex_unlock(&autosleep_lock);
+
+ if (!pm_get_wakeup_count(&final_count, false))
+ goto out;
+
+ /*
+ * If the wakeup occured for an unknown reason, wait to prevent the
+ * system from trying to suspend and waking up in a tight loop.
+ */
+ if (final_count == initial_count)
+ schedule_timeout_uninterruptible(HZ / 2);
+
+ out:
+ queue_up_suspend_work();
+}
+
+static DECLARE_WORK(suspend_work, try_to_suspend);
+
+void queue_up_suspend_work(void)
+{
+ if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
+ queue_work(autosleep_wq, &suspend_work);
+}
+
+suspend_state_t pm_autosleep_state(void)
+{
+ return autosleep_state;
+}
+
+int pm_autosleep_lock(void)
+{
+ return mutex_lock_interruptible(&autosleep_lock);
+}
+
+void pm_autosleep_unlock(void)
+{
+ mutex_unlock(&autosleep_lock);
+}
+
+int pm_autosleep_set_state(suspend_state_t state)
+{
+
+#ifndef CONFIG_HIBERNATION
+ if (state >= PM_SUSPEND_MAX)
+ return -EINVAL;
+#endif
+
+ __pm_stay_awake(autosleep_ws);
+
+ mutex_lock(&autosleep_lock);
+
+ autosleep_state = state;
+
+ __pm_relax(autosleep_ws);
+
+ if (state > PM_SUSPEND_ON) {
+ pm_wakep_autosleep_enabled(true);
+ queue_up_suspend_work();
+ } else {
+ pm_wakep_autosleep_enabled(false);
+ }
+
+ mutex_unlock(&autosleep_lock);
+ return 0;
+}
+
+int __init pm_autosleep_init(void)
+{
+ autosleep_ws = wakeup_source_register("autosleep");
+ if (!autosleep_ws)
+ return -ENOMEM;
+
+ autosleep_wq = alloc_ordered_workqueue("autosleep", 0);
+ if (autosleep_wq)
+ return 0;
+
+ wakeup_source_unregister(autosleep_ws);
+ return -ENOMEM;
+}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e09dfbfeecee..8b53db38a279 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,6 +25,8 @@
#include <linux/freezer.h>
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
+#include <linux/ctype.h>
+#include <linux/genhd.h>
#include <scsi/scsi_scan.h>
#include "power.h"
@@ -722,6 +724,17 @@ static int software_resume(void)
/* Check if the device is there */
swsusp_resume_device = name_to_dev_t(resume_file);
+
+ /*
+ * name_to_dev_t is ineffective to verify parition if resume_file is in
+ * integer format. (e.g. major:minor)
+ */
+ if (isdigit(resume_file[0]) && resume_wait) {
+ int partno;
+ while (!get_gendisk(swsusp_resume_device, &partno))
+ msleep(10);
+ }
+
if (!swsusp_resume_device) {
/*
* Some device discovery might still be in progress; we need
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c12581f1c62..428f8a034e96 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
return (s - buf);
}
-static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t n)
+static suspend_state_t decode_state(const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
#endif
char *p;
int len;
- int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- /* First, check if we are requested to hibernate */
- if (len == 4 && !strncmp(buf, "disk", len)) {
- error = hibernate();
- goto Exit;
- }
+ /* Check hibernation first. */
+ if (len == 4 && !strncmp(buf, "disk", len))
+ return PM_SUSPEND_MAX;
#ifdef CONFIG_SUSPEND
- for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
- if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
- error = pm_suspend(state);
- break;
- }
- }
+ for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
+ if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
+ return state;
#endif
- Exit:
+ return PM_SUSPEND_ON;
+}
+
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ suspend_state_t state;
+ int error;
+
+ error = pm_autosleep_lock();
+ if (error)
+ return error;
+
+ if (pm_autosleep_state() > PM_SUSPEND_ON) {
+ error = -EBUSY;
+ goto out;
+ }
+
+ state = decode_state(buf, n);
+ if (state < PM_SUSPEND_MAX)
+ error = pm_suspend(state);
+ else if (state == PM_SUSPEND_MAX)
+ error = hibernate();
+ else
+ error = -EINVAL;
+
+ out:
+ pm_autosleep_unlock();
return error ? error : n;
}
@@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
{
unsigned int val;
- return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
+ return pm_get_wakeup_count(&val, true) ?
+ sprintf(buf, "%u\n", val) : -EINTR;
}
static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
const char *buf, size_t n)
{
unsigned int val;
+ int error;
+
+ error = pm_autosleep_lock();
+ if (error)
+ return error;
+
+ if (pm_autosleep_state() > PM_SUSPEND_ON) {
+ error = -EBUSY;
+ goto out;
+ }
+ error = -EINVAL;
if (sscanf(buf, "%u", &val) == 1) {
if (pm_save_wakeup_count(val))
- return n;
+ error = n;
}
- return -EINVAL;
+
+ out:
+ pm_autosleep_unlock();
+ return error;
}
power_attr(wakeup_count);
+
+#ifdef CONFIG_PM_AUTOSLEEP
+static ssize_t autosleep_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ suspend_state_t state = pm_autosleep_state();
+
+ if (state == PM_SUSPEND_ON)
+ return sprintf(buf, "off\n");
+
+#ifdef CONFIG_SUSPEND
+ if (state < PM_SUSPEND_MAX)
+ return sprintf(buf, "%s\n", valid_state(state) ?
+ pm_states[state] : "error");
+#endif
+#ifdef CONFIG_HIBERNATION
+ return sprintf(buf, "disk\n");
+#else
+ return sprintf(buf, "error");
+#endif
+}
+
+static ssize_t autosleep_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ suspend_state_t state = decode_state(buf, n);
+ int error;
+
+ if (state == PM_SUSPEND_ON
+ && strcmp(buf, "off") && strcmp(buf, "off\n"))
+ return -EINVAL;
+
+ error = pm_autosleep_set_state(state);
+ return error ? error : n;
+}
+
+power_attr(autosleep);
+#endif /* CONFIG_PM_AUTOSLEEP */
+
+#ifdef CONFIG_PM_WAKELOCKS
+static ssize_t wake_lock_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return pm_show_wakelocks(buf, true);
+}
+
+static ssize_t wake_lock_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int error = pm_wake_lock(buf);
+ return error ? error : n;
+}
+
+power_attr(wake_lock);
+
+static ssize_t wake_unlock_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return pm_show_wakelocks(buf, false);
+}
+
+static ssize_t wake_unlock_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int error = pm_wake_unlock(buf);
+ return error ? error : n;
+}
+
+power_attr(wake_unlock);
+
+#endif /* CONFIG_PM_WAKELOCKS */
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_PM_TRACE
@@ -409,6 +521,13 @@ static struct attribute * g[] = {
#ifdef CONFIG_PM_SLEEP
&pm_async_attr.attr,
&wakeup_count_attr.attr,
+#ifdef CONFIG_PM_AUTOSLEEP
+ &autosleep_attr.attr,
+#endif
+#ifdef CONFIG_PM_WAKELOCKS
+ &wake_lock_attr.attr,
+ &wake_unlock_attr.attr,
+#endif
#ifdef CONFIG_PM_DEBUG
&pm_test_attr.attr,
#endif
@@ -444,7 +563,10 @@ static int __init pm_init(void)
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
- return sysfs_create_group(power_kobj, &attr_group);
+ error = sysfs_create_group(power_kobj, &attr_group);
+ if (error)
+ return error;
+ return pm_autosleep_init();
}
core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 98f3622d7407..b0bd4beaebfe 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void)
{
}
#endif
+
+#ifdef CONFIG_PM_AUTOSLEEP
+
+/* kernel/power/autosleep.c */
+extern int pm_autosleep_init(void);
+extern int pm_autosleep_lock(void);
+extern void pm_autosleep_unlock(void);
+extern suspend_state_t pm_autosleep_state(void);
+extern int pm_autosleep_set_state(suspend_state_t state);
+
+#else /* !CONFIG_PM_AUTOSLEEP */
+
+static inline int pm_autosleep_init(void) { return 0; }
+static inline int pm_autosleep_lock(void) { return 0; }
+static inline void pm_autosleep_unlock(void) {}
+static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; }
+
+#endif /* !CONFIG_PM_AUTOSLEEP */
+
+#ifdef CONFIG_PM_WAKELOCKS
+
+/* kernel/power/wakelock.c */
+extern ssize_t pm_show_wakelocks(char *buf, bool show_active);
+extern int pm_wake_lock(const char *buf);
+extern int pm_wake_unlock(const char *buf);
+
+#endif /* !CONFIG_PM_WAKELOCKS */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index eef311a58a64..11e22c068e8b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,7 +6,7 @@
*
* Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
+ * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
*
* This file is released under the GPLv2.
*
@@ -282,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
return -ENOSPC;
if (bio_chain) {
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+ __GFP_NORETRY);
if (src) {
copy_page(src, buf);
} else {
ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
if (ret)
return ret;
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ src = (void *)__get_free_page(__GFP_WAIT |
+ __GFP_NOWARN |
+ __GFP_NORETRY);
if (src) {
copy_page(src, buf);
} else {
@@ -367,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
clear_page(handle->cur);
handle->cur_swap = offset;
handle->k = 0;
- }
- if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
- error = hib_wait_on_bio_chain(bio_chain);
- if (error)
- goto out;
- handle->reqd_free_pages = reqd_free_pages();
+
+ if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
+ error = hib_wait_on_bio_chain(bio_chain);
+ if (error)
+ goto out;
+ /*
+ * Recalculate the number of required free pages, to
+ * make sure we never take more than half.
+ */
+ handle->reqd_free_pages = reqd_free_pages();
+ }
}
out:
return error;
@@ -419,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
/* Maximum number of threads for compression/decompression. */
#define LZO_THREADS 3
-/* Maximum number of pages for read buffering. */
-#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8)
+/* Minimum/maximum number of pages for read buffering. */
+#define LZO_MIN_RD_PAGES 1024
+#define LZO_MAX_RD_PAGES 8192
/**
@@ -631,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
}
/*
- * Adjust number of free pages after all allocations have been done.
- * We don't want to run out of pages when writing.
- */
- handle->reqd_free_pages = reqd_free_pages();
-
- /*
* Start the CRC32 thread.
*/
init_waitqueue_head(&crc->go);
@@ -657,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
+ /*
+ * Adjust the number of required free pages after all allocations have
+ * been done. We don't want to run out of pages when writing.
+ */
+ handle->reqd_free_pages = reqd_free_pages();
+
printk(KERN_INFO
"PM: Using %u thread(s) for compression.\n"
"PM: Compressing and saving image data (%u pages) ... ",
@@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
unsigned i, thr, run_threads, nr_threads;
unsigned ring = 0, pg = 0, ring_size = 0,
have = 0, want, need, asked = 0;
- unsigned long read_pages;
+ unsigned long read_pages = 0;
unsigned char **page = NULL;
struct dec_data *data = NULL;
struct crc_data *crc = NULL;
@@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
nr_threads = num_online_cpus() - 1;
nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
- page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
+ page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
if (!page) {
printk(KERN_ERR "PM: Failed to allocate LZO page\n");
ret = -ENOMEM;
@@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
/*
- * Adjust number of pages for read buffering, in case we are short.
+ * Set the number of pages for read buffering.
+ * This is complete guesswork, because we'll only know the real
+ * picture once prepare_image() is called, which is much later on
+ * during the image load phase. We'll assume the worst case and
+ * say that none of the image pages are from high memory.
*/
- read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
- read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
+ if (low_free_pages() > snapshot_get_image_size())
+ read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
+ read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
for (i = 0; i < read_pages; i++) {
page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
__GFP_WAIT | __GFP_HIGH :
- __GFP_WAIT);
+ __GFP_WAIT | __GFP_NOWARN |
+ __GFP_NORETRY);
+
if (!page[i]) {
if (i < LZO_CMP_PAGES) {
ring_size = i;
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
new file mode 100644
index 000000000000..c8fba3380076
--- /dev/null
+++ b/kernel/power/wakelock.c
@@ -0,0 +1,259 @@
+/*
+ * kernel/power/wakelock.c
+ *
+ * User space wakeup sources support.
+ *
+ * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This code is based on the analogous interface allowing user space to
+ * manipulate wakelocks on Android.
+ */
+
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+
+static DEFINE_MUTEX(wakelocks_lock);
+
+struct wakelock {
+ char *name;
+ struct rb_node node;
+ struct wakeup_source ws;
+#ifdef CONFIG_PM_WAKELOCKS_GC
+ struct list_head lru;
+#endif
+};
+
+static struct rb_root wakelocks_tree = RB_ROOT;
+
+ssize_t pm_show_wakelocks(char *buf, bool show_active)
+{
+ struct rb_node *node;
+ struct wakelock *wl;
+ char *str = buf;
+ char *end = buf + PAGE_SIZE;
+
+ mutex_lock(&wakelocks_lock);
+
+ for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
+ wl = rb_entry(node, struct wakelock, node);
+ if (wl->ws.active == show_active)
+ str += scnprintf(str, end - str, "%s ", wl->name);
+ }
+ if (str > buf)
+ str--;
+
+ str += scnprintf(str, end - str, "\n");
+
+ mutex_unlock(&wakelocks_lock);
+ return (str - buf);
+}
+
+#if CONFIG_PM_WAKELOCKS_LIMIT > 0
+static unsigned int number_of_wakelocks;
+
+static inline bool wakelocks_limit_exceeded(void)
+{
+ return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT;
+}
+
+static inline void increment_wakelocks_number(void)
+{
+ number_of_wakelocks++;
+}
+
+static inline void decrement_wakelocks_number(void)
+{
+ number_of_wakelocks--;
+}
+#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */
+static inline bool wakelocks_limit_exceeded(void) { return false; }
+static inline void increment_wakelocks_number(void) {}
+static inline void decrement_wakelocks_number(void) {}
+#endif /* CONFIG_PM_WAKELOCKS_LIMIT */
+
+#ifdef CONFIG_PM_WAKELOCKS_GC
+#define WL_GC_COUNT_MAX 100
+#define WL_GC_TIME_SEC 300
+
+static LIST_HEAD(wakelocks_lru_list);
+static unsigned int wakelocks_gc_count;
+
+static inline void wakelocks_lru_add(struct wakelock *wl)
+{
+ list_add(&wl->lru, &wakelocks_lru_list);
+}
+
+static inline void wakelocks_lru_most_recent(struct wakelock *wl)
+{
+ list_move(&wl->lru, &wakelocks_lru_list);
+}
+
+static void wakelocks_gc(void)
+{
+ struct wakelock *wl, *aux;
+ ktime_t now;
+
+ if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+ return;
+
+ now = ktime_get();
+ list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
+ u64 idle_time_ns;
+ bool active;
+
+ spin_lock_irq(&wl->ws.lock);
+ idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
+ active = wl->ws.active;
+ spin_unlock_irq(&wl->ws.lock);
+
+ if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
+ break;
+
+ if (!active) {
+ wakeup_source_remove(&wl->ws);
+ rb_erase(&wl->node, &wakelocks_tree);
+ list_del(&wl->lru);
+ kfree(wl->name);
+ kfree(wl);
+ decrement_wakelocks_number();
+ }
+ }
+ wakelocks_gc_count = 0;
+}
+#else /* !CONFIG_PM_WAKELOCKS_GC */
+static inline void wakelocks_lru_add(struct wakelock *wl) {}
+static inline void wakelocks_lru_most_recent(struct wakelock *wl) {}
+static inline void wakelocks_gc(void) {}
+#endif /* !CONFIG_PM_WAKELOCKS_GC */
+
+static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
+ bool add_if_not_found)
+{
+ struct rb_node **node = &wakelocks_tree.rb_node;
+ struct rb_node *parent = *node;
+ struct wakelock *wl;
+
+ while (*node) {
+ int diff;
+
+ parent = *node;
+ wl = rb_entry(*node, struct wakelock, node);
+ diff = strncmp(name, wl->name, len);
+ if (diff == 0) {
+ if (wl->name[len])
+ diff = -1;
+ else
+ return wl;
+ }
+ if (diff < 0)
+ node = &(*node)->rb_left;
+ else
+ node = &(*node)->rb_right;
+ }
+ if (!add_if_not_found)
+ return ERR_PTR(-EINVAL);
+
+ if (wakelocks_limit_exceeded())
+ return ERR_PTR(-ENOSPC);
+
+ /* Not found, we have to add a new one. */
+ wl = kzalloc(sizeof(*wl), GFP_KERNEL);
+ if (!wl)
+ return ERR_PTR(-ENOMEM);
+
+ wl->name = kstrndup(name, len, GFP_KERNEL);
+ if (!wl->name) {
+ kfree(wl);
+ return ERR_PTR(-ENOMEM);
+ }
+ wl->ws.name = wl->name;
+ wakeup_source_add(&wl->ws);
+ rb_link_node(&wl->node, parent, node);
+ rb_insert_color(&wl->node, &wakelocks_tree);
+ wakelocks_lru_add(wl);
+ increment_wakelocks_number();
+ return wl;
+}
+
+int pm_wake_lock(const char *buf)
+{
+ const char *str = buf;
+ struct wakelock *wl;
+ u64 timeout_ns = 0;
+ size_t len;
+ int ret = 0;
+
+ while (*str && !isspace(*str))
+ str++;
+
+ len = str - buf;
+ if (!len)
+ return -EINVAL;
+
+ if (*str && *str != '\n') {
+ /* Find out if there's a valid timeout string appended. */
+ ret = kstrtou64(skip_spaces(str), 10, &timeout_ns);
+ if (ret)
+ return -EINVAL;
+ }
+
+ mutex_lock(&wakelocks_lock);
+
+ wl = wakelock_lookup_add(buf, len, true);
+ if (IS_ERR(wl)) {
+ ret = PTR_ERR(wl);
+ goto out;
+ }
+ if (timeout_ns) {
+ u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
+
+ do_div(timeout_ms, NSEC_PER_MSEC);
+ __pm_wakeup_event(&wl->ws, timeout_ms);
+ } else {
+ __pm_stay_awake(&wl->ws);
+ }
+
+ wakelocks_lru_most_recent(wl);
+
+ out:
+ mutex_unlock(&wakelocks_lock);
+ return ret;
+}
+
+int pm_wake_unlock(const char *buf)
+{
+ struct wakelock *wl;
+ size_t len;
+ int ret = 0;
+
+ len = strlen(buf);
+ if (!len)
+ return -EINVAL;
+
+ if (buf[len-1] == '\n')
+ len--;
+
+ if (!len)
+ return -EINVAL;
+
+ mutex_lock(&wakelocks_lock);
+
+ wl = wakelock_lookup_add(buf, len, false);
+ if (IS_ERR(wl)) {
+ ret = PTR_ERR(wl);
+ goto out;
+ }
+ __pm_relax(&wl->ws);
+
+ wakelocks_lru_most_recent(wl);
+ wakelocks_gc();
+
+ out:
+ mutex_unlock(&wakelocks_lock);
+ return ret;
+}
diff --git a/kernel/printk.c b/kernel/printk.c
index b663c2c95d39..32462d2b364a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -41,6 +41,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/rculist.h>
+#include <linux/poll.h>
#include <asm/uaccess.h>
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
{
}
-#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
-
/* printk's without a loglevel use this.. */
#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers);
static int console_locked, console_suspended;
/*
- * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
- * It is also used in interesting ways to provide interlocking in
- * console_unlock();.
- */
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
-
-#define LOG_BUF_MASK (log_buf_len-1)
-#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
-
-/*
- * The indices into log_buf are not constrained to log_buf_len - they
- * must be masked before subscripting
- */
-static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */
-static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
-static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
-
-/*
* If exclusive_console is non-NULL then only this console is to be printed to.
*/
static struct console *exclusive_console;
@@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline);
/* Flag: console code may call schedule() */
static int console_may_schedule;
+/*
+ * The printk log buffer consists of a chain of concatenated variable
+ * length records. Every record starts with a record header, containing
+ * the overall length of the record.
+ *
+ * The heads to the first and last entry in the buffer, as well as the
+ * sequence numbers of these both entries are maintained when messages
+ * are stored..
+ *
+ * If the heads indicate available messages, the length in the header
+ * tells the start next message. A length == 0 for the next message
+ * indicates a wrap-around to the beginning of the buffer.
+ *
+ * Every record carries the monotonic timestamp in microseconds, as well as
+ * the standard userspace syslog level and syslog facility. The usual
+ * kernel messages use LOG_KERN; userspace-injected messages always carry
+ * a matching syslog facility, by default LOG_USER. The origin of every
+ * message can be reliably determined that way.
+ *
+ * The human readable log message directly follows the message header. The
+ * length of the message text is stored in the header, the stored message
+ * is not terminated.
+ *
+ * Optionally, a message can carry a dictionary of properties (key/value pairs),
+ * to provide userspace with a machine-readable message context.
+ *
+ * Examples for well-defined, commonly used property names are:
+ * DEVICE=b12:8 device identifier
+ * b12:8 block dev_t
+ * c127:3 char dev_t
+ * n8 netdev ifindex
+ * +sound:card0 subsystem:devname
+ * SUBSYSTEM=pci driver-core subsystem name
+ *
+ * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
+ * follows directly after a '=' character. Every property is terminated by
+ * a '\0' character. The last property is not terminated.
+ *
+ * Example of a message structure:
+ * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
+ * 0008 34 00 record is 52 bytes long
+ * 000a 0b 00 text is 11 bytes long
+ * 000c 1f 00 dictionary is 23 bytes long
+ * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
+ * 0010 69 74 27 73 20 61 20 6c "it's a l"
+ * 69 6e 65 "ine"
+ * 001b 44 45 56 49 43 "DEVIC"
+ * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
+ * 52 49 56 45 52 3d 62 75 "RIVER=bu"
+ * 67 "g"
+ * 0032 00 00 00 padding to next message header
+ *
+ * The 'struct log' buffer header must never be directly exported to
+ * userspace, it is a kernel-private implementation detail that might
+ * need to be changed in the future, when the requirements change.
+ *
+ * /dev/kmsg exports the structured data in the following line format:
+ * "level,sequnum,timestamp;<message text>\n"
+ *
+ * The optional key/value pairs are attached as continuation lines starting
+ * with a space character and terminated by a newline. All possible
+ * non-prinatable characters are escaped in the "\xff" notation.
+ *
+ * Users of the export format should ignore possible additional values
+ * separated by ',', and find the message after the ';' character.
+ */
+
+struct log {
+ u64 ts_nsec; /* timestamp in nanoseconds */
+ u16 len; /* length of entire record */
+ u16 text_len; /* length of text buffer */
+ u16 dict_len; /* length of dictionary buffer */
+ u16 level; /* syslog level + facility */
+};
+
+/*
+ * The logbuf_lock protects kmsg buffer, indices, counters. It is also
+ * used in interesting ways to provide interlocking in console_unlock();
+ */
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
+
+/* the next printk record to read by syslog(READ) or /proc/kmsg */
+static u64 syslog_seq;
+static u32 syslog_idx;
+
+/* index and sequence number of the first record stored in the buffer */
+static u64 log_first_seq;
+static u32 log_first_idx;
+
+/* index and sequence number of the next record to store in the buffer */
+static u64 log_next_seq;
#ifdef CONFIG_PRINTK
+static u32 log_next_idx;
+
+/* the next printk record to read after the last 'clear' command */
+static u64 clear_seq;
+static u32 clear_idx;
+
+#define LOG_LINE_MAX 1024
-static char __log_buf[__LOG_BUF_LEN];
+/* record buffer */
+#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define LOG_ALIGN 4
+#else
+#define LOG_ALIGN 8
+#endif
+#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
-static int log_buf_len = __LOG_BUF_LEN;
-static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
-static int saved_console_loglevel = -1;
+static u32 log_buf_len = __LOG_BUF_LEN;
+
+/* cpu currently holding logbuf_lock */
+static volatile unsigned int logbuf_cpu = UINT_MAX;
+
+/* human readable text of the record */
+static char *log_text(const struct log *msg)
+{
+ return (char *)msg + sizeof(struct log);
+}
+
+/* optional key/value pair dictionary attached to the record */
+static char *log_dict(const struct log *msg)
+{
+ return (char *)msg + sizeof(struct log) + msg->text_len;
+}
+
+/* get record by index; idx must point to valid msg */
+static struct log *log_from_idx(u32 idx)
+{
+ struct log *msg = (struct log *)(log_buf + idx);
+
+ /*
+ * A length == 0 record is the end of buffer marker. Wrap around and
+ * read the message at the start of the buffer.
+ */
+ if (!msg->len)
+ return (struct log *)log_buf;
+ return msg;
+}
+
+/* get next record; idx must point to valid msg */
+static u32 log_next(u32 idx)
+{
+ struct log *msg = (struct log *)(log_buf + idx);
+
+ /* length == 0 indicates the end of the buffer; wrap */
+ /*
+ * A length == 0 record is the end of buffer marker. Wrap around and
+ * read the message at the start of the buffer as *this* one, and
+ * return the one after that.
+ */
+ if (!msg->len) {
+ msg = (struct log *)log_buf;
+ return msg->len;
+ }
+ return idx + msg->len;
+}
+
+/* insert record into the buffer, discard old ones, update heads */
+static void log_store(int facility, int level,
+ const char *dict, u16 dict_len,
+ const char *text, u16 text_len)
+{
+ struct log *msg;
+ u32 size, pad_len;
+
+ /* number of '\0' padding bytes to next message */
+ size = sizeof(struct log) + text_len + dict_len;
+ pad_len = (-size) & (LOG_ALIGN - 1);
+ size += pad_len;
+
+ while (log_first_seq < log_next_seq) {
+ u32 free;
+
+ if (log_next_idx > log_first_idx)
+ free = max(log_buf_len - log_next_idx, log_first_idx);
+ else
+ free = log_first_idx - log_next_idx;
+
+ if (free > size + sizeof(struct log))
+ break;
+
+ /* drop old messages until we have enough contiuous space */
+ log_first_idx = log_next(log_first_idx);
+ log_first_seq++;
+ }
+
+ if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
+ /*
+ * This message + an additional empty header does not fit
+ * at the end of the buffer. Add an empty header with len == 0
+ * to signify a wrap around.
+ */
+ memset(log_buf + log_next_idx, 0, sizeof(struct log));
+ log_next_idx = 0;
+ }
+
+ /* fill message */
+ msg = (struct log *)(log_buf + log_next_idx);
+ memcpy(log_text(msg), text, text_len);
+ msg->text_len = text_len;
+ memcpy(log_dict(msg), dict, dict_len);
+ msg->dict_len = dict_len;
+ msg->level = (facility << 3) | (level & 7);
+ msg->ts_nsec = local_clock();
+ memset(log_dict(msg) + dict_len, 0, pad_len);
+ msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
+
+ /* insert message */
+ log_next_idx += msg->len;
+ log_next_seq++;
+}
+
+/* /dev/kmsg - userspace message inject/listen interface */
+struct devkmsg_user {
+ u64 seq;
+ u32 idx;
+ struct mutex lock;
+ char buf[8192];
+};
+
+static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ char *buf, *line;
+ int i;
+ int level = default_message_loglevel;
+ int facility = 1; /* LOG_USER */
+ size_t len = iov_length(iv, count);
+ ssize_t ret = len;
+
+ if (len > LOG_LINE_MAX)
+ return -EINVAL;
+ buf = kmalloc(len+1, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ line = buf;
+ for (i = 0; i < count; i++) {
+ if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len))
+ goto out;
+ line += iv[i].iov_len;
+ }
+
+ /*
+ * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
+ * the decimal value represents 32bit, the lower 3 bit are the log
+ * level, the rest are the log facility.
+ *
+ * If no prefix or no userspace facility is specified, we
+ * enforce LOG_USER, to be able to reliably distinguish
+ * kernel-generated messages from userspace-injected ones.
+ */
+ line = buf;
+ if (line[0] == '<') {
+ char *endp = NULL;
+
+ i = simple_strtoul(line+1, &endp, 10);
+ if (endp && endp[0] == '>') {
+ level = i & 7;
+ if (i >> 3)
+ facility = i >> 3;
+ endp++;
+ len -= endp - line;
+ line = endp;
+ }
+ }
+ line[len] = '\0';
+
+ printk_emit(facility, level, NULL, 0, "%s", line);
+out:
+ kfree(buf);
+ return ret;
+}
+
+static ssize_t devkmsg_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct devkmsg_user *user = file->private_data;
+ struct log *msg;
+ u64 ts_usec;
+ size_t i;
+ size_t len;
+ ssize_t ret;
+
+ if (!user)
+ return -EBADF;
+
+ mutex_lock(&user->lock);
+ raw_spin_lock(&logbuf_lock);
+ while (user->seq == log_next_seq) {
+ if (file->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ raw_spin_unlock(&logbuf_lock);
+ goto out;
+ }
+
+ raw_spin_unlock(&logbuf_lock);
+ ret = wait_event_interruptible(log_wait,
+ user->seq != log_next_seq);
+ if (ret)
+ goto out;
+ raw_spin_lock(&logbuf_lock);
+ }
+
+ if (user->seq < log_first_seq) {
+ /* our last seen message is gone, return error and reset */
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ ret = -EPIPE;
+ raw_spin_unlock(&logbuf_lock);
+ goto out;
+ }
+
+ msg = log_from_idx(user->idx);
+ ts_usec = msg->ts_nsec;
+ do_div(ts_usec, 1000);
+ len = sprintf(user->buf, "%u,%llu,%llu;",
+ msg->level, user->seq, ts_usec);
+
+ /* escape non-printable characters */
+ for (i = 0; i < msg->text_len; i++) {
+ unsigned char c = log_text(msg)[i];
+
+ if (c < ' ' || c >= 128)
+ len += sprintf(user->buf + len, "\\x%02x", c);
+ else
+ user->buf[len++] = c;
+ }
+ user->buf[len++] = '\n';
+
+ if (msg->dict_len) {
+ bool line = true;
+
+ for (i = 0; i < msg->dict_len; i++) {
+ unsigned char c = log_dict(msg)[i];
+
+ if (line) {
+ user->buf[len++] = ' ';
+ line = false;
+ }
+
+ if (c == '\0') {
+ user->buf[len++] = '\n';
+ line = true;
+ continue;
+ }
+
+ if (c < ' ' || c >= 128) {
+ len += sprintf(user->buf + len, "\\x%02x", c);
+ continue;
+ }
+
+ user->buf[len++] = c;
+ }
+ user->buf[len++] = '\n';
+ }
+
+ user->idx = log_next(user->idx);
+ user->seq++;
+ raw_spin_unlock(&logbuf_lock);
+
+ if (len > count) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(buf, user->buf, len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = len;
+out:
+ mutex_unlock(&user->lock);
+ return ret;
+}
+
+static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct devkmsg_user *user = file->private_data;
+ loff_t ret = 0;
+
+ if (!user)
+ return -EBADF;
+ if (offset)
+ return -ESPIPE;
+
+ raw_spin_lock(&logbuf_lock);
+ switch (whence) {
+ case SEEK_SET:
+ /* the first record */
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ break;
+ case SEEK_DATA:
+ /*
+ * The first record after the last SYSLOG_ACTION_CLEAR,
+ * like issued by 'dmesg -c'. Reading /dev/kmsg itself
+ * changes no global state, and does not clear anything.
+ */
+ user->idx = clear_idx;
+ user->seq = clear_seq;
+ break;
+ case SEEK_END:
+ /* after the last record */
+ user->idx = log_next_idx;
+ user->seq = log_next_seq;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ raw_spin_unlock(&logbuf_lock);
+ return ret;
+}
+
+static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
+{
+ struct devkmsg_user *user = file->private_data;
+ int ret = 0;
+
+ if (!user)
+ return POLLERR|POLLNVAL;
+
+ poll_wait(file, &log_wait, wait);
+
+ raw_spin_lock(&logbuf_lock);
+ if (user->seq < log_next_seq) {
+ /* return error when data has vanished underneath us */
+ if (user->seq < log_first_seq)
+ ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
+ ret = POLLIN|POLLRDNORM;
+ }
+ raw_spin_unlock(&logbuf_lock);
+
+ return ret;
+}
+
+static int devkmsg_open(struct inode *inode, struct file *file)
+{
+ struct devkmsg_user *user;
+ int err;
+
+ /* write-only does not need any file context */
+ if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+ return 0;
+
+ err = security_syslog(SYSLOG_ACTION_READ_ALL);
+ if (err)
+ return err;
+
+ user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
+ if (!user)
+ return -ENOMEM;
+
+ mutex_init(&user->lock);
+
+ raw_spin_lock(&logbuf_lock);
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ raw_spin_unlock(&logbuf_lock);
+
+ file->private_data = user;
+ return 0;
+}
+
+static int devkmsg_release(struct inode *inode, struct file *file)
+{
+ struct devkmsg_user *user = file->private_data;
+
+ if (!user)
+ return 0;
+
+ mutex_destroy(&user->lock);
+ kfree(user);
+ return 0;
+}
+
+const struct file_operations kmsg_fops = {
+ .open = devkmsg_open,
+ .read = devkmsg_read,
+ .aio_write = devkmsg_writev,
+ .llseek = devkmsg_llseek,
+ .poll = devkmsg_poll,
+ .release = devkmsg_release,
+};
#ifdef CONFIG_KEXEC
/*
@@ -165,9 +624,9 @@ static int saved_console_loglevel = -1;
void log_buf_kexec_setup(void)
{
VMCOREINFO_SYMBOL(log_buf);
- VMCOREINFO_SYMBOL(log_end);
VMCOREINFO_SYMBOL(log_buf_len);
- VMCOREINFO_SYMBOL(logged_chars);
+ VMCOREINFO_SYMBOL(log_first_idx);
+ VMCOREINFO_SYMBOL(log_next_idx);
}
#endif
@@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup);
void __init setup_log_buf(int early)
{
unsigned long flags;
- unsigned start, dest_idx, offset;
char *new_log_buf;
int free;
@@ -219,20 +677,8 @@ void __init setup_log_buf(int early)
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
- free = __LOG_BUF_LEN - log_end;
-
- offset = start = min(con_start, log_start);
- dest_idx = 0;
- while (start != log_end) {
- unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
-
- log_buf[dest_idx] = __log_buf[log_idx_mask];
- start++;
- dest_idx++;
- }
- log_start -= offset;
- con_start -= offset;
- log_end -= offset;
+ free = __LOG_BUF_LEN - log_next_idx;
+ memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
pr_info("log_buf_len: %d\n", log_buf_len);
@@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file)
return 0;
}
+#if defined(CONFIG_PRINTK_TIME)
+static bool printk_time = 1;
+#else
+static bool printk_time;
+#endif
+module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+
+static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
+{
+ size_t len = 0;
+
+ if (syslog) {
+ if (buf) {
+ len += sprintf(buf, "<%u>", msg->level);
+ } else {
+ len += 3;
+ if (msg->level > 9)
+ len++;
+ if (msg->level > 99)
+ len++;
+ }
+ }
+
+ if (printk_time) {
+ if (buf) {
+ unsigned long long ts = msg->ts_nsec;
+ unsigned long rem_nsec = do_div(ts, 1000000000);
+
+ len += sprintf(buf + len, "[%5lu.%06lu] ",
+ (unsigned long) ts, rem_nsec / 1000);
+ } else {
+ len += 15;
+ }
+ }
+
+ return len;
+}
+
+static size_t msg_print_text(const struct log *msg, bool syslog,
+ char *buf, size_t size)
+{
+ const char *text = log_text(msg);
+ size_t text_size = msg->text_len;
+ size_t len = 0;
+
+ do {
+ const char *next = memchr(text, '\n', text_size);
+ size_t text_len;
+
+ if (next) {
+ text_len = next - text;
+ next++;
+ text_size -= next - text;
+ } else {
+ text_len = text_size;
+ }
+
+ if (buf) {
+ if (print_prefix(msg, syslog, NULL) +
+ text_len + 1>= size - len)
+ break;
+
+ len += print_prefix(msg, syslog, buf + len);
+ memcpy(buf + len, text, text_len);
+ len += text_len;
+ buf[len++] = '\n';
+ } else {
+ /* SYSLOG_ACTION_* buffer size only calculation */
+ len += print_prefix(msg, syslog, NULL);
+ len += text_len + 1;
+ }
+
+ text = next;
+ } while (text);
+
+ return len;
+}
+
+static int syslog_print(char __user *buf, int size)
+{
+ char *text;
+ struct log *msg;
+ int len;
+
+ text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+ if (!text)
+ return -ENOMEM;
+
+ raw_spin_lock_irq(&logbuf_lock);
+ if (syslog_seq < log_first_seq) {
+ /* messages are gone, move to first one */
+ syslog_seq = log_first_seq;
+ syslog_idx = log_first_idx;
+ }
+ msg = log_from_idx(syslog_idx);
+ len = msg_print_text(msg, true, text, LOG_LINE_MAX);
+ syslog_idx = log_next(syslog_idx);
+ syslog_seq++;
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ if (len > 0 && copy_to_user(buf, text, len))
+ len = -EFAULT;
+
+ kfree(text);
+ return len;
+}
+
+static int syslog_print_all(char __user *buf, int size, bool clear)
+{
+ char *text;
+ int len = 0;
+
+ text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+ if (!text)
+ return -ENOMEM;
+
+ raw_spin_lock_irq(&logbuf_lock);
+ if (buf) {
+ u64 next_seq;
+ u64 seq;
+ u32 idx;
+
+ if (clear_seq < log_first_seq) {
+ /* messages are gone, move to first available one */
+ clear_seq = log_first_seq;
+ clear_idx = log_first_idx;
+ }
+
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump.
+ */
+ seq = clear_seq;
+ idx = clear_idx;
+ while (seq < log_next_seq) {
+ struct log *msg = log_from_idx(idx);
+
+ len += msg_print_text(msg, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ }
+ seq = clear_seq;
+ idx = clear_idx;
+ while (len > size && seq < log_next_seq) {
+ struct log *msg = log_from_idx(idx);
+
+ len -= msg_print_text(msg, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ }
+
+ /* last message in this dump */
+ next_seq = log_next_seq;
+
+ len = 0;
+ while (len >= 0 && seq < next_seq) {
+ struct log *msg = log_from_idx(idx);
+ int textlen;
+
+ textlen = msg_print_text(msg, true, text, LOG_LINE_MAX);
+ if (textlen < 0) {
+ len = textlen;
+ break;
+ }
+ idx = log_next(idx);
+ seq++;
+
+ raw_spin_unlock_irq(&logbuf_lock);
+ if (copy_to_user(buf + len, text, textlen))
+ len = -EFAULT;
+ else
+ len += textlen;
+ raw_spin_lock_irq(&logbuf_lock);
+
+ if (seq < log_first_seq) {
+ /* messages are gone, move to next one */
+ seq = log_first_seq;
+ idx = log_first_idx;
+ }
+ }
+ }
+
+ if (clear) {
+ clear_seq = log_next_seq;
+ clear_idx = log_next_idx;
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ kfree(text);
+ return len;
+}
+
int do_syslog(int type, char __user *buf, int len, bool from_file)
{
- unsigned i, j, limit, count;
- int do_clear = 0;
- char c;
+ bool clear = false;
+ static int saved_console_loglevel = -1;
int error;
error = check_syslog_permissions(type, from_file);
@@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
goto out;
}
error = wait_event_interruptible(log_wait,
- (log_start - log_end));
+ syslog_seq != log_next_seq);
if (error)
goto out;
- i = 0;
- raw_spin_lock_irq(&logbuf_lock);
- while (!error && (log_start != log_end) && i < len) {
- c = LOG_BUF(log_start);
- log_start++;
- raw_spin_unlock_irq(&logbuf_lock);
- error = __put_user(c,buf);
- buf++;
- i++;
- cond_resched();
- raw_spin_lock_irq(&logbuf_lock);
- }
- raw_spin_unlock_irq(&logbuf_lock);
- if (!error)
- error = i;
+ error = syslog_print(buf, len);
break;
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
- do_clear = 1;
+ clear = true;
/* FALL THRU */
/* Read last kernel messages */
case SYSLOG_ACTION_READ_ALL:
@@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
error = -EFAULT;
goto out;
}
- count = len;
- if (count > log_buf_len)
- count = log_buf_len;
- raw_spin_lock_irq(&logbuf_lock);
- if (count > logged_chars)
- count = logged_chars;
- if (do_clear)
- logged_chars = 0;
- limit = log_end;
- /*
- * __put_user() could sleep, and while we sleep
- * printk() could overwrite the messages
- * we try to copy to user space. Therefore
- * the messages are copied in reverse. <manfreds>
- */
- for (i = 0; i < count && !error; i++) {
- j = limit-1-i;
- if (j + log_buf_len < log_end)
- break;
- c = LOG_BUF(j);
- raw_spin_unlock_irq(&logbuf_lock);
- error = __put_user(c,&buf[count-1-i]);
- cond_resched();
- raw_spin_lock_irq(&logbuf_lock);
- }
- raw_spin_unlock_irq(&logbuf_lock);
- if (error)
- break;
- error = i;
- if (i != count) {
- int offset = count-error;
- /* buffer overflow during copy, correct user buffer. */
- for (i = 0; i < error; i++) {
- if (__get_user(c,&buf[i+offset]) ||
- __put_user(c,&buf[i])) {
- error = -EFAULT;
- break;
- }
- cond_resched();
- }
- }
+ error = syslog_print_all(buf, len, clear);
break;
/* Clear ring buffer */
case SYSLOG_ACTION_CLEAR:
- logged_chars = 0;
- break;
+ syslog_print_all(NULL, 0, true);
/* Disable logging to console */
case SYSLOG_ACTION_CONSOLE_OFF:
if (saved_console_loglevel == -1)
@@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- error = log_end - log_start;
+ raw_spin_lock_irq(&logbuf_lock);
+ if (syslog_seq < log_first_seq) {
+ /* messages are gone, move to first one */
+ syslog_seq = log_first_seq;
+ syslog_idx = log_first_idx;
+ }
+ if (from_file) {
+ /*
+ * Short-cut for poll(/"proc/kmsg") which simply checks
+ * for pending data, not the size; return the count of
+ * records, not the length.
+ */
+ error = log_next_idx - syslog_idx;
+ } else {
+ u64 seq;
+ u32 idx;
+
+ error = 0;
+ seq = syslog_seq;
+ idx = syslog_idx;
+ while (seq < log_next_seq) {
+ struct log *msg = log_from_idx(idx);
+
+ error += msg_print_text(msg, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ }
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4])
{
syslog_data[0] = log_buf;
syslog_data[1] = log_buf + log_buf_len;
- syslog_data[2] = log_buf + log_end -
- (logged_chars < log_buf_len ? logged_chars : log_buf_len);
- syslog_data[3] = log_buf + log_end;
+ syslog_data[2] = log_buf + log_first_idx;
+ syslog_data[3] = log_buf + log_next_idx;
}
#endif /* CONFIG_KGDB_KDB */
-/*
- * Call the console drivers on a range of log_buf
- */
-static void __call_console_drivers(unsigned start, unsigned end)
-{
- struct console *con;
-
- for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
- if ((con->flags & CON_ENABLED) && con->write &&
- (cpu_online(smp_processor_id()) ||
- (con->flags & CON_ANYTIME)))
- con->write(con, &LOG_BUF(start), end - start);
- }
-}
-
static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
@@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
"print all kernel messages to the console.");
/*
- * Write out chars from start to end - 1 inclusive
- */
-static void _call_console_drivers(unsigned start,
- unsigned end, int msg_log_level)
-{
- trace_console(&LOG_BUF(0), start, end, log_buf_len);
-
- if ((msg_log_level < console_loglevel || ignore_loglevel) &&
- console_drivers && start != end) {
- if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
- /* wrapped write */
- __call_console_drivers(start & LOG_BUF_MASK,
- log_buf_len);
- __call_console_drivers(0, end & LOG_BUF_MASK);
- } else {
- __call_console_drivers(start, end);
- }
- }
-}
-
-/*
- * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
- * lower 3 bit are the log level, the rest are the log facility. In case
- * userspace passes usual userspace syslog messages to /dev/kmsg or
- * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
- * to extract the correct log level for in-kernel processing, and not mangle
- * the original value.
- *
- * If a prefix is found, the length of the prefix is returned. If 'level' is
- * passed, it will be filled in with the log level without a possible facility
- * value. If 'special' is passed, the special printk prefix chars are accepted
- * and returned. If no valid header is found, 0 is returned and the passed
- * variables are not touched.
- */
-static size_t log_prefix(const char *p, unsigned int *level, char *special)
-{
- unsigned int lev = 0;
- char sp = '\0';
- size_t len;
-
- if (p[0] != '<' || !p[1])
- return 0;
- if (p[2] == '>') {
- /* usual single digit level number or special char */
- switch (p[1]) {
- case '0' ... '7':
- lev = p[1] - '0';
- break;
- case 'c': /* KERN_CONT */
- case 'd': /* KERN_DEFAULT */
- sp = p[1];
- break;
- default:
- return 0;
- }
- len = 3;
- } else {
- /* multi digit including the level and facility number */
- char *endp = NULL;
-
- lev = (simple_strtoul(&p[1], &endp, 10) & 7);
- if (endp == NULL || endp[0] != '>')
- return 0;
- len = (endp + 1) - p;
- }
-
- /* do not accept special char if not asked for */
- if (sp && !special)
- return 0;
-
- if (special) {
- *special = sp;
- /* return special char, do not touch level */
- if (sp)
- return len;
- }
-
- if (level)
- *level = lev;
- return len;
-}
-
-/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
*/
-static void call_console_drivers(unsigned start, unsigned end)
+static void call_console_drivers(int level, const char *text, size_t len)
{
- unsigned cur_index, start_print;
- static int msg_level = -1;
+ struct console *con;
- BUG_ON(((int)(start - end)) > 0);
+ trace_console(text, 0, len, len);
- cur_index = start;
- start_print = start;
- while (cur_index != end) {
- if (msg_level < 0 && ((end - cur_index) > 2)) {
- /* strip log prefix */
- cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
- start_print = cur_index;
- }
- while (cur_index != end) {
- char c = LOG_BUF(cur_index);
-
- cur_index++;
- if (c == '\n') {
- if (msg_level < 0) {
- /*
- * printk() has already given us loglevel tags in
- * the buffer. This code is here in case the
- * log buffer has wrapped right round and scribbled
- * on those tags
- */
- msg_level = default_message_loglevel;
- }
- _call_console_drivers(start_print, cur_index, msg_level);
- msg_level = -1;
- start_print = cur_index;
- break;
- }
- }
- }
- _call_console_drivers(start_print, end, msg_level);
-}
+ if (level >= console_loglevel && !ignore_loglevel)
+ return;
+ if (!console_drivers)
+ return;
-static void emit_log_char(char c)
-{
- LOG_BUF(log_end) = c;
- log_end++;
- if (log_end - log_start > log_buf_len)
- log_start = log_end - log_buf_len;
- if (log_end - con_start > log_buf_len)
- con_start = log_end - log_buf_len;
- if (logged_chars < log_buf_len)
- logged_chars++;
+ for_each_console(con) {
+ if (exclusive_console && con != exclusive_console)
+ continue;
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ if (!con->write)
+ continue;
+ if (!cpu_online(smp_processor_id()) &&
+ !(con->flags & CON_ANYTIME))
+ continue;
+ con->write(con, text, len);
+ }
}
/*
@@ -700,16 +1183,6 @@ static void zap_locks(void)
sema_init(&console_sem, 1);
}
-#if defined(CONFIG_PRINTK_TIME)
-static bool printk_time = 1;
-#else
-static bool printk_time = 0;
-#endif
-module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
-
-static bool always_kmsg_dump;
-module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
-
/* Check if we have any console registered that can be called early in boot. */
static int have_callable_console(void)
{
@@ -722,51 +1195,6 @@ static int have_callable_console(void)
return 0;
}
-/**
- * printk - print a kernel message
- * @fmt: format string
- *
- * This is printk(). It can be called from any context. We want it to work.
- *
- * We try to grab the console_lock. If we succeed, it's easy - we log the output and
- * call the console drivers. If we fail to get the semaphore we place the output
- * into the log buffer and return. The current holder of the console_sem will
- * notice the new output in console_unlock(); and will send it to the
- * consoles before releasing the lock.
- *
- * One effect of this deferred printing is that code which calls printk() and
- * then changes console_loglevel may break. This is because console_loglevel
- * is inspected when the actual printing occurs.
- *
- * See also:
- * printf(3)
- *
- * See the vsnprintf() documentation for format string extensions over C99.
- */
-
-asmlinkage int printk(const char *fmt, ...)
-{
- va_list args;
- int r;
-
-#ifdef CONFIG_KGDB_KDB
- if (unlikely(kdb_trap_printk)) {
- va_start(args, fmt);
- r = vkdb_printf(fmt, args);
- va_end(args);
- return r;
- }
-#endif
- va_start(args, fmt);
- r = vprintk(fmt, args);
- va_end(args);
-
- return r;
-}
-
-/* cpu currently holding logbuf_lock */
-static volatile unsigned int printk_cpu = UINT_MAX;
-
/*
* Can we actually use the console at this time on this cpu?
*
@@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu)
retval = 0;
}
}
- printk_cpu = UINT_MAX;
+ logbuf_cpu = UINT_MAX;
if (wake)
up(&console_sem);
raw_spin_unlock(&logbuf_lock);
return retval;
}
-static const char recursion_bug_msg [] =
- KERN_CRIT "BUG: recent printk recursion!\n";
-static int recursion_bug;
-static int new_text_line = 1;
-static char printk_buf[1024];
int printk_delay_msec __read_mostly;
@@ -836,15 +1259,23 @@ static inline void printk_delay(void)
}
}
-asmlinkage int vprintk(const char *fmt, va_list args)
+asmlinkage int vprintk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, va_list args)
{
- int printed_len = 0;
- int current_log_level = default_message_loglevel;
+ static int recursion_bug;
+ static char cont_buf[LOG_LINE_MAX];
+ static size_t cont_len;
+ static int cont_level;
+ static struct task_struct *cont_task;
+ static char textbuf[LOG_LINE_MAX];
+ char *text = textbuf;
+ size_t text_len;
unsigned long flags;
int this_cpu;
- char *p;
- size_t plen;
- char special;
+ bool newline = false;
+ bool prefix = false;
+ int printed_len = 0;
boot_delay_msec();
printk_delay();
@@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
/*
* Ouch, printk recursed into itself!
*/
- if (unlikely(printk_cpu == this_cpu)) {
+ if (unlikely(logbuf_cpu == this_cpu)) {
/*
* If a crash is occurring during printk() on this CPU,
* then try to get the crash message out but make sure
@@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args)
lockdep_off();
raw_spin_lock(&logbuf_lock);
- printk_cpu = this_cpu;
+ logbuf_cpu = this_cpu;
if (recursion_bug) {
+ static const char recursion_msg[] =
+ "BUG: recent printk recursion!";
+
recursion_bug = 0;
- strcpy(printk_buf, recursion_bug_msg);
- printed_len = strlen(recursion_bug_msg);
+ printed_len += strlen(recursion_msg);
+ /* emit KERN_CRIT message */
+ log_store(0, 2, NULL, 0, recursion_msg, printed_len);
}
- /* Emit the output into the temporary buffer */
- printed_len += vscnprintf(printk_buf + printed_len,
- sizeof(printk_buf) - printed_len, fmt, args);
- p = printk_buf;
+ /*
+ * The printf needs to come first; we need the syslog
+ * prefix which might be passed-in as a parameter.
+ */
+ text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
- /* Read log level and handle special printk prefix */
- plen = log_prefix(p, &current_log_level, &special);
- if (plen) {
- p += plen;
+ /* mark and strip a trailing newline */
+ if (text_len && text[text_len-1] == '\n') {
+ text_len--;
+ newline = true;
+ }
- switch (special) {
- case 'c': /* Strip <c> KERN_CONT, continue line */
- plen = 0;
- break;
- case 'd': /* Strip <d> KERN_DEFAULT, start new line */
- plen = 0;
- default:
- if (!new_text_line) {
- emit_log_char('\n');
- new_text_line = 1;
- }
+ /* strip syslog prefix and extract log level or control flags */
+ if (text[0] == '<' && text[1] && text[2] == '>') {
+ switch (text[1]) {
+ case '0' ... '7':
+ if (level == -1)
+ level = text[1] - '0';
+ case 'd': /* KERN_DEFAULT */
+ prefix = true;
+ case 'c': /* KERN_CONT */
+ text += 3;
+ text_len -= 3;
}
}
- /*
- * Copy the output into log_buf. If the caller didn't provide
- * the appropriate log prefix, we insert them here
- */
- for (; *p; p++) {
- if (new_text_line) {
- new_text_line = 0;
-
- if (plen) {
- /* Copy original log prefix */
- int i;
-
- for (i = 0; i < plen; i++)
- emit_log_char(printk_buf[i]);
- printed_len += plen;
- } else {
- /* Add log prefix */
- emit_log_char('<');
- emit_log_char(current_log_level + '0');
- emit_log_char('>');
- printed_len += 3;
- }
+ if (level == -1)
+ level = default_message_loglevel;
- if (printk_time) {
- /* Add the current time stamp */
- char tbuf[50], *tp;
- unsigned tlen;
- unsigned long long t;
- unsigned long nanosec_rem;
-
- t = cpu_clock(printk_cpu);
- nanosec_rem = do_div(t, 1000000000);
- tlen = sprintf(tbuf, "[%5lu.%06lu] ",
- (unsigned long) t,
- nanosec_rem / 1000);
-
- for (tp = tbuf; tp < tbuf + tlen; tp++)
- emit_log_char(*tp);
- printed_len += tlen;
- }
+ if (dict) {
+ prefix = true;
+ newline = true;
+ }
- if (!*p)
- break;
+ if (!newline) {
+ if (cont_len && (prefix || cont_task != current)) {
+ /*
+ * Flush earlier buffer, which is either from a
+ * different thread, or when we got a new prefix.
+ */
+ log_store(facility, cont_level, NULL, 0, cont_buf, cont_len);
+ cont_len = 0;
}
- emit_log_char(*p);
- if (*p == '\n')
- new_text_line = 1;
+ if (!cont_len) {
+ cont_level = level;
+ cont_task = current;
+ }
+
+ /* buffer or append to earlier buffer from the same thread */
+ if (cont_len + text_len > sizeof(cont_buf))
+ text_len = sizeof(cont_buf) - cont_len;
+ memcpy(cont_buf + cont_len, text, text_len);
+ cont_len += text_len;
+ } else {
+ if (cont_len && cont_task == current) {
+ if (prefix) {
+ /*
+ * New prefix from the same thread; flush. We
+ * either got no earlier newline, or we race
+ * with an interrupt.
+ */
+ log_store(facility, cont_level,
+ NULL, 0, cont_buf, cont_len);
+ cont_len = 0;
+ }
+
+ /* append to the earlier buffer and flush */
+ if (cont_len + text_len > sizeof(cont_buf))
+ text_len = sizeof(cont_buf) - cont_len;
+ memcpy(cont_buf + cont_len, text, text_len);
+ cont_len += text_len;
+ log_store(facility, cont_level,
+ NULL, 0, cont_buf, cont_len);
+ cont_len = 0;
+ cont_task = NULL;
+ printed_len = cont_len;
+ } else {
+ /* ordinary single and terminated line */
+ log_store(facility, level,
+ dict, dictlen, text, text_len);
+ printed_len = text_len;
+ }
}
/*
- * Try to acquire and then immediately release the
- * console semaphore. The release will do all the
- * actual magic (print out buffers, wake up klogd,
- * etc).
+ * Try to acquire and then immediately release the console semaphore.
+ * The release will print out buffers and wake up /dev/kmsg and syslog()
+ * users.
*
- * The console_trylock_for_printk() function
- * will release 'logbuf_lock' regardless of whether it
- * actually gets the semaphore or not.
+ * The console_trylock_for_printk() function will release 'logbuf_lock'
+ * regardless of whether it actually gets the console semaphore or not.
*/
if (console_trylock_for_printk(this_cpu))
console_unlock();
@@ -974,16 +1418,81 @@ out_restore_irqs:
return printed_len;
}
-EXPORT_SYMBOL(printk);
-EXPORT_SYMBOL(vprintk);
+EXPORT_SYMBOL(vprintk_emit);
-#else
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+ return vprintk_emit(0, -1, NULL, 0, fmt, args);
+}
+EXPORT_SYMBOL(vprintk);
-static void call_console_drivers(unsigned start, unsigned end)
+asmlinkage int printk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, ...)
{
+ va_list args;
+ int r;
+
+ va_start(args, fmt);
+ r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
+ va_end(args);
+
+ return r;
}
+EXPORT_SYMBOL(printk_emit);
+/**
+ * printk - print a kernel message
+ * @fmt: format string
+ *
+ * This is printk(). It can be called from any context. We want it to work.
+ *
+ * We try to grab the console_lock. If we succeed, it's easy - we log the
+ * output and call the console drivers. If we fail to get the semaphore, we
+ * place the output into the log buffer and return. The current holder of
+ * the console_sem will notice the new output in console_unlock(); and will
+ * send it to the consoles before releasing the lock.
+ *
+ * One effect of this deferred printing is that code which calls printk() and
+ * then changes console_loglevel may break. This is because console_loglevel
+ * is inspected when the actual printing occurs.
+ *
+ * See also:
+ * printf(3)
+ *
+ * See the vsnprintf() documentation for format string extensions over C99.
+ */
+asmlinkage int printk(const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(kdb_trap_printk)) {
+ va_start(args, fmt);
+ r = vkdb_printf(fmt, args);
+ va_end(args);
+ return r;
+ }
#endif
+ va_start(args, fmt);
+ r = vprintk_emit(0, -1, NULL, 0, fmt, args);
+ va_end(args);
+
+ return r;
+}
+EXPORT_SYMBOL(printk);
+
+#else
+
+#define LOG_LINE_MAX 0
+static struct log *log_from_idx(u32 idx) { return NULL; }
+static u32 log_next(u32 idx) { return 0; }
+static void call_console_drivers(int level, const char *text, size_t len) {}
+static size_t msg_print_text(const struct log *msg, bool syslog,
+ char *buf, size_t size) { return 0; }
+
+#endif /* CONFIG_PRINTK */
static int __add_preferred_console(char *name, int idx, char *options,
char *brl_options)
@@ -1217,7 +1726,7 @@ int is_console_locked(void)
}
/*
- * Delayed printk facility, for scheduler-internal messages:
+ * Delayed printk version, for scheduler-internal messages:
*/
#define PRINTK_BUF_SIZE 512
@@ -1253,6 +1762,10 @@ void wake_up_klogd(void)
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
}
+/* the next printk record to write to the console */
+static u64 console_seq;
+static u32 console_idx;
+
/**
* console_unlock - unlock the console system
*
@@ -1263,15 +1776,16 @@ void wake_up_klogd(void)
* by printk(). If this is the case, console_unlock(); emits
* the output prior to releasing the lock.
*
- * If there is output waiting for klogd, we wake it up.
+ * If there is output waiting, we wake /dev/kmsg and syslog() users.
*
* console_unlock(); may be called from any context.
*/
void console_unlock(void)
{
+ static u64 seen_seq;
unsigned long flags;
- unsigned _con_start, _log_end;
- unsigned wake_klogd = 0, retry = 0;
+ bool wake_klogd = false;
+ bool retry;
if (console_suspended) {
up(&console_sem);
@@ -1281,17 +1795,38 @@ void console_unlock(void)
console_may_schedule = 0;
again:
- for ( ; ; ) {
+ for (;;) {
+ struct log *msg;
+ static char text[LOG_LINE_MAX];
+ size_t len;
+ int level;
+
raw_spin_lock_irqsave(&logbuf_lock, flags);
- wake_klogd |= log_start - log_end;
- if (con_start == log_end)
- break; /* Nothing to print */
- _con_start = con_start;
- _log_end = log_end;
- con_start = log_end; /* Flush */
+ if (seen_seq != log_next_seq) {
+ wake_klogd = true;
+ seen_seq = log_next_seq;
+ }
+
+ if (console_seq < log_first_seq) {
+ /* messages are gone, move to first one */
+ console_seq = log_first_seq;
+ console_idx = log_first_idx;
+ }
+
+ if (console_seq == log_next_seq)
+ break;
+
+ msg = log_from_idx(console_idx);
+ level = msg->level & 7;
+
+ len = msg_print_text(msg, false, text, sizeof(text));
+
+ console_idx = log_next(console_idx);
+ console_seq++;
raw_spin_unlock(&logbuf_lock);
+
stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(_con_start, _log_end);
+ call_console_drivers(level, text, len);
start_critical_timings();
local_irq_restore(flags);
}
@@ -1312,8 +1847,7 @@ again:
* flush, no worries.
*/
raw_spin_lock(&logbuf_lock);
- if (con_start != log_end)
- retry = 1;
+ retry = console_seq != log_next_seq;
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
if (retry && console_trylock())
@@ -1549,7 +2083,8 @@ void register_console(struct console *newcon)
* for us.
*/
raw_spin_lock_irqsave(&logbuf_lock, flags);
- con_start = log_start;
+ console_seq = syslog_seq;
+ console_idx = syslog_idx;
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
/*
* We're about to replay the log buffer. Only do this to the
@@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
}
EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
+static bool always_kmsg_dump;
+module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
+
/**
* kmsg_dump - dump kernel log to kernel message dumpers.
* @reason: the reason (oops, panic etc) for dumping
@@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
*/
void kmsg_dump(enum kmsg_dump_reason reason)
{
- unsigned long end;
- unsigned chars;
+ u64 idx;
struct kmsg_dumper *dumper;
const char *s1, *s2;
unsigned long l1, l2;
@@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason)
/* Theoretically, the log could move on after we do this, but
there's not a lot we can do about that. The new messages
will overwrite the start of what we dump. */
+
raw_spin_lock_irqsave(&logbuf_lock, flags);
- end = log_end & LOG_BUF_MASK;
- chars = logged_chars;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ if (syslog_seq < log_first_seq)
+ idx = syslog_idx;
+ else
+ idx = log_first_idx;
- if (chars > end) {
- s1 = log_buf + log_buf_len - chars + end;
- l1 = chars - end;
+ if (idx > log_next_idx) {
+ s1 = log_buf;
+ l1 = log_next_idx;
- s2 = log_buf;
- l2 = end;
+ s2 = log_buf + idx;
+ l2 = log_buf_len - idx;
} else {
s1 = "";
l1 = 0;
- s2 = log_buf + end - chars;
- l2 = chars;
+ s2 = log_buf + idx;
+ l2 = log_next_idx - idx;
}
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ee8d49b9c309..a232bb59d93f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -198,15 +198,14 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
return 0;
rcu_read_lock();
tcred = __task_cred(task);
- if (cred->user->user_ns == tcred->user->user_ns &&
- (cred->uid == tcred->euid &&
- cred->uid == tcred->suid &&
- cred->uid == tcred->uid &&
- cred->gid == tcred->egid &&
- cred->gid == tcred->sgid &&
- cred->gid == tcred->gid))
+ if (uid_eq(cred->uid, tcred->euid) &&
+ uid_eq(cred->uid, tcred->suid) &&
+ uid_eq(cred->uid, tcred->uid) &&
+ gid_eq(cred->gid, tcred->egid) &&
+ gid_eq(cred->gid, tcred->sgid) &&
+ gid_eq(cred->gid, tcred->gid))
goto ok;
- if (ptrace_has_cap(tcred->user->user_ns, mode))
+ if (ptrace_has_cap(tcred->user_ns, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a86f1741cc27..95cba41ce1e9 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -51,6 +51,34 @@
#include "rcu.h"
+#ifdef CONFIG_PREEMPT_RCU
+
+/*
+ * Check for a task exiting while in a preemptible-RCU read-side
+ * critical section, clean up if so. No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+ struct task_struct *t = current;
+
+ if (likely(list_empty(&current->rcu_node_entry)))
+ return;
+ t->rcu_read_lock_nesting = 1;
+ barrier();
+ t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
+ __rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+void exit_rcu(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 22ecea0dfb62..fc31a2d65100 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)
return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
}
-/*
- * Check for a task exiting while in a preemptible -RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting == 0)
- return;
- t->rcu_read_lock_nesting = 1;
- __rcu_read_unlock();
-}
-
#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a89b381a8c6e..e66b34ab7555 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
static int fqs_holdoff; /* Hold time within burst (us). */
static int fqs_stutter = 3; /* Wait time between bursts (s). */
+static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444);
MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
module_param(fqs_stutter, int, 0444);
MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(n_barrier_cbs, int, 0444);
+MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
module_param(onoff_interval, int, 0444);
MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
module_param(onoff_holdoff, int, 0444);
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task;
static struct task_struct *onoff_task;
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static struct task_struct *stall_task;
+static struct task_struct **barrier_cbs_tasks;
+static struct task_struct *barrier_task;
#define RCU_TORTURE_PIPE_LEN 10
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail;
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_barrier_error;
static long n_rcu_torture_boost_ktrerror;
static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
@@ -173,6 +179,8 @@ static long n_offline_attempts;
static long n_offline_successes;
static long n_online_attempts;
static long n_online_successes;
+static long n_barrier_attempts;
+static long n_barrier_successes;
static struct list_head rcu_torture_removed;
static cpumask_var_t shuffle_tmp_mask;
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */
static unsigned long boost_starttime; /* jiffies of next boost test start. */
DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
+static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
+static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
+static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
+static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
@@ -327,6 +339,7 @@ struct rcu_torture_ops {
int (*completed)(void);
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
+ void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
void (*cb_barrier)(void);
void (*fqs)(void);
int (*stats)(char *page);
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = {
.completed = rcu_torture_completed,
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
+ .call = call_rcu,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
.completed = rcu_torture_completed,
.deferred_free = rcu_sync_torture_deferred_free,
.sync = synchronize_rcu,
+ .call = NULL,
.cb_barrier = NULL,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
.completed = rcu_no_completed,
.deferred_free = rcu_sync_torture_deferred_free,
.sync = synchronize_rcu_expedited,
+ .call = NULL,
.cb_barrier = NULL,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
.completed = rcu_bh_torture_completed,
.deferred_free = rcu_bh_torture_deferred_free,
.sync = synchronize_rcu_bh,
+ .call = call_rcu_bh,
.cb_barrier = rcu_barrier_bh,
.fqs = rcu_bh_force_quiescent_state,
.stats = NULL,
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
.completed = rcu_bh_torture_completed,
.deferred_free = rcu_sync_torture_deferred_free,
.sync = synchronize_rcu_bh,
+ .call = NULL,
.cb_barrier = NULL,
.fqs = rcu_bh_force_quiescent_state,
.stats = NULL,
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
.completed = rcu_bh_torture_completed,
.deferred_free = rcu_sync_torture_deferred_free,
.sync = synchronize_rcu_bh_expedited,
+ .call = NULL,
.cb_barrier = NULL,
.fqs = rcu_bh_force_quiescent_state,
.stats = NULL,
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void)
return srcu_batches_completed(&srcu_ctl);
}
+static void srcu_torture_deferred_free(struct rcu_torture *rp)
+{
+ call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+}
+
static void srcu_torture_synchronize(void)
{
synchronize_srcu(&srcu_ctl);
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page)
cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
- cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
+ cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
}
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = {
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
.completed = srcu_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
+ .deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
+ .call = NULL,
.cb_barrier = NULL,
.stats = srcu_torture_stats,
.name = "srcu"
};
+static struct rcu_torture_ops srcu_sync_ops = {
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .completed = srcu_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .call = NULL,
+ .cb_barrier = NULL,
+ .stats = srcu_torture_stats,
+ .name = "srcu_sync"
+};
+
static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
{
return srcu_read_lock_raw(&srcu_ctl);
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = {
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock_raw,
.completed = srcu_torture_completed,
- .deferred_free = rcu_sync_torture_deferred_free,
+ .deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
+ .call = NULL,
.cb_barrier = NULL,
.stats = srcu_torture_stats,
.name = "srcu_raw"
};
+static struct rcu_torture_ops srcu_raw_sync_ops = {
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock_raw,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock_raw,
+ .completed = srcu_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .call = NULL,
+ .cb_barrier = NULL,
+ .stats = srcu_torture_stats,
+ .name = "srcu_raw_sync"
+};
+
static void srcu_torture_synchronize_expedited(void)
{
synchronize_srcu_expedited(&srcu_ctl);
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = {
.completed = srcu_torture_completed,
.deferred_free = rcu_sync_torture_deferred_free,
.sync = srcu_torture_synchronize_expedited,
+ .call = NULL,
.cb_barrier = NULL,
.stats = srcu_torture_stats,
.name = "srcu_expedited"
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page)
"rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
"rtmbe: %d rtbke: %ld rtbre: %ld "
"rtbf: %ld rtb: %ld nt: %ld "
- "onoff: %ld/%ld:%ld/%ld",
+ "onoff: %ld/%ld:%ld/%ld "
+ "barrier: %ld/%ld:%ld",
rcu_torture_current,
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page)
n_online_successes,
n_online_attempts,
n_offline_successes,
- n_offline_attempts);
+ n_offline_attempts,
+ n_barrier_successes,
+ n_barrier_attempts,
+ n_rcu_torture_barrier_error);
+ cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+ n_rcu_torture_barrier_error != 0 ||
n_rcu_torture_boost_ktrerror != 0 ||
n_rcu_torture_boost_rterror != 0 ||
- n_rcu_torture_boost_failure != 0)
- cnt += sprintf(&page[cnt], " !!!");
- cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
- if (i > 1) {
+ n_rcu_torture_boost_failure != 0 ||
+ i > 1) {
cnt += sprintf(&page[cnt], "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(1);
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu)
/* This must be outside of the mutex, otherwise deadlock! */
kthread_stop(t);
+ boost_tasks[cpu] = NULL;
}
static int rcutorture_booster_init(int cpu)
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void)
return;
VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
kthread_stop(onoff_task);
+ onoff_task = NULL;
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void
+static int
rcu_torture_onoff_init(void)
{
+ return 0;
}
static void rcu_torture_onoff_cleanup(void)
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void)
return;
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
kthread_stop(stall_task);
+ stall_task = NULL;
+}
+
+/* Callback function for RCU barrier testing. */
+void rcu_torture_barrier_cbf(struct rcu_head *rcu)
+{
+ atomic_inc(&barrier_cbs_invoked);
+}
+
+/* kthread function to register callbacks used to test RCU barriers. */
+static int rcu_torture_barrier_cbs(void *arg)
+{
+ long myid = (long)arg;
+ struct rcu_head rcu;
+
+ init_rcu_head_on_stack(&rcu);
+ VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
+ set_user_nice(current, 19);
+ do {
+ wait_event(barrier_cbs_wq[myid],
+ atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
+ kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP);
+ if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+ break;
+ cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+ if (atomic_dec_and_test(&barrier_cbs_count))
+ wake_up(&barrier_wq);
+ } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+ VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
+ rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+ while (!kthread_should_stop())
+ schedule_timeout_interruptible(1);
+ cur_ops->cb_barrier();
+ destroy_rcu_head_on_stack(&rcu);
+ return 0;
+}
+
+/* kthread function to drive and coordinate RCU barrier testing. */
+static int rcu_torture_barrier(void *arg)
+{
+ int i;
+
+ VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
+ do {
+ atomic_set(&barrier_cbs_invoked, 0);
+ atomic_set(&barrier_cbs_count, n_barrier_cbs);
+ /* wake_up() path contains the required barriers. */
+ for (i = 0; i < n_barrier_cbs; i++)
+ wake_up(&barrier_cbs_wq[i]);
+ wait_event(barrier_wq,
+ atomic_read(&barrier_cbs_count) == 0 ||
+ kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP);
+ if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+ break;
+ n_barrier_attempts++;
+ cur_ops->cb_barrier();
+ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
+ n_rcu_torture_barrier_error++;
+ WARN_ON_ONCE(1);
+ }
+ n_barrier_successes++;
+ schedule_timeout_interruptible(HZ / 10);
+ } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+ VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
+ rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+ while (!kthread_should_stop())
+ schedule_timeout_interruptible(1);
+ return 0;
+}
+
+/* Initialize RCU barrier testing. */
+static int rcu_torture_barrier_init(void)
+{
+ int i;
+ int ret;
+
+ if (n_barrier_cbs == 0)
+ return 0;
+ if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
+ printk(KERN_ALERT "%s" TORTURE_FLAG
+ " Call or barrier ops missing for %s,\n",
+ torture_type, cur_ops->name);
+ printk(KERN_ALERT "%s" TORTURE_FLAG
+ " RCU barrier testing omitted from run.\n",
+ torture_type);
+ return 0;
+ }
+ atomic_set(&barrier_cbs_count, 0);
+ atomic_set(&barrier_cbs_invoked, 0);
+ barrier_cbs_tasks =
+ kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+ GFP_KERNEL);
+ barrier_cbs_wq =
+ kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
+ GFP_KERNEL);
+ if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
+ return -ENOMEM;
+ for (i = 0; i < n_barrier_cbs; i++) {
+ init_waitqueue_head(&barrier_cbs_wq[i]);
+ barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
+ (void *)(long)i,
+ "rcu_torture_barrier_cbs");
+ if (IS_ERR(barrier_cbs_tasks[i])) {
+ ret = PTR_ERR(barrier_cbs_tasks[i]);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
+ barrier_cbs_tasks[i] = NULL;
+ return ret;
+ }
+ }
+ barrier_task = kthread_run(rcu_torture_barrier, NULL,
+ "rcu_torture_barrier");
+ if (IS_ERR(barrier_task)) {
+ ret = PTR_ERR(barrier_task);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
+ barrier_task = NULL;
+ }
+ return 0;
+}
+
+/* Clean up after RCU barrier testing. */
+static void rcu_torture_barrier_cleanup(void)
+{
+ int i;
+
+ if (barrier_task != NULL) {
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
+ kthread_stop(barrier_task);
+ barrier_task = NULL;
+ }
+ if (barrier_cbs_tasks != NULL) {
+ for (i = 0; i < n_barrier_cbs; i++) {
+ if (barrier_cbs_tasks[i] != NULL) {
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
+ kthread_stop(barrier_cbs_tasks[i]);
+ barrier_cbs_tasks[i] = NULL;
+ }
+ }
+ kfree(barrier_cbs_tasks);
+ barrier_cbs_tasks = NULL;
+ }
+ if (barrier_cbs_wq != NULL) {
+ kfree(barrier_cbs_wq);
+ barrier_cbs_wq = NULL;
+ }
}
static int rcutorture_cpu_notify(struct notifier_block *self,
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void)
fullstop = FULLSTOP_RMMOD;
mutex_unlock(&fullstop_mutex);
unregister_reboot_notifier(&rcutorture_shutdown_nb);
+ rcu_torture_barrier_cleanup();
rcu_torture_stall_cleanup();
if (stutter_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void)
VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
kthread_stop(shutdown_task);
}
+ shutdown_task = NULL;
rcu_torture_onoff_cleanup();
/* Wait for all RCU callbacks to fire. */
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void)
if (cur_ops->cleanup)
cur_ops->cleanup();
- if (atomic_read(&n_rcu_torture_error))
+ if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
else if (n_online_successes != n_online_attempts ||
n_offline_successes != n_offline_attempts)
@@ -1692,10 +1904,12 @@ rcu_torture_init(void)
int i;
int cpu;
int firsterr = 0;
+ int retval;
static struct rcu_torture_ops *torture_ops[] =
{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
&rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
- &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
+ &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
+ &srcu_raw_sync_ops, &srcu_expedited_ops,
&sched_ops, &sched_sync_ops, &sched_expedited_ops, };
mutex_lock(&fullstop_mutex);
@@ -1749,6 +1963,7 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_free, 0);
atomic_set(&n_rcu_torture_mberror, 0);
atomic_set(&n_rcu_torture_error, 0);
+ n_rcu_torture_barrier_error = 0;
n_rcu_torture_boost_ktrerror = 0;
n_rcu_torture_boost_rterror = 0;
n_rcu_torture_boost_failure = 0;
@@ -1872,7 +2087,6 @@ rcu_torture_init(void)
test_boost_duration = 2;
if ((test_boost == 1 && cur_ops->can_boost) ||
test_boost == 2) {
- int retval;
boost_starttime = jiffies + test_boost_interval * HZ;
register_cpu_notifier(&rcutorture_cpu_nb);
@@ -1897,9 +2111,22 @@ rcu_torture_init(void)
goto unwind;
}
}
- rcu_torture_onoff_init();
+ i = rcu_torture_onoff_init();
+ if (i != 0) {
+ firsterr = i;
+ goto unwind;
+ }
register_reboot_notifier(&rcutorture_shutdown_nb);
- rcu_torture_stall_init();
+ i = rcu_torture_stall_init();
+ if (i != 0) {
+ firsterr = i;
+ goto unwind;
+ }
+ retval = rcu_torture_barrier_init();
+ if (retval != 0) {
+ firsterr = retval;
+ goto unwind;
+ }
rcutorture_record_test_transition();
mutex_unlock(&fullstop_mutex);
return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d0c5baf1ab18..0da7b88d92d0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
.gpnum = -300, \
.completed = -300, \
.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
+ .orphan_nxttail = &structname##_state.orphan_nxtlist, \
+ .orphan_donetail = &structname##_state.orphan_donelist, \
.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
.n_force_qs = 0, \
.n_force_qs_ngp = 0, \
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
unsigned long rcutorture_testseq;
unsigned long rcutorture_vernum;
+/* State information for rcu_barrier() and friends. */
+
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
+static atomic_t rcu_barrier_cpu_count;
+static DEFINE_MUTEX(rcu_barrier_mutex);
+static struct completion rcu_barrier_completion;
+
/*
* Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
* permit this function to be invoked without holding the root rcu_node
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu)
{
trace_rcu_utilization("Start context switch");
rcu_sched_qs(cpu);
- rcu_preempt_note_context_switch(cpu);
trace_rcu_utilization("End context switch");
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * Also record a quiescent state for this CPU for the current grace period.
- * Synchronization and interrupt disabling are not required because
- * this function executes in stop_machine() context. Therefore, cleanup
- * operations that might block must be done later from the CPU_DEAD
- * notifier.
- *
- * Note that the outgoing CPU's bit has already been cleared in the
- * cpu_online_mask. This allows us to randomly pick a callback
- * destination from the bits set in that mask.
+ * Send the specified CPU's RCU callbacks to the orphanage. The
+ * specified CPU must be offline, and the caller must hold the
+ * ->onofflock.
*/
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+static void
+rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
+ struct rcu_node *rnp, struct rcu_data *rdp)
{
int i;
- unsigned long mask;
- int receive_cpu = cpumask_any(cpu_online_mask);
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
- RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
- /* First, adjust the counts. */
+ /*
+ * Orphan the callbacks. First adjust the counts. This is safe
+ * because ->onofflock excludes _rcu_barrier()'s adoption of
+ * the callbacks, thus no memory barrier is required.
+ */
if (rdp->nxtlist != NULL) {
- receive_rdp->qlen_lazy += rdp->qlen_lazy;
- receive_rdp->qlen += rdp->qlen;
+ rsp->qlen_lazy += rdp->qlen_lazy;
+ rsp->qlen += rdp->qlen;
+ rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen_lazy = 0;
rdp->qlen = 0;
}
/*
- * Next, move ready-to-invoke callbacks to be invoked on some
- * other CPU. These will not be required to pass through another
- * grace period: They are done, regardless of CPU.
+ * Next, move those callbacks still needing a grace period to
+ * the orphanage, where some other CPU will pick them up.
+ * Some of the callbacks might have gone partway through a grace
+ * period, but that is too bad. They get to start over because we
+ * cannot assume that grace periods are synchronized across CPUs.
+ * We don't bother updating the ->nxttail[] array yet, instead
+ * we just reset the whole thing later on.
*/
- if (rdp->nxtlist != NULL &&
- rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) {
- struct rcu_head *oldhead;
- struct rcu_head **oldtail;
- struct rcu_head **newtail;
-
- oldhead = rdp->nxtlist;
- oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
- rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
- *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
- newtail = rdp->nxttail[RCU_DONE_TAIL];
- for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
- if (receive_rdp->nxttail[i] == oldtail)
- receive_rdp->nxttail[i] = newtail;
- if (rdp->nxttail[i] == newtail)
- rdp->nxttail[i] = &rdp->nxtlist;
- }
+ if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
+ *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
+ rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = NULL;
}
/*
- * Finally, put the rest of the callbacks at the end of the list.
- * The ones that made it partway through get to start over: We
- * cannot assume that grace periods are synchronized across CPUs.
- * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
- * this does not seem compelling. Not yet, anyway.)
+ * Then move the ready-to-invoke callbacks to the orphanage,
+ * where some other CPU will pick them up. These will not be
+ * required to pass though another grace period: They are done.
*/
if (rdp->nxtlist != NULL) {
- *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
- receive_rdp->nxttail[RCU_NEXT_TAIL] =
- rdp->nxttail[RCU_NEXT_TAIL];
- receive_rdp->n_cbs_adopted += rdp->qlen;
- rdp->n_cbs_orphaned += rdp->qlen;
-
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
+ *rsp->orphan_donetail = rdp->nxtlist;
+ rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
}
+ /* Finally, initialize the rcu_data structure's list to empty. */
+ rdp->nxtlist = NULL;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ rdp->nxttail[i] = &rdp->nxtlist;
+}
+
+/*
+ * Adopt the RCU callbacks from the specified rcu_state structure's
+ * orphanage. The caller must hold the ->onofflock.
+ */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+ int i;
+ struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+
/*
- * Record a quiescent state for the dying CPU. This is safe
- * only because we have already cleared out the callbacks.
- * (Otherwise, the RCU core might try to schedule the invocation
- * of callbacks on this now-offline CPU, which would be bad.)
+ * If there is an rcu_barrier() operation in progress, then
+ * only the task doing that operation is permitted to adopt
+ * callbacks. To do otherwise breaks rcu_barrier() and friends
+ * by causing them to fail to wait for the callbacks in the
+ * orphanage.
*/
- mask = rdp->grpmask; /* rnp->grplo is constant. */
+ if (rsp->rcu_barrier_in_progress &&
+ rsp->rcu_barrier_in_progress != current)
+ return;
+
+ /* Do the accounting first. */
+ rdp->qlen_lazy += rsp->qlen_lazy;
+ rdp->qlen += rsp->qlen;
+ rdp->n_cbs_adopted += rsp->qlen;
+ rsp->qlen_lazy = 0;
+ rsp->qlen = 0;
+
+ /*
+ * We do not need a memory barrier here because the only way we
+ * can get here if there is an rcu_barrier() in flight is if
+ * we are the task doing the rcu_barrier().
+ */
+
+ /* First adopt the ready-to-invoke callbacks. */
+ if (rsp->orphan_donelist != NULL) {
+ *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
+ for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
+ if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+ rdp->nxttail[i] = rsp->orphan_donetail;
+ rsp->orphan_donelist = NULL;
+ rsp->orphan_donetail = &rsp->orphan_donelist;
+ }
+
+ /* And then adopt the callbacks that still need a grace period. */
+ if (rsp->orphan_nxtlist != NULL) {
+ *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
+ rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
+ rsp->orphan_nxtlist = NULL;
+ rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+ }
+}
+
+/*
+ * Trace the fact that this CPU is going offline.
+ */
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+{
+ RCU_TRACE(unsigned long mask);
+ RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
+ RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+
+ RCU_TRACE(mask = rdp->grpmask);
trace_rcu_grace_period(rsp->name,
rnp->gpnum + 1 - !!(rnp->qsmask & mask),
"cpuofl");
- rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
- /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
}
/*
* The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup.
+ * this fact from process context. Do the remainder of the cleanup,
+ * including orphaning the outgoing CPU's RCU callbacks, and also
+ * adopting them, if there is no _rcu_barrier() instance running.
* There can only be one CPU hotplug operation at a time, so no other
* CPU can be attempting to update rcu_cpu_kthread_task.
*/
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
unsigned long mask;
int need_report = 0;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */
+ struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
/* Adjust any no-longer-needed kthreads. */
rcu_stop_cpu_kthread(cpu);
rcu_node_kthread_setaffinity(rnp, -1);
- /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */
+ /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
/* Exclude any attempts to start a new grace period. */
raw_spin_lock_irqsave(&rsp->onofflock, flags);
+ /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
+ rcu_adopt_orphan_cbs(rsp);
+
/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
mask = rdp->grpmask; /* rnp->grplo is constant. */
do {
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
#else /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+}
+
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
{
}
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
rcu_is_callbacks_kthread());
/* Update count, and requeue any remaining callbacks. */
- rdp->qlen_lazy -= count_lazy;
- rdp->qlen -= count;
- rdp->n_cbs_invoked += count;
if (list != NULL) {
*tail = rdp->nxtlist;
rdp->nxtlist = list;
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
else
break;
}
+ smp_mb(); /* List handling before counting for rcu_barrier(). */
+ rdp->qlen_lazy -= count_lazy;
+ rdp->qlen -= count;
+ rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1823,11 +1878,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
rdp = this_cpu_ptr(rsp->rda);
/* Add the callback to our list. */
- *rdp->nxttail[RCU_NEXT_TAIL] = head;
- rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
rdp->qlen++;
if (lazy)
rdp->qlen_lazy++;
+ else
+ rcu_idle_count_callbacks_posted();
+ smp_mb(); /* Count before adding callback for rcu_barrier(). */
+ *rdp->nxttail[RCU_NEXT_TAIL] = head;
+ rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
@@ -1893,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
+/*
+ * Because a context switch is a grace period for RCU-sched and RCU-bh,
+ * any blocking grace-period wait automatically implies a grace period
+ * if there is only one CPU online at any point time during execution
+ * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
+ * occasionally incorrectly indicate that there are multiple CPUs online
+ * when there was in fact only one the whole time, as this just adds
+ * some overhead: RCU still operates correctly.
+ *
+ * Of course, sampling num_online_cpus() with preemption enabled can
+ * give erroneous results if there are concurrent CPU-hotplug operations.
+ * For example, given a demonic sequence of preemptions in num_online_cpus()
+ * and CPU-hotplug operations, there could be two or more CPUs online at
+ * all times, but num_online_cpus() might well return one (or even zero).
+ *
+ * However, all such demonic sequences require at least one CPU-offline
+ * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
+ * is only a problem if there is an RCU read-side critical section executing
+ * throughout. But RCU-sched and RCU-bh read-side critical sections
+ * disable either preemption or bh, which prevents a CPU from going offline.
+ * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
+ * that there is only one CPU when in fact there was more than one throughout
+ * is when there were no RCU readers in the system. If there are no
+ * RCU readers, the grace period by definition can be of zero length,
+ * regardless of the number of online CPUs.
+ */
+static inline int rcu_blocking_is_gp(void)
+{
+ might_sleep(); /* Check for RCU read-side critical section. */
+ return num_online_cpus() <= 1;
+}
+
/**
* synchronize_sched - wait until an rcu-sched grace period has elapsed.
*
@@ -2166,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu)
rcu_preempt_cpu_has_callbacks(cpu);
}
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
-static atomic_t rcu_barrier_cpu_count;
-static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
-
+/*
+ * RCU callback function for _rcu_barrier(). If we are last, wake
+ * up the task executing _rcu_barrier().
+ */
static void rcu_barrier_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -2200,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp,
void (*call_rcu_func)(struct rcu_head *head,
void (*func)(struct rcu_head *head)))
{
- BUG_ON(in_interrupt());
+ int cpu;
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_head rh;
+
+ init_rcu_head_on_stack(&rh);
+
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rcu_barrier_mutex);
- init_completion(&rcu_barrier_completion);
+
+ smp_mb(); /* Prevent any prior operations from leaking in. */
+
/*
- * Initialize rcu_barrier_cpu_count to 1, then invoke
- * rcu_barrier_func() on each CPU, so that each CPU also has
- * incremented rcu_barrier_cpu_count. Only then is it safe to
- * decrement rcu_barrier_cpu_count -- otherwise the first CPU
- * might complete its grace period before all of the other CPUs
- * did their increment, causing this function to return too
- * early. Note that on_each_cpu() disables irqs, which prevents
- * any CPUs from coming online or going offline until each online
- * CPU has queued its RCU-barrier callback.
+ * Initialize the count to one rather than to zero in order to
+ * avoid a too-soon return to zero in case of a short grace period
+ * (or preemption of this task). Also flag this task as doing
+ * an rcu_barrier(). This will prevent anyone else from adopting
+ * orphaned callbacks, which could cause otherwise failure if a
+ * CPU went offline and quickly came back online. To see this,
+ * consider the following sequence of events:
+ *
+ * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
+ * 2. CPU 1 goes offline, orphaning its callbacks.
+ * 3. CPU 0 adopts CPU 1's orphaned callbacks.
+ * 4. CPU 1 comes back online.
+ * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
+ * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
+ * us -- but before CPU 1's orphaned callbacks are invoked!!!
*/
+ init_completion(&rcu_barrier_completion);
atomic_set(&rcu_barrier_cpu_count, 1);
- on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
+ raw_spin_lock_irqsave(&rsp->onofflock, flags);
+ rsp->rcu_barrier_in_progress = current;
+ raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+
+ /*
+ * Force every CPU with callbacks to register a new callback
+ * that will tell us when all the preceding callbacks have
+ * been invoked. If an offline CPU has callbacks, wait for
+ * it to either come back online or to finish orphaning those
+ * callbacks.
+ */
+ for_each_possible_cpu(cpu) {
+ preempt_disable();
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (cpu_is_offline(cpu)) {
+ preempt_enable();
+ while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
+ schedule_timeout_interruptible(1);
+ } else if (ACCESS_ONCE(rdp->qlen)) {
+ smp_call_function_single(cpu, rcu_barrier_func,
+ (void *)call_rcu_func, 1);
+ preempt_enable();
+ } else {
+ preempt_enable();
+ }
+ }
+
+ /*
+ * Now that all online CPUs have rcu_barrier_callback() callbacks
+ * posted, we can adopt all of the orphaned callbacks and place
+ * an rcu_barrier_callback() callback after them. When that is done,
+ * we are guaranteed to have an rcu_barrier_callback() callback
+ * following every callback that could possibly have been
+ * registered before _rcu_barrier() was called.
+ */
+ raw_spin_lock_irqsave(&rsp->onofflock, flags);
+ rcu_adopt_orphan_cbs(rsp);
+ rsp->rcu_barrier_in_progress = NULL;
+ raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+ atomic_inc(&rcu_barrier_cpu_count);
+ smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
+ call_rcu_func(&rh, rcu_barrier_callback);
+
+ /*
+ * Now that we have an rcu_barrier_callback() callback on each
+ * CPU, and thus each counted, remove the initial count.
+ */
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion);
+
+ /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
wait_for_completion(&rcu_barrier_completion);
+
+ /* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rcu_barrier_mutex);
+
+ destroy_rcu_head_on_stack(&rh);
}
/**
@@ -2417,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
for (i = NUM_RCU_LVLS - 1; i > 0; i--)
rsp->levelspread[i] = CONFIG_RCU_FANOUT;
- rsp->levelspread[0] = RCU_FANOUT_LEAF;
+ rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
}
#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index cdd1be0a4072..7f5d138dedf5 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,18 +29,14 @@
#include <linux/seqlock.h>
/*
- * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
+ * CONFIG_RCU_FANOUT_LEAF.
* In theory, it should be possible to add more levels straightforwardly.
* In practice, this did work well going from three levels to four.
* Of course, your mileage may vary.
*/
#define MAX_RCU_LVLS 4
-#if CONFIG_RCU_FANOUT > 16
-#define RCU_FANOUT_LEAF 16
-#else /* #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
-#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
@@ -371,6 +367,17 @@ struct rcu_state {
raw_spinlock_t onofflock; /* exclude on/offline and */
/* starting new GP. */
+ struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
+ /* need a grace period. */
+ struct rcu_head **orphan_nxttail; /* Tail of above. */
+ struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
+ /* are ready to invoke. */
+ struct rcu_head **orphan_donetail; /* Tail of above. */
+ long qlen_lazy; /* Number of lazy callbacks. */
+ long qlen; /* Total number of callbacks. */
+ struct task_struct *rcu_barrier_in_progress;
+ /* Task doing rcu_barrier(), */
+ /* or NULL if no barrier. */
raw_spinlock_t fqslock; /* Only one task forcing */
/* quiescent states. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
long rcu_batches_completed(void);
-static void rcu_preempt_note_context_switch(int cpu);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
static void rcu_prepare_for_idle(int cpu);
+static void rcu_idle_count_callbacks_posted(void);
static void print_cpu_stall_info_begin(void);
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c023464816be..2411000d9869 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
*
* Caller must disable preemption.
*/
-static void rcu_preempt_note_context_switch(int cpu)
+void rcu_preempt_note_context_switch(void)
{
struct task_struct *t = current;
unsigned long flags;
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu)
(t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
+ rdp = __this_cpu_ptr(rcu_preempt_state.rda);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu)
* means that we continue to block the current grace period.
*/
local_irq_save(flags);
- rcu_preempt_qs(cpu);
+ rcu_preempt_qs(smp_processor_id());
local_irq_restore(flags);
}
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void)
rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
}
-/*
- * Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting == 0)
- return;
- t->rcu_read_lock_nesting = 1;
- __rcu_read_unlock();
-}
-
#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
- * Because preemptible RCU does not exist, we never have to check for
- * CPUs being in quiescent states.
- */
-static void rcu_preempt_note_context_switch(int cpu)
-{
-}
-
-/*
* Because preemptible RCU does not exist, there are never any preempted
* RCU readers.
*/
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu)
{
}
+/*
+ * Don't bother keeping a running count of the number of RCU callbacks
+ * posted because CONFIG_RCU_FAST_NO_HZ=n.
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+}
+
#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
/*
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu)
#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
+/* Loop counter for rcu_prepare_for_idle(). */
static DEFINE_PER_CPU(int, rcu_dyntick_drain);
+/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
-static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
-static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */
-static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
+/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
+static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
+/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
+static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
+/* Enable special processing on first attempt to enter dyntick-idle mode. */
+static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
+/* Running count of non-lazy callbacks posted, never decremented. */
+static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
+/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
+static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
/*
* Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
*/
int rcu_needs_cpu(int cpu)
{
+ /* Flag a new idle sojourn to the idle-entry state machine. */
+ per_cpu(rcu_idle_first_pass, cpu) = 1;
/* If no callbacks, RCU doesn't need the CPU. */
if (!rcu_cpu_has_callbacks(cpu))
return 0;
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
}
/*
+ * Handler for smp_call_function_single(). The only point of this
+ * handler is to wake the CPU up, so the handler does only tracing.
+ */
+void rcu_idle_demigrate(void *unused)
+{
+ trace_rcu_prep_idle("Demigrate");
+}
+
+/*
* Timer handler used to force CPU to start pushing its remaining RCU
* callbacks in the case where it entered dyntick-idle mode with callbacks
* pending. The hander doesn't really need to do anything because the
* real work is done upon re-entry to idle, or by the next scheduling-clock
* interrupt should idle not be re-entered.
+ *
+ * One special case: the timer gets migrated without awakening the CPU
+ * on which the timer was scheduled on. In this case, we must wake up
+ * that CPU. We do so with smp_call_function_single().
*/
-static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
+static void rcu_idle_gp_timer_func(unsigned long cpu_in)
{
+ int cpu = (int)cpu_in;
+
trace_rcu_prep_idle("Timer");
- return HRTIMER_NORESTART;
+ if (cpu != smp_processor_id())
+ smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
+ else
+ WARN_ON_ONCE(1); /* Getting here can hang the system... */
}
/*
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
*/
static void rcu_prepare_for_idle_init(int cpu)
{
- static int firsttime = 1;
- struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
-
- hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtp->function = rcu_idle_gp_timer_func;
- if (firsttime) {
- unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
-
- rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
- upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
- rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
- firsttime = 0;
- }
+ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+ setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
+ rcu_idle_gp_timer_func, cpu);
+ per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
+ per_cpu(rcu_idle_first_pass, cpu) = 1;
}
/*
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu)
*/
static void rcu_cleanup_after_idle(int cpu)
{
- hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
+ del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
+ trace_rcu_prep_idle("Cleanup after idle");
}
/*
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu)
*/
static void rcu_prepare_for_idle(int cpu)
{
+ struct timer_list *tp;
+
+ /*
+ * If this is an idle re-entry, for example, due to use of
+ * RCU_NONIDLE() or the new idle-loop tracing API within the idle
+ * loop, then don't take any state-machine actions, unless the
+ * momentary exit from idle queued additional non-lazy callbacks.
+ * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
+ * pending.
+ */
+ if (!per_cpu(rcu_idle_first_pass, cpu) &&
+ (per_cpu(rcu_nonlazy_posted, cpu) ==
+ per_cpu(rcu_nonlazy_posted_snap, cpu))) {
+ if (rcu_cpu_has_callbacks(cpu)) {
+ tp = &per_cpu(rcu_idle_gp_timer, cpu);
+ mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
+ }
+ return;
+ }
+ per_cpu(rcu_idle_first_pass, cpu) = 0;
+ per_cpu(rcu_nonlazy_posted_snap, cpu) =
+ per_cpu(rcu_nonlazy_posted, cpu) - 1;
+
/*
* If there are no callbacks on this CPU, enter dyntick-idle mode.
* Also reset state to avoid prejudicing later attempts.
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu)
per_cpu(rcu_dyntick_drain, cpu) = 0;
per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
if (rcu_cpu_has_nonlazy_callbacks(cpu))
- hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
- rcu_idle_gp_wait, HRTIMER_MODE_REL);
+ per_cpu(rcu_idle_gp_timer_expires, cpu) =
+ jiffies + RCU_IDLE_GP_DELAY;
else
- hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
- rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL);
+ per_cpu(rcu_idle_gp_timer_expires, cpu) =
+ jiffies + RCU_IDLE_LAZY_GP_DELAY;
+ tp = &per_cpu(rcu_idle_gp_timer, cpu);
+ mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
+ per_cpu(rcu_nonlazy_posted_snap, cpu) =
+ per_cpu(rcu_nonlazy_posted, cpu);
return; /* Nothing more to do immediately. */
} else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
/* We have hit the limit, so time to give up. */
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu)
trace_rcu_prep_idle("Callbacks drained");
}
+/*
+ * Keep a running count of the number of non-lazy callbacks posted
+ * on this CPU. This running counter (which is never decremented) allows
+ * rcu_prepare_for_idle() to detect when something out of the idle loop
+ * posts a callback, even if an equal number of callbacks are invoked.
+ * Of course, callbacks should only be posted from within a trace event
+ * designed to be called from idle or from within RCU_NONIDLE().
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+ __this_cpu_add(rcu_nonlazy_posted, 1);
+}
+
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu)
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
- struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
+ struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
- sprintf(cp, "drain=%d %c timer=%lld",
+ sprintf(cp, "drain=%d %c timer=%lu",
per_cpu(rcu_dyntick_drain, cpu),
per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
- hrtimer_active(hrtp)
- ? ktime_to_us(hrtimer_get_remaining(hrtp))
- : -1);
+ timer_pending(tltp) ? tltp->expires - jiffies : -1);
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index ed459edeff43..d4bc16ddd1d4 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
gpnum = rsp->gpnum;
seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
- "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
+ "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->completed, gpnum, rsp->fqs_state,
(long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff),
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- rsp->n_force_qs_lh);
+ rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d508363858b3..bebe2b170d49 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
counter->parent = parent;
}
-int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
+int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
+ bool force)
{
+ int ret = 0;
+
if (counter->usage + val > counter->limit) {
counter->failcnt++;
- return -ENOMEM;
+ ret = -ENOMEM;
+ if (!force)
+ return ret;
}
counter->usage += val;
if (counter->usage > counter->max_usage)
counter->max_usage = counter->usage;
- return 0;
+ return ret;
}
-int res_counter_charge(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at)
+static int __res_counter_charge(struct res_counter *counter, unsigned long val,
+ struct res_counter **limit_fail_at, bool force)
{
- int ret;
+ int ret, r;
unsigned long flags;
struct res_counter *c, *u;
+ r = ret = 0;
*limit_fail_at = NULL;
local_irq_save(flags);
for (c = counter; c != NULL; c = c->parent) {
spin_lock(&c->lock);
- ret = res_counter_charge_locked(c, val);
+ r = res_counter_charge_locked(c, val, force);
spin_unlock(&c->lock);
- if (ret < 0) {
+ if (r < 0 && !ret) {
+ ret = r;
*limit_fail_at = c;
- goto undo;
+ if (!force)
+ break;
}
}
- ret = 0;
- goto done;
-undo:
- for (u = counter; u != c; u = u->parent) {
- spin_lock(&u->lock);
- res_counter_uncharge_locked(u, val);
- spin_unlock(&u->lock);
+
+ if (ret < 0 && !force) {
+ for (u = counter; u != c; u = u->parent) {
+ spin_lock(&u->lock);
+ res_counter_uncharge_locked(u, val);
+ spin_unlock(&u->lock);
+ }
}
-done:
local_irq_restore(flags);
+
return ret;
}
+int res_counter_charge(struct res_counter *counter, unsigned long val,
+ struct res_counter **limit_fail_at)
+{
+ return __res_counter_charge(counter, val, limit_fail_at, false);
+}
+
int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
struct res_counter **limit_fail_at)
{
- int ret, r;
- unsigned long flags;
- struct res_counter *c;
-
- r = ret = 0;
- *limit_fail_at = NULL;
- local_irq_save(flags);
- for (c = counter; c != NULL; c = c->parent) {
- spin_lock(&c->lock);
- r = res_counter_charge_locked(c, val);
- if (r)
- c->usage += val;
- spin_unlock(&c->lock);
- if (r < 0 && ret == 0) {
- *limit_fail_at = c;
- ret = r;
- }
- }
- local_irq_restore(flags);
-
- return ret;
+ return __res_counter_charge(counter, val, limit_fail_at, true);
}
+
void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
{
if (WARN_ON(counter->usage < val))
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35102a3..173ea52f3af0 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
-
-
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0533a688ce22..39eb6011bc38 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
#include "sched.h"
#include "../workqueue_sched.h"
+#include "../smpboot.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -692,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-void update_cpu_load(struct rq *this_rq);
-
static void set_load_weight(struct task_struct *p)
{
int prio = p->static_prio - MAX_RT_PRIO;
@@ -2083,6 +2082,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
#endif
/* Here we just switch the register state and the stack. */
+ rcu_switch_from(prev);
switch_to(prev, next, prev);
barrier();
@@ -2486,22 +2486,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
* scheduler tick (TICK_NSEC). With tickless idle this will not be called
* every tick. We fix it up based on jiffies.
*/
-void update_cpu_load(struct rq *this_rq)
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
{
- unsigned long this_load = this_rq->load.weight;
- unsigned long curr_jiffies = jiffies;
- unsigned long pending_updates;
int i, scale;
this_rq->nr_load_updates++;
- /* Avoid repeated calls on same jiffy, when moving in and out of idle */
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
/* Update our load: */
this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2517,45 @@ void update_cpu_load(struct rq *this_rq)
sched_avg_update(this_rq);
}
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+ unsigned long curr_jiffies = jiffies;
+ unsigned long load = this_rq->load.weight;
+ unsigned long pending_updates;
+
+ /*
+ * Bloody broken means of dealing with nohz, but better than nothing..
+ * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
+ * update and see 0 difference the one time and 2 the next, even though
+ * we ticked at roughtly the same rate.
+ *
+ * Hence we only use this from nohz_idle_balance() and skip this
+ * nonsense when called from the scheduler_tick() since that's
+ * guaranteed a stable rate.
+ */
+ if (load || curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
+ __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from scheduler_tick()
+ */
static void update_cpu_load_active(struct rq *this_rq)
{
- update_cpu_load(this_rq);
+ /*
+ * See the mess in update_idle_cpu_load().
+ */
+ this_rq->last_load_update_tick = jiffies;
+ __update_cpu_load(this_rq, this_rq->load.weight, 1);
calc_load_account_active(this_rq);
}
@@ -3113,6 +3140,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
if (irqs_disabled())
print_irqtrace_events(prev);
dump_stack();
+ add_taint(TAINT_WARN);
}
/*
@@ -4042,11 +4070,8 @@ static bool check_same_owner(struct task_struct *p)
rcu_read_lock();
pcred = __task_cred(p);
- if (cred->user->user_ns == pcred->user->user_ns)
- match = (cred->euid == pcred->euid ||
- cred->euid == pcred->uid);
- else
- match = false;
+ match = (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
rcu_read_unlock();
return match;
}
@@ -5560,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+ if (!(sd->flags & SD_OVERLAP) &&
+ cpumask_intersects(groupmask, sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
break;
@@ -5898,99 +5924,11 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
- int i, n, val, min_val, best_node = -1;
-
- min_val = INT_MAX;
-
- for (i = 0; i < nr_node_ids; i++) {
- /* Start at @node */
- n = (node + i) % nr_node_ids;
-
- if (!nr_cpus_node(n))
- continue;
-
- /* Skip already used nodes */
- if (node_isset(n, *used_nodes))
- continue;
-
- /* Simple min distance search */
- val = node_distance(node, n);
-
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- if (best_node != -1)
- node_set(best_node, *used_nodes);
- return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
- nodemask_t used_nodes;
- int i;
-
- cpumask_clear(span);
- nodes_clear(used_nodes);
-
- cpumask_or(span, span, cpumask_of_node(node));
- node_set(node, used_nodes);
-
- for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
- int next_node = find_next_best_node(node, &used_nodes);
- if (next_node < 0)
- break;
- cpumask_or(span, span, cpumask_of_node(next_node));
- }
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
- lockdep_assert_held(&sched_domains_mutex);
-
- sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
- return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
- return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
static const struct cpumask *cpu_cpu_mask(int cpu)
{
return cpumask_of_node(cpu_to_node(cpu));
}
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
@@ -6020,6 +5958,7 @@ struct sched_domain_topology_level {
sched_domain_init_f init;
sched_domain_mask_f mask;
int flags;
+ int numa_level;
struct sd_data data;
};
@@ -6211,10 +6150,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
}
SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
#ifdef CONFIG_SCHED_SMT
SD_INIT_FUNC(SIBLING)
#endif
@@ -6336,15 +6271,184 @@ static struct sched_domain_topology_level default_topology[] = {
{ sd_init_BOOK, cpu_book_mask, },
#endif
{ sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
- { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
- { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
{ NULL, },
};
static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+#ifdef CONFIG_NUMA
+
+static int sched_domains_numa_levels;
+static int sched_domains_numa_scale;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+
+static inline int sd_local_flags(int level)
+{
+ if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+ return 0;
+
+ return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+}
+
+static struct sched_domain *
+sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+{
+ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+ int level = tl->numa_level;
+ int sd_weight = cpumask_weight(
+ sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+
+ *sd = (struct sched_domain){
+ .min_interval = sd_weight,
+ .max_interval = 2*sd_weight,
+ .busy_factor = 32,
+ .imbalance_pct = 125,
+ .cache_nice_tries = 2,
+ .busy_idx = 3,
+ .idle_idx = 2,
+ .newidle_idx = 0,
+ .wake_idx = 0,
+ .forkexec_idx = 0,
+
+ .flags = 1*SD_LOAD_BALANCE
+ | 1*SD_BALANCE_NEWIDLE
+ | 0*SD_BALANCE_EXEC
+ | 0*SD_BALANCE_FORK
+ | 0*SD_BALANCE_WAKE
+ | 0*SD_WAKE_AFFINE
+ | 0*SD_PREFER_LOCAL
+ | 0*SD_SHARE_CPUPOWER
+ | 0*SD_SHARE_PKG_RESOURCES
+ | 1*SD_SERIALIZE
+ | 0*SD_PREFER_SIBLING
+ | sd_local_flags(level)
+ ,
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
+ };
+ SD_INIT_NAME(sd, NUMA);
+ sd->private = &tl->data;
+
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+
+ return sd;
+}
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_init_numa(void)
+{
+ int next_distance, curr_distance = node_distance(0, 0);
+ struct sched_domain_topology_level *tl;
+ int level = 0;
+ int i, j, k;
+
+ sched_domains_numa_scale = curr_distance;
+ sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ if (!sched_domains_numa_distance)
+ return;
+
+ /*
+ * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * unique distances in the node_distance() table.
+ *
+ * Assumes node_distance(0,j) includes all distances in
+ * node_distance(i,j) in order to avoid cubic time.
+ *
+ * XXX: could be optimized to O(n log n) by using sort()
+ */
+ next_distance = curr_distance;
+ for (i = 0; i < nr_node_ids; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ int distance = node_distance(0, j);
+ if (distance > curr_distance &&
+ (distance < next_distance ||
+ next_distance == curr_distance))
+ next_distance = distance;
+ }
+ if (next_distance != curr_distance) {
+ sched_domains_numa_distance[level++] = next_distance;
+ sched_domains_numa_levels = level;
+ curr_distance = next_distance;
+ } else break;
+ }
+ /*
+ * 'level' contains the number of unique distances, excluding the
+ * identity distance node_distance(i,i).
+ *
+ * The sched_domains_nume_distance[] array includes the actual distance
+ * numbers.
+ */
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ /*
+ * Now for each level, construct a mask per node which contains all
+ * cpus of nodes that are that many hops away from us.
+ */
+ for (i = 0; i < level; i++) {
+ sched_domains_numa_masks[i] =
+ kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+ if (!mask)
+ return;
+
+ sched_domains_numa_masks[i][j] = mask;
+
+ for (k = 0; k < nr_node_ids; k++) {
+ if (node_distance(j, k) > sched_domains_numa_distance[i])
+ continue;
+
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+ }
+ }
+
+ tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ /*
+ * Copy the default topology bits..
+ */
+ for (i = 0; default_topology[i].init; i++)
+ tl[i] = default_topology[i];
+
+ /*
+ * .. and append 'j' levels of NUMA goodness.
+ */
+ for (j = 0; j < level; i++, j++) {
+ tl[i] = (struct sched_domain_topology_level){
+ .init = sd_numa_init,
+ .mask = sd_numa_mask,
+ .flags = SDTL_OVERLAP,
+ .numa_level = j,
+ };
+ }
+
+ sched_domain_topology = tl;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+#endif /* CONFIG_NUMA */
+
static int __sdt_alloc(const struct cpumask *cpu_map)
{
struct sched_domain_topology_level *tl;
@@ -6382,6 +6486,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sg)
return -ENOMEM;
+ sg->next = sg;
+
*per_cpu_ptr(sdd->sg, j) = sg;
sgp = kzalloc_node(sizeof(struct sched_group_power),
@@ -6710,97 +6816,6 @@ match2:
mutex_unlock(&sched_domains_mutex);
}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void reinit_sched_domains(void)
-{
- get_online_cpus();
-
- /* Destroy domains first to force the rebuild */
- partition_sched_domains(0, NULL, NULL);
-
- rebuild_sched_domains();
- put_online_cpus();
-}
-
-static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
-{
- unsigned int level = 0;
-
- if (sscanf(buf, "%u", &level) != 1)
- return -EINVAL;
-
- /*
- * level is always be positive so don't check for
- * level < POWERSAVINGS_BALANCE_NONE which is 0
- * What happens on 0 or 1 byte write,
- * need to check for count as well?
- */
-
- if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
- return -EINVAL;
-
- if (smt)
- sched_smt_power_savings = level;
- else
- sched_mc_power_savings = level;
-
- reinit_sched_domains();
-
- return count;
-}
-
-#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", sched_mc_power_savings);
-}
-static ssize_t sched_mc_power_savings_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- return sched_power_savings_store(buf, count, 0);
-}
-static DEVICE_ATTR(sched_mc_power_savings, 0644,
- sched_mc_power_savings_show,
- sched_mc_power_savings_store);
-#endif
-
-#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", sched_smt_power_savings);
-}
-static ssize_t sched_smt_power_savings_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- return sched_power_savings_store(buf, count, 1);
-}
-static DEVICE_ATTR(sched_smt_power_savings, 0644,
- sched_smt_power_savings_show,
- sched_smt_power_savings_store);
-#endif
-
-int __init sched_create_sysfs_power_savings_entries(struct device *dev)
-{
- int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
- if (smt_capable())
- err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
-#endif
-#ifdef CONFIG_SCHED_MC
- if (!err && mc_capable())
- err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
-#endif
- return err;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
/*
* Update cpusets according to cpu_active mask. If cpusets are
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6838,6 +6853,8 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+ sched_init_numa();
+
get_online_cpus();
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);
@@ -7059,6 +7076,7 @@ void __init sched_init(void)
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+ idle_thread_set_boot_cpu();
#endif
init_sched_fair_class();
@@ -7980,13 +7998,9 @@ static struct cftype cpu_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
+ { } /* terminate */
};
-static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
-}
-
struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.create = cpu_cgroup_create,
@@ -7994,8 +8008,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
- .populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
+ .base_cftypes = cpu_files,
.early_init = 1,
};
@@ -8180,13 +8194,9 @@ static struct cftype files[] = {
.name = "stat",
.read_map = cpuacct_stats_show,
},
+ { } /* terminate */
};
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
- return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
-}
-
/*
* charge this task's execution time to its accounting group.
*
@@ -8218,7 +8228,7 @@ struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
.create = cpuacct_create,
.destroy = cpuacct_destroy,
- .populate = cpuacct_populate,
.subsys_id = cpuacct_subsys_id,
+ .base_cftypes = files,
};
#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161d..6f79596e0ea9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
- SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)
SEQ_printf(m, "\ncpu#%d\n", cpu);
#endif
-#define P(x) \
- SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
+#define P(x) \
+do { \
+ if (sizeof(rq->x) == 4) \
+ SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
+ else \
+ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
+} while (0)
+
#define PN(x) \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9553640c1c3..940e6d17cf96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
* If power savings logic is enabled for a domain, see if we
* are not overloaded, if so, don't balance wider.
*/
- if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+ if (tmp->flags & (SD_PREFER_LOCAL)) {
unsigned long power = 0;
unsigned long nr_running = 0;
unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
- if (tmp->flags & SD_POWERSAVINGS_BALANCE)
- nr_running /= 2;
-
if (nr_running < capacity)
want_sd = 0;
}
@@ -3082,7 +3079,7 @@ struct lb_env {
struct rq *dst_rq;
enum cpu_idle_type idle;
- long load_move;
+ long imbalance;
unsigned int flags;
unsigned int loop;
@@ -3218,7 +3215,7 @@ static unsigned long task_h_load(struct task_struct *p);
static const unsigned int sched_nr_migrate_break = 32;
/*
- * move_tasks tries to move up to load_move weighted load from busiest to
+ * move_tasks tries to move up to imbalance weighted load from busiest to
* this_rq, as part of a balancing operation within domain "sd".
* Returns 1 if successful and 0 otherwise.
*
@@ -3231,7 +3228,7 @@ static int move_tasks(struct lb_env *env)
unsigned long load;
int pulled = 0;
- if (env->load_move <= 0)
+ if (env->imbalance <= 0)
return 0;
while (!list_empty(tasks)) {
@@ -3257,7 +3254,7 @@ static int move_tasks(struct lb_env *env)
if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
- if ((load / 2) > env->load_move)
+ if ((load / 2) > env->imbalance)
goto next;
if (!can_migrate_task(p, env))
@@ -3265,7 +3262,7 @@ static int move_tasks(struct lb_env *env)
move_task(p, env);
pulled++;
- env->load_move -= load;
+ env->imbalance -= load;
#ifdef CONFIG_PREEMPT
/*
@@ -3281,7 +3278,7 @@ static int move_tasks(struct lb_env *env)
* We only want to steal up to the prescribed amount of
* weighted load.
*/
- if (env->load_move <= 0)
+ if (env->imbalance <= 0)
break;
continue;
@@ -3435,14 +3432,6 @@ struct sd_lb_stats {
unsigned int busiest_group_weight;
int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int power_savings_balance; /* Is powersave balance needed for this sd */
- struct sched_group *group_min; /* Least loaded group in sd */
- struct sched_group *group_leader; /* Group which relieves group_min */
- unsigned long min_load_per_task; /* load_per_task in group_min */
- unsigned long leader_nr_running; /* Nr running of group_leader */
- unsigned long min_nr_running; /* Nr running of group_min */
-#endif
};
/*
@@ -3486,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
- struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
- /*
- * Busy processors will not participate in power savings
- * balance.
- */
- if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- sds->power_savings_balance = 0;
- else {
- sds->power_savings_balance = 1;
- sds->min_nr_running = ULONG_MAX;
- sds->leader_nr_running = 0;
- }
-}
-
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- * load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
- struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-
- if (!sds->power_savings_balance)
- return;
-
- /*
- * If the local group is idle or completely loaded
- * no need to do power savings balance at this domain
- */
- if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
- !sds->this_nr_running))
- sds->power_savings_balance = 0;
-
- /*
- * If a group is already running at full capacity or idle,
- * don't include that group in power savings calculations
- */
- if (!sds->power_savings_balance ||
- sgs->sum_nr_running >= sgs->group_capacity ||
- !sgs->sum_nr_running)
- return;
-
- /*
- * Calculate the group which has the least non-idle load.
- * This is the group from where we need to pick up the load
- * for saving power
- */
- if ((sgs->sum_nr_running < sds->min_nr_running) ||
- (sgs->sum_nr_running == sds->min_nr_running &&
- group_first_cpu(group) > group_first_cpu(sds->group_min))) {
- sds->group_min = group;
- sds->min_nr_running = sgs->sum_nr_running;
- sds->min_load_per_task = sgs->sum_weighted_load /
- sgs->sum_nr_running;
- }
-
- /*
- * Calculate the group which is almost near its
- * capacity but still has some space to pick up some load
- * from other group and save more power
- */
- if (sgs->sum_nr_running + 1 > sgs->group_capacity)
- return;
-
- if (sgs->sum_nr_running > sds->leader_nr_running ||
- (sgs->sum_nr_running == sds->leader_nr_running &&
- group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
- sds->group_leader = group;
- sds->leader_nr_running = sgs->sum_nr_running;
- }
-}
-
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- * under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
-{
- if (!sds->power_savings_balance)
- return 0;
-
- if (sds->this != sds->group_leader ||
- sds->group_leader == sds->group_min)
- return 0;
-
- *imbalance = sds->min_load_per_task;
- sds->busiest = sds->group_min;
-
- return 1;
-
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
- struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
- return;
-}
-
-static inline void update_sd_power_savings_stats(struct sched_group *group,
- struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
- return;
-}
-
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
-{
- return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
-
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
{
return SCHED_POWER_SCALE;
@@ -3765,24 +3612,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @sd: The sched_domain whose statistics are to be updated.
* @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
-static inline void update_sg_lb_stats(struct sched_domain *sd,
- struct sched_group *group, int this_cpu,
- enum cpu_idle_type idle, int load_idx,
+static inline void update_sg_lb_stats(struct lb_env *env,
+ struct sched_group *group, int load_idx,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
- unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
- int i;
+ unsigned long nr_running, max_nr_running, min_nr_running;
+ unsigned long load, max_cpu_load, min_cpu_load;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
+ int i;
if (local_group)
balance_cpu = group_first_cpu(group);
@@ -3791,10 +3636,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
max_cpu_load = 0;
min_cpu_load = ~0UL;
max_nr_running = 0;
+ min_nr_running = ~0UL;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
+ nr_running = rq->nr_running;
+
/* Bias balancing toward cpus of our domain */
if (local_group) {
if (idle_cpu(i) && !first_idle_cpu) {
@@ -3805,16 +3653,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
- if (load > max_cpu_load) {
+ if (load > max_cpu_load)
max_cpu_load = load;
- max_nr_running = rq->nr_running;
- }
if (min_cpu_load > load)
min_cpu_load = load;
+
+ if (nr_running > max_nr_running)
+ max_nr_running = nr_running;
+ if (min_nr_running > nr_running)
+ min_nr_running = nr_running;
}
sgs->group_load += load;
- sgs->sum_nr_running += rq->nr_running;
+ sgs->sum_nr_running += nr_running;
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
@@ -3827,14 +3678,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
* to do the newly idle load balance.
*/
if (local_group) {
- if (idle != CPU_NEWLY_IDLE) {
- if (balance_cpu != this_cpu) {
+ if (env->idle != CPU_NEWLY_IDLE) {
+ if (balance_cpu != env->dst_cpu) {
*balance = 0;
return;
}
- update_group_power(sd, this_cpu);
+ update_group_power(env->sd, env->dst_cpu);
} else if (time_after_eq(jiffies, group->sgp->next_update))
- update_group_power(sd, this_cpu);
+ update_group_power(env->sd, env->dst_cpu);
}
/* Adjust by relative CPU power of the group */
@@ -3852,13 +3703,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
+ if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
+ (max_nr_running - min_nr_running) > 1)
sgs->group_imb = 1;
sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
SCHED_POWER_SCALE);
if (!sgs->group_capacity)
- sgs->group_capacity = fix_small_capacity(sd, group);
+ sgs->group_capacity = fix_small_capacity(env->sd, group);
sgs->group_weight = group->group_weight;
if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3876,11 +3728,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
* Determine if @sg is a busier group than the previously selected
* busiest group.
*/
-static bool update_sd_pick_busiest(struct sched_domain *sd,
+static bool update_sd_pick_busiest(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *sg,
- struct sg_lb_stats *sgs,
- int this_cpu)
+ struct sg_lb_stats *sgs)
{
if (sgs->avg_load <= sds->max_load)
return false;
@@ -3896,8 +3747,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
* numbered CPUs in the group, therefore mark all groups
* higher than ourself as busy.
*/
- if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
- this_cpu < group_first_cpu(sg)) {
+ if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+ env->dst_cpu < group_first_cpu(sg)) {
if (!sds->busiest)
return true;
@@ -3917,28 +3768,27 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
- enum cpu_idle_type idle, const struct cpumask *cpus,
- int *balance, struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env,
+ const struct cpumask *cpus,
+ int *balance, struct sd_lb_stats *sds)
{
- struct sched_domain *child = sd->child;
- struct sched_group *sg = sd->groups;
+ struct sched_domain *child = env->sd->child;
+ struct sched_group *sg = env->sd->groups;
struct sg_lb_stats sgs;
int load_idx, prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
- init_sd_power_savings_stats(sd, sds, idle);
- load_idx = get_sd_load_idx(sd, idle);
+ load_idx = get_sd_load_idx(env->sd, env->idle);
do {
int local_group;
- local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
- local_group, cpus, balance, &sgs);
+ update_sg_lb_stats(env, sg, load_idx, local_group,
+ cpus, balance, &sgs);
if (local_group && !(*balance))
return;
@@ -3966,7 +3816,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
sds->this_load_per_task = sgs.sum_weighted_load;
sds->this_has_capacity = sgs.group_has_capacity;
sds->this_idle_cpus = sgs.idle_cpus;
- } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
+ } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
sds->max_load = sgs.avg_load;
sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3978,9 +3828,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
sds->group_imb = sgs.group_imb;
}
- update_sd_power_savings_stats(sg, sds, local_group, &sgs);
sg = sg->next;
- } while (sg != sd->groups);
+ } while (sg != env->sd->groups);
}
/**
@@ -4008,24 +3857,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
* @this_cpu: The cpu at whose sched_domain we're performing load-balance.
* @imbalance: returns amount of imbalanced due to packing.
*/
-static int check_asym_packing(struct sched_domain *sd,
- struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
+static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
{
int busiest_cpu;
- if (!(sd->flags & SD_ASYM_PACKING))
+ if (!(env->sd->flags & SD_ASYM_PACKING))
return 0;
if (!sds->busiest)
return 0;
busiest_cpu = group_first_cpu(sds->busiest);
- if (this_cpu > busiest_cpu)
+ if (env->dst_cpu > busiest_cpu)
return 0;
- *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
- SCHED_POWER_SCALE);
+ env->imbalance = DIV_ROUND_CLOSEST(
+ sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
+
return 1;
}
@@ -4037,8 +3885,8 @@ static int check_asym_packing(struct sched_domain *sd,
* @this_cpu: The cpu at whose sched_domain we're performing load-balance.
* @imbalance: Variable to store the imbalance.
*/
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
+static inline
+void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long tmp, pwr_now = 0, pwr_move = 0;
unsigned int imbn = 2;
@@ -4049,9 +3897,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
if (sds->busiest_load_per_task >
sds->this_load_per_task)
imbn = 1;
- } else
+ } else {
sds->this_load_per_task =
- cpu_avg_load_per_task(this_cpu);
+ cpu_avg_load_per_task(env->dst_cpu);
+ }
scaled_busy_load_per_task = sds->busiest_load_per_task
* SCHED_POWER_SCALE;
@@ -4059,7 +3908,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
(scaled_busy_load_per_task * imbn)) {
- *imbalance = sds->busiest_load_per_task;
+ env->imbalance = sds->busiest_load_per_task;
return;
}
@@ -4096,18 +3945,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
/* Move if we gain throughput */
if (pwr_move > pwr_now)
- *imbalance = sds->busiest_load_per_task;
+ env->imbalance = sds->busiest_load_per_task;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
+ * @env: load balance environment
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
*/
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
- unsigned long *imbalance)
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long max_pull, load_above_capacity = ~0UL;
@@ -4123,8 +3970,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* its cpu_power, while calculating max_load..)
*/
if (sds->max_load < sds->avg_load) {
- *imbalance = 0;
- return fix_small_imbalance(sds, this_cpu, imbalance);
+ env->imbalance = 0;
+ return fix_small_imbalance(env, sds);
}
if (!sds->group_imb) {
@@ -4152,7 +3999,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * sds->busiest->sgp->power,
+ env->imbalance = min(max_pull * sds->busiest->sgp->power,
(sds->avg_load - sds->this_load) * sds->this->sgp->power)
/ SCHED_POWER_SCALE;
@@ -4162,8 +4009,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* a think about bumping its value to force at least one task to be
* moved
*/
- if (*imbalance < sds->busiest_load_per_task)
- return fix_small_imbalance(sds, this_cpu, imbalance);
+ if (env->imbalance < sds->busiest_load_per_task)
+ return fix_small_imbalance(env, sds);
}
@@ -4194,9 +4041,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* put to idle by rebalancing its tasks onto our group.
*/
static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum cpu_idle_type idle,
- const struct cpumask *cpus, int *balance)
+find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
{
struct sd_lb_stats sds;
@@ -4206,7 +4051,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* Compute the various statistics relavent for load balancing at
* this level.
*/
- update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+ update_sd_lb_stats(env, cpus, balance, &sds);
/*
* this_cpu is not the appropriate cpu to perform load balancing at
@@ -4215,8 +4060,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (!(*balance))
goto ret;
- if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
- check_asym_packing(sd, &sds, this_cpu, imbalance))
+ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
+ check_asym_packing(env, &sds))
return sds.busiest;
/* There is no busy sibling group to pull tasks from */
@@ -4234,7 +4079,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+ if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
!sds.busiest_has_capacity)
goto force_balance;
@@ -4252,7 +4097,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (sds.this_load >= sds.avg_load)
goto out_balanced;
- if (idle == CPU_IDLE) {
+ if (env->idle == CPU_IDLE) {
/*
* This cpu is idle. If the busiest group load doesn't
* have more tasks than the number of available cpu's and
@@ -4267,34 +4112,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
* imbalance_pct to be conservative.
*/
- if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
- calculate_imbalance(&sds, this_cpu, imbalance);
+ calculate_imbalance(env, &sds);
return sds.busiest;
out_balanced:
- /*
- * There is no obvious imbalance. But check if we can do some balancing
- * to save power.
- */
- if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
- return sds.busiest;
ret:
- *imbalance = 0;
+ env->imbalance = 0;
return NULL;
}
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
-static struct rq *
-find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
- enum cpu_idle_type idle, unsigned long imbalance,
- const struct cpumask *cpus)
+static struct rq *find_busiest_queue(struct lb_env *env,
+ struct sched_group *group,
+ const struct cpumask *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
@@ -4307,7 +4145,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
unsigned long wl;
if (!capacity)
- capacity = fix_small_capacity(sd, group);
+ capacity = fix_small_capacity(env->sd, group);
if (!cpumask_test_cpu(i, cpus))
continue;
@@ -4319,7 +4157,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu power.
*/
- if (capacity && rq->nr_running == 1 && wl > imbalance)
+ if (capacity && rq->nr_running == 1 && wl > env->imbalance)
continue;
/*
@@ -4348,40 +4186,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
/* Working cpumask for load_balance and load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int idle,
- int busiest_cpu, int this_cpu)
+static int need_active_balance(struct lb_env *env)
{
- if (idle == CPU_NEWLY_IDLE) {
+ struct sched_domain *sd = env->sd;
+
+ if (env->idle == CPU_NEWLY_IDLE) {
/*
* ASYM_PACKING needs to force migrate tasks from busy but
* higher numbered CPUs in order to pack all tasks in the
* lowest numbered CPUs.
*/
- if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+ if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
return 1;
-
- /*
- * The only task running in a non-idle cpu can be moved to this
- * cpu in an attempt to completely freeup the other CPU
- * package.
- *
- * The package power saving logic comes from
- * find_busiest_group(). If there are no imbalance, then
- * f_b_g() will return NULL. However when sched_mc={1,2} then
- * f_b_g() will select a group from which a running task may be
- * pulled to this cpu in order to make the other package idle.
- * If there is no opportunity to make a package idle and if
- * there are no imbalance, then f_b_g() will return NULL and no
- * action will be taken in load_balance_newidle().
- *
- * Under normal task pull operation due to imbalance, there
- * will be more than one task in the source run queue and
- * move_tasks() will succeed. ld_moved will be true and this
- * active balance code will not be triggered.
- */
- if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
- return 0;
}
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4399,7 +4216,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
{
int ld_moved, active_balance = 0;
struct sched_group *group;
- unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4417,8 +4233,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
schedstat_inc(sd, lb_count[idle]);
redo:
- group = find_busiest_group(sd, this_cpu, &imbalance, idle,
- cpus, balance);
+ group = find_busiest_group(&env, cpus, balance);
if (*balance == 0)
goto out_balanced;
@@ -4428,7 +4243,7 @@ redo:
goto out_balanced;
}
- busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
+ busiest = find_busiest_queue(&env, group, cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
@@ -4436,7 +4251,7 @@ redo:
BUG_ON(busiest == this_rq);
- schedstat_add(sd, lb_imbalance[idle], imbalance);
+ schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
if (busiest->nr_running > 1) {
@@ -4447,10 +4262,9 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.load_move = imbalance;
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
- env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+ env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
local_irq_save(flags);
@@ -4492,7 +4306,7 @@ more_balance:
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
- if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
+ if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -4519,10 +4333,11 @@ more_balance:
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
- if (active_balance)
+ if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
+ }
/*
* We've kicked active balancing, reset the failure
@@ -4703,104 +4518,15 @@ static struct {
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu: The cpu whose lowest level of sched domain is to
- * be returned.
- * @flag: The flag to check for the lowest sched_domain
- * for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
- struct sched_domain *sd;
-
- for_each_domain(cpu, sd)
- if (sd->flags & flag)
- break;
-
- return sd;
-}
-
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu: The cpu whose domains we're iterating over.
- * @sd: variable holding the value of the power_savings_sd
- * for cpu.
- * @flag: The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
- for (sd = lowest_flag_domain(cpu, flag); \
- (sd && (sd->flags & flag)); sd = sd->parent)
-
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu: The cpu which is nominating a new idle_load_balancer.
- *
- * Returns: Returns the id of the idle load balancer if it exists,
- * Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
+static inline int find_new_ilb(int call_cpu)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
- struct sched_group *ilbg;
- struct sched_domain *sd;
-
- /*
- * Have idle load balancer selection from semi-idle packages only
- * when power-aware load balancing is enabled
- */
- if (!(sched_smt_power_savings || sched_mc_power_savings))
- goto out_done;
-
- /*
- * Optimize for the case when we have no idle CPUs or only one
- * idle CPU. Don't walk the sched_domain hierarchy in such cases
- */
- if (cpumask_weight(nohz.idle_cpus_mask) < 2)
- goto out_done;
-
- rcu_read_lock();
- for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
- ilbg = sd->groups;
-
- do {
- if (ilbg->group_weight !=
- atomic_read(&ilbg->sgp->nr_busy_cpus)) {
- ilb = cpumask_first_and(nohz.idle_cpus_mask,
- sched_group_cpus(ilbg));
- goto unlock;
- }
-
- ilbg = ilbg->next;
-
- } while (ilbg != sd->groups);
- }
-unlock:
- rcu_read_unlock();
-out_done:
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
return nr_cpu_ids;
}
-#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
- return nr_cpu_ids;
-}
-#endif
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
@@ -5023,7 +4749,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
raw_spin_lock_irq(&this_rq->lock);
update_rq_clock(this_rq);
- update_cpu_load(this_rq);
+ update_idle_cpu_load(this_rq);
raw_spin_unlock_irq(&this_rq->lock);
rebalance_domains(balance_cpu, CPU_IDLE);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f289..b44d604b35d1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,7 +4,7 @@
* idle-task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE tasks which are
- * handled in sched_fair.c)
+ * handled in sched/fair.c)
*/
#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d0..c5565c3c515f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1803,44 +1803,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_rt(struct task_struct *p,
const struct cpumask *new_mask)
{
- int weight = cpumask_weight(new_mask);
+ struct rq *rq;
+ int weight;
BUG_ON(!rt_task(p));
- /*
- * Update the migration status of the RQ if we have an RT task
- * which is running AND changing its weight value.
- */
- if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
- struct rq *rq = task_rq(p);
-
- if (!task_current(rq, p)) {
- /*
- * Make sure we dequeue this task from the pushable list
- * before going further. It will either remain off of
- * the list because we are no longer pushable, or it
- * will be requeued.
- */
- if (p->rt.nr_cpus_allowed > 1)
- dequeue_pushable_task(rq, p);
+ if (!p->on_rq)
+ return;
- /*
- * Requeue if our weight is changing and still > 1
- */
- if (weight > 1)
- enqueue_pushable_task(rq, p);
+ weight = cpumask_weight(new_mask);
- }
+ /*
+ * Only update if the process changes its state from whether it
+ * can migrate or not.
+ */
+ if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
+ return;
- if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
- rq->rt.rt_nr_migratory++;
- } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
- BUG_ON(!rq->rt.rt_nr_migratory);
- rq->rt.rt_nr_migratory--;
- }
+ rq = task_rq(p);
- update_rt_migration(&rq->rt);
+ /*
+ * The process used to be able to migrate OR it can now migrate
+ */
+ if (weight <= 1) {
+ if (!task_current(rq, p))
+ dequeue_pushable_task(rq, p);
+ BUG_ON(!rq->rt.rt_nr_migratory);
+ rq->rt.rt_nr_migratory--;
+ } else {
+ if (!task_current(rq, p))
+ enqueue_pushable_task(rq, p);
+ rq->rt.rt_nr_migratory++;
}
+
+ update_rt_migration(&rq->rt);
}
/* Assumes rq->lock is held */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba4d52e..ba9dccfd24ce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -201,7 +201,7 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned long nr_running, h_nr_running;
+ unsigned int nr_running, h_nr_running;
u64 exec_clock;
u64 min_vruntime;
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
- unsigned long rt_nr_running;
+ unsigned int rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
@@ -353,7 +353,7 @@ struct rq {
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
- unsigned long nr_running;
+ unsigned int nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
@@ -876,7 +876,7 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-extern void update_cpu_load(struct rq *this_rq);
+extern void update_idle_cpu_load(struct rq *this_rq);
#ifdef CONFIG_CGROUP_CPUACCT
#include <linux/cgroup.h>
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e8d76c5895ea..ee376beedaf9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,16 +3,357 @@
*
* Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
*
- * This defines a simple but solid secure-computing mode.
+ * Copyright (C) 2012 Google, Inc.
+ * Will Drewry <wad@chromium.org>
+ *
+ * This defines a simple but solid secure-computing facility.
+ *
+ * Mode 1 uses a fixed list of allowed system calls.
+ * Mode 2 allows user-defined system call filters in the form
+ * of Berkeley Packet Filters/Linux Socket Filters.
*/
+#include <linux/atomic.h>
#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/sched.h>
#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/seccomp.h>
/* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
+
+#ifdef CONFIG_SECCOMP_FILTER
+#include <asm/syscall.h>
+#include <linux/filter.h>
+#include <linux/ptrace.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/tracehook.h>
+#include <linux/uaccess.h>
+
+/**
+ * struct seccomp_filter - container for seccomp BPF programs
+ *
+ * @usage: reference count to manage the object lifetime.
+ * get/put helpers should be used when accessing an instance
+ * outside of a lifetime-guarded section. In general, this
+ * is only needed for handling filters shared across tasks.
+ * @prev: points to a previously installed, or inherited, filter
+ * @len: the number of instructions in the program
+ * @insns: the BPF program instructions to evaluate
+ *
+ * seccomp_filter objects are organized in a tree linked via the @prev
+ * pointer. For any task, it appears to be a singly-linked list starting
+ * with current->seccomp.filter, the most recently attached or inherited filter.
+ * However, multiple filters may share a @prev node, by way of fork(), which
+ * results in a unidirectional tree existing in memory. This is similar to
+ * how namespaces work.
+ *
+ * seccomp_filter objects should never be modified after being attached
+ * to a task_struct (other than @usage).
+ */
+struct seccomp_filter {
+ atomic_t usage;
+ struct seccomp_filter *prev;
+ unsigned short len; /* Instruction count */
+ struct sock_filter insns[];
+};
+
+/* Limit any path through the tree to 256KB worth of instructions. */
+#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
+
+/**
+ * get_u32 - returns a u32 offset into data
+ * @data: a unsigned 64 bit value
+ * @index: 0 or 1 to return the first or second 32-bits
+ *
+ * This inline exists to hide the length of unsigned long. If a 32-bit
+ * unsigned long is passed in, it will be extended and the top 32-bits will be
+ * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
+ * properly returned.
+ *
+ * Endianness is explicitly ignored and left for BPF program authors to manage
+ * as per the specific architecture.
+ */
+static inline u32 get_u32(u64 data, int index)
+{
+ return ((u32 *)&data)[index];
+}
+
+/* Helper for bpf_load below. */
+#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
+/**
+ * bpf_load: checks and returns a pointer to the requested offset
+ * @off: offset into struct seccomp_data to load from
+ *
+ * Returns the requested 32-bits of data.
+ * seccomp_check_filter() should assure that @off is 32-bit aligned
+ * and not out of bounds. Failure to do so is a BUG.
+ */
+u32 seccomp_bpf_load(int off)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ if (off == BPF_DATA(nr))
+ return syscall_get_nr(current, regs);
+ if (off == BPF_DATA(arch))
+ return syscall_get_arch(current, regs);
+ if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
+ unsigned long value;
+ int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
+ int index = !!(off % sizeof(u64));
+ syscall_get_arguments(current, regs, arg, 1, &value);
+ return get_u32(value, index);
+ }
+ if (off == BPF_DATA(instruction_pointer))
+ return get_u32(KSTK_EIP(current), 0);
+ if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
+ return get_u32(KSTK_EIP(current), 1);
+ /* seccomp_check_filter should make this impossible. */
+ BUG();
+}
+
+/**
+ * seccomp_check_filter - verify seccomp filter code
+ * @filter: filter to verify
+ * @flen: length of filter
+ *
+ * Takes a previously checked filter (by sk_chk_filter) and
+ * redirects all filter code that loads struct sk_buff data
+ * and related data through seccomp_bpf_load. It also
+ * enforces length and alignment checking of those loads.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
+{
+ int pc;
+ for (pc = 0; pc < flen; pc++) {
+ struct sock_filter *ftest = &filter[pc];
+ u16 code = ftest->code;
+ u32 k = ftest->k;
+
+ switch (code) {
+ case BPF_S_LD_W_ABS:
+ ftest->code = BPF_S_ANC_SECCOMP_LD_W;
+ /* 32-bit aligned and not out of bounds. */
+ if (k >= sizeof(struct seccomp_data) || k & 3)
+ return -EINVAL;
+ continue;
+ case BPF_S_LD_W_LEN:
+ ftest->code = BPF_S_LD_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ case BPF_S_LDX_W_LEN:
+ ftest->code = BPF_S_LDX_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ /* Explicitly include allowed calls. */
+ case BPF_S_RET_K:
+ case BPF_S_RET_A:
+ case BPF_S_ALU_ADD_K:
+ case BPF_S_ALU_ADD_X:
+ case BPF_S_ALU_SUB_K:
+ case BPF_S_ALU_SUB_X:
+ case BPF_S_ALU_MUL_K:
+ case BPF_S_ALU_MUL_X:
+ case BPF_S_ALU_DIV_X:
+ case BPF_S_ALU_AND_K:
+ case BPF_S_ALU_AND_X:
+ case BPF_S_ALU_OR_K:
+ case BPF_S_ALU_OR_X:
+ case BPF_S_ALU_LSH_K:
+ case BPF_S_ALU_LSH_X:
+ case BPF_S_ALU_RSH_K:
+ case BPF_S_ALU_RSH_X:
+ case BPF_S_ALU_NEG:
+ case BPF_S_LD_IMM:
+ case BPF_S_LDX_IMM:
+ case BPF_S_MISC_TAX:
+ case BPF_S_MISC_TXA:
+ case BPF_S_ALU_DIV_K:
+ case BPF_S_LD_MEM:
+ case BPF_S_LDX_MEM:
+ case BPF_S_ST:
+ case BPF_S_STX:
+ case BPF_S_JMP_JA:
+ case BPF_S_JMP_JEQ_K:
+ case BPF_S_JMP_JEQ_X:
+ case BPF_S_JMP_JGE_K:
+ case BPF_S_JMP_JGE_X:
+ case BPF_S_JMP_JGT_K:
+ case BPF_S_JMP_JGT_X:
+ case BPF_S_JMP_JSET_K:
+ case BPF_S_JMP_JSET_X:
+ continue;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/**
+ * seccomp_run_filters - evaluates all seccomp filters against @syscall
+ * @syscall: number of the current system call
+ *
+ * Returns valid seccomp BPF response codes.
+ */
+static u32 seccomp_run_filters(int syscall)
+{
+ struct seccomp_filter *f;
+ u32 ret = SECCOMP_RET_ALLOW;
+
+ /* Ensure unexpected behavior doesn't result in failing open. */
+ if (WARN_ON(current->seccomp.filter == NULL))
+ return SECCOMP_RET_KILL;
+
+ /*
+ * All filters in the list are evaluated and the lowest BPF return
+ * value always takes priority (ignoring the DATA).
+ */
+ for (f = current->seccomp.filter; f; f = f->prev) {
+ u32 cur_ret = sk_run_filter(NULL, f->insns);
+ if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+ ret = cur_ret;
+ }
+ return ret;
+}
+
+/**
+ * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * @fprog: BPF program to install
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+static long seccomp_attach_filter(struct sock_fprog *fprog)
+{
+ struct seccomp_filter *filter;
+ unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
+ unsigned long total_insns = fprog->len;
+ long ret;
+
+ if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ for (filter = current->seccomp.filter; filter; filter = filter->prev)
+ total_insns += filter->len + 4; /* include a 4 instr penalty */
+ if (total_insns > MAX_INSNS_PER_PATH)
+ return -ENOMEM;
+
+ /*
+ * Installing a seccomp filter requires that the task have
+ * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
+ * This avoids scenarios where unprivileged tasks can affect the
+ * behavior of privileged children.
+ */
+ if (!current->no_new_privs &&
+ security_capable_noaudit(current_cred(), current_user_ns(),
+ CAP_SYS_ADMIN) != 0)
+ return -EACCES;
+
+ /* Allocate a new seccomp_filter */
+ filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
+ GFP_KERNEL|__GFP_NOWARN);
+ if (!filter)
+ return -ENOMEM;
+ atomic_set(&filter->usage, 1);
+ filter->len = fprog->len;
+
+ /* Copy the instructions from fprog. */
+ ret = -EFAULT;
+ if (copy_from_user(filter->insns, fprog->filter, fp_size))
+ goto fail;
+
+ /* Check and rewrite the fprog via the skb checker */
+ ret = sk_chk_filter(filter->insns, filter->len);
+ if (ret)
+ goto fail;
+
+ /* Check and rewrite the fprog for seccomp use */
+ ret = seccomp_check_filter(filter->insns, filter->len);
+ if (ret)
+ goto fail;
+
+ /*
+ * If there is an existing filter, make it the prev and don't drop its
+ * task reference.
+ */
+ filter->prev = current->seccomp.filter;
+ current->seccomp.filter = filter;
+ return 0;
+fail:
+ kfree(filter);
+ return ret;
+}
+
+/**
+ * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * @user_filter: pointer to the user data containing a sock_fprog.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+long seccomp_attach_user_filter(char __user *user_filter)
+{
+ struct sock_fprog fprog;
+ long ret = -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (is_compat_task()) {
+ struct compat_sock_fprog fprog32;
+ if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
+ goto out;
+ fprog.len = fprog32.len;
+ fprog.filter = compat_ptr(fprog32.filter);
+ } else /* falls through to the if below. */
+#endif
+ if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
+ goto out;
+ ret = seccomp_attach_filter(&fprog);
+out:
+ return ret;
+}
+
+/* get_seccomp_filter - increments the reference count of the filter on @tsk */
+void get_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ if (!orig)
+ return;
+ /* Reference count is bounded by the number of total processes. */
+ atomic_inc(&orig->usage);
+}
+
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ /* Clean up single-reference branches iteratively. */
+ while (orig && atomic_dec_and_test(&orig->usage)) {
+ struct seccomp_filter *freeme = orig;
+ orig = orig->prev;
+ kfree(freeme);
+ }
+}
+
+/**
+ * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
+ * @syscall: syscall number to send to userland
+ * @reason: filter-supplied reason code to send to userland (via si_errno)
+ *
+ * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
+ */
+static void seccomp_send_sigsys(int syscall, int reason)
+{
+ struct siginfo info;
+ memset(&info, 0, sizeof(info));
+ info.si_signo = SIGSYS;
+ info.si_code = SYS_SECCOMP;
+ info.si_call_addr = (void __user *)KSTK_EIP(current);
+ info.si_errno = reason;
+ info.si_arch = syscall_get_arch(current, task_pt_regs(current));
+ info.si_syscall = syscall;
+ force_sig_info(SIGSYS, &info, current);
+}
+#endif /* CONFIG_SECCOMP_FILTER */
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = {
};
#endif
-void __secure_computing(int this_syscall)
+int __secure_computing(int this_syscall)
{
int mode = current->seccomp.mode;
- int * syscall;
+ int exit_sig = 0;
+ int *syscall;
+ u32 ret;
switch (mode) {
- case 1:
+ case SECCOMP_MODE_STRICT:
syscall = mode1_syscalls;
#ifdef CONFIG_COMPAT
if (is_compat_task())
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall)
#endif
do {
if (*syscall == this_syscall)
- return;
+ return 0;
} while (*++syscall);
+ exit_sig = SIGKILL;
+ ret = SECCOMP_RET_KILL;
+ break;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER: {
+ int data;
+ ret = seccomp_run_filters(this_syscall);
+ data = ret & SECCOMP_RET_DATA;
+ ret &= SECCOMP_RET_ACTION;
+ switch (ret) {
+ case SECCOMP_RET_ERRNO:
+ /* Set the low-order 16-bits as a errno. */
+ syscall_set_return_value(current, task_pt_regs(current),
+ -data, 0);
+ goto skip;
+ case SECCOMP_RET_TRAP:
+ /* Show the handler the original registers. */
+ syscall_rollback(current, task_pt_regs(current));
+ /* Let the filter pass back 16 bits of data. */
+ seccomp_send_sigsys(this_syscall, data);
+ goto skip;
+ case SECCOMP_RET_TRACE:
+ /* Skip these calls if there is no tracer. */
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
+ goto skip;
+ /* Allow the BPF to provide the event message */
+ ptrace_event(PTRACE_EVENT_SECCOMP, data);
+ /*
+ * The delivery of a fatal signal during event
+ * notification may silently skip tracer notification.
+ * Terminating the task now avoids executing a system
+ * call that may not be intended.
+ */
+ if (fatal_signal_pending(current))
+ break;
+ return 0;
+ case SECCOMP_RET_ALLOW:
+ return 0;
+ case SECCOMP_RET_KILL:
+ default:
+ break;
+ }
+ exit_sig = SIGSYS;
break;
+ }
+#endif
default:
BUG();
}
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall)
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
- audit_seccomp(this_syscall);
- do_exit(SIGKILL);
+ audit_seccomp(this_syscall, exit_sig, ret);
+ do_exit(exit_sig);
+#ifdef CONFIG_SECCOMP_FILTER
+skip:
+ audit_seccomp(this_syscall, exit_sig, ret);
+#endif
+ return -1;
}
long prctl_get_seccomp(void)
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void)
return current->seccomp.mode;
}
-long prctl_set_seccomp(unsigned long seccomp_mode)
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * This function may be called repeatedly with a @seccomp_mode of
+ * SECCOMP_MODE_FILTER to install additional filters. Every filter
+ * successfully installed will be evaluated (in reverse order) for each system
+ * call the task makes.
+ *
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
{
- long ret;
+ long ret = -EINVAL;
- /* can set it only once to be even more secure */
- ret = -EPERM;
- if (unlikely(current->seccomp.mode))
+ if (current->seccomp.mode &&
+ current->seccomp.mode != seccomp_mode)
goto out;
- ret = -EINVAL;
- if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
- current->seccomp.mode = seccomp_mode;
- set_thread_flag(TIF_SECCOMP);
+ switch (seccomp_mode) {
+ case SECCOMP_MODE_STRICT:
+ ret = 0;
#ifdef TIF_NOTSC
disable_TSC();
#endif
- ret = 0;
+ break;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER:
+ ret = seccomp_attach_user_filter(filter);
+ if (ret)
+ goto out;
+ break;
+#endif
+ default:
+ goto out;
}
- out:
+ current->seccomp.mode = seccomp_mode;
+ set_thread_flag(TIF_SECCOMP);
+out:
return ret;
}
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 60636a4e25c3..4567fc020fe3 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable);
* down_trylock - try to acquire the semaphore, without waiting
* @sem: the semaphore to be acquired
*
- * Try to acquire the semaphore atomically. Returns 0 if the mutex has
+ * Try to acquire the semaphore atomically. Returns 0 if the semaphore has
* been acquired successfully or 1 if it it cannot be acquired.
*
* NOTE: This return value is inverted from both spin_trylock and
diff --git a/kernel/signal.c b/kernel/signal.c
index 17afcaf582d0..f7b418217633 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -29,6 +29,7 @@
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
+#include <linux/uprobes.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -160,7 +161,7 @@ void recalc_sigpending(void)
#define SYNCHRONOUS_MASK \
(sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
- sigmask(SIGTRAP) | sigmask(SIGFPE))
+ sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
int next_signal(struct sigpending *pending, sigset_t *mask)
{
@@ -767,14 +768,13 @@ static int kill_ok_by_cred(struct task_struct *t)
const struct cred *cred = current_cred();
const struct cred *tcred = __task_cred(t);
- if (cred->user->user_ns == tcred->user->user_ns &&
- (cred->euid == tcred->suid ||
- cred->euid == tcred->uid ||
- cred->uid == tcred->suid ||
- cred->uid == tcred->uid))
+ if (uid_eq(cred->euid, tcred->suid) ||
+ uid_eq(cred->euid, tcred->uid) ||
+ uid_eq(cred->uid, tcred->suid) ||
+ uid_eq(cred->uid, tcred->uid))
return 1;
- if (ns_capable(tcred->user->user_ns, CAP_KILL))
+ if (ns_capable(tcred->user_ns, CAP_KILL))
return 1;
return 0;
@@ -1020,15 +1020,6 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
-/*
- * map the uid in struct cred into user namespace *ns
- */
-static inline uid_t map_cred_ns(const struct cred *cred,
- struct user_namespace *ns)
-{
- return user_ns_map_uid(ns, cred, cred->uid);
-}
-
#ifdef CONFIG_USER_NS
static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
{
@@ -1038,8 +1029,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str
if (SI_FROMKERNEL(info))
return;
- info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
- current_cred(), info->si_uid);
+ rcu_read_lock();
+ info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
+ make_kuid(current_user_ns(), info->si_uid));
+ rcu_read_unlock();
}
#else
static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
@@ -1106,7 +1099,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
q->info.si_code = SI_USER;
q->info.si_pid = task_tgid_nr_ns(current,
task_active_pid_ns(t));
- q->info.si_uid = current_uid();
+ q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
break;
case (unsigned long) SEND_SIG_PRIV:
q->info.si_signo = sig;
@@ -1387,10 +1380,8 @@ static int kill_as_cred_perm(const struct cred *cred,
struct task_struct *target)
{
const struct cred *pcred = __task_cred(target);
- if (cred->user_ns != pcred->user_ns)
- return 0;
- if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
- cred->uid != pcred->suid && cred->uid != pcred->uid)
+ if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
+ !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid))
return 0;
return 1;
}
@@ -1678,8 +1669,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
- info.si_uid = map_cred_ns(__task_cred(tsk),
- task_cred_xxx(tsk->parent, user_ns));
+ info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
+ task_uid(tsk));
rcu_read_unlock();
info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
@@ -1762,8 +1753,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
- info.si_uid = map_cred_ns(__task_cred(tsk),
- task_cred_xxx(parent, user_ns));
+ info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
rcu_read_unlock();
info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -1973,7 +1963,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
info.si_signo = signr;
info.si_code = exit_code;
info.si_pid = task_pid_vnr(current);
- info.si_uid = current_uid();
+ info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
/* Let the debugger run. */
ptrace_stop(exit_code, why, 1, &info);
@@ -2181,8 +2171,8 @@ static int ptrace_signal(int signr, siginfo_t *info,
info->si_code = SI_USER;
rcu_read_lock();
info->si_pid = task_pid_vnr(current->parent);
- info->si_uid = map_cred_ns(__task_cred(current->parent),
- current_user_ns());
+ info->si_uid = from_kuid_munged(current_user_ns(),
+ task_uid(current->parent));
rcu_read_unlock();
}
@@ -2202,6 +2192,9 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
struct signal_struct *signal = current->signal;
int signr;
+ if (unlikely(uprobe_deny_signal()))
+ return 0;
+
relock:
/*
* We'll jump back here after any time we were stopped in TASK_STOPPED.
@@ -2706,6 +2699,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
+#ifdef __ARCH_SIGSYS
+ case __SI_SYS:
+ err |= __put_user(from->si_call_addr, &to->si_call_addr);
+ err |= __put_user(from->si_syscall, &to->si_syscall);
+ err |= __put_user(from->si_arch, &to->si_arch);
+ break;
+#endif
default: /* this is just in case for now ... */
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
@@ -2828,7 +2828,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
info.si_errno = 0;
info.si_code = SI_USER;
info.si_pid = task_tgid_vnr(current);
- info.si_uid = current_uid();
+ info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
return kill_something_info(sig, &info, pid);
}
@@ -2871,7 +2871,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
info.si_errno = 0;
info.si_code = SI_TKILL;
info.si_pid = task_tgid_vnr(current);
- info.si_uid = current_uid();
+ info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
return do_send_specific(tgid, pid, sig, &info);
}
@@ -3236,6 +3236,21 @@ SYSCALL_DEFINE0(pause)
#endif
+#ifdef HAVE_SET_RESTORE_SIGMASK
+int sigsuspend(sigset_t *set)
+{
+ sigdelsetmask(set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(set);
+
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ set_restore_sigmask();
+ return -ERESTARTNOHAND;
+}
+#endif
+
#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
/**
* sys_rt_sigsuspend - replace the signal mask for a value with the
@@ -3253,15 +3268,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
if (copy_from_user(&newset, unewset, sizeof(newset)))
return -EFAULT;
- sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
- current->saved_sigmask = current->blocked;
- set_current_blocked(&newset);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
- return -ERESTARTNOHAND;
+ return sigsuspend(&newset);
}
#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/smp.c b/kernel/smp.c
index 2f8b10ecf759..d0ae5b24875e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,8 @@
#include <linux/smp.h>
#include <linux/cpu.h>
+#include "smpboot.h"
+
#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
static struct {
struct list_head queue;
@@ -669,6 +671,8 @@ void __init smp_init(void)
{
unsigned int cpu;
+ idle_threads_init();
+
/* FIXME: This should be done in userspace --RR */
for_each_present_cpu(cpu) {
if (num_online_cpus() >= setup_max_cpus)
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
}
}
EXPORT_SYMBOL(on_each_cpu_cond);
+
+static void do_nothing(void *unused)
+{
+}
+
+/**
+ * kick_all_cpus_sync - Force all cpus out of idle
+ *
+ * Used to synchronize the update of pm_idle function pointer. It's
+ * called after the pointer is updated and returns after the dummy
+ * callback function has been executed on all cpus. The execution of
+ * the function can only happen on the remote cpus after they have
+ * left the idle function which had been called via pm_idle function
+ * pointer. So it's guaranteed that nothing uses the previous pointer
+ * anymore.
+ */
+void kick_all_cpus_sync(void)
+{
+ /* Make sure the change is visible before we kick the cpus */
+ smp_mb();
+ smp_call_function(do_nothing, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
new file mode 100644
index 000000000000..e1a797e028a3
--- /dev/null
+++ b/kernel/smpboot.c
@@ -0,0 +1,62 @@
+/*
+ * Common SMP CPU bringup/teardown functions
+ */
+#include <linux/err.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+
+#include "smpboot.h"
+
+#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
+/*
+ * For the hotplug case we keep the task structs around and reuse
+ * them.
+ */
+static DEFINE_PER_CPU(struct task_struct *, idle_threads);
+
+struct task_struct * __cpuinit idle_thread_get(unsigned int cpu)
+{
+ struct task_struct *tsk = per_cpu(idle_threads, cpu);
+
+ if (!tsk)
+ return ERR_PTR(-ENOMEM);
+ init_idle(tsk, cpu);
+ return tsk;
+}
+
+void __init idle_thread_set_boot_cpu(void)
+{
+ per_cpu(idle_threads, smp_processor_id()) = current;
+}
+
+static inline void idle_init(unsigned int cpu)
+{
+ struct task_struct *tsk = per_cpu(idle_threads, cpu);
+
+ if (!tsk) {
+ tsk = fork_idle(cpu);
+ if (IS_ERR(tsk))
+ pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
+ else
+ per_cpu(idle_threads, cpu) = tsk;
+ }
+}
+
+/**
+ * idle_thread_init - Initialize the idle thread for a cpu
+ * @cpu: The cpu for which the idle thread should be initialized
+ *
+ * Creates the thread if it does not exist.
+ */
+void __init idle_threads_init(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu != smp_processor_id())
+ idle_init(cpu);
+ }
+}
+#endif
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
new file mode 100644
index 000000000000..80c0acfb8472
--- /dev/null
+++ b/kernel/smpboot.h
@@ -0,0 +1,18 @@
+#ifndef SMPBOOT_H
+#define SMPBOOT_H
+
+struct task_struct;
+
+int smpboot_prepare(unsigned int cpu);
+
+#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
+struct task_struct *idle_thread_get(unsigned int cpu);
+void idle_thread_set_boot_cpu(void);
+void idle_threads_init(void);
+#else
+static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
+static inline void idle_thread_set_boot_cpu(void) { }
+static inline void idle_threads_init(void) { }
+#endif
+
+#endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index ba35f3a4a1f4..2095be3318d5 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,10 +34,77 @@
#include <linux/delay.h>
#include <linux/srcu.h>
+/*
+ * Initialize an rcu_batch structure to empty.
+ */
+static inline void rcu_batch_init(struct rcu_batch *b)
+{
+ b->head = NULL;
+ b->tail = &b->head;
+}
+
+/*
+ * Enqueue a callback onto the tail of the specified rcu_batch structure.
+ */
+static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
+{
+ *b->tail = head;
+ b->tail = &head->next;
+}
+
+/*
+ * Is the specified rcu_batch structure empty?
+ */
+static inline bool rcu_batch_empty(struct rcu_batch *b)
+{
+ return b->tail == &b->head;
+}
+
+/*
+ * Remove the callback at the head of the specified rcu_batch structure
+ * and return a pointer to it, or return NULL if the structure is empty.
+ */
+static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
+{
+ struct rcu_head *head;
+
+ if (rcu_batch_empty(b))
+ return NULL;
+
+ head = b->head;
+ b->head = head->next;
+ if (b->tail == &head->next)
+ rcu_batch_init(b);
+
+ return head;
+}
+
+/*
+ * Move all callbacks from the rcu_batch structure specified by "from" to
+ * the structure specified by "to".
+ */
+static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
+{
+ if (!rcu_batch_empty(from)) {
+ *to->tail = from->head;
+ to->tail = from->tail;
+ rcu_batch_init(from);
+ }
+}
+
+/* single-thread state-machine */
+static void process_srcu(struct work_struct *work);
+
static int init_srcu_struct_fields(struct srcu_struct *sp)
{
sp->completed = 0;
- mutex_init(&sp->mutex);
+ spin_lock_init(&sp->queue_lock);
+ sp->running = false;
+ rcu_batch_init(&sp->batch_queue);
+ rcu_batch_init(&sp->batch_check0);
+ rcu_batch_init(&sp->batch_check1);
+ rcu_batch_init(&sp->batch_done);
+ INIT_DELAYED_WORK(&sp->work, process_srcu);
sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
return sp->per_cpu_ref ? 0 : -ENOMEM;
}
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
- * srcu_readers_active_idx -- returns approximate number of readers
- * active on the specified rank of per-CPU counters.
+ * Returns approximate total of the readers' ->seq[] values for the
+ * rank of per-CPU counters specified by idx.
*/
+static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+ unsigned long t;
-static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+ for_each_possible_cpu(cpu) {
+ t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+ sum += t;
+ }
+ return sum;
+}
+
+/*
+ * Returns approximate number of readers active on the specified rank
+ * of the per-CPU ->c[] counters.
+ */
+static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
{
int cpu;
- int sum;
+ unsigned long sum = 0;
+ unsigned long t;
- sum = 0;
- for_each_possible_cpu(cpu)
- sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
+ for_each_possible_cpu(cpu) {
+ t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+ sum += t;
+ }
return sum;
}
+/*
+ * Return true if the number of pre-existing readers is determined to
+ * be stably zero. An example unstable zero can occur if the call
+ * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
+ * but due to task migration, sees the corresponding __srcu_read_unlock()
+ * decrement. This can happen because srcu_readers_active_idx() takes
+ * time to sum the array, and might in fact be interrupted or preempted
+ * partway through the summation.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+{
+ unsigned long seq;
+
+ seq = srcu_readers_seq_idx(sp, idx);
+
+ /*
+ * The following smp_mb() A pairs with the smp_mb() B located in
+ * __srcu_read_lock(). This pairing ensures that if an
+ * __srcu_read_lock() increments its counter after the summation
+ * in srcu_readers_active_idx(), then the corresponding SRCU read-side
+ * critical section will see any changes made prior to the start
+ * of the current SRCU grace period.
+ *
+ * Also, if the above call to srcu_readers_seq_idx() saw the
+ * increment of ->seq[], then the call to srcu_readers_active_idx()
+ * must see the increment of ->c[].
+ */
+ smp_mb(); /* A */
+
+ /*
+ * Note that srcu_readers_active_idx() can incorrectly return
+ * zero even though there is a pre-existing reader throughout.
+ * To see this, suppose that task A is in a very long SRCU
+ * read-side critical section that started on CPU 0, and that
+ * no other reader exists, so that the sum of the counters
+ * is equal to one. Then suppose that task B starts executing
+ * srcu_readers_active_idx(), summing up to CPU 1, and then that
+ * task C starts reading on CPU 0, so that its increment is not
+ * summed, but finishes reading on CPU 2, so that its decrement
+ * -is- summed. Then when task B completes its sum, it will
+ * incorrectly get zero, despite the fact that task A has been
+ * in its SRCU read-side critical section the whole time.
+ *
+ * We therefore do a validation step should srcu_readers_active_idx()
+ * return zero.
+ */
+ if (srcu_readers_active_idx(sp, idx) != 0)
+ return false;
+
+ /*
+ * The remainder of this function is the validation step.
+ * The following smp_mb() D pairs with the smp_mb() C in
+ * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
+ * by srcu_readers_active_idx() above, then any destructive
+ * operation performed after the grace period will happen after
+ * the corresponding SRCU read-side critical section.
+ *
+ * Note that there can be at most NR_CPUS worth of readers using
+ * the old index, which is not enough to overflow even a 32-bit
+ * integer. (Yes, this does mean that systems having more than
+ * a billion or so CPUs need to be 64-bit systems.) Therefore,
+ * the sum of the ->seq[] counters cannot possibly overflow.
+ * Therefore, the only way that the return values of the two
+ * calls to srcu_readers_seq_idx() can be equal is if there were
+ * no increments of the corresponding rank of ->seq[] counts
+ * in the interim. But the missed-increment scenario laid out
+ * above includes an increment of the ->seq[] counter by
+ * the corresponding __srcu_read_lock(). Therefore, if this
+ * scenario occurs, the return values from the two calls to
+ * srcu_readers_seq_idx() will differ, and thus the validation
+ * step below suffices.
+ */
+ smp_mb(); /* D */
+
+ return srcu_readers_seq_idx(sp, idx) == seq;
+}
+
/**
* srcu_readers_active - returns approximate number of readers.
* @sp: which srcu_struct to count active readers (holding srcu_read_lock).
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
*/
static int srcu_readers_active(struct srcu_struct *sp)
{
- return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
+ int cpu;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+ sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+ }
+ return sum;
}
/**
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp)
int idx;
preempt_disable();
- idx = sp->completed & 0x1;
- barrier(); /* ensure compiler looks -once- at sp->completed. */
- per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
- srcu_barrier(); /* ensure compiler won't misorder critical section. */
+ idx = rcu_dereference_index_check(sp->completed,
+ rcu_read_lock_sched_held()) & 0x1;
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
+ smp_mb(); /* B */ /* Avoid leaking the critical section. */
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
preempt_enable();
return idx;
}
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *sp, int idx)
{
preempt_disable();
- srcu_barrier(); /* ensure compiler won't misorder critical section. */
- per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
+ smp_mb(); /* C */ /* Avoid leaking the critical section. */
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
preempt_enable();
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
* we repeatedly block for 1-millisecond time periods. This approach
* has done well in testing, so there is no need for a config parameter.
*/
-#define SYNCHRONIZE_SRCU_READER_DELAY 10
+#define SRCU_RETRY_CHECK_DELAY 5
+#define SYNCHRONIZE_SRCU_TRYCOUNT 2
+#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
/*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ * @@@ Wait until all pre-existing readers complete. Such readers
+ * will have used the index specified by "idx".
+ * the caller should ensures the ->completed is not changed while checking
+ * and idx = (->completed & 1) ^ 1
*/
-static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
+static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
{
- int idx;
-
- rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
- !lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
-
- idx = sp->completed;
- mutex_lock(&sp->mutex);
+ for (;;) {
+ if (srcu_readers_active_idx_check(sp, idx))
+ return true;
+ if (--trycount <= 0)
+ return false;
+ udelay(SRCU_RETRY_CHECK_DELAY);
+ }
+}
- /*
- * Check to see if someone else did the work for us while we were
- * waiting to acquire the lock. We need -two- advances of
- * the counter, not just one. If there was but one, we might have
- * shown up -after- our helper's first synchronize_sched(), thus
- * having failed to prevent CPU-reordering races with concurrent
- * srcu_read_unlock()s on other CPUs (see comment below). So we
- * either (1) wait for two or (2) supply the second ourselves.
- */
+/*
+ * Increment the ->completed counter so that future SRCU readers will
+ * use the other rank of the ->c[] and ->seq[] arrays. This allows
+ * us to wait for pre-existing readers in a starvation-free manner.
+ */
+static void srcu_flip(struct srcu_struct *sp)
+{
+ sp->completed++;
+}
- if ((sp->completed - idx) >= 2) {
- mutex_unlock(&sp->mutex);
- return;
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+ void (*func)(struct rcu_head *head))
+{
+ unsigned long flags;
+
+ head->next = NULL;
+ head->func = func;
+ spin_lock_irqsave(&sp->queue_lock, flags);
+ rcu_batch_queue(&sp->batch_queue, head);
+ if (!sp->running) {
+ sp->running = true;
+ queue_delayed_work(system_nrt_wq, &sp->work, 0);
}
+ spin_unlock_irqrestore(&sp->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
- sync_func(); /* Force memory barrier on all CPUs. */
+struct rcu_synchronize {
+ struct rcu_head head;
+ struct completion completion;
+};
- /*
- * The preceding synchronize_sched() ensures that any CPU that
- * sees the new value of sp->completed will also see any preceding
- * changes to data structures made by this CPU. This prevents
- * some other CPU from reordering the accesses in its SRCU
- * read-side critical section to precede the corresponding
- * srcu_read_lock() -- ensuring that such references will in
- * fact be protected.
- *
- * So it is now safe to do the flip.
- */
+/*
+ * Awaken the corresponding synchronize_srcu() instance now that a
+ * grace period has elapsed.
+ */
+static void wakeme_after_rcu(struct rcu_head *head)
+{
+ struct rcu_synchronize *rcu;
- idx = sp->completed & 0x1;
- sp->completed++;
+ rcu = container_of(head, struct rcu_synchronize, head);
+ complete(&rcu->completion);
+}
- sync_func(); /* Force memory barrier on all CPUs. */
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
+static void srcu_reschedule(struct srcu_struct *sp);
- /*
- * At this point, because of the preceding synchronize_sched(),
- * all srcu_read_lock() calls using the old counters have completed.
- * Their corresponding critical sections might well be still
- * executing, but the srcu_read_lock() primitives themselves
- * will have finished executing. We initially give readers
- * an arbitrarily chosen 10 microseconds to get out of their
- * SRCU read-side critical sections, then loop waiting 1/HZ
- * seconds per iteration. The 10-microsecond value has done
- * very well in testing.
- */
-
- if (srcu_readers_active_idx(sp, idx))
- udelay(SYNCHRONIZE_SRCU_READER_DELAY);
- while (srcu_readers_active_idx(sp, idx))
- schedule_timeout_interruptible(1);
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
+{
+ struct rcu_synchronize rcu;
+ struct rcu_head *head = &rcu.head;
+ bool done = false;
- sync_func(); /* Force memory barrier on all CPUs. */
+ rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+ !lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
- /*
- * The preceding synchronize_sched() forces all srcu_read_unlock()
- * primitives that were executing concurrently with the preceding
- * for_each_possible_cpu() loop to have completed by this point.
- * More importantly, it also forces the corresponding SRCU read-side
- * critical sections to have also completed, and the corresponding
- * references to SRCU-protected data items to be dropped.
- *
- * Note:
- *
- * Despite what you might think at first glance, the
- * preceding synchronize_sched() -must- be within the
- * critical section ended by the following mutex_unlock().
- * Otherwise, a task taking the early exit can race
- * with a srcu_read_unlock(), which might have executed
- * just before the preceding srcu_readers_active() check,
- * and whose CPU might have reordered the srcu_read_unlock()
- * with the preceding critical section. In this case, there
- * is nothing preventing the synchronize_sched() task that is
- * taking the early exit from freeing a data structure that
- * is still being referenced (out of order) by the task
- * doing the srcu_read_unlock().
- *
- * Alternatively, the comparison with "2" on the early exit
- * could be changed to "3", but this increases synchronize_srcu()
- * latency for bulk loads. So the current code is preferred.
- */
+ init_completion(&rcu.completion);
+
+ head->next = NULL;
+ head->func = wakeme_after_rcu;
+ spin_lock_irq(&sp->queue_lock);
+ if (!sp->running) {
+ /* steal the processing owner */
+ sp->running = true;
+ rcu_batch_queue(&sp->batch_check0, head);
+ spin_unlock_irq(&sp->queue_lock);
+
+ srcu_advance_batches(sp, trycount);
+ if (!rcu_batch_empty(&sp->batch_done)) {
+ BUG_ON(sp->batch_done.head != head);
+ rcu_batch_dequeue(&sp->batch_done);
+ done = true;
+ }
+ /* give the processing owner to work_struct */
+ srcu_reschedule(sp);
+ } else {
+ rcu_batch_queue(&sp->batch_queue, head);
+ spin_unlock_irq(&sp->queue_lock);
+ }
- mutex_unlock(&sp->mutex);
+ if (!done)
+ wait_for_completion(&rcu.completion);
}
/**
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
*/
void synchronize_srcu(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, synchronize_sched);
+ __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
* synchronize_srcu_expedited - Brute-force SRCU grace period
* @sp: srcu_struct with which to synchronize.
*
- * Wait for an SRCU grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly. This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code. In fact,
- * if you are using synchronize_srcu_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_srcu() instead.
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
*
* Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier. Failing to observe
- * these restriction will result in deadlock. It is also illegal to call
+ * that is acquired by a CPU-hotplug notifier. It is also illegal to call
* synchronize_srcu_expedited() from the corresponding SRCU read-side
* critical section; doing so will result in deadlock. However, it is
* perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
*/
void synchronize_srcu_expedited(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, synchronize_sched_expedited);
+ __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
}
EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
/**
+ * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ */
+void srcu_barrier(struct srcu_struct *sp)
+{
+ synchronize_srcu(sp);
+}
+EXPORT_SYMBOL_GPL(srcu_barrier);
+
+/**
* srcu_batches_completed - return batches completed.
* @sp: srcu_struct on which to report batch completion.
*
* Report the number of batches, correlated with, but not necessarily
* precisely the same as, the number of grace periods that have elapsed.
*/
-
long srcu_batches_completed(struct srcu_struct *sp)
{
return sp->completed;
}
EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+#define SRCU_CALLBACK_BATCH 10
+#define SRCU_INTERVAL 1
+
+/*
+ * Move any new SRCU callbacks to the first stage of the SRCU grace
+ * period pipeline.
+ */
+static void srcu_collect_new(struct srcu_struct *sp)
+{
+ if (!rcu_batch_empty(&sp->batch_queue)) {
+ spin_lock_irq(&sp->queue_lock);
+ rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
+ spin_unlock_irq(&sp->queue_lock);
+ }
+}
+
+/*
+ * Core SRCU state machine. Advance callbacks from ->batch_check0 to
+ * ->batch_check1 and then to ->batch_done as readers drain.
+ */
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
+{
+ int idx = 1 ^ (sp->completed & 1);
+
+ /*
+ * Because readers might be delayed for an extended period after
+ * fetching ->completed for their index, at any point in time there
+ * might well be readers using both idx=0 and idx=1. We therefore
+ * need to wait for readers to clear from both index values before
+ * invoking a callback.
+ */
+
+ if (rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_check1))
+ return; /* no callbacks need to be advanced */
+
+ if (!try_check_zero(sp, idx, trycount))
+ return; /* failed to advance, will try after SRCU_INTERVAL */
+
+ /*
+ * The callbacks in ->batch_check1 have already done with their
+ * first zero check and flip back when they were enqueued on
+ * ->batch_check0 in a previous invocation of srcu_advance_batches().
+ * (Presumably try_check_zero() returned false during that
+ * invocation, leaving the callbacks stranded on ->batch_check1.)
+ * They are therefore ready to invoke, so move them to ->batch_done.
+ */
+ rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+
+ if (rcu_batch_empty(&sp->batch_check0))
+ return; /* no callbacks need to be advanced */
+ srcu_flip(sp);
+
+ /*
+ * The callbacks in ->batch_check0 just finished their
+ * first check zero and flip, so move them to ->batch_check1
+ * for future checking on the other idx.
+ */
+ rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
+
+ /*
+ * SRCU read-side critical sections are normally short, so check
+ * at least twice in quick succession after a flip.
+ */
+ trycount = trycount < 2 ? 2 : trycount;
+ if (!try_check_zero(sp, idx^1, trycount))
+ return; /* failed to advance, will try after SRCU_INTERVAL */
+
+ /*
+ * The callbacks in ->batch_check1 have now waited for all
+ * pre-existing readers using both idx values. They are therefore
+ * ready to invoke, so move them to ->batch_done.
+ */
+ rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+}
+
+/*
+ * Invoke a limited number of SRCU callbacks that have passed through
+ * their grace period. If there are more to do, SRCU will reschedule
+ * the workqueue.
+ */
+static void srcu_invoke_callbacks(struct srcu_struct *sp)
+{
+ int i;
+ struct rcu_head *head;
+
+ for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
+ head = rcu_batch_dequeue(&sp->batch_done);
+ if (!head)
+ break;
+ local_bh_disable();
+ head->func(head);
+ local_bh_enable();
+ }
+}
+
+/*
+ * Finished one round of SRCU grace period. Start another if there are
+ * more SRCU callbacks queued, otherwise put SRCU into not-running state.
+ */
+static void srcu_reschedule(struct srcu_struct *sp)
+{
+ bool pending = true;
+
+ if (rcu_batch_empty(&sp->batch_done) &&
+ rcu_batch_empty(&sp->batch_check1) &&
+ rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_queue)) {
+ spin_lock_irq(&sp->queue_lock);
+ if (rcu_batch_empty(&sp->batch_done) &&
+ rcu_batch_empty(&sp->batch_check1) &&
+ rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_queue)) {
+ sp->running = false;
+ pending = false;
+ }
+ spin_unlock_irq(&sp->queue_lock);
+ }
+
+ if (pending)
+ queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL);
+}
+
+/*
+ * This is the work-queue function that handles SRCU grace periods.
+ */
+static void process_srcu(struct work_struct *work)
+{
+ struct srcu_struct *sp;
+
+ sp = container_of(work, struct srcu_struct, work.work);
+
+ srcu_collect_new(sp);
+ srcu_advance_batches(sp, 1);
+ srcu_invoke_callbacks(sp);
+ srcu_reschedule(sp);
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7006eb6c1e4..6df42624e454 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -93,10 +93,8 @@
int overflowuid = DEFAULT_OVERFLOWUID;
int overflowgid = DEFAULT_OVERFLOWGID;
-#ifdef CONFIG_UID16
EXPORT_SYMBOL(overflowuid);
EXPORT_SYMBOL(overflowgid);
-#endif
/*
* the same as above, but for filesystems which can only store a 16-bit
@@ -133,11 +131,10 @@ static bool set_one_prio_perm(struct task_struct *p)
{
const struct cred *cred = current_cred(), *pcred = __task_cred(p);
- if (pcred->user->user_ns == cred->user->user_ns &&
- (pcred->uid == cred->euid ||
- pcred->euid == cred->euid))
+ if (uid_eq(pcred->uid, cred->euid) ||
+ uid_eq(pcred->euid, cred->euid))
return true;
- if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+ if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
return true;
return false;
}
@@ -177,6 +174,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
const struct cred *cred = current_cred();
int error = -EINVAL;
struct pid *pgrp;
+ kuid_t uid;
if (which > PRIO_USER || which < PRIO_PROCESS)
goto out;
@@ -209,18 +207,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case PRIO_USER:
- user = (struct user_struct *) cred->user;
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
if (!who)
- who = cred->uid;
- else if ((who != cred->uid) &&
- !(user = find_user(who)))
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid) &&
+ !(user = find_user(uid)))
goto out_unlock; /* No processes for this user */
do_each_thread(g, p) {
- if (__task_cred(p)->uid == who)
+ if (uid_eq(task_uid(p), uid))
error = set_one_prio(p, niceval, error);
} while_each_thread(g, p);
- if (who != cred->uid)
+ if (!uid_eq(uid, cred->uid))
free_uid(user); /* For find_user() */
break;
}
@@ -244,6 +243,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
const struct cred *cred = current_cred();
long niceval, retval = -ESRCH;
struct pid *pgrp;
+ kuid_t uid;
if (which > PRIO_USER || which < PRIO_PROCESS)
return -EINVAL;
@@ -274,21 +274,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case PRIO_USER:
- user = (struct user_struct *) cred->user;
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
if (!who)
- who = cred->uid;
- else if ((who != cred->uid) &&
- !(user = find_user(who)))
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid) &&
+ !(user = find_user(uid)))
goto out_unlock; /* No processes for this user */
do_each_thread(g, p) {
- if (__task_cred(p)->uid == who) {
+ if (uid_eq(task_uid(p), uid)) {
niceval = 20 - task_nice(p);
if (niceval > retval)
retval = niceval;
}
} while_each_thread(g, p);
- if (who != cred->uid)
+ if (!uid_eq(uid, cred->uid))
free_uid(user); /* for find_user() */
break;
}
@@ -553,9 +554,19 @@ void ctrl_alt_del(void)
*/
SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kgid_t krgid, kegid;
+
+ krgid = make_kgid(ns, rgid);
+ kegid = make_kgid(ns, egid);
+
+ if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+ return -EINVAL;
+ if ((egid != (gid_t) -1) && !gid_valid(kegid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -564,25 +575,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
retval = -EPERM;
if (rgid != (gid_t) -1) {
- if (old->gid == rgid ||
- old->egid == rgid ||
+ if (gid_eq(old->gid, krgid) ||
+ gid_eq(old->egid, krgid) ||
nsown_capable(CAP_SETGID))
- new->gid = rgid;
+ new->gid = krgid;
else
goto error;
}
if (egid != (gid_t) -1) {
- if (old->gid == egid ||
- old->egid == egid ||
- old->sgid == egid ||
+ if (gid_eq(old->gid, kegid) ||
+ gid_eq(old->egid, kegid) ||
+ gid_eq(old->sgid, kegid) ||
nsown_capable(CAP_SETGID))
- new->egid = egid;
+ new->egid = kegid;
else
goto error;
}
if (rgid != (gid_t) -1 ||
- (egid != (gid_t) -1 && egid != old->gid))
+ (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
new->sgid = new->egid;
new->fsgid = new->egid;
@@ -600,9 +611,15 @@ error:
*/
SYSCALL_DEFINE1(setgid, gid_t, gid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kgid_t kgid;
+
+ kgid = make_kgid(ns, gid);
+ if (!gid_valid(kgid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -611,9 +628,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
retval = -EPERM;
if (nsown_capable(CAP_SETGID))
- new->gid = new->egid = new->sgid = new->fsgid = gid;
- else if (gid == old->gid || gid == old->sgid)
- new->egid = new->fsgid = gid;
+ new->gid = new->egid = new->sgid = new->fsgid = kgid;
+ else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
+ new->egid = new->fsgid = kgid;
else
goto error;
@@ -631,7 +648,7 @@ static int set_user(struct cred *new)
{
struct user_struct *new_user;
- new_user = alloc_uid(current_user_ns(), new->uid);
+ new_user = alloc_uid(new->uid);
if (!new_user)
return -EAGAIN;
@@ -670,9 +687,19 @@ static int set_user(struct cred *new)
*/
SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kuid_t kruid, keuid;
+
+ kruid = make_kuid(ns, ruid);
+ keuid = make_kuid(ns, euid);
+
+ if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+ return -EINVAL;
+ if ((euid != (uid_t) -1) && !uid_valid(keuid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -681,29 +708,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
retval = -EPERM;
if (ruid != (uid_t) -1) {
- new->uid = ruid;
- if (old->uid != ruid &&
- old->euid != ruid &&
+ new->uid = kruid;
+ if (!uid_eq(old->uid, kruid) &&
+ !uid_eq(old->euid, kruid) &&
!nsown_capable(CAP_SETUID))
goto error;
}
if (euid != (uid_t) -1) {
- new->euid = euid;
- if (old->uid != euid &&
- old->euid != euid &&
- old->suid != euid &&
+ new->euid = keuid;
+ if (!uid_eq(old->uid, keuid) &&
+ !uid_eq(old->euid, keuid) &&
+ !uid_eq(old->suid, keuid) &&
!nsown_capable(CAP_SETUID))
goto error;
}
- if (new->uid != old->uid) {
+ if (!uid_eq(new->uid, old->uid)) {
retval = set_user(new);
if (retval < 0)
goto error;
}
if (ruid != (uid_t) -1 ||
- (euid != (uid_t) -1 && euid != old->uid))
+ (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
new->suid = new->euid;
new->fsuid = new->euid;
@@ -731,9 +758,15 @@ error:
*/
SYSCALL_DEFINE1(setuid, uid_t, uid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kuid_t kuid;
+
+ kuid = make_kuid(ns, uid);
+ if (!uid_valid(kuid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -742,17 +775,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
retval = -EPERM;
if (nsown_capable(CAP_SETUID)) {
- new->suid = new->uid = uid;
- if (uid != old->uid) {
+ new->suid = new->uid = kuid;
+ if (!uid_eq(kuid, old->uid)) {
retval = set_user(new);
if (retval < 0)
goto error;
}
- } else if (uid != old->uid && uid != new->suid) {
+ } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
goto error;
}
- new->fsuid = new->euid = uid;
+ new->fsuid = new->euid = kuid;
retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
if (retval < 0)
@@ -772,9 +805,24 @@ error:
*/
SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kuid_t kruid, keuid, ksuid;
+
+ kruid = make_kuid(ns, ruid);
+ keuid = make_kuid(ns, euid);
+ ksuid = make_kuid(ns, suid);
+
+ if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+ return -EINVAL;
+
+ if ((euid != (uid_t) -1) && !uid_valid(keuid))
+ return -EINVAL;
+
+ if ((suid != (uid_t) -1) && !uid_valid(ksuid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -784,29 +832,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
retval = -EPERM;
if (!nsown_capable(CAP_SETUID)) {
- if (ruid != (uid_t) -1 && ruid != old->uid &&
- ruid != old->euid && ruid != old->suid)
+ if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
+ !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
goto error;
- if (euid != (uid_t) -1 && euid != old->uid &&
- euid != old->euid && euid != old->suid)
+ if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
+ !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
goto error;
- if (suid != (uid_t) -1 && suid != old->uid &&
- suid != old->euid && suid != old->suid)
+ if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
+ !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
goto error;
}
if (ruid != (uid_t) -1) {
- new->uid = ruid;
- if (ruid != old->uid) {
+ new->uid = kruid;
+ if (!uid_eq(kruid, old->uid)) {
retval = set_user(new);
if (retval < 0)
goto error;
}
}
if (euid != (uid_t) -1)
- new->euid = euid;
+ new->euid = keuid;
if (suid != (uid_t) -1)
- new->suid = suid;
+ new->suid = ksuid;
new->fsuid = new->euid;
retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
@@ -820,14 +868,19 @@ error:
return retval;
}
-SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid)
+SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
{
const struct cred *cred = current_cred();
int retval;
+ uid_t ruid, euid, suid;
- if (!(retval = put_user(cred->uid, ruid)) &&
- !(retval = put_user(cred->euid, euid)))
- retval = put_user(cred->suid, suid);
+ ruid = from_kuid_munged(cred->user_ns, cred->uid);
+ euid = from_kuid_munged(cred->user_ns, cred->euid);
+ suid = from_kuid_munged(cred->user_ns, cred->suid);
+
+ if (!(retval = put_user(ruid, ruidp)) &&
+ !(retval = put_user(euid, euidp)))
+ retval = put_user(suid, suidp);
return retval;
}
@@ -837,9 +890,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u
*/
SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
{
+ struct user_namespace *ns = current_user_ns();
const struct cred *old;
struct cred *new;
int retval;
+ kgid_t krgid, kegid, ksgid;
+
+ krgid = make_kgid(ns, rgid);
+ kegid = make_kgid(ns, egid);
+ ksgid = make_kgid(ns, sgid);
+
+ if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+ return -EINVAL;
+ if ((egid != (gid_t) -1) && !gid_valid(kegid))
+ return -EINVAL;
+ if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
+ return -EINVAL;
new = prepare_creds();
if (!new)
@@ -848,23 +914,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
retval = -EPERM;
if (!nsown_capable(CAP_SETGID)) {
- if (rgid != (gid_t) -1 && rgid != old->gid &&
- rgid != old->egid && rgid != old->sgid)
+ if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
+ !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
goto error;
- if (egid != (gid_t) -1 && egid != old->gid &&
- egid != old->egid && egid != old->sgid)
+ if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
+ !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
goto error;
- if (sgid != (gid_t) -1 && sgid != old->gid &&
- sgid != old->egid && sgid != old->sgid)
+ if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
+ !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
goto error;
}
if (rgid != (gid_t) -1)
- new->gid = rgid;
+ new->gid = krgid;
if (egid != (gid_t) -1)
- new->egid = egid;
+ new->egid = kegid;
if (sgid != (gid_t) -1)
- new->sgid = sgid;
+ new->sgid = ksgid;
new->fsgid = new->egid;
return commit_creds(new);
@@ -874,14 +940,19 @@ error:
return retval;
}
-SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid)
+SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
{
const struct cred *cred = current_cred();
int retval;
+ gid_t rgid, egid, sgid;
+
+ rgid = from_kgid_munged(cred->user_ns, cred->gid);
+ egid = from_kgid_munged(cred->user_ns, cred->egid);
+ sgid = from_kgid_munged(cred->user_ns, cred->sgid);
- if (!(retval = put_user(cred->gid, rgid)) &&
- !(retval = put_user(cred->egid, egid)))
- retval = put_user(cred->sgid, sgid);
+ if (!(retval = put_user(rgid, rgidp)) &&
+ !(retval = put_user(egid, egidp)))
+ retval = put_user(sgid, sgidp);
return retval;
}
@@ -898,18 +969,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
const struct cred *old;
struct cred *new;
uid_t old_fsuid;
+ kuid_t kuid;
+
+ old = current_cred();
+ old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
+
+ kuid = make_kuid(old->user_ns, uid);
+ if (!uid_valid(kuid))
+ return old_fsuid;
new = prepare_creds();
if (!new)
- return current_fsuid();
- old = current_cred();
- old_fsuid = old->fsuid;
+ return old_fsuid;
- if (uid == old->uid || uid == old->euid ||
- uid == old->suid || uid == old->fsuid ||
+ if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
+ uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
nsown_capable(CAP_SETUID)) {
- if (uid != old_fsuid) {
- new->fsuid = uid;
+ if (!uid_eq(kuid, old->fsuid)) {
+ new->fsuid = kuid;
if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
goto change_okay;
}
@@ -931,18 +1008,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
const struct cred *old;
struct cred *new;
gid_t old_fsgid;
+ kgid_t kgid;
+
+ old = current_cred();
+ old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
+
+ kgid = make_kgid(old->user_ns, gid);
+ if (!gid_valid(kgid))
+ return old_fsgid;
new = prepare_creds();
if (!new)
- return current_fsgid();
- old = current_cred();
- old_fsgid = old->fsgid;
+ return old_fsgid;
- if (gid == old->gid || gid == old->egid ||
- gid == old->sgid || gid == old->fsgid ||
+ if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) ||
+ gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
nsown_capable(CAP_SETGID)) {
- if (gid != old_fsgid) {
- new->fsgid = gid;
+ if (!gid_eq(kgid, old->fsgid)) {
+ new->fsgid = kgid;
goto change_okay;
}
}
@@ -1498,15 +1581,14 @@ static int check_prlimit_permission(struct task_struct *task)
return 0;
tcred = __task_cred(task);
- if (cred->user->user_ns == tcred->user->user_ns &&
- (cred->uid == tcred->euid &&
- cred->uid == tcred->suid &&
- cred->uid == tcred->uid &&
- cred->gid == tcred->egid &&
- cred->gid == tcred->sgid &&
- cred->gid == tcred->gid))
+ if (uid_eq(cred->uid, tcred->euid) &&
+ uid_eq(cred->uid, tcred->suid) &&
+ uid_eq(cred->uid, tcred->uid) &&
+ gid_eq(cred->gid, tcred->egid) &&
+ gid_eq(cred->gid, tcred->sgid) &&
+ gid_eq(cred->gid, tcred->gid))
return 0;
- if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+ if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
return 0;
return -EPERM;
@@ -1908,7 +1990,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = prctl_get_seccomp();
break;
case PR_SET_SECCOMP:
- error = prctl_set_seccomp(arg2);
+ error = prctl_set_seccomp(arg2, (char __user *)arg3);
break;
case PR_GET_TSC:
error = GET_TSC_CTL(arg2);
@@ -1979,6 +2061,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = put_user(me->signal->is_child_subreaper,
(int __user *) arg2);
break;
+ case PR_SET_NO_NEW_PRIVS:
+ if (arg2 != 1 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ current->no_new_privs = 1;
+ break;
+ case PR_GET_NO_NEW_PRIVS:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ return current->no_new_privs ? 1 : 0;
default:
error = -EINVAL;
break;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a538c55fc7b..aa27d391bfc8 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);
* If one has not already been chosen, it checks to see if a
* functional rtc device is available.
*/
-static struct rtc_device *alarmtimer_get_rtcdev(void)
+struct rtc_device *alarmtimer_get_rtcdev(void)
{
unsigned long flags;
struct rtc_device *ret;
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void)
class_interface_unregister(&alarmtimer_rtc_interface);
}
#else
-static inline struct rtc_device *alarmtimer_get_rtcdev(void)
+struct rtc_device *alarmtimer_get_rtcdev(void)
{
return NULL;
}
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888e..6ec7e7e0db43 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer);
*
* mod_timer_pinned() is a way to update the expire field of an
* active timer (if the timer is inactive it will be activated)
- * and not allow the timer to be migrated to a different CPU.
+ * and to ensure that the timer is scheduled on the current CPU.
+ *
+ * Note that this does not prevent the timer from being migrated
+ * when the current CPU goes offline. If this is a problem for
+ * you, use CPU-hotplug notifiers to handle it correctly, for
+ * example, cancelling the timer when the corresponding CPU goes
+ * offline.
*
* mod_timer_pinned(timer, expires) is equivalent to:
*
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
* warnings as well as problems when looking into
* timer->lockdep_map, make a copy and use that here.
*/
- struct lockdep_map lockdep_map = timer->lockdep_map;
+ struct lockdep_map lockdep_map;
+
+ lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
#endif
/*
* Couple the lock chain with the lock chain at
@@ -1427,25 +1435,25 @@ SYSCALL_DEFINE0(getppid)
SYSCALL_DEFINE0(getuid)
{
/* Only we change this so SMP safe */
- return current_uid();
+ return from_kuid_munged(current_user_ns(), current_uid());
}
SYSCALL_DEFINE0(geteuid)
{
/* Only we change this so SMP safe */
- return current_euid();
+ return from_kuid_munged(current_user_ns(), current_euid());
}
SYSCALL_DEFINE0(getgid)
{
/* Only we change this so SMP safe */
- return current_gid();
+ return from_kgid_munged(current_user_ns(), current_gid());
}
SYSCALL_DEFINE0(getegid)
{
/* Only we change this so SMP safe */
- return current_egid();
+ return from_kgid_munged(current_user_ns(), current_egid());
}
#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a1d2849f2473..8c4c07071cc5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,6 @@ if FTRACE
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
- select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE
select KALLSYMS
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
@@ -272,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES
bool "Trace likely/unlikely profiler"
select TRACE_BRANCH_PROFILING
help
- This tracer profiles all the the likely and unlikely macros
+ This tracer profiles all likely and unlikely macros
in the kernel. It will display the results in:
/sys/kernel/debug/tracing/trace_stat/branch_annotated
@@ -373,6 +372,7 @@ config KPROBE_EVENT
depends on HAVE_REGS_AND_STACK_ACCESS_API
bool "Enable kprobes-based dynamic events"
select TRACING
+ select PROBE_EVENTS
default y
help
This allows the user to add tracing events (similar to tracepoints)
@@ -385,6 +385,25 @@ config KPROBE_EVENT
This option is also required by perf-probe subcommand of perf tools.
If you want to use perf tools, this option is strongly recommended.
+config UPROBE_EVENT
+ bool "Enable uprobes-based dynamic events"
+ depends on ARCH_SUPPORTS_UPROBES
+ depends on MMU
+ select UPROBES
+ select PROBE_EVENTS
+ select TRACING
+ default n
+ help
+ This allows the user to add tracing events on top of userspace
+ dynamic events (similar to tracepoints) on the fly via the trace
+ events interface. Those events can be inserted wherever uprobes
+ can probe, and record various registers.
+ This option is required if you plan to use perf-probe subcommand
+ of perf tools on user space applications.
+
+config PROBE_EVENTS
+ def_bool n
+
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5f39a07fe5ea..b831087c8200 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
ifeq ($(CONFIG_BLOCK),y)
obj-$(CONFIG_EVENT_TRACING) += blktrace.o
@@ -61,5 +60,7 @@ endif
ifeq ($(CONFIG_TRACING),y)
obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
+obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
+obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0fa92f677c92..a008663d86c8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
static int ftrace_cmp_recs(const void *a, const void *b)
{
- const struct dyn_ftrace *reca = a;
- const struct dyn_ftrace *recb = b;
+ const struct dyn_ftrace *key = a;
+ const struct dyn_ftrace *rec = b;
- if (reca->ip > recb->ip)
- return 1;
- if (reca->ip < recb->ip)
+ if (key->flags < rec->ip)
return -1;
+ if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
+ return 1;
return 0;
}
-/**
- * ftrace_location - return true if the ip giving is a traced location
- * @ip: the instruction pointer to check
- *
- * Returns 1 if @ip given is a pointer to a ftrace location.
- * That is, the instruction that is either a NOP or call to
- * the function tracer. It checks the ftrace internal tables to
- * determine if the address belongs or not.
- */
-int ftrace_location(unsigned long ip)
+static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
{
struct ftrace_page *pg;
struct dyn_ftrace *rec;
struct dyn_ftrace key;
- key.ip = ip;
+ key.ip = start;
+ key.flags = end; /* overload flags, as it is unsigned long */
for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ if (end < pg->records[0].ip ||
+ start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
+ continue;
rec = bsearch(&key, pg->records, pg->index,
sizeof(struct dyn_ftrace),
ftrace_cmp_recs);
if (rec)
- return 1;
+ return rec->ip;
}
return 0;
}
+/**
+ * ftrace_location - return true if the ip giving is a traced location
+ * @ip: the instruction pointer to check
+ *
+ * Returns rec->ip if @ip given is a pointer to a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+unsigned long ftrace_location(unsigned long ip)
+{
+ return ftrace_location_range(ip, ip);
+}
+
+/**
+ * ftrace_text_reserved - return true if range contains an ftrace location
+ * @start: start of range to search
+ * @end: end of range to search (inclusive). @end points to the last byte to check.
+ *
+ * Returns 1 if @start and @end contains a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+int ftrace_text_reserved(void *start, void *end)
+{
+ unsigned long ret;
+
+ ret = ftrace_location_range((unsigned long)start,
+ (unsigned long)end);
+
+ return (int)!!ret;
+}
+
static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
__ftrace_hash_rec_update(ops, filter_hash, 1);
}
-static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
-{
- if (ftrace_pages->index == ftrace_pages->size) {
- /* We should have allocated enough */
- if (WARN_ON(!ftrace_pages->next))
- return NULL;
- ftrace_pages = ftrace_pages->next;
- }
-
- return &ftrace_pages->records[ftrace_pages->index++];
-}
-
-static struct dyn_ftrace *
-ftrace_record_ip(unsigned long ip)
-{
- struct dyn_ftrace *rec;
-
- if (ftrace_disabled)
- return NULL;
-
- rec = ftrace_alloc_dyn_node(ip);
- if (!rec)
- return NULL;
-
- rec->ip = ip;
-
- return rec;
-}
-
static void print_ip_ins(const char *fmt, unsigned char *p)
{
int i;
@@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip)
}
}
-
-/* Return 1 if the address range is reserved for ftrace */
-int ftrace_text_reserved(void *start, void *end)
-{
- struct dyn_ftrace *rec;
- struct ftrace_page *pg;
-
- do_for_each_ftrace_rec(pg, rec) {
- if (rec->ip <= (unsigned long)end &&
- rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
- return 1;
- } while_for_each_ftrace_rec();
- return 0;
-}
-
static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
{
unsigned long flag = 0UL;
@@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
return -1; /* unknow ftrace bug */
}
-static void ftrace_replace_code(int update)
+void __weak ftrace_replace_code(int enable)
{
struct dyn_ftrace *rec;
struct ftrace_page *pg;
@@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update)
return;
do_for_each_ftrace_rec(pg, rec) {
- failed = __ftrace_replace_code(rec, update);
+ failed = __ftrace_replace_code(rec, enable);
if (failed) {
ftrace_bug(failed, rec->ip);
/* Stop processing */
@@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void)
return 0;
}
-static int __ftrace_modify_code(void *data)
+void ftrace_modify_all_code(int command)
{
- int *command = data;
-
- if (*command & FTRACE_UPDATE_CALLS)
+ if (command & FTRACE_UPDATE_CALLS)
ftrace_replace_code(1);
- else if (*command & FTRACE_DISABLE_CALLS)
+ else if (command & FTRACE_DISABLE_CALLS)
ftrace_replace_code(0);
- if (*command & FTRACE_UPDATE_TRACE_FUNC)
+ if (command & FTRACE_UPDATE_TRACE_FUNC)
ftrace_update_ftrace_func(ftrace_trace_function);
- if (*command & FTRACE_START_FUNC_RET)
+ if (command & FTRACE_START_FUNC_RET)
ftrace_enable_ftrace_graph_caller();
- else if (*command & FTRACE_STOP_FUNC_RET)
+ else if (command & FTRACE_STOP_FUNC_RET)
ftrace_disable_ftrace_graph_caller();
+}
+
+static int __ftrace_modify_code(void *data)
+{
+ int *command = data;
+
+ ftrace_modify_all_code(*command);
return 0;
}
@@ -2469,57 +2459,35 @@ static int
ftrace_avail_open(struct inode *inode, struct file *file)
{
struct ftrace_iterator *iter;
- int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter)
- return -ENOMEM;
-
- iter->pg = ftrace_pages_start;
- iter->ops = &global_ops;
-
- ret = seq_open(file, &show_ftrace_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
-
- m->private = iter;
- } else {
- kfree(iter);
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (iter) {
+ iter->pg = ftrace_pages_start;
+ iter->ops = &global_ops;
}
- return ret;
+ return iter ? 0 : -ENOMEM;
}
static int
ftrace_enabled_open(struct inode *inode, struct file *file)
{
struct ftrace_iterator *iter;
- int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter)
- return -ENOMEM;
-
- iter->pg = ftrace_pages_start;
- iter->flags = FTRACE_ITER_ENABLED;
- iter->ops = &global_ops;
-
- ret = seq_open(file, &show_ftrace_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
-
- m->private = iter;
- } else {
- kfree(iter);
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (iter) {
+ iter->pg = ftrace_pages_start;
+ iter->flags = FTRACE_ITER_ENABLED;
+ iter->ops = &global_ops;
}
- return ret;
+ return iter ? 0 : -ENOMEM;
}
static void ftrace_filter_reset(struct ftrace_hash *hash)
@@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
return 0;
}
-static void ftrace_swap_recs(void *a, void *b, int size)
+static int ftrace_cmp_ips(const void *a, const void *b)
+{
+ const unsigned long *ipa = a;
+ const unsigned long *ipb = b;
+
+ if (*ipa > *ipb)
+ return 1;
+ if (*ipa < *ipb)
+ return -1;
+ return 0;
+}
+
+static void ftrace_swap_ips(void *a, void *b, int size)
{
- struct dyn_ftrace *reca = a;
- struct dyn_ftrace *recb = b;
- struct dyn_ftrace t;
+ unsigned long *ipa = a;
+ unsigned long *ipb = b;
+ unsigned long t;
- t = *reca;
- *reca = *recb;
- *recb = t;
+ t = *ipa;
+ *ipa = *ipb;
+ *ipb = t;
}
static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
{
+ struct ftrace_page *start_pg;
struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
unsigned long count;
unsigned long *p;
unsigned long addr;
@@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod,
if (!count)
return 0;
- pg = ftrace_allocate_pages(count);
- if (!pg)
+ sort(start, count, sizeof(*start),
+ ftrace_cmp_ips, ftrace_swap_ips);
+
+ start_pg = ftrace_allocate_pages(count);
+ if (!start_pg)
return -ENOMEM;
mutex_lock(&ftrace_lock);
@@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod,
if (!mod) {
WARN_ON(ftrace_pages || ftrace_pages_start);
/* First initialization */
- ftrace_pages = ftrace_pages_start = pg;
+ ftrace_pages = ftrace_pages_start = start_pg;
} else {
if (!ftrace_pages)
goto out;
@@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod,
ftrace_pages = ftrace_pages->next;
}
- ftrace_pages->next = pg;
- ftrace_pages = pg;
+ ftrace_pages->next = start_pg;
}
p = start;
+ pg = start_pg;
while (p < end) {
addr = ftrace_call_adjust(*p++);
/*
@@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod,
*/
if (!addr)
continue;
- if (!ftrace_record_ip(addr))
- break;
+
+ if (pg->index == pg->size) {
+ /* We should have allocated enough */
+ if (WARN_ON(!pg->next))
+ break;
+ pg = pg->next;
+ }
+
+ rec = &pg->records[pg->index++];
+ rec->ip = addr;
}
- /* These new locations need to be initialized */
- ftrace_new_pgs = pg;
+ /* We should have used all pages */
+ WARN_ON(pg->next);
+
+ /* Assign the last page to ftrace_pages */
+ ftrace_pages = pg;
- /* Make each individual set of pages sorted by ips */
- for (; pg; pg = pg->next)
- sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
- ftrace_cmp_recs, ftrace_swap_recs);
+ /* These new locations need to be initialized */
+ ftrace_new_pgs = start_pg;
/*
* We only need to disable interrupts on start up
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cf8d11e91efd..6420cda62336 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -23,6 +23,8 @@
#include <asm/local.h>
#include "trace.h"
+static void update_pages_handler(struct work_struct *work);
+
/*
* The ring buffer header is special. We must manually up keep it.
*/
@@ -449,6 +451,7 @@ struct ring_buffer_per_cpu {
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
struct lock_class_key lock_key;
+ unsigned int nr_pages;
struct list_head *pages;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
@@ -466,13 +469,18 @@ struct ring_buffer_per_cpu {
unsigned long read_bytes;
u64 write_stamp;
u64 read_stamp;
+ /* ring buffer pages to update, > 0 to add, < 0 to remove */
+ int nr_pages_to_update;
+ struct list_head new_pages; /* new pages to add */
+ struct work_struct update_pages_work;
+ struct completion update_done;
};
struct ring_buffer {
- unsigned pages;
unsigned flags;
int cpus;
atomic_t record_disabled;
+ atomic_t resize_disabled;
cpumask_var_t cpumask;
struct lock_class_key *reader_lock_key;
@@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
+ /* Reset the head page if it exists */
+ if (cpu_buffer->head_page)
+ rb_set_head_page(cpu_buffer);
+
rb_head_page_deactivate(cpu_buffer);
if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
@@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
return 0;
}
-static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned nr_pages)
+static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
{
+ int i;
struct buffer_page *bpage, *tmp;
- LIST_HEAD(pages);
- unsigned i;
-
- WARN_ON(!nr_pages);
for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
*/
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL | __GFP_NORETRY,
- cpu_to_node(cpu_buffer->cpu));
+ cpu_to_node(cpu));
if (!bpage)
goto free_pages;
- rb_check_bpage(cpu_buffer, bpage);
+ list_add(&bpage->list, pages);
- list_add(&bpage->list, &pages);
-
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
if (!page)
goto free_pages;
@@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_init_page(bpage->page);
}
+ return 0;
+
+free_pages:
+ list_for_each_entry_safe(bpage, tmp, pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+
+ return -ENOMEM;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ LIST_HEAD(pages);
+
+ WARN_ON(!nr_pages);
+
+ if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
+ return -ENOMEM;
+
/*
* The ring buffer page list is a circular list that does not
* start and end with a list head. All page list items point to
@@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
cpu_buffer->pages = pages.next;
list_del(&pages);
+ cpu_buffer->nr_pages = nr_pages;
+
rb_check_pages(cpu_buffer);
return 0;
-
- free_pages:
- list_for_each_entry_safe(bpage, tmp, &pages, list) {
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
- }
- return -ENOMEM;
}
static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
@@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
raw_spin_lock_init(&cpu_buffer->reader_lock);
lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+ INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
+ init_completion(&cpu_buffer->update_done);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -1052,7 +1076,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
- ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ ret = rb_allocate_pages(cpu_buffer, nr_pages);
if (ret < 0)
goto fail_free_reader;
@@ -1113,7 +1137,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
{
struct ring_buffer *buffer;
int bsize;
- int cpu;
+ int cpu, nr_pages;
/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1124,14 +1148,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
/* need at least two pages */
- if (buffer->pages < 2)
- buffer->pages = 2;
+ if (nr_pages < 2)
+ nr_pages = 2;
/*
* In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1154,7 +1178,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
for_each_buffer_cpu(buffer, cpu) {
buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, cpu);
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
goto fail_free_buffers;
}
@@ -1222,58 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
-static void
-rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+static inline unsigned long rb_page_entries(struct buffer_page *bpage)
{
- struct buffer_page *bpage;
- struct list_head *p;
- unsigned i;
+ return local_read(&bpage->entries) & RB_WRITE_MASK;
+}
+
+static inline unsigned long rb_page_write(struct buffer_page *bpage)
+{
+ return local_read(&bpage->write) & RB_WRITE_MASK;
+}
+
+static int
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
+{
+ struct list_head *tail_page, *to_remove, *next_page;
+ struct buffer_page *to_remove_page, *tmp_iter_page;
+ struct buffer_page *last_page, *first_page;
+ unsigned int nr_removed;
+ unsigned long head_bit;
+ int page_entries;
+
+ head_bit = 0;
raw_spin_lock_irq(&cpu_buffer->reader_lock);
- rb_head_page_deactivate(cpu_buffer);
+ atomic_inc(&cpu_buffer->record_disabled);
+ /*
+ * We don't race with the readers since we have acquired the reader
+ * lock. We also don't race with writers after disabling recording.
+ * This makes it easy to figure out the first and the last page to be
+ * removed from the list. We unlink all the pages in between including
+ * the first and last pages. This is done in a busy loop so that we
+ * lose the least number of traces.
+ * The pages are freed after we restart recording and unlock readers.
+ */
+ tail_page = &cpu_buffer->tail_page->list;
- for (i = 0; i < nr_pages; i++) {
- if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
- goto out;
- p = cpu_buffer->pages->next;
- bpage = list_entry(p, struct buffer_page, list);
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
+ /*
+ * tail page might be on reader page, we remove the next page
+ * from the ring buffer
+ */
+ if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+ tail_page = rb_list_head(tail_page->next);
+ to_remove = tail_page;
+
+ /* start of pages to remove */
+ first_page = list_entry(rb_list_head(to_remove->next),
+ struct buffer_page, list);
+
+ for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
+ to_remove = rb_list_head(to_remove)->next;
+ head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
}
- if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
- goto out;
- rb_reset_cpu(cpu_buffer);
- rb_check_pages(cpu_buffer);
+ next_page = rb_list_head(to_remove)->next;
-out:
+ /*
+ * Now we remove all pages between tail_page and next_page.
+ * Make sure that we have head_bit value preserved for the
+ * next page
+ */
+ tail_page->next = (struct list_head *)((unsigned long)next_page |
+ head_bit);
+ next_page = rb_list_head(next_page);
+ next_page->prev = tail_page;
+
+ /* make sure pages points to a valid page in the ring buffer */
+ cpu_buffer->pages = next_page;
+
+ /* update head page */
+ if (head_bit)
+ cpu_buffer->head_page = list_entry(next_page,
+ struct buffer_page, list);
+
+ /*
+ * change read pointer to make sure any read iterators reset
+ * themselves
+ */
+ cpu_buffer->read = 0;
+
+ /* pages are removed, resume tracing and then free the pages */
+ atomic_dec(&cpu_buffer->record_disabled);
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+
+ RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
+
+ /* last buffer page to remove */
+ last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
+ list);
+ tmp_iter_page = first_page;
+
+ do {
+ to_remove_page = tmp_iter_page;
+ rb_inc_page(cpu_buffer, &tmp_iter_page);
+
+ /* update the counters */
+ page_entries = rb_page_entries(to_remove_page);
+ if (page_entries) {
+ /*
+ * If something was added to this page, it was full
+ * since it is not the tail page. So we deduct the
+ * bytes consumed in ring buffer from here.
+ * No need to update overruns, since this page is
+ * deleted from ring buffer and its entries are
+ * already accounted for.
+ */
+ local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ }
+
+ /*
+ * We have already removed references to this list item, just
+ * free up the buffer_page and its page
+ */
+ free_buffer_page(to_remove_page);
+ nr_removed--;
+
+ } while (to_remove_page != last_page);
+
+ RB_WARN_ON(cpu_buffer, nr_removed);
+
+ return nr_removed == 0;
}
-static void
-rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
- struct list_head *pages, unsigned nr_pages)
+static int
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct buffer_page *bpage;
- struct list_head *p;
- unsigned i;
+ struct list_head *pages = &cpu_buffer->new_pages;
+ int retries, success;
raw_spin_lock_irq(&cpu_buffer->reader_lock);
- rb_head_page_deactivate(cpu_buffer);
+ /*
+ * We are holding the reader lock, so the reader page won't be swapped
+ * in the ring buffer. Now we are racing with the writer trying to
+ * move head page and the tail page.
+ * We are going to adapt the reader page update process where:
+ * 1. We first splice the start and end of list of new pages between
+ * the head page and its previous page.
+ * 2. We cmpxchg the prev_page->next to point from head page to the
+ * start of new pages list.
+ * 3. Finally, we update the head->prev to the end of new list.
+ *
+ * We will try this process 10 times, to make sure that we don't keep
+ * spinning.
+ */
+ retries = 10;
+ success = 0;
+ while (retries--) {
+ struct list_head *head_page, *prev_page, *r;
+ struct list_head *last_page, *first_page;
+ struct list_head *head_page_with_bit;
- for (i = 0; i < nr_pages; i++) {
- if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
- goto out;
- p = pages->next;
- bpage = list_entry(p, struct buffer_page, list);
- list_del_init(&bpage->list);
- list_add_tail(&bpage->list, cpu_buffer->pages);
+ head_page = &rb_set_head_page(cpu_buffer)->list;
+ prev_page = head_page->prev;
+
+ first_page = pages->next;
+ last_page = pages->prev;
+
+ head_page_with_bit = (struct list_head *)
+ ((unsigned long)head_page | RB_PAGE_HEAD);
+
+ last_page->next = head_page_with_bit;
+ first_page->prev = prev_page;
+
+ r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
+
+ if (r == head_page_with_bit) {
+ /*
+ * yay, we replaced the page pointer to our new list,
+ * now, we just have to update to head page's prev
+ * pointer to point to end of list
+ */
+ head_page->prev = last_page;
+ success = 1;
+ break;
+ }
}
- rb_reset_cpu(cpu_buffer);
- rb_check_pages(cpu_buffer);
-out:
+ if (success)
+ INIT_LIST_HEAD(pages);
+ /*
+ * If we weren't successful in adding in new pages, warn and stop
+ * tracing
+ */
+ RB_WARN_ON(cpu_buffer, !success);
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+
+ /* free pages if they weren't inserted */
+ if (!success) {
+ struct buffer_page *bpage, *tmp;
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
+ list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
+ return success;
+}
+
+static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ int success;
+
+ if (cpu_buffer->nr_pages_to_update > 0)
+ success = rb_insert_pages(cpu_buffer);
+ else
+ success = rb_remove_pages(cpu_buffer,
+ -cpu_buffer->nr_pages_to_update);
+
+ if (success)
+ cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
+}
+
+static void update_pages_handler(struct work_struct *work)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
+ struct ring_buffer_per_cpu, update_pages_work);
+ rb_update_pages(cpu_buffer);
+ complete(&cpu_buffer->update_done);
}
/**
@@ -1283,16 +1471,14 @@ out:
*
* Minimum size is 2 * BUF_PAGE_SIZE.
*
- * Returns -1 on failure.
+ * Returns 0 on success and < 0 on failure.
*/
-int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
+ int cpu_id)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned nr_pages, rm_pages, new_pages;
- struct buffer_page *bpage, *tmp;
- unsigned long buffer_size;
- LIST_HEAD(pages);
- int i, cpu;
+ unsigned nr_pages;
+ int cpu, err = 0;
/*
* Always succeed at resizing a non-existent buffer:
@@ -1302,113 +1488,154 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
size *= BUF_PAGE_SIZE;
- buffer_size = buffer->pages * BUF_PAGE_SIZE;
/* we need a minimum of two pages */
if (size < BUF_PAGE_SIZE * 2)
size = BUF_PAGE_SIZE * 2;
- if (size == buffer_size)
- return size;
-
- atomic_inc(&buffer->record_disabled);
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
- /* Make sure all writers are done with this buffer. */
- synchronize_sched();
+ /*
+ * Don't succeed if resizing is disabled, as a reader might be
+ * manipulating the ring buffer and is expecting a sane state while
+ * this is true.
+ */
+ if (atomic_read(&buffer->resize_disabled))
+ return -EBUSY;
+ /* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
- get_online_cpus();
-
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
- if (size < buffer_size) {
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ /* calculate the pages to update */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
- /* easy case, just free pages */
- if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
- goto out_fail;
+ cpu_buffer->nr_pages_to_update = nr_pages -
+ cpu_buffer->nr_pages;
+ /*
+ * nothing more to do for removing pages or no update
+ */
+ if (cpu_buffer->nr_pages_to_update <= 0)
+ continue;
+ /*
+ * to add pages, make sure all new pages can be
+ * allocated without receiving ENOMEM
+ */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages, cpu)) {
+ /* not enough memory for new pages */
+ err = -ENOMEM;
+ goto out_err;
+ }
+ }
- rm_pages = buffer->pages - nr_pages;
+ get_online_cpus();
+ /*
+ * Fire off all the required work handlers
+ * We can't schedule on offline CPUs, but it's not necessary
+ * since we can change their buffer sizes without any race.
+ */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ if (cpu_online(cpu))
+ schedule_work_on(cpu,
+ &cpu_buffer->update_pages_work);
+ else
+ rb_update_pages(cpu_buffer);
+ }
+ /* wait for all the updates to complete */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- rb_remove_pages(cpu_buffer, rm_pages);
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ if (cpu_online(cpu))
+ wait_for_completion(&cpu_buffer->update_done);
+ cpu_buffer->nr_pages_to_update = 0;
}
- goto out;
- }
- /*
- * This is a bit more difficult. We only want to add pages
- * when we can allocate enough for all CPUs. We do this
- * by allocating all the pages and storing them on a local
- * link list. If we succeed in our allocation, then we
- * add these pages to the cpu_buffers. Otherwise we just free
- * them all and return -ENOMEM;
- */
- if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
- goto out_fail;
+ put_online_cpus();
+ } else {
+ cpu_buffer = buffer->buffers[cpu_id];
- new_pages = nr_pages - buffer->pages;
+ if (nr_pages == cpu_buffer->nr_pages)
+ goto out;
- for_each_buffer_cpu(buffer, cpu) {
- for (i = 0; i < new_pages; i++) {
- struct page *page;
- /*
- * __GFP_NORETRY flag makes sure that the allocation
- * fails gracefully without invoking oom-killer and
- * the system is not destabilized.
- */
- bpage = kzalloc_node(ALIGN(sizeof(*bpage),
- cache_line_size()),
- GFP_KERNEL | __GFP_NORETRY,
- cpu_to_node(cpu));
- if (!bpage)
- goto free_pages;
- list_add(&bpage->list, &pages);
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
- goto free_pages;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
+ cpu_buffer->nr_pages_to_update = nr_pages -
+ cpu_buffer->nr_pages;
+
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (cpu_buffer->nr_pages_to_update > 0 &&
+ __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages, cpu_id)) {
+ err = -ENOMEM;
+ goto out_err;
}
- }
- for_each_buffer_cpu(buffer, cpu) {
- cpu_buffer = buffer->buffers[cpu];
- rb_insert_pages(cpu_buffer, &pages, new_pages);
- }
+ get_online_cpus();
- if (RB_WARN_ON(buffer, !list_empty(&pages)))
- goto out_fail;
+ if (cpu_online(cpu_id)) {
+ schedule_work_on(cpu_id,
+ &cpu_buffer->update_pages_work);
+ wait_for_completion(&cpu_buffer->update_done);
+ } else
+ rb_update_pages(cpu_buffer);
+
+ cpu_buffer->nr_pages_to_update = 0;
+ put_online_cpus();
+ }
out:
- buffer->pages = nr_pages;
- put_online_cpus();
+ /*
+ * The ring buffer resize can happen with the ring buffer
+ * enabled, so that the update disturbs the tracing as little
+ * as possible. But if the buffer is disabled, we do not need
+ * to worry about that, and we can take the time to verify
+ * that the buffer is not corrupt.
+ */
+ if (atomic_read(&buffer->record_disabled)) {
+ atomic_inc(&buffer->record_disabled);
+ /*
+ * Even though the buffer was disabled, we must make sure
+ * that it is truly disabled before calling rb_check_pages.
+ * There could have been a race between checking
+ * record_disable and incrementing it.
+ */
+ synchronize_sched();
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_check_pages(cpu_buffer);
+ }
+ atomic_dec(&buffer->record_disabled);
+ }
+
mutex_unlock(&buffer->mutex);
+ return size;
- atomic_dec(&buffer->record_disabled);
+ out_err:
+ for_each_buffer_cpu(buffer, cpu) {
+ struct buffer_page *bpage, *tmp;
- return size;
+ cpu_buffer = buffer->buffers[cpu];
+ cpu_buffer->nr_pages_to_update = 0;
- free_pages:
- list_for_each_entry_safe(bpage, tmp, &pages, list) {
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
- }
- put_online_cpus();
- mutex_unlock(&buffer->mutex);
- atomic_dec(&buffer->record_disabled);
- return -ENOMEM;
+ if (list_empty(&cpu_buffer->new_pages))
+ continue;
- /*
- * Something went totally wrong, and we are too paranoid
- * to even clean up the mess.
- */
- out_fail:
- put_online_cpus();
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
+ list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
mutex_unlock(&buffer->mutex);
- atomic_dec(&buffer->record_disabled);
- return -1;
+ return err;
}
EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -1447,21 +1674,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
return __rb_page_index(iter->head_page, iter->head);
}
-static inline unsigned long rb_page_write(struct buffer_page *bpage)
-{
- return local_read(&bpage->write) & RB_WRITE_MASK;
-}
-
static inline unsigned rb_page_commit(struct buffer_page *bpage)
{
return local_read(&bpage->page->commit);
}
-static inline unsigned long rb_page_entries(struct buffer_page *bpage)
-{
- return local_read(&bpage->entries) & RB_WRITE_MASK;
-}
-
/* Size is determined by what has been committed */
static inline unsigned rb_page_size(struct buffer_page *bpage)
{
@@ -1510,7 +1727,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* assign the commit to the tail.
*/
again:
- max_count = cpu_buffer->buffer->pages * 100;
+ max_count = cpu_buffer->nr_pages * 100;
while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
if (RB_WARN_ON(cpu_buffer, !(--max_count)))
@@ -3486,6 +3703,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
iter->cpu_buffer = cpu_buffer;
+ atomic_inc(&buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
return iter;
@@ -3548,7 +3766,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ /*
+ * Ring buffer is disabled from recording, here's a good place
+ * to check the integrity of the ring buffer.
+ */
+ rb_check_pages(cpu_buffer);
+
atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->buffer->resize_disabled);
kfree(iter);
}
EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
@@ -3588,9 +3813,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
* ring_buffer_size - return the size of the ring buffer (in bytes)
* @buffer: The ring buffer.
*/
-unsigned long ring_buffer_size(struct ring_buffer *buffer)
+unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
{
- return BUF_PAGE_SIZE * buffer->pages;
+ /*
+ * Earlier, this method returned
+ * BUF_PAGE_SIZE * buffer->nr_pages
+ * Since the nr_pages field is now removed, we have converted this to
+ * return the per cpu buffer value.
+ */
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
@@ -3611,6 +3845,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->commit_page = cpu_buffer->head_page;
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
local_set(&cpu_buffer->reader_page->page->commit, 0);
@@ -3647,8 +3882,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
+ atomic_inc(&buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
+ /* Make sure all commits have finished */
+ synchronize_sched();
+
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
@@ -3664,6 +3903,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&buffer->resize_disabled);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -3765,8 +4005,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
!cpumask_test_cpu(cpu, buffer_b->cpumask))
goto out;
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
/* At least make sure the two buffers are somewhat the same */
- if (buffer_a->pages != buffer_b->pages)
+ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
ret = -EAGAIN;
@@ -3780,9 +4023,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
if (atomic_read(&buffer_b->record_disabled))
goto out;
- cpu_buffer_a = buffer_a->buffers[cpu];
- cpu_buffer_b = buffer_b->buffers[cpu];
-
if (atomic_read(&cpu_buffer_a->record_disabled))
goto out;
@@ -4071,6 +4311,8 @@ static int rb_cpu_notify(struct notifier_block *self,
struct ring_buffer *buffer =
container_of(self, struct ring_buffer, cpu_notify);
long cpu = (long)hcpu;
+ int cpu_i, nr_pages_same;
+ unsigned int nr_pages;
switch (action) {
case CPU_UP_PREPARE:
@@ -4078,8 +4320,23 @@ static int rb_cpu_notify(struct notifier_block *self,
if (cpumask_test_cpu(cpu, buffer->cpumask))
return NOTIFY_OK;
+ nr_pages = 0;
+ nr_pages_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_buffer_cpu(buffer, cpu_i) {
+ /* fill in the size from first enabled cpu */
+ if (nr_pages == 0)
+ nr_pages = buffer->buffers[cpu_i]->nr_pages;
+ if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
+ nr_pages_same = 0;
+ break;
+ }
+ }
+ /* allocate minimum pages, user can later expand it */
+ if (!nr_pages_same)
+ nr_pages = 2;
buffer->buffers[cpu] =
- rb_allocate_cpu_buffer(buffer, cpu);
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu]) {
WARN(1, "failed to allocate ring buffer on CPU %ld\n",
cpu);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2a22255c1010..68032c6177db 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -87,18 +87,6 @@ static int tracing_disabled = 1;
DEFINE_PER_CPU(int, ftrace_cpu_disabled);
-static inline void ftrace_disable_cpu(void)
-{
- preempt_disable();
- __this_cpu_inc(ftrace_cpu_disabled);
-}
-
-static inline void ftrace_enable_cpu(void)
-{
- __this_cpu_dec(ftrace_cpu_disabled);
- preempt_enable();
-}
-
cpumask_var_t __read_mostly tracing_buffer_mask;
/*
@@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
- void *ret;
if (s->len <= s->readpos)
return -EBUSY;
@@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
len = s->len - s->readpos;
if (cnt > len)
cnt = len;
- ret = memcpy(buf, s->buffer + s->readpos, cnt);
- if (!ret)
- return -EFAULT;
+ memcpy(buf, s->buffer + s->readpos, cnt);
s->readpos += cnt;
return cnt;
@@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
arch_spin_lock(&ftrace_max_lock);
- ftrace_disable_cpu();
-
ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
if (ret == -EBUSY) {
@@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
"Failed to swap buffers due to commit in progress\n");
}
- ftrace_enable_cpu();
-
WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
__update_max_tr(tr, tsk, cpu);
@@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
* Register a new plugin tracer.
*/
int register_tracer(struct tracer *type)
-__releases(kernel_lock)
-__acquires(kernel_lock)
{
struct tracer *t;
int ret = 0;
@@ -841,7 +820,8 @@ __acquires(kernel_lock)
/* If we expanded the buffers, make sure the max is expanded too */
if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, trace_buf_size);
+ ring_buffer_resize(max_tr.buffer, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
/* the test is responsible for initializing and enabling */
pr_info("Testing tracer %s: ", type->name);
@@ -857,7 +837,8 @@ __acquires(kernel_lock)
/* Shrink the max buffer again */
if (ring_buffer_expanded && type->use_max_tr)
- ring_buffer_resize(max_tr.buffer, 1);
+ ring_buffer_resize(max_tr.buffer, 1,
+ RING_BUFFER_ALL_CPUS);
printk(KERN_CONT "PASSED\n");
}
@@ -917,13 +898,6 @@ out:
mutex_unlock(&trace_types_lock);
}
-static void __tracing_reset(struct ring_buffer *buffer, int cpu)
-{
- ftrace_disable_cpu();
- ring_buffer_reset_cpu(buffer, cpu);
- ftrace_enable_cpu();
-}
-
void tracing_reset(struct trace_array *tr, int cpu)
{
struct ring_buffer *buffer = tr->buffer;
@@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
/* Make sure all commits have finished */
synchronize_sched();
- __tracing_reset(buffer, cpu);
+ ring_buffer_reset_cpu(buffer, cpu);
ring_buffer_record_enable(buffer);
}
@@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
tr->time_start = ftrace_now(tr->cpu);
for_each_online_cpu(cpu)
- __tracing_reset(buffer, cpu);
+ ring_buffer_reset_cpu(buffer, cpu);
ring_buffer_record_enable(buffer);
}
@@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
#endif /* CONFIG_STACKTRACE */
+/* created for use with alloc_percpu */
+struct trace_buffer_struct {
+ char buffer[TRACE_BUF_SIZE];
+};
+
+static struct trace_buffer_struct *trace_percpu_buffer;
+static struct trace_buffer_struct *trace_percpu_sirq_buffer;
+static struct trace_buffer_struct *trace_percpu_irq_buffer;
+static struct trace_buffer_struct *trace_percpu_nmi_buffer;
+
+/*
+ * The buffer used is dependent on the context. There is a per cpu
+ * buffer for normal context, softirq contex, hard irq context and
+ * for NMI context. Thise allows for lockless recording.
+ *
+ * Note, if the buffers failed to be allocated, then this returns NULL
+ */
+static char *get_trace_buf(void)
+{
+ struct trace_buffer_struct *percpu_buffer;
+ struct trace_buffer_struct *buffer;
+
+ /*
+ * If we have allocated per cpu buffers, then we do not
+ * need to do any locking.
+ */
+ if (in_nmi())
+ percpu_buffer = trace_percpu_nmi_buffer;
+ else if (in_irq())
+ percpu_buffer = trace_percpu_irq_buffer;
+ else if (in_softirq())
+ percpu_buffer = trace_percpu_sirq_buffer;
+ else
+ percpu_buffer = trace_percpu_buffer;
+
+ if (!percpu_buffer)
+ return NULL;
+
+ buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
+
+ return buffer->buffer;
+}
+
+static int alloc_percpu_trace_buffer(void)
+{
+ struct trace_buffer_struct *buffers;
+ struct trace_buffer_struct *sirq_buffers;
+ struct trace_buffer_struct *irq_buffers;
+ struct trace_buffer_struct *nmi_buffers;
+
+ buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!buffers)
+ goto err_warn;
+
+ sirq_buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!sirq_buffers)
+ goto err_sirq;
+
+ irq_buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!irq_buffers)
+ goto err_irq;
+
+ nmi_buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!nmi_buffers)
+ goto err_nmi;
+
+ trace_percpu_buffer = buffers;
+ trace_percpu_sirq_buffer = sirq_buffers;
+ trace_percpu_irq_buffer = irq_buffers;
+ trace_percpu_nmi_buffer = nmi_buffers;
+
+ return 0;
+
+ err_nmi:
+ free_percpu(irq_buffers);
+ err_irq:
+ free_percpu(sirq_buffers);
+ err_sirq:
+ free_percpu(buffers);
+ err_warn:
+ WARN(1, "Could not allocate percpu trace_printk buffer");
+ return -ENOMEM;
+}
+
+void trace_printk_init_buffers(void)
+{
+ static int buffers_allocated;
+
+ if (buffers_allocated)
+ return;
+
+ if (alloc_percpu_trace_buffer())
+ return;
+
+ pr_info("ftrace: Allocated trace_printk buffers\n");
+
+ buffers_allocated = 1;
+}
+
/**
* trace_vbprintk - write binary msg to tracing buffer
*
*/
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
- static arch_spinlock_t trace_buf_lock =
- (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
- static u32 trace_buf[TRACE_BUF_SIZE];
-
struct ftrace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
struct bprint_entry *entry;
unsigned long flags;
- int disable;
- int cpu, len = 0, size, pc;
+ char *tbuffer;
+ int len = 0, size, pc;
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
pc = preempt_count();
preempt_disable_notrace();
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disable = atomic_inc_return(&data->disabled);
- if (unlikely(disable != 1))
+ tbuffer = get_trace_buf();
+ if (!tbuffer) {
+ len = 0;
goto out;
+ }
- /* Lockdep uses trace_printk for lock tracing */
- local_irq_save(flags);
- arch_spin_lock(&trace_buf_lock);
- len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+ len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
- if (len > TRACE_BUF_SIZE || len < 0)
- goto out_unlock;
+ if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
+ goto out;
+ local_save_flags(flags);
size = sizeof(*entry) + sizeof(u32) * len;
buffer = tr->buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
flags, pc);
if (!event)
- goto out_unlock;
+ goto out;
entry = ring_buffer_event_data(event);
entry->ip = ip;
entry->fmt = fmt;
- memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+ memcpy(entry->buf, tbuffer, sizeof(u32) * len);
if (!filter_check_discard(call, entry, buffer, event)) {
ring_buffer_unlock_commit(buffer, event);
ftrace_trace_stack(buffer, flags, 6, pc);
}
-out_unlock:
- arch_spin_unlock(&trace_buf_lock);
- local_irq_restore(flags);
-
out:
- atomic_dec_return(&data->disabled);
preempt_enable_notrace();
unpause_graph_tracing();
@@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr,
int trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args)
{
- static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
- static char trace_buf[TRACE_BUF_SIZE];
-
struct ftrace_event_call *call = &event_print;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
- struct trace_array_cpu *data;
- int cpu, len = 0, size, pc;
+ int len = 0, size, pc;
struct print_entry *entry;
- unsigned long irq_flags;
- int disable;
+ unsigned long flags;
+ char *tbuffer;
if (tracing_disabled || tracing_selftest_running)
return 0;
+ /* Don't pollute graph traces with trace_vprintk internals */
+ pause_graph_tracing();
+
pc = preempt_count();
preempt_disable_notrace();
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disable = atomic_inc_return(&data->disabled);
- if (unlikely(disable != 1))
+
+ tbuffer = get_trace_buf();
+ if (!tbuffer) {
+ len = 0;
goto out;
+ }
- pause_graph_tracing();
- raw_local_irq_save(irq_flags);
- arch_spin_lock(&trace_buf_lock);
- len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+ len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
+ if (len > TRACE_BUF_SIZE)
+ goto out;
+ local_save_flags(flags);
size = sizeof(*entry) + len + 1;
buffer = tr->buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- irq_flags, pc);
+ flags, pc);
if (!event)
- goto out_unlock;
+ goto out;
entry = ring_buffer_event_data(event);
entry->ip = ip;
- memcpy(&entry->buf, trace_buf, len);
+ memcpy(&entry->buf, tbuffer, len);
entry->buf[len] = '\0';
if (!filter_check_discard(call, entry, buffer, event)) {
ring_buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, irq_flags, 6, pc);
+ ftrace_trace_stack(buffer, flags, 6, pc);
}
-
- out_unlock:
- arch_spin_unlock(&trace_buf_lock);
- raw_local_irq_restore(irq_flags);
- unpause_graph_tracing();
out:
- atomic_dec_return(&data->disabled);
preempt_enable_notrace();
+ unpause_graph_tracing();
return len;
}
@@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
static void trace_iterator_increment(struct trace_iterator *iter)
{
- /* Don't allow ftrace to trace into the ring buffers */
- ftrace_disable_cpu();
-
iter->idx++;
if (iter->buffer_iter[iter->cpu])
ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
-
- ftrace_enable_cpu();
}
static struct trace_entry *
@@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
struct ring_buffer_event *event;
struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
- /* Don't allow ftrace to trace into the ring buffers */
- ftrace_disable_cpu();
-
if (buf_iter)
event = ring_buffer_iter_peek(buf_iter, ts);
else
event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
lost_events);
- ftrace_enable_cpu();
-
if (event) {
iter->ent_size = ring_buffer_event_length(event);
return ring_buffer_event_data(event);
@@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
static void trace_consume(struct trace_iterator *iter)
{
- /* Don't allow ftrace to trace into the ring buffers */
- ftrace_disable_cpu();
ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
&iter->lost_events);
- ftrace_enable_cpu();
}
static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
iter->cpu = 0;
iter->idx = -1;
- ftrace_disable_cpu();
-
if (cpu_file == TRACE_PIPE_ALL_CPU) {
for_each_tracing_cpu(cpu)
tracing_iter_reset(iter, cpu);
} else
tracing_iter_reset(iter, cpu_file);
- ftrace_enable_cpu();
-
iter->leftover = 0;
for (p = iter; p && l < *pos; p = s_next(m, p, &l))
;
@@ -2332,15 +2371,13 @@ static struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file)
{
long cpu_file = (long) inode->i_private;
- void *fail_ret = ERR_PTR(-ENOMEM);
struct trace_iterator *iter;
- struct seq_file *m;
- int cpu, ret;
+ int cpu;
if (tracing_disabled)
return ERR_PTR(-ENODEV);
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));
if (!iter)
return ERR_PTR(-ENOMEM);
@@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file)
tracing_iter_reset(iter, cpu);
}
- ret = seq_open(file, &tracer_seq_ops);
- if (ret < 0) {
- fail_ret = ERR_PTR(ret);
- goto fail_buffer;
- }
-
- m = file->private_data;
- m->private = iter;
-
mutex_unlock(&trace_types_lock);
return iter;
- fail_buffer:
- for_each_tracing_cpu(cpu) {
- if (iter->buffer_iter[cpu])
- ring_buffer_read_finish(iter->buffer_iter[cpu]);
- }
- free_cpumask_var(iter->started);
- tracing_start();
fail:
mutex_unlock(&trace_types_lock);
kfree(iter->trace);
- kfree(iter);
-
- return fail_ret;
+ seq_release_private(inode, file);
+ return ERR_PTR(-ENOMEM);
}
int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file)
tracing_start();
mutex_unlock(&trace_types_lock);
- seq_release(inode, file);
mutex_destroy(&iter->mutex);
free_cpumask_var(iter->started);
kfree(iter->trace);
- kfree(iter);
+ seq_release_private(inode, file);
return 0;
}
@@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
if (cpumask_test_cpu(cpu, tracing_cpumask) &&
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_inc(&global_trace.data[cpu]->disabled);
+ ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
}
if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_dec(&global_trace.data[cpu]->disabled);
+ ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
}
}
arch_spin_unlock(&ftrace_max_lock);
@@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
return t->init(tr);
}
-static int __tracing_resize_ring_buffer(unsigned long size)
+static void set_buffer_entries(struct trace_array *tr, unsigned long val)
+{
+ int cpu;
+ for_each_tracing_cpu(cpu)
+ tr->data[cpu]->entries = val;
+}
+
+static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
{
int ret;
@@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size)
*/
ring_buffer_expanded = 1;
- ret = ring_buffer_resize(global_trace.buffer, size);
+ ret = ring_buffer_resize(global_trace.buffer, size, cpu);
if (ret < 0)
return ret;
if (!current_trace->use_max_tr)
goto out;
- ret = ring_buffer_resize(max_tr.buffer, size);
+ ret = ring_buffer_resize(max_tr.buffer, size, cpu);
if (ret < 0) {
- int r;
+ int r = 0;
+
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ int i;
+ for_each_tracing_cpu(i) {
+ r = ring_buffer_resize(global_trace.buffer,
+ global_trace.data[i]->entries,
+ i);
+ if (r < 0)
+ break;
+ }
+ } else {
+ r = ring_buffer_resize(global_trace.buffer,
+ global_trace.data[cpu]->entries,
+ cpu);
+ }
- r = ring_buffer_resize(global_trace.buffer,
- global_trace.entries);
if (r < 0) {
/*
* AARGH! We are left with different
@@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size)
return ret;
}
- max_tr.entries = size;
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ set_buffer_entries(&max_tr, size);
+ else
+ max_tr.data[cpu]->entries = size;
+
out:
- global_trace.entries = size;
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ set_buffer_entries(&global_trace, size);
+ else
+ global_trace.data[cpu]->entries = size;
return ret;
}
-static ssize_t tracing_resize_ring_buffer(unsigned long size)
+static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
{
- int cpu, ret = size;
+ int ret = size;
mutex_lock(&trace_types_lock);
- tracing_stop();
-
- /* disable all cpu buffers */
- for_each_tracing_cpu(cpu) {
- if (global_trace.data[cpu])
- atomic_inc(&global_trace.data[cpu]->disabled);
- if (max_tr.data[cpu])
- atomic_inc(&max_tr.data[cpu]->disabled);
+ if (cpu_id != RING_BUFFER_ALL_CPUS) {
+ /* make sure, this cpu is enabled in the mask */
+ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
}
- if (size != global_trace.entries)
- ret = __tracing_resize_ring_buffer(size);
-
+ ret = __tracing_resize_ring_buffer(size, cpu_id);
if (ret < 0)
ret = -ENOMEM;
- for_each_tracing_cpu(cpu) {
- if (global_trace.data[cpu])
- atomic_dec(&global_trace.data[cpu]->disabled);
- if (max_tr.data[cpu])
- atomic_dec(&max_tr.data[cpu]->disabled);
- }
-
- tracing_start();
+out:
mutex_unlock(&trace_types_lock);
return ret;
@@ -3078,7 +3115,8 @@ int tracing_update_buffers(void)
mutex_lock(&trace_types_lock);
if (!ring_buffer_expanded)
- ret = __tracing_resize_ring_buffer(trace_buf_size);
+ ret = __tracing_resize_ring_buffer(trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
mutex_unlock(&trace_types_lock);
return ret;
@@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf)
mutex_lock(&trace_types_lock);
if (!ring_buffer_expanded) {
- ret = __tracing_resize_ring_buffer(trace_buf_size);
+ ret = __tracing_resize_ring_buffer(trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
if (ret < 0)
goto out;
ret = 0;
@@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf)
* The max_tr ring buffer has some state (e.g. ring->clock) and
* we want preserve it.
*/
- ring_buffer_resize(max_tr.buffer, 1);
- max_tr.entries = 1;
+ ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
+ set_buffer_entries(&max_tr, 1);
}
destroy_trace_option_files(topts);
@@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf)
topts = create_trace_option_files(current_trace);
if (current_trace->use_max_tr) {
- ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
- if (ret < 0)
- goto out;
- max_tr.entries = global_trace.entries;
+ int cpu;
+ /* we need to make per cpu buffer sizes equivalent */
+ for_each_tracing_cpu(cpu) {
+ ret = ring_buffer_resize(max_tr.buffer,
+ global_trace.data[cpu]->entries,
+ cpu);
+ if (ret < 0)
+ goto out;
+ max_tr.data[cpu]->entries =
+ global_trace.data[cpu]->entries;
+ }
}
if (t->init) {
@@ -3642,30 +3688,82 @@ out_err:
goto out;
}
+struct ftrace_entries_info {
+ struct trace_array *tr;
+ int cpu;
+};
+
+static int tracing_entries_open(struct inode *inode, struct file *filp)
+{
+ struct ftrace_entries_info *info;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->tr = &global_trace;
+ info->cpu = (unsigned long)inode->i_private;
+
+ filp->private_data = info;
+
+ return 0;
+}
+
static ssize_t
tracing_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct trace_array *tr = filp->private_data;
- char buf[96];
- int r;
+ struct ftrace_entries_info *info = filp->private_data;
+ struct trace_array *tr = info->tr;
+ char buf[64];
+ int r = 0;
+ ssize_t ret;
mutex_lock(&trace_types_lock);
- if (!ring_buffer_expanded)
- r = sprintf(buf, "%lu (expanded: %lu)\n",
- tr->entries >> 10,
- trace_buf_size >> 10);
- else
- r = sprintf(buf, "%lu\n", tr->entries >> 10);
+
+ if (info->cpu == RING_BUFFER_ALL_CPUS) {
+ int cpu, buf_size_same;
+ unsigned long size;
+
+ size = 0;
+ buf_size_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_tracing_cpu(cpu) {
+ /* fill in the size from first enabled cpu */
+ if (size == 0)
+ size = tr->data[cpu]->entries;
+ if (size != tr->data[cpu]->entries) {
+ buf_size_same = 0;
+ break;
+ }
+ }
+
+ if (buf_size_same) {
+ if (!ring_buffer_expanded)
+ r = sprintf(buf, "%lu (expanded: %lu)\n",
+ size >> 10,
+ trace_buf_size >> 10);
+ else
+ r = sprintf(buf, "%lu\n", size >> 10);
+ } else
+ r = sprintf(buf, "X\n");
+ } else
+ r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
+
mutex_unlock(&trace_types_lock);
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ return ret;
}
static ssize_t
tracing_entries_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
+ struct ftrace_entries_info *info = filp->private_data;
unsigned long val;
int ret;
@@ -3680,7 +3778,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
/* value is in KB */
val <<= 10;
- ret = tracing_resize_ring_buffer(val);
+ ret = tracing_resize_ring_buffer(val, info->cpu);
if (ret < 0)
return ret;
@@ -3689,6 +3787,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
return cnt;
}
+static int
+tracing_entries_release(struct inode *inode, struct file *filp)
+{
+ struct ftrace_entries_info *info = filp->private_data;
+
+ kfree(info);
+
+ return 0;
+}
+
static ssize_t
tracing_total_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -3700,7 +3808,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
mutex_lock(&trace_types_lock);
for_each_tracing_cpu(cpu) {
- size += tr->entries >> 10;
+ size += tr->data[cpu]->entries >> 10;
if (!ring_buffer_expanded)
expanded_size += trace_buf_size >> 10;
}
@@ -3734,7 +3842,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
if (trace_flags & TRACE_ITER_STOP_ON_FREE)
tracing_off();
/* resize the ring buffer to 0 */
- tracing_resize_ring_buffer(0);
+ tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);
return 0;
}
@@ -3749,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
struct print_entry *entry;
unsigned long irq_flags;
struct page *pages[2];
+ void *map_page[2];
int nr_pages = 1;
ssize_t written;
- void *page1;
- void *page2;
int offset;
int size;
int len;
int ret;
+ int i;
if (tracing_disabled)
return -EINVAL;
@@ -3795,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
goto out;
}
- page1 = kmap_atomic(pages[0]);
- if (nr_pages == 2)
- page2 = kmap_atomic(pages[1]);
+ for (i = 0; i < nr_pages; i++)
+ map_page[i] = kmap_atomic(pages[i]);
local_save_flags(irq_flags);
size = sizeof(*entry) + cnt + 2; /* possible \n added */
@@ -3815,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (nr_pages == 2) {
len = PAGE_SIZE - offset;
- memcpy(&entry->buf, page1 + offset, len);
- memcpy(&entry->buf[len], page2, cnt - len);
+ memcpy(&entry->buf, map_page[0] + offset, len);
+ memcpy(&entry->buf[len], map_page[1], cnt - len);
} else
- memcpy(&entry->buf, page1 + offset, cnt);
+ memcpy(&entry->buf, map_page[0] + offset, cnt);
if (entry->buf[cnt - 1] != '\n') {
entry->buf[cnt] = '\n';
@@ -3833,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
*fpos += written;
out_unlock:
- if (nr_pages == 2)
- kunmap_atomic(page2);
- kunmap_atomic(page1);
- while (nr_pages > 0)
- put_page(pages[--nr_pages]);
+ for (i = 0; i < nr_pages; i++){
+ kunmap_atomic(map_page[i]);
+ put_page(pages[i]);
+ }
out:
return written;
}
@@ -3933,9 +4039,10 @@ static const struct file_operations tracing_pipe_fops = {
};
static const struct file_operations tracing_entries_fops = {
- .open = tracing_open_generic,
+ .open = tracing_entries_open,
.read = tracing_entries_read,
.write = tracing_entries_write,
+ .release = tracing_entries_release,
.llseek = generic_file_llseek,
};
@@ -4367,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu)
struct dentry *d_cpu;
char cpu_dir[30]; /* 30 characters should be more than enough */
+ if (!d_percpu)
+ return;
+
snprintf(cpu_dir, 30, "cpu%ld", cpu);
d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
if (!d_cpu) {
@@ -4387,6 +4497,9 @@ static void tracing_init_debugfs_percpu(long cpu)
trace_create_file("stats", 0444, d_cpu,
(void *) cpu, &tracing_stats_fops);
+
+ trace_create_file("buffer_size_kb", 0444, d_cpu,
+ (void *) cpu, &tracing_entries_fops);
}
#ifdef CONFIG_FTRACE_SELFTEST
@@ -4718,7 +4831,7 @@ static __init int tracer_init_debugfs(void)
(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
trace_create_file("buffer_size_kb", 0644, d_tracer,
- &global_trace, &tracing_entries_fops);
+ (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
trace_create_file("buffer_total_size_kb", 0444, d_tracer,
&global_trace, &tracing_total_entries_fops);
@@ -4957,6 +5070,10 @@ __init static int tracer_alloc_buffers(void)
if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
goto out_free_buffer_mask;
+ /* Only allocate trace_printk buffers if a trace_printk exists */
+ if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
+ trace_printk_init_buffers();
+
/* To save memory, keep the ring buffer size to its minimum */
if (ring_buffer_expanded)
ring_buf_size = trace_buf_size;
@@ -4975,7 +5092,6 @@ __init static int tracer_alloc_buffers(void)
WARN_ON(1);
goto out_free_cpumask;
}
- global_trace.entries = ring_buffer_size(global_trace.buffer);
if (global_trace.buffer_disabled)
tracing_off();
@@ -4988,7 +5104,6 @@ __init static int tracer_alloc_buffers(void)
ring_buffer_free(global_trace.buffer);
goto out_free_cpumask;
}
- max_tr.entries = 1;
#endif
/* Allocate the first page for all buffers */
@@ -4997,6 +5112,12 @@ __init static int tracer_alloc_buffers(void)
max_tr.data[i] = &per_cpu(max_tr_data, i);
}
+ set_buffer_entries(&global_trace,
+ ring_buffer_size(global_trace.buffer, 0));
+#ifdef CONFIG_TRACER_MAX_TRACE
+ set_buffer_entries(&max_tr, 1);
+#endif
+
trace_init_cmdlines();
register_tracer(&nop_trace);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f95d65da6db8..5aec220d2de0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -103,6 +103,11 @@ struct kretprobe_trace_entry_head {
unsigned long ret_ip;
};
+struct uprobe_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long ip;
+};
+
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
@@ -131,6 +136,7 @@ struct trace_array_cpu {
atomic_t disabled;
void *buffer_page; /* ring buffer spare */
+ unsigned long entries;
unsigned long saved_latency;
unsigned long critical_start;
unsigned long critical_end;
@@ -152,7 +158,6 @@ struct trace_array_cpu {
*/
struct trace_array {
struct ring_buffer *buffer;
- unsigned long entries;
int cpu;
int buffer_disabled;
cycle_t time_start;
@@ -826,6 +831,8 @@ extern struct list_head ftrace_events;
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
+void trace_printk_init_buffers(void);
+
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
extern struct ftrace_event_call \
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 079a93ae8a9d..29111da1d100 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
if (!call->name || !call->class || !call->class->reg)
continue;
+ if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+ continue;
+
if (match &&
strcmp(match, call->name) != 0 &&
strcmp(match, call->class->system) != 0)
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
return -1;
}
- if (call->class->reg)
+ if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
trace_create_file("enable", 0644, call->dir, call,
enable);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 3dd15e8bc856..e039906b037d 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \
.event.type = etype, \
.class = &event_class_ftrace_##call, \
.print_fmt = print, \
+ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
}; \
struct ftrace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 580a05ec926b..b31d3d5699fe 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -19,547 +19,15 @@
#include <linux/module.h>
#include <linux/uaccess.h>
-#include <linux/kprobes.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/debugfs.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/ptrace.h>
-#include <linux/perf_event.h>
-#include <linux/stringify.h>
-#include <linux/limits.h>
-#include <asm/bitsperlong.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-#define MAX_TRACE_ARGS 128
-#define MAX_ARGSTR_LEN 63
-#define MAX_EVENT_NAME_LEN 64
-#define MAX_STRING_SIZE PATH_MAX
-#define KPROBE_EVENT_SYSTEM "kprobes"
-
-/* Reserved field names */
-#define FIELD_STRING_IP "__probe_ip"
-#define FIELD_STRING_RETIP "__probe_ret_ip"
-#define FIELD_STRING_FUNC "__probe_func"
-
-const char *reserved_field_names[] = {
- "common_type",
- "common_flags",
- "common_preempt_count",
- "common_pid",
- "common_tgid",
- FIELD_STRING_IP,
- FIELD_STRING_RETIP,
- FIELD_STRING_FUNC,
-};
-
-/* Printing function type */
-typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
- void *);
-#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
-#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
-
-/* Printing in basic type function template */
-#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
-static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
- const char *name, \
- void *data, void *ent)\
-{ \
- return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
-} \
-static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
-
-DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
-
-/* data_rloc: data relative location, compatible with u32 */
-#define make_data_rloc(len, roffs) \
- (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
-#define get_rloc_len(dl) ((u32)(dl) >> 16)
-#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
-
-static inline void *get_rloc_data(u32 *dl)
-{
- return (u8 *)dl + get_rloc_offs(*dl);
-}
-
-/* For data_loc conversion */
-static inline void *get_loc_data(u32 *dl, void *ent)
-{
- return (u8 *)ent + get_rloc_offs(*dl);
-}
-
-/*
- * Convert data_rloc to data_loc:
- * data_rloc stores the offset from data_rloc itself, but data_loc
- * stores the offset from event entry.
- */
-#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
-
-/* For defining macros, define string/string_size types */
-typedef u32 string;
-typedef u32 string_size;
-
-/* Print type function for string type */
-static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
- const char *name,
- void *data, void *ent)
-{
- int len = *(u32 *)data >> 16;
-
- if (!len)
- return trace_seq_printf(s, " %s=(fault)", name);
- else
- return trace_seq_printf(s, " %s=\"%s\"", name,
- (const char *)get_loc_data(data, ent));
-}
-static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
-
-/* Data fetch function type */
-typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
-
-struct fetch_param {
- fetch_func_t fn;
- void *data;
-};
-
-static __kprobes void call_fetch(struct fetch_param *fprm,
- struct pt_regs *regs, void *dest)
-{
- return fprm->fn(regs, fprm->data, dest);
-}
-
-#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
-/*
- * Define macro for basic types - we don't need to define s* types, because
- * we have to care only about bitwidth at recording time.
- */
-#define DEFINE_BASIC_FETCH_FUNCS(method) \
-DEFINE_FETCH_##method(u8) \
-DEFINE_FETCH_##method(u16) \
-DEFINE_FETCH_##method(u32) \
-DEFINE_FETCH_##method(u64)
-
-#define CHECK_FETCH_FUNCS(method, fn) \
- (((FETCH_FUNC_NAME(method, u8) == fn) || \
- (FETCH_FUNC_NAME(method, u16) == fn) || \
- (FETCH_FUNC_NAME(method, u32) == fn) || \
- (FETCH_FUNC_NAME(method, u64) == fn) || \
- (FETCH_FUNC_NAME(method, string) == fn) || \
- (FETCH_FUNC_NAME(method, string_size) == fn)) \
- && (fn != NULL))
-
-/* Data fetch function templates */
-#define DEFINE_FETCH_reg(type) \
-static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
- void *offset, void *dest) \
-{ \
- *(type *)dest = (type)regs_get_register(regs, \
- (unsigned int)((unsigned long)offset)); \
-}
-DEFINE_BASIC_FETCH_FUNCS(reg)
-/* No string on the register */
-#define fetch_reg_string NULL
-#define fetch_reg_string_size NULL
-
-#define DEFINE_FETCH_stack(type) \
-static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
- void *offset, void *dest) \
-{ \
- *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
- (unsigned int)((unsigned long)offset)); \
-}
-DEFINE_BASIC_FETCH_FUNCS(stack)
-/* No string on the stack entry */
-#define fetch_stack_string NULL
-#define fetch_stack_string_size NULL
-
-#define DEFINE_FETCH_retval(type) \
-static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
- void *dummy, void *dest) \
-{ \
- *(type *)dest = (type)regs_return_value(regs); \
-}
-DEFINE_BASIC_FETCH_FUNCS(retval)
-/* No string on the retval */
-#define fetch_retval_string NULL
-#define fetch_retval_string_size NULL
-
-#define DEFINE_FETCH_memory(type) \
-static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
- void *addr, void *dest) \
-{ \
- type retval; \
- if (probe_kernel_address(addr, retval)) \
- *(type *)dest = 0; \
- else \
- *(type *)dest = retval; \
-}
-DEFINE_BASIC_FETCH_FUNCS(memory)
-/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
- * length and relative data location.
- */
-static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
- void *addr, void *dest)
-{
- long ret;
- int maxlen = get_rloc_len(*(u32 *)dest);
- u8 *dst = get_rloc_data(dest);
- u8 *src = addr;
- mm_segment_t old_fs = get_fs();
- if (!maxlen)
- return;
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- set_fs(KERNEL_DS);
- pagefault_disable();
- do
- ret = __copy_from_user_inatomic(dst++, src++, 1);
- while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
- dst[-1] = '\0';
- pagefault_enable();
- set_fs(old_fs);
-
- if (ret < 0) { /* Failed to fetch string */
- ((u8 *)get_rloc_data(dest))[0] = '\0';
- *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
- } else
- *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
- get_rloc_offs(*(u32 *)dest));
-}
-/* Return the length of string -- including null terminal byte */
-static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
- void *addr, void *dest)
-{
- int ret, len = 0;
- u8 c;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- pagefault_disable();
- do {
- ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
- len++;
- } while (c && ret == 0 && len < MAX_STRING_SIZE);
- pagefault_enable();
- set_fs(old_fs);
-
- if (ret < 0) /* Failed to check the length */
- *(u32 *)dest = 0;
- else
- *(u32 *)dest = len;
-}
-
-/* Memory fetching by symbol */
-struct symbol_cache {
- char *symbol;
- long offset;
- unsigned long addr;
-};
-
-static unsigned long update_symbol_cache(struct symbol_cache *sc)
-{
- sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
- if (sc->addr)
- sc->addr += sc->offset;
- return sc->addr;
-}
-
-static void free_symbol_cache(struct symbol_cache *sc)
-{
- kfree(sc->symbol);
- kfree(sc);
-}
-
-static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
-{
- struct symbol_cache *sc;
-
- if (!sym || strlen(sym) == 0)
- return NULL;
- sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
- if (!sc)
- return NULL;
-
- sc->symbol = kstrdup(sym, GFP_KERNEL);
- if (!sc->symbol) {
- kfree(sc);
- return NULL;
- }
- sc->offset = offset;
- update_symbol_cache(sc);
- return sc;
-}
-
-#define DEFINE_FETCH_symbol(type) \
-static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
- void *data, void *dest) \
-{ \
- struct symbol_cache *sc = data; \
- if (sc->addr) \
- fetch_memory_##type(regs, (void *)sc->addr, dest); \
- else \
- *(type *)dest = 0; \
-}
-DEFINE_BASIC_FETCH_FUNCS(symbol)
-DEFINE_FETCH_symbol(string)
-DEFINE_FETCH_symbol(string_size)
-
-/* Dereference memory access function */
-struct deref_fetch_param {
- struct fetch_param orig;
- long offset;
-};
-
-#define DEFINE_FETCH_deref(type) \
-static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
- void *data, void *dest) \
-{ \
- struct deref_fetch_param *dprm = data; \
- unsigned long addr; \
- call_fetch(&dprm->orig, regs, &addr); \
- if (addr) { \
- addr += dprm->offset; \
- fetch_memory_##type(regs, (void *)addr, dest); \
- } else \
- *(type *)dest = 0; \
-}
-DEFINE_BASIC_FETCH_FUNCS(deref)
-DEFINE_FETCH_deref(string)
-DEFINE_FETCH_deref(string_size)
-
-static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
-{
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- update_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- update_symbol_cache(data->orig.data);
-}
-
-static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
-{
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- free_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- free_symbol_cache(data->orig.data);
- kfree(data);
-}
-
-/* Bitfield fetch function */
-struct bitfield_fetch_param {
- struct fetch_param orig;
- unsigned char hi_shift;
- unsigned char low_shift;
-};
+#include "trace_probe.h"
-#define DEFINE_FETCH_bitfield(type) \
-static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
- void *data, void *dest) \
-{ \
- struct bitfield_fetch_param *bprm = data; \
- type buf = 0; \
- call_fetch(&bprm->orig, regs, &buf); \
- if (buf) { \
- buf <<= bprm->hi_shift; \
- buf >>= bprm->low_shift; \
- } \
- *(type *)dest = buf; \
-}
-DEFINE_BASIC_FETCH_FUNCS(bitfield)
-#define fetch_bitfield_string NULL
-#define fetch_bitfield_string_size NULL
-
-static __kprobes void
-update_bitfield_fetch_param(struct bitfield_fetch_param *data)
-{
- /*
- * Don't check the bitfield itself, because this must be the
- * last fetch function.
- */
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- update_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- update_symbol_cache(data->orig.data);
-}
-
-static __kprobes void
-free_bitfield_fetch_param(struct bitfield_fetch_param *data)
-{
- /*
- * Don't check the bitfield itself, because this must be the
- * last fetch function.
- */
- if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
- free_deref_fetch_param(data->orig.data);
- else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
- free_symbol_cache(data->orig.data);
- kfree(data);
-}
-
-/* Default (unsigned long) fetch type */
-#define __DEFAULT_FETCH_TYPE(t) u##t
-#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
-#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
-#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
-
-/* Fetch types */
-enum {
- FETCH_MTD_reg = 0,
- FETCH_MTD_stack,
- FETCH_MTD_retval,
- FETCH_MTD_memory,
- FETCH_MTD_symbol,
- FETCH_MTD_deref,
- FETCH_MTD_bitfield,
- FETCH_MTD_END,
-};
-
-#define ASSIGN_FETCH_FUNC(method, type) \
- [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
-
-#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
- {.name = _name, \
- .size = _size, \
- .is_signed = sign, \
- .print = PRINT_TYPE_FUNC_NAME(ptype), \
- .fmt = PRINT_TYPE_FMT_NAME(ptype), \
- .fmttype = _fmttype, \
- .fetch = { \
-ASSIGN_FETCH_FUNC(reg, ftype), \
-ASSIGN_FETCH_FUNC(stack, ftype), \
-ASSIGN_FETCH_FUNC(retval, ftype), \
-ASSIGN_FETCH_FUNC(memory, ftype), \
-ASSIGN_FETCH_FUNC(symbol, ftype), \
-ASSIGN_FETCH_FUNC(deref, ftype), \
-ASSIGN_FETCH_FUNC(bitfield, ftype), \
- } \
- }
-
-#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
- __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
-
-#define FETCH_TYPE_STRING 0
-#define FETCH_TYPE_STRSIZE 1
-
-/* Fetch type information table */
-static const struct fetch_type {
- const char *name; /* Name of type */
- size_t size; /* Byte size of type */
- int is_signed; /* Signed flag */
- print_type_func_t print; /* Print functions */
- const char *fmt; /* Fromat string */
- const char *fmttype; /* Name in format file */
- /* Fetch functions */
- fetch_func_t fetch[FETCH_MTD_END];
-} fetch_type_table[] = {
- /* Special types */
- [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
- sizeof(u32), 1, "__data_loc char[]"),
- [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
- string_size, sizeof(u32), 0, "u32"),
- /* Basic types */
- ASSIGN_FETCH_TYPE(u8, u8, 0),
- ASSIGN_FETCH_TYPE(u16, u16, 0),
- ASSIGN_FETCH_TYPE(u32, u32, 0),
- ASSIGN_FETCH_TYPE(u64, u64, 0),
- ASSIGN_FETCH_TYPE(s8, u8, 1),
- ASSIGN_FETCH_TYPE(s16, u16, 1),
- ASSIGN_FETCH_TYPE(s32, u32, 1),
- ASSIGN_FETCH_TYPE(s64, u64, 1),
-};
-
-static const struct fetch_type *find_fetch_type(const char *type)
-{
- int i;
-
- if (!type)
- type = DEFAULT_FETCH_TYPE_STR;
-
- /* Special case: bitfield */
- if (*type == 'b') {
- unsigned long bs;
- type = strchr(type, '/');
- if (!type)
- goto fail;
- type++;
- if (strict_strtoul(type, 0, &bs))
- goto fail;
- switch (bs) {
- case 8:
- return find_fetch_type("u8");
- case 16:
- return find_fetch_type("u16");
- case 32:
- return find_fetch_type("u32");
- case 64:
- return find_fetch_type("u64");
- default:
- goto fail;
- }
- }
-
- for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
- if (strcmp(type, fetch_type_table[i].name) == 0)
- return &fetch_type_table[i];
-fail:
- return NULL;
-}
-
-/* Special function : only accept unsigned long */
-static __kprobes void fetch_stack_address(struct pt_regs *regs,
- void *dummy, void *dest)
-{
- *(unsigned long *)dest = kernel_stack_pointer(regs);
-}
-
-static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
- fetch_func_t orig_fn)
-{
- int i;
-
- if (type != &fetch_type_table[FETCH_TYPE_STRING])
- return NULL; /* Only string type needs size function */
- for (i = 0; i < FETCH_MTD_END; i++)
- if (type->fetch[i] == orig_fn)
- return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
-
- WARN_ON(1); /* This should not happen */
- return NULL;
-}
+#define KPROBE_EVENT_SYSTEM "kprobes"
/**
* Kprobe event core functions
*/
-struct probe_arg {
- struct fetch_param fetch;
- struct fetch_param fetch_size;
- unsigned int offset; /* Offset from argument entry */
- const char *name; /* Name of this argument */
- const char *comm; /* Command of this argument */
- const struct fetch_type *type; /* Type of this argument */
-};
-
-/* Flags for trace_probe */
-#define TP_FLAG_TRACE 1
-#define TP_FLAG_PROFILE 2
-#define TP_FLAG_REGISTERED 4
-
struct trace_probe {
struct list_head list;
struct kretprobe rp; /* Use rp.kp for kprobe use */
@@ -631,18 +99,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
static int kretprobe_dispatcher(struct kretprobe_instance *ri,
struct pt_regs *regs);
-/* Check the name is good for event/group/fields */
-static int is_good_name(const char *name)
-{
- if (!isalpha(*name) && *name != '_')
- return 0;
- while (*++name != '\0') {
- if (!isalpha(*name) && !isdigit(*name) && *name != '_')
- return 0;
- }
- return 1;
-}
-
/*
* Allocate new trace_probe and initialize it (including kprobes).
*/
@@ -651,7 +107,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
void *addr,
const char *symbol,
unsigned long offs,
- int nargs, int is_return)
+ int nargs, bool is_return)
{
struct trace_probe *tp;
int ret = -ENOMEM;
@@ -702,34 +158,12 @@ error:
return ERR_PTR(ret);
}
-static void update_probe_arg(struct probe_arg *arg)
-{
- if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
- update_bitfield_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
- update_deref_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
- update_symbol_cache(arg->fetch.data);
-}
-
-static void free_probe_arg(struct probe_arg *arg)
-{
- if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
- free_bitfield_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
- free_deref_fetch_param(arg->fetch.data);
- else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
- free_symbol_cache(arg->fetch.data);
- kfree(arg->name);
- kfree(arg->comm);
-}
-
static void free_trace_probe(struct trace_probe *tp)
{
int i;
for (i = 0; i < tp->nr_args; i++)
- free_probe_arg(&tp->args[i]);
+ traceprobe_free_probe_arg(&tp->args[i]);
kfree(tp->call.class->system);
kfree(tp->call.name);
@@ -787,7 +221,7 @@ static int __register_trace_probe(struct trace_probe *tp)
return -EINVAL;
for (i = 0; i < tp->nr_args; i++)
- update_probe_arg(&tp->args[i]);
+ traceprobe_update_arg(&tp->args[i]);
/* Set/clear disabled flag according to tp->flag */
if (trace_probe_is_enabled(tp))
@@ -919,227 +353,6 @@ static struct notifier_block trace_probe_module_nb = {
.priority = 1 /* Invoked after kprobe module callback */
};
-/* Split symbol and offset. */
-static int split_symbol_offset(char *symbol, unsigned long *offset)
-{
- char *tmp;
- int ret;
-
- if (!offset)
- return -EINVAL;
-
- tmp = strchr(symbol, '+');
- if (tmp) {
- /* skip sign because strict_strtol doesn't accept '+' */
- ret = strict_strtoul(tmp + 1, 0, offset);
- if (ret)
- return ret;
- *tmp = '\0';
- } else
- *offset = 0;
- return 0;
-}
-
-#define PARAM_MAX_ARGS 16
-#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
-
-static int parse_probe_vars(char *arg, const struct fetch_type *t,
- struct fetch_param *f, int is_return)
-{
- int ret = 0;
- unsigned long param;
-
- if (strcmp(arg, "retval") == 0) {
- if (is_return)
- f->fn = t->fetch[FETCH_MTD_retval];
- else
- ret = -EINVAL;
- } else if (strncmp(arg, "stack", 5) == 0) {
- if (arg[5] == '\0') {
- if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
- f->fn = fetch_stack_address;
- else
- ret = -EINVAL;
- } else if (isdigit(arg[5])) {
- ret = strict_strtoul(arg + 5, 10, &param);
- if (ret || param > PARAM_MAX_STACK)
- ret = -EINVAL;
- else {
- f->fn = t->fetch[FETCH_MTD_stack];
- f->data = (void *)param;
- }
- } else
- ret = -EINVAL;
- } else
- ret = -EINVAL;
- return ret;
-}
-
-/* Recursive argument parser */
-static int __parse_probe_arg(char *arg, const struct fetch_type *t,
- struct fetch_param *f, int is_return)
-{
- int ret = 0;
- unsigned long param;
- long offset;
- char *tmp;
-
- switch (arg[0]) {
- case '$':
- ret = parse_probe_vars(arg + 1, t, f, is_return);
- break;
- case '%': /* named register */
- ret = regs_query_register_offset(arg + 1);
- if (ret >= 0) {
- f->fn = t->fetch[FETCH_MTD_reg];
- f->data = (void *)(unsigned long)ret;
- ret = 0;
- }
- break;
- case '@': /* memory or symbol */
- if (isdigit(arg[1])) {
- ret = strict_strtoul(arg + 1, 0, &param);
- if (ret)
- break;
- f->fn = t->fetch[FETCH_MTD_memory];
- f->data = (void *)param;
- } else {
- ret = split_symbol_offset(arg + 1, &offset);
- if (ret)
- break;
- f->data = alloc_symbol_cache(arg + 1, offset);
- if (f->data)
- f->fn = t->fetch[FETCH_MTD_symbol];
- }
- break;
- case '+': /* deref memory */
- arg++; /* Skip '+', because strict_strtol() rejects it. */
- case '-':
- tmp = strchr(arg, '(');
- if (!tmp)
- break;
- *tmp = '\0';
- ret = strict_strtol(arg, 0, &offset);
- if (ret)
- break;
- arg = tmp + 1;
- tmp = strrchr(arg, ')');
- if (tmp) {
- struct deref_fetch_param *dprm;
- const struct fetch_type *t2 = find_fetch_type(NULL);
- *tmp = '\0';
- dprm = kzalloc(sizeof(struct deref_fetch_param),
- GFP_KERNEL);
- if (!dprm)
- return -ENOMEM;
- dprm->offset = offset;
- ret = __parse_probe_arg(arg, t2, &dprm->orig,
- is_return);
- if (ret)
- kfree(dprm);
- else {
- f->fn = t->fetch[FETCH_MTD_deref];
- f->data = (void *)dprm;
- }
- }
- break;
- }
- if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
- pr_info("%s type has no corresponding fetch method.\n",
- t->name);
- ret = -EINVAL;
- }
- return ret;
-}
-
-#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
-
-/* Bitfield type needs to be parsed into a fetch function */
-static int __parse_bitfield_probe_arg(const char *bf,
- const struct fetch_type *t,
- struct fetch_param *f)
-{
- struct bitfield_fetch_param *bprm;
- unsigned long bw, bo;
- char *tail;
-
- if (*bf != 'b')
- return 0;
-
- bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
- if (!bprm)
- return -ENOMEM;
- bprm->orig = *f;
- f->fn = t->fetch[FETCH_MTD_bitfield];
- f->data = (void *)bprm;
-
- bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
- if (bw == 0 || *tail != '@')
- return -EINVAL;
-
- bf = tail + 1;
- bo = simple_strtoul(bf, &tail, 0);
- if (tail == bf || *tail != '/')
- return -EINVAL;
-
- bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
- bprm->low_shift = bprm->hi_shift + bo;
- return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
-}
-
-/* String length checking wrapper */
-static int parse_probe_arg(char *arg, struct trace_probe *tp,
- struct probe_arg *parg, int is_return)
-{
- const char *t;
- int ret;
-
- if (strlen(arg) > MAX_ARGSTR_LEN) {
- pr_info("Argument is too long.: %s\n", arg);
- return -ENOSPC;
- }
- parg->comm = kstrdup(arg, GFP_KERNEL);
- if (!parg->comm) {
- pr_info("Failed to allocate memory for command '%s'.\n", arg);
- return -ENOMEM;
- }
- t = strchr(parg->comm, ':');
- if (t) {
- arg[t - parg->comm] = '\0';
- t++;
- }
- parg->type = find_fetch_type(t);
- if (!parg->type) {
- pr_info("Unsupported type: %s\n", t);
- return -EINVAL;
- }
- parg->offset = tp->size;
- tp->size += parg->type->size;
- ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
- if (ret >= 0 && t != NULL)
- ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
- if (ret >= 0) {
- parg->fetch_size.fn = get_fetch_size_function(parg->type,
- parg->fetch.fn);
- parg->fetch_size.data = parg->fetch.data;
- }
- return ret;
-}
-
-/* Return 1 if name is reserved or already used by another argument */
-static int conflict_field_name(const char *name,
- struct probe_arg *args, int narg)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
- if (strcmp(reserved_field_names[i], name) == 0)
- return 1;
- for (i = 0; i < narg; i++)
- if (strcmp(args[i].name, name) == 0)
- return 1;
- return 0;
-}
-
static int create_trace_probe(int argc, char **argv)
{
/*
@@ -1162,7 +375,7 @@ static int create_trace_probe(int argc, char **argv)
*/
struct trace_probe *tp;
int i, ret = 0;
- int is_return = 0, is_delete = 0;
+ bool is_return = false, is_delete = false;
char *symbol = NULL, *event = NULL, *group = NULL;
char *arg;
unsigned long offset = 0;
@@ -1171,11 +384,11 @@ static int create_trace_probe(int argc, char **argv)
/* argc must be >= 1 */
if (argv[0][0] == 'p')
- is_return = 0;
+ is_return = false;
else if (argv[0][0] == 'r')
- is_return = 1;
+ is_return = true;
else if (argv[0][0] == '-')
- is_delete = 1;
+ is_delete = true;
else {
pr_info("Probe definition must be started with 'p', 'r' or"
" '-'.\n");
@@ -1240,7 +453,7 @@ static int create_trace_probe(int argc, char **argv)
/* a symbol specified */
symbol = argv[1];
/* TODO: support .init module functions */
- ret = split_symbol_offset(symbol, &offset);
+ ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret) {
pr_info("Failed to parse symbol.\n");
return ret;
@@ -1302,7 +515,8 @@ static int create_trace_probe(int argc, char **argv)
goto error;
}
- if (conflict_field_name(tp->args[i].name, tp->args, i)) {
+ if (traceprobe_conflict_field_name(tp->args[i].name,
+ tp->args, i)) {
pr_info("Argument[%d] name '%s' conflicts with "
"another field.\n", i, argv[i]);
ret = -EINVAL;
@@ -1310,7 +524,8 @@ static int create_trace_probe(int argc, char **argv)
}
/* Parse fetch argument */
- ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
+ ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i],
+ is_return, true);
if (ret) {
pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
goto error;
@@ -1412,70 +627,11 @@ static int probes_open(struct inode *inode, struct file *file)
return seq_open(file, &probes_seq_op);
}
-static int command_trace_probe(const char *buf)
-{
- char **argv;
- int argc = 0, ret = 0;
-
- argv = argv_split(GFP_KERNEL, buf, &argc);
- if (!argv)
- return -ENOMEM;
-
- if (argc)
- ret = create_trace_probe(argc, argv);
-
- argv_free(argv);
- return ret;
-}
-
-#define WRITE_BUFSIZE 4096
-
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- char *kbuf, *tmp;
- int ret;
- size_t done;
- size_t size;
-
- kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
- if (!kbuf)
- return -ENOMEM;
-
- ret = done = 0;
- while (done < count) {
- size = count - done;
- if (size >= WRITE_BUFSIZE)
- size = WRITE_BUFSIZE - 1;
- if (copy_from_user(kbuf, buffer + done, size)) {
- ret = -EFAULT;
- goto out;
- }
- kbuf[size] = '\0';
- tmp = strchr(kbuf, '\n');
- if (tmp) {
- *tmp = '\0';
- size = tmp - kbuf + 1;
- } else if (done + size < count) {
- pr_warning("Line length is too long: "
- "Should be less than %d.", WRITE_BUFSIZE);
- ret = -EINVAL;
- goto out;
- }
- done += size;
- /* Remove comments */
- tmp = strchr(kbuf, '#');
- if (tmp)
- *tmp = '\0';
-
- ret = command_trace_probe(kbuf);
- if (ret)
- goto out;
- }
- ret = done;
-out:
- kfree(kbuf);
- return ret;
+ return traceprobe_probes_write(file, buffer, count, ppos,
+ create_trace_probe);
}
static const struct file_operations kprobe_events_ops = {
@@ -1711,16 +867,6 @@ partial:
return TRACE_TYPE_PARTIAL_LINE;
}
-#undef DEFINE_FIELD
-#define DEFINE_FIELD(type, item, name, is_signed) \
- do { \
- ret = trace_define_field(event_call, #type, name, \
- offsetof(typeof(field), item), \
- sizeof(field.item), is_signed, \
- FILTER_OTHER); \
- if (ret) \
- return ret; \
- } while (0)
static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
{
@@ -2051,8 +1197,9 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
- ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
- "$stack $stack0 +0($stack)");
+ ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
+ "$stack $stack0 +0($stack)",
+ create_trace_probe);
if (WARN_ON_ONCE(ret)) {
pr_warning("error on probing function entry.\n");
warn++;
@@ -2066,8 +1213,8 @@ static __init int kprobe_trace_self_tests_init(void)
enable_trace_probe(tp, TP_FLAG_TRACE);
}
- ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
- "$retval");
+ ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
+ "$retval", create_trace_probe);
if (WARN_ON_ONCE(ret)) {
pr_warning("error on probing function return.\n");
warn++;
@@ -2101,13 +1248,13 @@ static __init int kprobe_trace_self_tests_init(void)
} else
disable_trace_probe(tp, TP_FLAG_TRACE);
- ret = command_trace_probe("-:testprobe");
+ ret = traceprobe_command("-:testprobe", create_trace_probe);
if (WARN_ON_ONCE(ret)) {
pr_warning("error on deleting a probe.\n");
warn++;
}
- ret = command_trace_probe("-:testprobe2");
+ ret = traceprobe_command("-:testprobe2", create_trace_probe);
if (WARN_ON_ONCE(ret)) {
pr_warning("error on deleting a probe.\n");
warn++;
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 6fd4ffd042f9..a9077c1b4ad3 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
const char **iter;
char *fmt;
+ /* allocate the trace_printk per cpu buffers */
+ if (start != end)
+ trace_printk_init_buffers();
+
mutex_lock(&btrace_mutex);
for (iter = start; iter < end; iter++) {
struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
new file mode 100644
index 000000000000..daa9980153af
--- /dev/null
+++ b/kernel/trace/trace_probe.c
@@ -0,0 +1,839 @@
+/*
+ * Common code for probe-based Dynamic events.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * This code was copied from kernel/trace/trace_kprobe.c written by
+ * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+ *
+ * Updates to make this generic:
+ * Copyright (C) IBM Corporation, 2010-2011
+ * Author: Srikar Dronamraju
+ */
+
+#include "trace_probe.h"
+
+const char *reserved_field_names[] = {
+ "common_type",
+ "common_flags",
+ "common_preempt_count",
+ "common_pid",
+ "common_tgid",
+ FIELD_STRING_IP,
+ FIELD_STRING_RETIP,
+ FIELD_STRING_FUNC,
+};
+
+/* Printing function type */
+#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
+#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
+
+/* Printing in basic type function template */
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
+static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
+ const char *name, \
+ void *data, void *ent)\
+{ \
+ return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
+} \
+static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
+
+static inline void *get_rloc_data(u32 *dl)
+{
+ return (u8 *)dl + get_rloc_offs(*dl);
+}
+
+/* For data_loc conversion */
+static inline void *get_loc_data(u32 *dl, void *ent)
+{
+ return (u8 *)ent + get_rloc_offs(*dl);
+}
+
+/* For defining macros, define string/string_size types */
+typedef u32 string;
+typedef u32 string_size;
+
+/* Print type function for string type */
+static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
+ const char *name,
+ void *data, void *ent)
+{
+ int len = *(u32 *)data >> 16;
+
+ if (!len)
+ return trace_seq_printf(s, " %s=(fault)", name);
+ else
+ return trace_seq_printf(s, " %s=\"%s\"", name,
+ (const char *)get_loc_data(data, ent));
+}
+
+static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
+
+#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
+/*
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
+ */
+#define DEFINE_BASIC_FETCH_FUNCS(method) \
+DEFINE_FETCH_##method(u8) \
+DEFINE_FETCH_##method(u16) \
+DEFINE_FETCH_##method(u32) \
+DEFINE_FETCH_##method(u64)
+
+#define CHECK_FETCH_FUNCS(method, fn) \
+ (((FETCH_FUNC_NAME(method, u8) == fn) || \
+ (FETCH_FUNC_NAME(method, u16) == fn) || \
+ (FETCH_FUNC_NAME(method, u32) == fn) || \
+ (FETCH_FUNC_NAME(method, u64) == fn) || \
+ (FETCH_FUNC_NAME(method, string) == fn) || \
+ (FETCH_FUNC_NAME(method, string_size) == fn)) \
+ && (fn != NULL))
+
+/* Data fetch function templates */
+#define DEFINE_FETCH_reg(type) \
+static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
+ void *offset, void *dest) \
+{ \
+ *(type *)dest = (type)regs_get_register(regs, \
+ (unsigned int)((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(reg)
+/* No string on the register */
+#define fetch_reg_string NULL
+#define fetch_reg_string_size NULL
+
+#define DEFINE_FETCH_stack(type) \
+static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+ void *offset, void *dest) \
+{ \
+ *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
+ (unsigned int)((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string NULL
+#define fetch_stack_string_size NULL
+
+#define DEFINE_FETCH_retval(type) \
+static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
+ void *dummy, void *dest) \
+{ \
+ *(type *)dest = (type)regs_return_value(regs); \
+}
+DEFINE_BASIC_FETCH_FUNCS(retval)
+/* No string on the retval */
+#define fetch_retval_string NULL
+#define fetch_retval_string_size NULL
+
+#define DEFINE_FETCH_memory(type) \
+static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+ void *addr, void *dest) \
+{ \
+ type retval; \
+ if (probe_kernel_address(addr, retval)) \
+ *(type *)dest = 0; \
+ else \
+ *(type *)dest = retval; \
+}
+DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ long ret;
+ int maxlen = get_rloc_len(*(u32 *)dest);
+ u8 *dst = get_rloc_data(dest);
+ u8 *src = addr;
+ mm_segment_t old_fs = get_fs();
+
+ if (!maxlen)
+ return;
+
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+
+ do
+ ret = __copy_from_user_inatomic(dst++, src++, 1);
+ while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
+
+ dst[-1] = '\0';
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret < 0) { /* Failed to fetch string */
+ ((u8 *)get_rloc_data(dest))[0] = '\0';
+ *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
+ } else {
+ *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
+ get_rloc_offs(*(u32 *)dest));
+ }
+}
+
+/* Return the length of string -- including null terminal byte */
+static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ mm_segment_t old_fs;
+ int ret, len = 0;
+ u8 c;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+
+ do {
+ ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret < 0) /* Failed to check the length */
+ *(u32 *)dest = 0;
+ else
+ *(u32 *)dest = len;
+}
+
+/* Memory fetching by symbol */
+struct symbol_cache {
+ char *symbol;
+ long offset;
+ unsigned long addr;
+};
+
+static unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+ sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+
+ if (sc->addr)
+ sc->addr += sc->offset;
+
+ return sc->addr;
+}
+
+static void free_symbol_cache(struct symbol_cache *sc)
+{
+ kfree(sc->symbol);
+ kfree(sc);
+}
+
+static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+ struct symbol_cache *sc;
+
+ if (!sym || strlen(sym) == 0)
+ return NULL;
+
+ sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+ if (!sc)
+ return NULL;
+
+ sc->symbol = kstrdup(sym, GFP_KERNEL);
+ if (!sc->symbol) {
+ kfree(sc);
+ return NULL;
+ }
+ sc->offset = offset;
+ update_symbol_cache(sc);
+
+ return sc;
+}
+
+#define DEFINE_FETCH_symbol(type) \
+static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
+ void *data, void *dest) \
+{ \
+ struct symbol_cache *sc = data; \
+ if (sc->addr) \
+ fetch_memory_##type(regs, (void *)sc->addr, dest); \
+ else \
+ *(type *)dest = 0; \
+}
+DEFINE_BASIC_FETCH_FUNCS(symbol)
+DEFINE_FETCH_symbol(string)
+DEFINE_FETCH_symbol(string_size)
+
+/* Dereference memory access function */
+struct deref_fetch_param {
+ struct fetch_param orig;
+ long offset;
+};
+
+#define DEFINE_FETCH_deref(type) \
+static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
+ void *data, void *dest) \
+{ \
+ struct deref_fetch_param *dprm = data; \
+ unsigned long addr; \
+ call_fetch(&dprm->orig, regs, &addr); \
+ if (addr) { \
+ addr += dprm->offset; \
+ fetch_memory_##type(regs, (void *)addr, dest); \
+ } else \
+ *(type *)dest = 0; \
+}
+DEFINE_BASIC_FETCH_FUNCS(deref)
+DEFINE_FETCH_deref(string)
+DEFINE_FETCH_deref(string_size)
+
+static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
+{
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ update_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ update_symbol_cache(data->orig.data);
+}
+
+static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
+{
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ free_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ free_symbol_cache(data->orig.data);
+ kfree(data);
+}
+
+/* Bitfield fetch function */
+struct bitfield_fetch_param {
+ struct fetch_param orig;
+ unsigned char hi_shift;
+ unsigned char low_shift;
+};
+
+#define DEFINE_FETCH_bitfield(type) \
+static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
+ void *data, void *dest) \
+{ \
+ struct bitfield_fetch_param *bprm = data; \
+ type buf = 0; \
+ call_fetch(&bprm->orig, regs, &buf); \
+ if (buf) { \
+ buf <<= bprm->hi_shift; \
+ buf >>= bprm->low_shift; \
+ } \
+ *(type *)dest = buf; \
+}
+
+DEFINE_BASIC_FETCH_FUNCS(bitfield)
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+
+static __kprobes void
+update_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+ /*
+ * Don't check the bitfield itself, because this must be the
+ * last fetch function.
+ */
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ update_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ update_symbol_cache(data->orig.data);
+}
+
+static __kprobes void
+free_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+ /*
+ * Don't check the bitfield itself, because this must be the
+ * last fetch function.
+ */
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ free_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ free_symbol_cache(data->orig.data);
+
+ kfree(data);
+}
+
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+
+#define ASSIGN_FETCH_FUNC(method, type) \
+ [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
+
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
+ {.name = _name, \
+ .size = _size, \
+ .is_signed = sign, \
+ .print = PRINT_TYPE_FUNC_NAME(ptype), \
+ .fmt = PRINT_TYPE_FMT_NAME(ptype), \
+ .fmttype = _fmttype, \
+ .fetch = { \
+ASSIGN_FETCH_FUNC(reg, ftype), \
+ASSIGN_FETCH_FUNC(stack, ftype), \
+ASSIGN_FETCH_FUNC(retval, ftype), \
+ASSIGN_FETCH_FUNC(memory, ftype), \
+ASSIGN_FETCH_FUNC(symbol, ftype), \
+ASSIGN_FETCH_FUNC(deref, ftype), \
+ASSIGN_FETCH_FUNC(bitfield, ftype), \
+ } \
+ }
+
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
+ __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+
+#define FETCH_TYPE_STRING 0
+#define FETCH_TYPE_STRSIZE 1
+
+/* Fetch type information table */
+static const struct fetch_type fetch_type_table[] = {
+ /* Special types */
+ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+ sizeof(u32), 1, "__data_loc char[]"),
+ [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+ string_size, sizeof(u32), 0, "u32"),
+ /* Basic types */
+ ASSIGN_FETCH_TYPE(u8, u8, 0),
+ ASSIGN_FETCH_TYPE(u16, u16, 0),
+ ASSIGN_FETCH_TYPE(u32, u32, 0),
+ ASSIGN_FETCH_TYPE(u64, u64, 0),
+ ASSIGN_FETCH_TYPE(s8, u8, 1),
+ ASSIGN_FETCH_TYPE(s16, u16, 1),
+ ASSIGN_FETCH_TYPE(s32, u32, 1),
+ ASSIGN_FETCH_TYPE(s64, u64, 1),
+};
+
+static const struct fetch_type *find_fetch_type(const char *type)
+{
+ int i;
+
+ if (!type)
+ type = DEFAULT_FETCH_TYPE_STR;
+
+ /* Special case: bitfield */
+ if (*type == 'b') {
+ unsigned long bs;
+
+ type = strchr(type, '/');
+ if (!type)
+ goto fail;
+
+ type++;
+ if (strict_strtoul(type, 0, &bs))
+ goto fail;
+
+ switch (bs) {
+ case 8:
+ return find_fetch_type("u8");
+ case 16:
+ return find_fetch_type("u16");
+ case 32:
+ return find_fetch_type("u32");
+ case 64:
+ return find_fetch_type("u64");
+ default:
+ goto fail;
+ }
+ }
+
+ for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
+ if (strcmp(type, fetch_type_table[i].name) == 0)
+ return &fetch_type_table[i];
+
+fail:
+ return NULL;
+}
+
+/* Special function : only accept unsigned long */
+static __kprobes void fetch_stack_address(struct pt_regs *regs,
+ void *dummy, void *dest)
+{
+ *(unsigned long *)dest = kernel_stack_pointer(regs);
+}
+
+static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
+ fetch_func_t orig_fn)
+{
+ int i;
+
+ if (type != &fetch_type_table[FETCH_TYPE_STRING])
+ return NULL; /* Only string type needs size function */
+
+ for (i = 0; i < FETCH_MTD_END; i++)
+ if (type->fetch[i] == orig_fn)
+ return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
+
+ WARN_ON(1); /* This should not happen */
+
+ return NULL;
+}
+
+/* Split symbol and offset. */
+int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
+{
+ char *tmp;
+ int ret;
+
+ if (!offset)
+ return -EINVAL;
+
+ tmp = strchr(symbol, '+');
+ if (tmp) {
+ /* skip sign because strict_strtol doesn't accept '+' */
+ ret = strict_strtoul(tmp + 1, 0, offset);
+ if (ret)
+ return ret;
+
+ *tmp = '\0';
+ } else
+ *offset = 0;
+
+ return 0;
+}
+
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+
+static int parse_probe_vars(char *arg, const struct fetch_type *t,
+ struct fetch_param *f, bool is_return)
+{
+ int ret = 0;
+ unsigned long param;
+
+ if (strcmp(arg, "retval") == 0) {
+ if (is_return)
+ f->fn = t->fetch[FETCH_MTD_retval];
+ else
+ ret = -EINVAL;
+ } else if (strncmp(arg, "stack", 5) == 0) {
+ if (arg[5] == '\0') {
+ if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
+ f->fn = fetch_stack_address;
+ else
+ ret = -EINVAL;
+ } else if (isdigit(arg[5])) {
+ ret = strict_strtoul(arg + 5, 10, &param);
+ if (ret || param > PARAM_MAX_STACK)
+ ret = -EINVAL;
+ else {
+ f->fn = t->fetch[FETCH_MTD_stack];
+ f->data = (void *)param;
+ }
+ } else
+ ret = -EINVAL;
+ } else
+ ret = -EINVAL;
+
+ return ret;
+}
+
+/* Recursive argument parser */
+static int parse_probe_arg(char *arg, const struct fetch_type *t,
+ struct fetch_param *f, bool is_return, bool is_kprobe)
+{
+ unsigned long param;
+ long offset;
+ char *tmp;
+ int ret;
+
+ ret = 0;
+
+ /* Until uprobe_events supports only reg arguments */
+ if (!is_kprobe && arg[0] != '%')
+ return -EINVAL;
+
+ switch (arg[0]) {
+ case '$':
+ ret = parse_probe_vars(arg + 1, t, f, is_return);
+ break;
+
+ case '%': /* named register */
+ ret = regs_query_register_offset(arg + 1);
+ if (ret >= 0) {
+ f->fn = t->fetch[FETCH_MTD_reg];
+ f->data = (void *)(unsigned long)ret;
+ ret = 0;
+ }
+ break;
+
+ case '@': /* memory or symbol */
+ if (isdigit(arg[1])) {
+ ret = strict_strtoul(arg + 1, 0, &param);
+ if (ret)
+ break;
+
+ f->fn = t->fetch[FETCH_MTD_memory];
+ f->data = (void *)param;
+ } else {
+ ret = traceprobe_split_symbol_offset(arg + 1, &offset);
+ if (ret)
+ break;
+
+ f->data = alloc_symbol_cache(arg + 1, offset);
+ if (f->data)
+ f->fn = t->fetch[FETCH_MTD_symbol];
+ }
+ break;
+
+ case '+': /* deref memory */
+ arg++; /* Skip '+', because strict_strtol() rejects it. */
+ case '-':
+ tmp = strchr(arg, '(');
+ if (!tmp)
+ break;
+
+ *tmp = '\0';
+ ret = strict_strtol(arg, 0, &offset);
+
+ if (ret)
+ break;
+
+ arg = tmp + 1;
+ tmp = strrchr(arg, ')');
+
+ if (tmp) {
+ struct deref_fetch_param *dprm;
+ const struct fetch_type *t2;
+
+ t2 = find_fetch_type(NULL);
+ *tmp = '\0';
+ dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
+
+ if (!dprm)
+ return -ENOMEM;
+
+ dprm->offset = offset;
+ ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
+ is_kprobe);
+ if (ret)
+ kfree(dprm);
+ else {
+ f->fn = t->fetch[FETCH_MTD_deref];
+ f->data = (void *)dprm;
+ }
+ }
+ break;
+ }
+ if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
+ pr_info("%s type has no corresponding fetch method.\n", t->name);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
+
+/* Bitfield type needs to be parsed into a fetch function */
+static int __parse_bitfield_probe_arg(const char *bf,
+ const struct fetch_type *t,
+ struct fetch_param *f)
+{
+ struct bitfield_fetch_param *bprm;
+ unsigned long bw, bo;
+ char *tail;
+
+ if (*bf != 'b')
+ return 0;
+
+ bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ if (!bprm)
+ return -ENOMEM;
+
+ bprm->orig = *f;
+ f->fn = t->fetch[FETCH_MTD_bitfield];
+ f->data = (void *)bprm;
+ bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
+
+ if (bw == 0 || *tail != '@')
+ return -EINVAL;
+
+ bf = tail + 1;
+ bo = simple_strtoul(bf, &tail, 0);
+
+ if (tail == bf || *tail != '/')
+ return -EINVAL;
+
+ bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
+ bprm->low_shift = bprm->hi_shift + bo;
+
+ return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
+}
+
+/* String length checking wrapper */
+int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
+ struct probe_arg *parg, bool is_return, bool is_kprobe)
+{
+ const char *t;
+ int ret;
+
+ if (strlen(arg) > MAX_ARGSTR_LEN) {
+ pr_info("Argument is too long.: %s\n", arg);
+ return -ENOSPC;
+ }
+ parg->comm = kstrdup(arg, GFP_KERNEL);
+ if (!parg->comm) {
+ pr_info("Failed to allocate memory for command '%s'.\n", arg);
+ return -ENOMEM;
+ }
+ t = strchr(parg->comm, ':');
+ if (t) {
+ arg[t - parg->comm] = '\0';
+ t++;
+ }
+ parg->type = find_fetch_type(t);
+ if (!parg->type) {
+ pr_info("Unsupported type: %s\n", t);
+ return -EINVAL;
+ }
+ parg->offset = *size;
+ *size += parg->type->size;
+ ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
+
+ if (ret >= 0 && t != NULL)
+ ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
+
+ if (ret >= 0) {
+ parg->fetch_size.fn = get_fetch_size_function(parg->type,
+ parg->fetch.fn);
+ parg->fetch_size.data = parg->fetch.data;
+ }
+
+ return ret;
+}
+
+/* Return 1 if name is reserved or already used by another argument */
+int traceprobe_conflict_field_name(const char *name,
+ struct probe_arg *args, int narg)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+ if (strcmp(reserved_field_names[i], name) == 0)
+ return 1;
+
+ for (i = 0; i < narg; i++)
+ if (strcmp(args[i].name, name) == 0)
+ return 1;
+
+ return 0;
+}
+
+void traceprobe_update_arg(struct probe_arg *arg)
+{
+ if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+ update_bitfield_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+ update_deref_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+ update_symbol_cache(arg->fetch.data);
+}
+
+void traceprobe_free_probe_arg(struct probe_arg *arg)
+{
+ if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+ free_bitfield_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+ free_deref_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+ free_symbol_cache(arg->fetch.data);
+
+ kfree(arg->name);
+ kfree(arg->comm);
+}
+
+int traceprobe_command(const char *buf, int (*createfn)(int, char **))
+{
+ char **argv;
+ int argc, ret;
+
+ argc = 0;
+ ret = 0;
+ argv = argv_split(GFP_KERNEL, buf, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = createfn(argc, argv);
+
+ argv_free(argv);
+
+ return ret;
+}
+
+#define WRITE_BUFSIZE 4096
+
+ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos,
+ int (*createfn)(int, char **))
+{
+ char *kbuf, *tmp;
+ int ret = 0;
+ size_t done = 0;
+ size_t size;
+
+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ while (done < count) {
+ size = count - done;
+
+ if (size >= WRITE_BUFSIZE)
+ size = WRITE_BUFSIZE - 1;
+
+ if (copy_from_user(kbuf, buffer + done, size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ kbuf[size] = '\0';
+ tmp = strchr(kbuf, '\n');
+
+ if (tmp) {
+ *tmp = '\0';
+ size = tmp - kbuf + 1;
+ } else if (done + size < count) {
+ pr_warning("Line length is too long: "
+ "Should be less than %d.", WRITE_BUFSIZE);
+ ret = -EINVAL;
+ goto out;
+ }
+ done += size;
+ /* Remove comments */
+ tmp = strchr(kbuf, '#');
+
+ if (tmp)
+ *tmp = '\0';
+
+ ret = traceprobe_command(kbuf, createfn);
+ if (ret)
+ goto out;
+ }
+ ret = done;
+
+out:
+ kfree(kbuf);
+
+ return ret;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
new file mode 100644
index 000000000000..933708677814
--- /dev/null
+++ b/kernel/trace/trace_probe.h
@@ -0,0 +1,161 @@
+/*
+ * Common header file for probe-based Dynamic events.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * This code was copied from kernel/trace/trace_kprobe.h written by
+ * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+ *
+ * Updates to make this generic:
+ * Copyright (C) IBM Corporation, 2010-2011
+ * Author: Srikar Dronamraju
+ */
+
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+#include <linux/perf_event.h>
+#include <linux/kprobes.h>
+#include <linux/stringify.h>
+#include <linux/limits.h>
+#include <linux/uaccess.h>
+#include <asm/bitsperlong.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+#define MAX_TRACE_ARGS 128
+#define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
+#define MAX_STRING_SIZE PATH_MAX
+
+/* Reserved field names */
+#define FIELD_STRING_IP "__probe_ip"
+#define FIELD_STRING_RETIP "__probe_ret_ip"
+#define FIELD_STRING_FUNC "__probe_func"
+
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed) \
+ do { \
+ ret = trace_define_field(event_call, #type, name, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), is_signed, \
+ FILTER_OTHER); \
+ if (ret) \
+ return ret; \
+ } while (0)
+
+
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE 1
+#define TP_FLAG_PROFILE 2
+#define TP_FLAG_REGISTERED 4
+#define TP_FLAG_UPROBE 8
+
+
+/* data_rloc: data relative location, compatible with u32 */
+#define make_data_rloc(len, roffs) \
+ (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
+#define get_rloc_len(dl) ((u32)(dl) >> 16)
+#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
+
+/*
+ * Convert data_rloc to data_loc:
+ * data_rloc stores the offset from data_rloc itself, but data_loc
+ * stores the offset from event entry.
+ */
+#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
+
+/* Data fetch function type */
+typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
+/* Printing function type */
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *);
+
+/* Fetch types */
+enum {
+ FETCH_MTD_reg = 0,
+ FETCH_MTD_stack,
+ FETCH_MTD_retval,
+ FETCH_MTD_memory,
+ FETCH_MTD_symbol,
+ FETCH_MTD_deref,
+ FETCH_MTD_bitfield,
+ FETCH_MTD_END,
+};
+
+/* Fetch type information table */
+struct fetch_type {
+ const char *name; /* Name of type */
+ size_t size; /* Byte size of type */
+ int is_signed; /* Signed flag */
+ print_type_func_t print; /* Print functions */
+ const char *fmt; /* Fromat string */
+ const char *fmttype; /* Name in format file */
+ /* Fetch functions */
+ fetch_func_t fetch[FETCH_MTD_END];
+};
+
+struct fetch_param {
+ fetch_func_t fn;
+ void *data;
+};
+
+struct probe_arg {
+ struct fetch_param fetch;
+ struct fetch_param fetch_size;
+ unsigned int offset; /* Offset from argument entry */
+ const char *name; /* Name of this argument */
+ const char *comm; /* Command of this argument */
+ const struct fetch_type *type; /* Type of this argument */
+};
+
+static inline __kprobes void call_fetch(struct fetch_param *fprm,
+ struct pt_regs *regs, void *dest)
+{
+ return fprm->fn(regs, fprm->data, dest);
+}
+
+/* Check the name is good for event/group/fields */
+static inline int is_good_name(const char *name)
+{
+ if (!isalpha(*name) && *name != '_')
+ return 0;
+ while (*++name != '\0') {
+ if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+ return 0;
+ }
+ return 1;
+}
+
+extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
+ struct probe_arg *parg, bool is_return, bool is_kprobe);
+
+extern int traceprobe_conflict_field_name(const char *name,
+ struct probe_arg *args, int narg);
+
+extern void traceprobe_update_arg(struct probe_arg *arg);
+extern void traceprobe_free_probe_arg(struct probe_arg *arg);
+
+extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
+
+extern ssize_t traceprobe_probes_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos,
+ int (*createfn)(int, char**));
+
+extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
new file mode 100644
index 000000000000..2b36ac68549e
--- /dev/null
+++ b/kernel/trace/trace_uprobe.c
@@ -0,0 +1,788 @@
+/*
+ * uprobes-based tracing events
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (C) IBM Corporation, 2010-2012
+ * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/uprobes.h>
+#include <linux/namei.h>
+
+#include "trace_probe.h"
+
+#define UPROBE_EVENT_SYSTEM "uprobes"
+
+/*
+ * uprobe event core functions
+ */
+struct trace_uprobe;
+struct uprobe_trace_consumer {
+ struct uprobe_consumer cons;
+ struct trace_uprobe *tu;
+};
+
+struct trace_uprobe {
+ struct list_head list;
+ struct ftrace_event_class class;
+ struct ftrace_event_call call;
+ struct uprobe_trace_consumer *consumer;
+ struct inode *inode;
+ char *filename;
+ unsigned long offset;
+ unsigned long nhit;
+ unsigned int flags; /* For TP_FLAG_* */
+ ssize_t size; /* trace entry size */
+ unsigned int nr_args;
+ struct probe_arg args[];
+};
+
+#define SIZEOF_TRACE_UPROBE(n) \
+ (offsetof(struct trace_uprobe, args) + \
+ (sizeof(struct probe_arg) * (n)))
+
+static int register_uprobe_event(struct trace_uprobe *tu);
+static void unregister_uprobe_event(struct trace_uprobe *tu);
+
+static DEFINE_MUTEX(uprobe_lock);
+static LIST_HEAD(uprobe_list);
+
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+
+/*
+ * Allocate new trace_uprobe and initialize it (including uprobes).
+ */
+static struct trace_uprobe *
+alloc_trace_uprobe(const char *group, const char *event, int nargs)
+{
+ struct trace_uprobe *tu;
+
+ if (!event || !is_good_name(event))
+ return ERR_PTR(-EINVAL);
+
+ if (!group || !is_good_name(group))
+ return ERR_PTR(-EINVAL);
+
+ tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
+ if (!tu)
+ return ERR_PTR(-ENOMEM);
+
+ tu->call.class = &tu->class;
+ tu->call.name = kstrdup(event, GFP_KERNEL);
+ if (!tu->call.name)
+ goto error;
+
+ tu->class.system = kstrdup(group, GFP_KERNEL);
+ if (!tu->class.system)
+ goto error;
+
+ INIT_LIST_HEAD(&tu->list);
+ return tu;
+
+error:
+ kfree(tu->call.name);
+ kfree(tu);
+
+ return ERR_PTR(-ENOMEM);
+}
+
+static void free_trace_uprobe(struct trace_uprobe *tu)
+{
+ int i;
+
+ for (i = 0; i < tu->nr_args; i++)
+ traceprobe_free_probe_arg(&tu->args[i]);
+
+ iput(tu->inode);
+ kfree(tu->call.class->system);
+ kfree(tu->call.name);
+ kfree(tu->filename);
+ kfree(tu);
+}
+
+static struct trace_uprobe *find_probe_event(const char *event, const char *group)
+{
+ struct trace_uprobe *tu;
+
+ list_for_each_entry(tu, &uprobe_list, list)
+ if (strcmp(tu->call.name, event) == 0 &&
+ strcmp(tu->call.class->system, group) == 0)
+ return tu;
+
+ return NULL;
+}
+
+/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
+static void unregister_trace_uprobe(struct trace_uprobe *tu)
+{
+ list_del(&tu->list);
+ unregister_uprobe_event(tu);
+ free_trace_uprobe(tu);
+}
+
+/* Register a trace_uprobe and probe_event */
+static int register_trace_uprobe(struct trace_uprobe *tu)
+{
+ struct trace_uprobe *old_tp;
+ int ret;
+
+ mutex_lock(&uprobe_lock);
+
+ /* register as an event */
+ old_tp = find_probe_event(tu->call.name, tu->call.class->system);
+ if (old_tp)
+ /* delete old event */
+ unregister_trace_uprobe(old_tp);
+
+ ret = register_uprobe_event(tu);
+ if (ret) {
+ pr_warning("Failed to register probe event(%d)\n", ret);
+ goto end;
+ }
+
+ list_add_tail(&tu->list, &uprobe_list);
+
+end:
+ mutex_unlock(&uprobe_lock);
+
+ return ret;
+}
+
+/*
+ * Argument syntax:
+ * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
+ *
+ * - Remove uprobe: -:[GRP/]EVENT
+ */
+static int create_trace_uprobe(int argc, char **argv)
+{
+ struct trace_uprobe *tu;
+ struct inode *inode;
+ char *arg, *event, *group, *filename;
+ char buf[MAX_EVENT_NAME_LEN];
+ struct path path;
+ unsigned long offset;
+ bool is_delete;
+ int i, ret;
+
+ inode = NULL;
+ ret = 0;
+ is_delete = false;
+ event = NULL;
+ group = NULL;
+
+ /* argc must be >= 1 */
+ if (argv[0][0] == '-')
+ is_delete = true;
+ else if (argv[0][0] != 'p') {
+ pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n");
+ return -EINVAL;
+ }
+
+ if (argv[0][1] == ':') {
+ event = &argv[0][2];
+ arg = strchr(event, '/');
+
+ if (arg) {
+ group = event;
+ event = arg + 1;
+ event[-1] = '\0';
+
+ if (strlen(group) == 0) {
+ pr_info("Group name is not specified\n");
+ return -EINVAL;
+ }
+ }
+ if (strlen(event) == 0) {
+ pr_info("Event name is not specified\n");
+ return -EINVAL;
+ }
+ }
+ if (!group)
+ group = UPROBE_EVENT_SYSTEM;
+
+ if (is_delete) {
+ if (!event) {
+ pr_info("Delete command needs an event name.\n");
+ return -EINVAL;
+ }
+ mutex_lock(&uprobe_lock);
+ tu = find_probe_event(event, group);
+
+ if (!tu) {
+ mutex_unlock(&uprobe_lock);
+ pr_info("Event %s/%s doesn't exist.\n", group, event);
+ return -ENOENT;
+ }
+ /* delete an event */
+ unregister_trace_uprobe(tu);
+ mutex_unlock(&uprobe_lock);
+ return 0;
+ }
+
+ if (argc < 2) {
+ pr_info("Probe point is not specified.\n");
+ return -EINVAL;
+ }
+ if (isdigit(argv[1][0])) {
+ pr_info("probe point must be have a filename.\n");
+ return -EINVAL;
+ }
+ arg = strchr(argv[1], ':');
+ if (!arg)
+ goto fail_address_parse;
+
+ *arg++ = '\0';
+ filename = argv[1];
+ ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto fail_address_parse;
+
+ ret = strict_strtoul(arg, 0, &offset);
+ if (ret)
+ goto fail_address_parse;
+
+ inode = igrab(path.dentry->d_inode);
+
+ argc -= 2;
+ argv += 2;
+
+ /* setup a probe */
+ if (!event) {
+ char *tail = strrchr(filename, '/');
+ char *ptr;
+
+ ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL);
+ if (!ptr) {
+ ret = -ENOMEM;
+ goto fail_address_parse;
+ }
+
+ tail = ptr;
+ ptr = strpbrk(tail, ".-_");
+ if (ptr)
+ *ptr = '\0';
+
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
+ event = buf;
+ kfree(tail);
+ }
+
+ tu = alloc_trace_uprobe(group, event, argc);
+ if (IS_ERR(tu)) {
+ pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
+ ret = PTR_ERR(tu);
+ goto fail_address_parse;
+ }
+ tu->offset = offset;
+ tu->inode = inode;
+ tu->filename = kstrdup(filename, GFP_KERNEL);
+
+ if (!tu->filename) {
+ pr_info("Failed to allocate filename.\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /* parse arguments */
+ ret = 0;
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ /* Increment count for freeing args in error case */
+ tu->nr_args++;
+
+ /* Parse argument name */
+ arg = strchr(argv[i], '=');
+ if (arg) {
+ *arg++ = '\0';
+ tu->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+ } else {
+ arg = argv[i];
+ /* If argument name is omitted, set "argN" */
+ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
+ tu->args[i].name = kstrdup(buf, GFP_KERNEL);
+ }
+
+ if (!tu->args[i].name) {
+ pr_info("Failed to allocate argument[%d] name.\n", i);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ if (!is_good_name(tu->args[i].name)) {
+ pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) {
+ pr_info("Argument[%d] name '%s' conflicts with "
+ "another field.\n", i, argv[i]);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ /* Parse fetch argument */
+ ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false);
+ if (ret) {
+ pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+ goto error;
+ }
+ }
+
+ ret = register_trace_uprobe(tu);
+ if (ret)
+ goto error;
+ return 0;
+
+error:
+ free_trace_uprobe(tu);
+ return ret;
+
+fail_address_parse:
+ if (inode)
+ iput(inode);
+
+ pr_info("Failed to parse address.\n");
+
+ return ret;
+}
+
+static void cleanup_all_probes(void)
+{
+ struct trace_uprobe *tu;
+
+ mutex_lock(&uprobe_lock);
+ while (!list_empty(&uprobe_list)) {
+ tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
+ unregister_trace_uprobe(tu);
+ }
+ mutex_unlock(&uprobe_lock);
+}
+
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&uprobe_lock);
+ return seq_list_start(&uprobe_list, *pos);
+}
+
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &uprobe_list, pos);
+}
+
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&uprobe_lock);
+}
+
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_uprobe *tu = v;
+ int i;
+
+ seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
+ seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
+
+ for (i = 0; i < tu->nr_args; i++)
+ seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm);
+
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const struct seq_operations probes_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_seq_show
+};
+
+static int probes_open(struct inode *inode, struct file *file)
+{
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
+ cleanup_all_probes();
+
+ return seq_open(file, &probes_seq_op);
+}
+
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
+}
+
+static const struct file_operations uprobe_events_ops = {
+ .owner = THIS_MODULE,
+ .open = probes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = probes_write,
+};
+
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_uprobe *tu = v;
+
+ seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit);
+ return 0;
+}
+
+static const struct seq_operations profile_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_profile_seq_show
+};
+
+static int profile_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &profile_seq_op);
+}
+
+static const struct file_operations uprobe_profile_ops = {
+ .owner = THIS_MODULE,
+ .open = profile_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/* uprobe handler */
+static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+ struct uprobe_trace_entry_head *entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ u8 *data;
+ int size, i, pc;
+ unsigned long irq_flags;
+ struct ftrace_event_call *call = &tu->call;
+
+ tu->nhit++;
+
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ size = sizeof(*entry) + tu->size;
+
+ event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+ size, irq_flags, pc);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+ data = (u8 *)&entry[1];
+ for (i = 0; i < tu->nr_args; i++)
+ call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
+
+ if (!filter_current_check_discard(buffer, call, entry, event))
+ trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+}
+
+/* Event entry printers */
+static enum print_line_t
+print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
+{
+ struct uprobe_trace_entry_head *field;
+ struct trace_seq *s = &iter->seq;
+ struct trace_uprobe *tu;
+ u8 *data;
+ int i;
+
+ field = (struct uprobe_trace_entry_head *)iter->ent;
+ tu = container_of(event, struct trace_uprobe, call.event);
+
+ if (!trace_seq_printf(s, "%s: (", tu->call.name))
+ goto partial;
+
+ if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto partial;
+
+ if (!trace_seq_puts(s, ")"))
+ goto partial;
+
+ data = (u8 *)&field[1];
+ for (i = 0; i < tu->nr_args; i++) {
+ if (!tu->args[i].type->print(s, tu->args[i].name,
+ data + tu->args[i].offset, field))
+ goto partial;
+ }
+
+ if (trace_seq_puts(s, "\n"))
+ return TRACE_TYPE_HANDLED;
+
+partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int probe_event_enable(struct trace_uprobe *tu, int flag)
+{
+ struct uprobe_trace_consumer *utc;
+ int ret = 0;
+
+ if (!tu->inode || tu->consumer)
+ return -EINTR;
+
+ utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
+ if (!utc)
+ return -EINTR;
+
+ utc->cons.handler = uprobe_dispatcher;
+ utc->cons.filter = NULL;
+ ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
+ if (ret) {
+ kfree(utc);
+ return ret;
+ }
+
+ tu->flags |= flag;
+ utc->tu = tu;
+ tu->consumer = utc;
+
+ return 0;
+}
+
+static void probe_event_disable(struct trace_uprobe *tu, int flag)
+{
+ if (!tu->inode || !tu->consumer)
+ return;
+
+ uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
+ tu->flags &= ~flag;
+ kfree(tu->consumer);
+ tu->consumer = NULL;
+}
+
+static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+ int ret, i;
+ struct uprobe_trace_entry_head field;
+ struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
+
+ DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+ /* Set argument names as fields */
+ for (i = 0; i < tu->nr_args; i++) {
+ ret = trace_define_field(event_call, tu->args[i].type->fmttype,
+ tu->args[i].name,
+ sizeof(field) + tu->args[i].offset,
+ tu->args[i].type->size,
+ tu->args[i].type->is_signed,
+ FILTER_OTHER);
+
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+#define LEN_OR_ZERO (len ? len - pos : 0)
+static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
+{
+ const char *fmt, *arg;
+ int i;
+ int pos = 0;
+
+ fmt = "(%lx)";
+ arg = "REC->" FIELD_STRING_IP;
+
+ /* When len=0, we just calculate the needed length */
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
+
+ for (i = 0; i < tu->nr_args; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
+ tu->args[i].name, tu->args[i].type->fmt);
+ }
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
+
+ for (i = 0; i < tu->nr_args; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+ tu->args[i].name);
+ }
+
+ return pos; /* return the length of print_fmt */
+}
+#undef LEN_OR_ZERO
+
+static int set_print_fmt(struct trace_uprobe *tu)
+{
+ char *print_fmt;
+ int len;
+
+ /* First: called with 0 length to calculate the needed length */
+ len = __set_print_fmt(tu, NULL, 0);
+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
+ if (!print_fmt)
+ return -ENOMEM;
+
+ /* Second: actually write the @print_fmt */
+ __set_print_fmt(tu, print_fmt, len + 1);
+ tu->call.print_fmt = print_fmt;
+
+ return 0;
+}
+
+#ifdef CONFIG_PERF_EVENTS
+/* uprobe profile handler */
+static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+ struct ftrace_event_call *call = &tu->call;
+ struct uprobe_trace_entry_head *entry;
+ struct hlist_head *head;
+ u8 *data;
+ int size, __size, i;
+ int rctx;
+
+ __size = sizeof(*entry) + tu->size;
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
+ return;
+
+ preempt_disable();
+
+ entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ if (!entry)
+ goto out;
+
+ entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+ data = (u8 *)&entry[1];
+ for (i = 0; i < tu->nr_args; i++)
+ call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
+
+ head = this_cpu_ptr(call->perf_events);
+ perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+
+ out:
+ preempt_enable();
+}
+#endif /* CONFIG_PERF_EVENTS */
+
+static
+int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
+{
+ struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return probe_event_enable(tu, TP_FLAG_TRACE);
+
+ case TRACE_REG_UNREGISTER:
+ probe_event_disable(tu, TP_FLAG_TRACE);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return probe_event_enable(tu, TP_FLAG_PROFILE);
+
+ case TRACE_REG_PERF_UNREGISTER:
+ probe_event_disable(tu, TP_FLAG_PROFILE);
+ return 0;
+#endif
+ default:
+ return 0;
+ }
+ return 0;
+}
+
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
+{
+ struct uprobe_trace_consumer *utc;
+ struct trace_uprobe *tu;
+
+ utc = container_of(con, struct uprobe_trace_consumer, cons);
+ tu = utc->tu;
+ if (!tu || tu->consumer != utc)
+ return 0;
+
+ if (tu->flags & TP_FLAG_TRACE)
+ uprobe_trace_func(tu, regs);
+
+#ifdef CONFIG_PERF_EVENTS
+ if (tu->flags & TP_FLAG_PROFILE)
+ uprobe_perf_func(tu, regs);
+#endif
+ return 0;
+}
+
+static struct trace_event_functions uprobe_funcs = {
+ .trace = print_uprobe_event
+};
+
+static int register_uprobe_event(struct trace_uprobe *tu)
+{
+ struct ftrace_event_call *call = &tu->call;
+ int ret;
+
+ /* Initialize ftrace_event_call */
+ INIT_LIST_HEAD(&call->class->fields);
+ call->event.funcs = &uprobe_funcs;
+ call->class->define_fields = uprobe_event_define_fields;
+
+ if (set_print_fmt(tu) < 0)
+ return -ENOMEM;
+
+ ret = register_ftrace_event(&call->event);
+ if (!ret) {
+ kfree(call->print_fmt);
+ return -ENODEV;
+ }
+ call->flags = 0;
+ call->class->reg = trace_uprobe_register;
+ call->data = tu;
+ ret = trace_add_event_call(call);
+
+ if (ret) {
+ pr_info("Failed to register uprobe event: %s\n", call->name);
+ kfree(call->print_fmt);
+ unregister_ftrace_event(&call->event);
+ }
+
+ return ret;
+}
+
+static void unregister_uprobe_event(struct trace_uprobe *tu)
+{
+ /* tu->event is unregistered in trace_remove_event_call() */
+ trace_remove_event_call(&tu->call);
+ kfree(tu->call.print_fmt);
+ tu->call.print_fmt = NULL;
+}
+
+/* Make a trace interface for controling probe points */
+static __init int init_uprobe_trace(void)
+{
+ struct dentry *d_tracer;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ trace_create_file("uprobe_events", 0644, d_tracer,
+ NULL, &uprobe_events_ops);
+ /* Profile interface */
+ trace_create_file("uprobe_profile", 0444, d_tracer,
+ NULL, &uprobe_profile_ops);
+ return 0;
+}
+
+fs_initcall(init_uprobe_trace);
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
deleted file mode 100644
index 209b379a4721..000000000000
--- a/kernel/trace/trace_workqueue.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Workqueue statistical tracer.
- *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-
-#include <trace/events/workqueue.h>
-#include <linux/list.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/kref.h>
-#include "trace_stat.h"
-#include "trace.h"
-
-
-/* A cpu workqueue thread */
-struct cpu_workqueue_stats {
- struct list_head list;
- struct kref kref;
- int cpu;
- pid_t pid;
-/* Can be inserted from interrupt or user context, need to be atomic */
- atomic_t inserted;
-/*
- * Don't need to be atomic, works are serialized in a single workqueue thread
- * on a single CPU.
- */
- unsigned int executed;
-};
-
-/* List of workqueue threads on one cpu */
-struct workqueue_global_stats {
- struct list_head list;
- spinlock_t lock;
-};
-
-/* Don't need a global lock because allocated before the workqueues, and
- * never freed.
- */
-static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
-#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
-
-static void cpu_workqueue_stat_free(struct kref *kref)
-{
- kfree(container_of(kref, struct cpu_workqueue_stats, kref));
-}
-
-/* Insertion of a work */
-static void
-probe_workqueue_insertion(void *ignore,
- struct task_struct *wq_thread,
- struct work_struct *work)
-{
- int cpu = cpumask_first(&wq_thread->cpus_allowed);
- struct cpu_workqueue_stats *node;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
- if (node->pid == wq_thread->pid) {
- atomic_inc(&node->inserted);
- goto found;
- }
- }
- pr_debug("trace_workqueue: entry not found\n");
-found:
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Execution of a work */
-static void
-probe_workqueue_execution(void *ignore,
- struct task_struct *wq_thread,
- struct work_struct *work)
-{
- int cpu = cpumask_first(&wq_thread->cpus_allowed);
- struct cpu_workqueue_stats *node;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
- if (node->pid == wq_thread->pid) {
- node->executed++;
- goto found;
- }
- }
- pr_debug("trace_workqueue: entry not found\n");
-found:
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(void *ignore,
- struct task_struct *wq_thread, int cpu)
-{
- struct cpu_workqueue_stats *cws;
- unsigned long flags;
-
- WARN_ON(cpu < 0);
-
- /* Workqueues are sometimes created in atomic context */
- cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
- if (!cws) {
- pr_warning("trace_workqueue: not enough memory\n");
- return;
- }
- INIT_LIST_HEAD(&cws->list);
- kref_init(&cws->kref);
- cws->cpu = cpu;
- cws->pid = wq_thread->pid;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Destruction of a cpu workqueue thread */
-static void
-probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
-{
- /* Workqueue only execute on one cpu */
- int cpu = cpumask_first(&wq_thread->cpus_allowed);
- struct cpu_workqueue_stats *node, *next;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
- list) {
- if (node->pid == wq_thread->pid) {
- list_del(&node->list);
- kref_put(&node->kref, cpu_workqueue_stat_free);
- goto found;
- }
- }
-
- pr_debug("trace_workqueue: don't find workqueue to destroy\n");
-found:
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
-}
-
-static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
-{
- unsigned long flags;
- struct cpu_workqueue_stats *ret = NULL;
-
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-
- if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
- ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
- struct cpu_workqueue_stats, list);
- kref_get(&ret->kref);
- }
-
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
- return ret;
-}
-
-static void *workqueue_stat_start(struct tracer_stat *trace)
-{
- int cpu;
- void *ret = NULL;
-
- for_each_possible_cpu(cpu) {
- ret = workqueue_stat_start_cpu(cpu);
- if (ret)
- return ret;
- }
- return NULL;
-}
-
-static void *workqueue_stat_next(void *prev, int idx)
-{
- struct cpu_workqueue_stats *prev_cws = prev;
- struct cpu_workqueue_stats *ret;
- int cpu = prev_cws->cpu;
- unsigned long flags;
-
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
- do {
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- return NULL;
- } while (!(ret = workqueue_stat_start_cpu(cpu)));
- return ret;
- } else {
- ret = list_entry(prev_cws->list.next,
- struct cpu_workqueue_stats, list);
- kref_get(&ret->kref);
- }
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
- return ret;
-}
-
-static int workqueue_stat_show(struct seq_file *s, void *p)
-{
- struct cpu_workqueue_stats *cws = p;
- struct pid *pid;
- struct task_struct *tsk;
-
- pid = find_get_pid(cws->pid);
- if (pid) {
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (tsk) {
- seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
- atomic_read(&cws->inserted), cws->executed,
- tsk->comm);
- put_task_struct(tsk);
- }
- put_pid(pid);
- }
-
- return 0;
-}
-
-static void workqueue_stat_release(void *stat)
-{
- struct cpu_workqueue_stats *node = stat;
-
- kref_put(&node->kref, cpu_workqueue_stat_free);
-}
-
-static int workqueue_stat_headers(struct seq_file *s)
-{
- seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
- seq_printf(s, "# | | | |\n");
- return 0;
-}
-
-struct tracer_stat workqueue_stats __read_mostly = {
- .name = "workqueues",
- .stat_start = workqueue_stat_start,
- .stat_next = workqueue_stat_next,
- .stat_show = workqueue_stat_show,
- .stat_release = workqueue_stat_release,
- .stat_headers = workqueue_stat_headers
-};
-
-
-int __init stat_workqueue_init(void)
-{
- if (register_stat_tracer(&workqueue_stats)) {
- pr_warning("Unable to register workqueue stat tracer\n");
- return 1;
- }
-
- return 0;
-}
-fs_initcall(stat_workqueue_init);
-
-/*
- * Workqueues are created very early, just after pre-smp initcalls.
- * So we must register our tracepoints at this stage.
- */
-int __init trace_workqueue_early_init(void)
-{
- int ret, cpu;
-
- for_each_possible_cpu(cpu) {
- spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
- INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
- }
-
- ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
- if (ret)
- goto out;
-
- ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
- if (ret)
- goto no_insertion;
-
- ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
- if (ret)
- goto no_execution;
-
- ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
- if (ret)
- goto no_creation;
-
- return 0;
-
-no_creation:
- unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
-no_execution:
- unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
-no_insertion:
- unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
-out:
- pr_warning("trace_workqueue: unable to trace workqueues\n");
-
- return 1;
-}
-early_initcall(trace_workqueue_early_init);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 51c6e89e8619..d7948eb10225 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -81,14 +81,19 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
return ret;
}
-SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid)
+SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
{
const struct cred *cred = current_cred();
int retval;
+ old_uid_t ruid, euid, suid;
- if (!(retval = put_user(high2lowuid(cred->uid), ruid)) &&
- !(retval = put_user(high2lowuid(cred->euid), euid)))
- retval = put_user(high2lowuid(cred->suid), suid);
+ ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid));
+ euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid));
+ suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid));
+
+ if (!(retval = put_user(ruid, ruidp)) &&
+ !(retval = put_user(euid, euidp)))
+ retval = put_user(suid, suidp);
return retval;
}
@@ -103,14 +108,19 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
}
-SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid)
+SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
{
const struct cred *cred = current_cred();
int retval;
+ old_gid_t rgid, egid, sgid;
+
+ rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid));
+ egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid));
+ sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid));
- if (!(retval = put_user(high2lowgid(cred->gid), rgid)) &&
- !(retval = put_user(high2lowgid(cred->egid), egid)))
- retval = put_user(high2lowgid(cred->sgid), sgid);
+ if (!(retval = put_user(rgid, rgidp)) &&
+ !(retval = put_user(egid, egidp)))
+ retval = put_user(sgid, sgidp);
return retval;
}
@@ -134,11 +144,14 @@ SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
static int groups16_to_user(old_gid_t __user *grouplist,
struct group_info *group_info)
{
+ struct user_namespace *user_ns = current_user_ns();
int i;
old_gid_t group;
+ kgid_t kgid;
for (i = 0; i < group_info->ngroups; i++) {
- group = high2lowgid(GROUP_AT(group_info, i));
+ kgid = GROUP_AT(group_info, i);
+ group = high2lowgid(from_kgid_munged(user_ns, kgid));
if (put_user(group, grouplist+i))
return -EFAULT;
}
@@ -149,13 +162,20 @@ static int groups16_to_user(old_gid_t __user *grouplist,
static int groups16_from_user(struct group_info *group_info,
old_gid_t __user *grouplist)
{
+ struct user_namespace *user_ns = current_user_ns();
int i;
old_gid_t group;
+ kgid_t kgid;
for (i = 0; i < group_info->ngroups; i++) {
if (get_user(group, grouplist+i))
return -EFAULT;
- GROUP_AT(group_info, i) = low2highgid(group);
+
+ kgid = make_kgid(user_ns, low2highgid(group));
+ if (!gid_valid(kgid))
+ return -EINVAL;
+
+ GROUP_AT(group_info, i) = kgid;
}
return 0;
@@ -211,20 +231,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
SYSCALL_DEFINE0(getuid16)
{
- return high2lowuid(current_uid());
+ return high2lowuid(from_kuid_munged(current_user_ns(), current_uid()));
}
SYSCALL_DEFINE0(geteuid16)
{
- return high2lowuid(current_euid());
+ return high2lowuid(from_kuid_munged(current_user_ns(), current_euid()));
}
SYSCALL_DEFINE0(getgid16)
{
- return high2lowgid(current_gid());
+ return high2lowgid(from_kgid_munged(current_user_ns(), current_gid()));
}
SYSCALL_DEFINE0(getegid16)
{
- return high2lowgid(current_egid());
+ return high2lowgid(from_kgid_munged(current_user_ns(), current_egid()));
}
diff --git a/kernel/user.c b/kernel/user.c
index 71dd2363ab0f..b815fefbe76f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -22,10 +22,27 @@
* and 1 for... ?
*/
struct user_namespace init_user_ns = {
+ .uid_map = {
+ .nr_extents = 1,
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
+ },
+ .gid_map = {
+ .nr_extents = 1,
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
+ },
.kref = {
.refcount = ATOMIC_INIT(3),
},
- .creator = &root_user,
+ .owner = GLOBAL_ROOT_UID,
+ .group = GLOBAL_ROOT_GID,
};
EXPORT_SYMBOL_GPL(init_user_ns);
@@ -34,11 +51,14 @@ EXPORT_SYMBOL_GPL(init_user_ns);
* when changing user ID's (ie setuid() and friends).
*/
+#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7)
+#define UIDHASH_SZ (1 << UIDHASH_BITS)
#define UIDHASH_MASK (UIDHASH_SZ - 1)
#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid)))
+#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid))))
static struct kmem_cache *uid_cachep;
+struct hlist_head uidhash_table[UIDHASH_SZ];
/*
* The uidhash_lock is mostly taken from process context, but it is
@@ -51,14 +71,14 @@ static struct kmem_cache *uid_cachep;
*/
static DEFINE_SPINLOCK(uidhash_lock);
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
+/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
- .__count = ATOMIC_INIT(2),
+ .__count = ATOMIC_INIT(1),
.processes = ATOMIC_INIT(1),
.files = ATOMIC_INIT(0),
.sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
- .user_ns = &init_user_ns,
+ .uid = GLOBAL_ROOT_UID,
};
/*
@@ -72,16 +92,15 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
static void uid_hash_remove(struct user_struct *up)
{
hlist_del_init(&up->uidhash_node);
- put_user_ns(up->user_ns);
}
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
{
struct user_struct *user;
struct hlist_node *h;
hlist_for_each_entry(user, h, hashent, uidhash_node) {
- if (user->uid == uid) {
+ if (uid_eq(user->uid, uid)) {
atomic_inc(&user->__count);
return user;
}
@@ -110,14 +129,13 @@ static void free_user(struct user_struct *up, unsigned long flags)
*
* If the user_struct could not be found, return NULL.
*/
-struct user_struct *find_user(uid_t uid)
+struct user_struct *find_user(kuid_t uid)
{
struct user_struct *ret;
unsigned long flags;
- struct user_namespace *ns = current_user_ns();
spin_lock_irqsave(&uidhash_lock, flags);
- ret = uid_hash_find(uid, uidhashentry(ns, uid));
+ ret = uid_hash_find(uid, uidhashentry(uid));
spin_unlock_irqrestore(&uidhash_lock, flags);
return ret;
}
@@ -136,9 +154,9 @@ void free_uid(struct user_struct *up)
local_irq_restore(flags);
}
-struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
+struct user_struct *alloc_uid(kuid_t uid)
{
- struct hlist_head *hashent = uidhashentry(ns, uid);
+ struct hlist_head *hashent = uidhashentry(uid);
struct user_struct *up, *new;
spin_lock_irq(&uidhash_lock);
@@ -153,8 +171,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
new->uid = uid;
atomic_set(&new->__count, 1);
- new->user_ns = get_user_ns(ns);
-
/*
* Before adding this, check whether we raced
* on adding the same user already..
@@ -162,7 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
- put_user_ns(ns);
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
@@ -187,11 +202,11 @@ static int __init uid_cache_init(void)
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
for(n = 0; n < UIDHASH_SZ; ++n)
- INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
+ INIT_HLIST_HEAD(uidhash_table + n);
/* Insert the root user immediately (init already runs as root) */
spin_lock_irq(&uidhash_lock);
- uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
+ uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
spin_unlock_irq(&uidhash_lock);
return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 3b906e98b1db..86602316422d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -11,9 +11,20 @@
#include <linux/user_namespace.h>
#include <linux/highuid.h>
#include <linux/cred.h>
+#include <linux/securebits.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/ctype.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
+static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+ struct uid_gid_map *map);
+
/*
* Create a new user namespace, deriving the creator from the user in the
* passed credentials, and replacing that user with the new root user for the
@@ -24,109 +35,565 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
*/
int create_user_ns(struct cred *new)
{
- struct user_namespace *ns;
- struct user_struct *root_user;
- int n;
+ struct user_namespace *ns, *parent_ns = new->user_ns;
+ kuid_t owner = new->euid;
+ kgid_t group = new->egid;
+
+ /* The creator needs a mapping in the parent user namespace
+ * or else we won't be able to reasonably tell userspace who
+ * created a user_namespace.
+ */
+ if (!kuid_has_mapping(parent_ns, owner) ||
+ !kgid_has_mapping(parent_ns, group))
+ return -EPERM;
- ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
+ ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
return -ENOMEM;
kref_init(&ns->kref);
+ ns->parent = parent_ns;
+ ns->owner = owner;
+ ns->group = group;
- for (n = 0; n < UIDHASH_SZ; ++n)
- INIT_HLIST_HEAD(ns->uidhash_table + n);
-
- /* Alloc new root user. */
- root_user = alloc_uid(ns, 0);
- if (!root_user) {
- kmem_cache_free(user_ns_cachep, ns);
- return -ENOMEM;
- }
-
- /* set the new root user in the credentials under preparation */
- ns->creator = new->user;
- new->user = root_user;
- new->uid = new->euid = new->suid = new->fsuid = 0;
- new->gid = new->egid = new->sgid = new->fsgid = 0;
- put_group_info(new->group_info);
- new->group_info = get_group_info(&init_groups);
+ /* Start with the same capabilities as init but useless for doing
+ * anything as the capabilities are bound to the new user namespace.
+ */
+ new->securebits = SECUREBITS_DEFAULT;
+ new->cap_inheritable = CAP_EMPTY_SET;
+ new->cap_permitted = CAP_FULL_SET;
+ new->cap_effective = CAP_FULL_SET;
+ new->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
key_put(new->request_key_auth);
new->request_key_auth = NULL;
#endif
/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
- /* root_user holds a reference to ns, our reference can be dropped */
- put_user_ns(ns);
+ /* Leave the new->user_ns reference with the new user namespace. */
+ /* Leave the reference to our user_ns with the new cred. */
+ new->user_ns = ns;
return 0;
}
-/*
- * Deferred destructor for a user namespace. This is required because
- * free_user_ns() may be called with uidhash_lock held, but we need to call
- * back to free_uid() which will want to take the lock again.
- */
-static void free_user_ns_work(struct work_struct *work)
+void free_user_ns(struct kref *kref)
{
- struct user_namespace *ns =
- container_of(work, struct user_namespace, destroyer);
- free_uid(ns->creator);
+ struct user_namespace *parent, *ns =
+ container_of(kref, struct user_namespace, kref);
+
+ parent = ns->parent;
kmem_cache_free(user_ns_cachep, ns);
+ put_user_ns(parent);
}
+EXPORT_SYMBOL(free_user_ns);
-void free_user_ns(struct kref *kref)
+static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
- struct user_namespace *ns =
- container_of(kref, struct user_namespace, kref);
+ unsigned idx, extents;
+ u32 first, last, id2;
+
+ id2 = id + count - 1;
+
+ /* Find the matching extent */
+ extents = map->nr_extents;
+ smp_read_barrier_depends();
+ for (idx = 0; idx < extents; idx++) {
+ first = map->extent[idx].first;
+ last = first + map->extent[idx].count - 1;
+ if (id >= first && id <= last &&
+ (id2 >= first && id2 <= last))
+ break;
+ }
+ /* Map the id or note failure */
+ if (idx < extents)
+ id = (id - first) + map->extent[idx].lower_first;
+ else
+ id = (u32) -1;
- INIT_WORK(&ns->destroyer, free_user_ns_work);
- schedule_work(&ns->destroyer);
+ return id;
}
-EXPORT_SYMBOL(free_user_ns);
-uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
+static u32 map_id_down(struct uid_gid_map *map, u32 id)
{
- struct user_namespace *tmp;
+ unsigned idx, extents;
+ u32 first, last;
- if (likely(to == cred->user->user_ns))
- return uid;
+ /* Find the matching extent */
+ extents = map->nr_extents;
+ smp_read_barrier_depends();
+ for (idx = 0; idx < extents; idx++) {
+ first = map->extent[idx].first;
+ last = first + map->extent[idx].count - 1;
+ if (id >= first && id <= last)
+ break;
+ }
+ /* Map the id or note failure */
+ if (idx < extents)
+ id = (id - first) + map->extent[idx].lower_first;
+ else
+ id = (u32) -1;
+ return id;
+}
- /* Is cred->user the creator of the target user_ns
- * or the creator of one of it's parents?
- */
- for ( tmp = to; tmp != &init_user_ns;
- tmp = tmp->creator->user_ns ) {
- if (cred->user == tmp->creator) {
- return (uid_t)0;
- }
+static u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+ unsigned idx, extents;
+ u32 first, last;
+
+ /* Find the matching extent */
+ extents = map->nr_extents;
+ smp_read_barrier_depends();
+ for (idx = 0; idx < extents; idx++) {
+ first = map->extent[idx].lower_first;
+ last = first + map->extent[idx].count - 1;
+ if (id >= first && id <= last)
+ break;
}
+ /* Map the id or note failure */
+ if (idx < extents)
+ id = (id - first) + map->extent[idx].first;
+ else
+ id = (u32) -1;
+
+ return id;
+}
+
+/**
+ * make_kuid - Map a user-namespace uid pair into a kuid.
+ * @ns: User namespace that the uid is in
+ * @uid: User identifier
+ *
+ * Maps a user-namespace uid pair into a kernel internal kuid,
+ * and returns that kuid.
+ *
+ * When there is no mapping defined for the user-namespace uid
+ * pair INVALID_UID is returned. Callers are expected to test
+ * for and handle handle INVALID_UID being returned. INVALID_UID
+ * may be tested for using uid_valid().
+ */
+kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
+{
+ /* Map the uid to a global kernel uid */
+ return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
+}
+EXPORT_SYMBOL(make_kuid);
+
+/**
+ * from_kuid - Create a uid from a kuid user-namespace pair.
+ * @targ: The user namespace we want a uid in.
+ * @kuid: The kernel internal uid to start with.
+ *
+ * Map @kuid into the user-namespace specified by @targ and
+ * return the resulting uid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * If @kuid has no mapping in @targ (uid_t)-1 is returned.
+ */
+uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
+{
+ /* Map the uid from a global kernel uid */
+ return map_id_up(&targ->uid_map, __kuid_val(kuid));
+}
+EXPORT_SYMBOL(from_kuid);
- /* No useful relationship so no mapping */
- return overflowuid;
+/**
+ * from_kuid_munged - Create a uid from a kuid user-namespace pair.
+ * @targ: The user namespace we want a uid in.
+ * @kuid: The kernel internal uid to start with.
+ *
+ * Map @kuid into the user-namespace specified by @targ and
+ * return the resulting uid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * Unlike from_kuid from_kuid_munged never fails and always
+ * returns a valid uid. This makes from_kuid_munged appropriate
+ * for use in syscalls like stat and getuid where failing the
+ * system call and failing to provide a valid uid are not an
+ * options.
+ *
+ * If @kuid has no mapping in @targ overflowuid is returned.
+ */
+uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
+{
+ uid_t uid;
+ uid = from_kuid(targ, kuid);
+
+ if (uid == (uid_t) -1)
+ uid = overflowuid;
+ return uid;
}
+EXPORT_SYMBOL(from_kuid_munged);
-gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
+/**
+ * make_kgid - Map a user-namespace gid pair into a kgid.
+ * @ns: User namespace that the gid is in
+ * @uid: group identifier
+ *
+ * Maps a user-namespace gid pair into a kernel internal kgid,
+ * and returns that kgid.
+ *
+ * When there is no mapping defined for the user-namespace gid
+ * pair INVALID_GID is returned. Callers are expected to test
+ * for and handle INVALID_GID being returned. INVALID_GID may be
+ * tested for using gid_valid().
+ */
+kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
{
- struct user_namespace *tmp;
+ /* Map the gid to a global kernel gid */
+ return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
+}
+EXPORT_SYMBOL(make_kgid);
- if (likely(to == cred->user->user_ns))
- return gid;
+/**
+ * from_kgid - Create a gid from a kgid user-namespace pair.
+ * @targ: The user namespace we want a gid in.
+ * @kgid: The kernel internal gid to start with.
+ *
+ * Map @kgid into the user-namespace specified by @targ and
+ * return the resulting gid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * If @kgid has no mapping in @targ (gid_t)-1 is returned.
+ */
+gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
+{
+ /* Map the gid from a global kernel gid */
+ return map_id_up(&targ->gid_map, __kgid_val(kgid));
+}
+EXPORT_SYMBOL(from_kgid);
+
+/**
+ * from_kgid_munged - Create a gid from a kgid user-namespace pair.
+ * @targ: The user namespace we want a gid in.
+ * @kgid: The kernel internal gid to start with.
+ *
+ * Map @kgid into the user-namespace specified by @targ and
+ * return the resulting gid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * Unlike from_kgid from_kgid_munged never fails and always
+ * returns a valid gid. This makes from_kgid_munged appropriate
+ * for use in syscalls like stat and getgid where failing the
+ * system call and failing to provide a valid gid are not options.
+ *
+ * If @kgid has no mapping in @targ overflowgid is returned.
+ */
+gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
+{
+ gid_t gid;
+ gid = from_kgid(targ, kgid);
- /* Is cred->user the creator of the target user_ns
- * or the creator of one of it's parents?
+ if (gid == (gid_t) -1)
+ gid = overflowgid;
+ return gid;
+}
+EXPORT_SYMBOL(from_kgid_munged);
+
+static int uid_m_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_extent *extent = v;
+ struct user_namespace *lower_ns;
+ uid_t lower;
+
+ lower_ns = current_user_ns();
+ if ((lower_ns == ns) && lower_ns->parent)
+ lower_ns = lower_ns->parent;
+
+ lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));
+
+ seq_printf(seq, "%10u %10u %10u\n",
+ extent->first,
+ lower,
+ extent->count);
+
+ return 0;
+}
+
+static int gid_m_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_extent *extent = v;
+ struct user_namespace *lower_ns;
+ gid_t lower;
+
+ lower_ns = current_user_ns();
+ if ((lower_ns == ns) && lower_ns->parent)
+ lower_ns = lower_ns->parent;
+
+ lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));
+
+ seq_printf(seq, "%10u %10u %10u\n",
+ extent->first,
+ lower,
+ extent->count);
+
+ return 0;
+}
+
+static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
+{
+ struct uid_gid_extent *extent = NULL;
+ loff_t pos = *ppos;
+
+ if (pos < map->nr_extents)
+ extent = &map->extent[pos];
+
+ return extent;
+}
+
+static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+ struct user_namespace *ns = seq->private;
+
+ return m_start(seq, ppos, &ns->uid_map);
+}
+
+static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+ struct user_namespace *ns = seq->private;
+
+ return m_start(seq, ppos, &ns->gid_map);
+}
+
+static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return seq->op->start(seq, pos);
+}
+
+static void m_stop(struct seq_file *seq, void *v)
+{
+ return;
+}
+
+struct seq_operations proc_uid_seq_operations = {
+ .start = uid_m_start,
+ .stop = m_stop,
+ .next = m_next,
+ .show = uid_m_show,
+};
+
+struct seq_operations proc_gid_seq_operations = {
+ .start = gid_m_start,
+ .stop = m_stop,
+ .next = m_next,
+ .show = gid_m_show,
+};
+
+static DEFINE_MUTEX(id_map_mutex);
+
+static ssize_t map_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos,
+ int cap_setid,
+ struct uid_gid_map *map,
+ struct uid_gid_map *parent_map)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_map new_map;
+ unsigned idx;
+ struct uid_gid_extent *extent, *last = NULL;
+ unsigned long page = 0;
+ char *kbuf, *pos, *next_line;
+ ssize_t ret = -EINVAL;
+
+ /*
+ * The id_map_mutex serializes all writes to any given map.
+ *
+ * Any map is only ever written once.
+ *
+ * An id map fits within 1 cache line on most architectures.
+ *
+ * On read nothing needs to be done unless you are on an
+ * architecture with a crazy cache coherency model like alpha.
+ *
+ * There is a one time data dependency between reading the
+ * count of the extents and the values of the extents. The
+ * desired behavior is to see the values of the extents that
+ * were written before the count of the extents.
+ *
+ * To achieve this smp_wmb() is used on guarantee the write
+ * order and smp_read_barrier_depends() is guaranteed that we
+ * don't have crazy architectures returning stale data.
+ *
*/
- for ( tmp = to; tmp != &init_user_ns;
- tmp = tmp->creator->user_ns ) {
- if (cred->user == tmp->creator) {
- return (gid_t)0;
+ mutex_lock(&id_map_mutex);
+
+ ret = -EPERM;
+ /* Only allow one successful write to the map */
+ if (map->nr_extents != 0)
+ goto out;
+
+ /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
+ * over the user namespace in order to set the id mapping.
+ */
+ if (!ns_capable(ns, cap_setid))
+ goto out;
+
+ /* Get a buffer */
+ ret = -ENOMEM;
+ page = __get_free_page(GFP_TEMPORARY);
+ kbuf = (char *) page;
+ if (!page)
+ goto out;
+
+ /* Only allow <= page size writes at the beginning of the file */
+ ret = -EINVAL;
+ if ((*ppos != 0) || (count >= PAGE_SIZE))
+ goto out;
+
+ /* Slurp in the user data */
+ ret = -EFAULT;
+ if (copy_from_user(kbuf, buf, count))
+ goto out;
+ kbuf[count] = '\0';
+
+ /* Parse the user data */
+ ret = -EINVAL;
+ pos = kbuf;
+ new_map.nr_extents = 0;
+ for (;pos; pos = next_line) {
+ extent = &new_map.extent[new_map.nr_extents];
+
+ /* Find the end of line and ensure I don't look past it */
+ next_line = strchr(pos, '\n');
+ if (next_line) {
+ *next_line = '\0';
+ next_line++;
+ if (*next_line == '\0')
+ next_line = NULL;
}
+
+ pos = skip_spaces(pos);
+ extent->first = simple_strtoul(pos, &pos, 10);
+ if (!isspace(*pos))
+ goto out;
+
+ pos = skip_spaces(pos);
+ extent->lower_first = simple_strtoul(pos, &pos, 10);
+ if (!isspace(*pos))
+ goto out;
+
+ pos = skip_spaces(pos);
+ extent->count = simple_strtoul(pos, &pos, 10);
+ if (*pos && !isspace(*pos))
+ goto out;
+
+ /* Verify there is not trailing junk on the line */
+ pos = skip_spaces(pos);
+ if (*pos != '\0')
+ goto out;
+
+ /* Verify we have been given valid starting values */
+ if ((extent->first == (u32) -1) ||
+ (extent->lower_first == (u32) -1 ))
+ goto out;
+
+ /* Verify count is not zero and does not cause the extent to wrap */
+ if ((extent->first + extent->count) <= extent->first)
+ goto out;
+ if ((extent->lower_first + extent->count) <= extent->lower_first)
+ goto out;
+
+ /* For now only accept extents that are strictly in order */
+ if (last &&
+ (((last->first + last->count) > extent->first) ||
+ ((last->lower_first + last->count) > extent->lower_first)))
+ goto out;
+
+ new_map.nr_extents++;
+ last = extent;
+
+ /* Fail if the file contains too many extents */
+ if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
+ (next_line != NULL))
+ goto out;
+ }
+ /* Be very certaint the new map actually exists */
+ if (new_map.nr_extents == 0)
+ goto out;
+
+ ret = -EPERM;
+ /* Validate the user is allowed to use user id's mapped to. */
+ if (!new_idmap_permitted(ns, cap_setid, &new_map))
+ goto out;
+
+ /* Map the lower ids from the parent user namespace to the
+ * kernel global id space.
+ */
+ for (idx = 0; idx < new_map.nr_extents; idx++) {
+ u32 lower_first;
+ extent = &new_map.extent[idx];
+
+ lower_first = map_id_range_down(parent_map,
+ extent->lower_first,
+ extent->count);
+
+ /* Fail if we can not map the specified extent to
+ * the kernel global id space.
+ */
+ if (lower_first == (u32) -1)
+ goto out;
+
+ extent->lower_first = lower_first;
}
- /* No useful relationship so no mapping */
- return overflowgid;
+ /* Install the map */
+ memcpy(map->extent, new_map.extent,
+ new_map.nr_extents*sizeof(new_map.extent[0]));
+ smp_wmb();
+ map->nr_extents = new_map.nr_extents;
+
+ *ppos = count;
+ ret = count;
+out:
+ mutex_unlock(&id_map_mutex);
+ if (page)
+ free_page(page);
+ return ret;
+}
+
+ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+
+ if (!ns->parent)
+ return -EPERM;
+
+ return map_write(file, buf, size, ppos, CAP_SETUID,
+ &ns->uid_map, &ns->parent->uid_map);
+}
+
+ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+
+ if (!ns->parent)
+ return -EPERM;
+
+ return map_write(file, buf, size, ppos, CAP_SETGID,
+ &ns->gid_map, &ns->parent->gid_map);
+}
+
+static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+ struct uid_gid_map *new_map)
+{
+ /* Allow the specified ids if we have the appropriate capability
+ * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
+ */
+ if (ns_capable(ns->parent, cap_setid))
+ return true;
+
+ return false;
}
static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 405caf91aad5..679d97a5d3fd 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
- ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
+ ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
up_read(&uts_sem);
return ns;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5abf42f63c08..9a3128dc67df 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
cwq = get_cwq(gcwq->cpu, wq);
trace_workqueue_queue_work(cpu, cwq, work);
- BUG_ON(!list_empty(&work->entry));
+ if (WARN_ON(!list_empty(&work->entry))) {
+ spin_unlock_irqrestore(&gcwq->lock, flags);
+ return;
+ }
cwq->nr_in_flight[cwq->work_color]++;
work_flags = work_color_to_flags(cwq->work_color);
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker)
} else
wake_up_all(&gcwq->trustee_wait);
- /* sanity check nr_running */
- WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
+ /*
+ * Sanity check nr_running. Because trustee releases gcwq->lock
+ * between setting %WORKER_ROGUE and zapping nr_running, the
+ * warning may trigger spuriously. Check iff trustee is idle.
+ */
+ WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+ gcwq->nr_workers == gcwq->nr_idle &&
atomic_read(get_gcwq_nr_running(gcwq->cpu)));
}
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock)
* lock freed" warnings as well as problems when looking into
* work->lockdep_map, make a copy and use that here.
*/
- struct lockdep_map lockdep_map = work->lockdep_map;
+ struct lockdep_map lockdep_map;
+
+ lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
/*
* A single work shouldn't be executed concurrently by
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work)
{
struct wq_barrier barr;
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);
+
if (start_flush_work(work, &barr, true)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);