diff options
Diffstat (limited to 'kernel')
79 files changed, 9247 insertions, 4199 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index ab4f1090f437..b3097bde4e9c 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -4,3 +4,4 @@ config_data.h config_data.gz timeconst.h +hz.bc diff --git a/kernel/Makefile b/kernel/Makefile index bbde5f1a4486..d1574d47cf27 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -24,6 +24,7 @@ endif obj-y += sched/ obj-y += power/ +obj-y += cpu/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o diff --git a/kernel/async.c b/kernel/async.c index 8ddee2c3e5b0..61f023ce0228 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -73,7 +73,7 @@ struct async_entry { struct list_head global_list; struct work_struct work; async_cookie_t cookie; - async_func_ptr *func; + async_func_t func; void *data; struct async_domain *domain; }; @@ -84,24 +84,20 @@ static atomic_t entry_count; static async_cookie_t lowest_in_progress(struct async_domain *domain) { - struct async_entry *first = NULL; + struct list_head *pending; async_cookie_t ret = ASYNC_COOKIE_MAX; unsigned long flags; spin_lock_irqsave(&async_lock, flags); - if (domain) { - if (!list_empty(&domain->pending)) - first = list_first_entry(&domain->pending, - struct async_entry, domain_list); - } else { - if (!list_empty(&async_global_pending)) - first = list_first_entry(&async_global_pending, - struct async_entry, global_list); - } + if (domain) + pending = &domain->pending; + else + pending = &async_global_pending; - if (first) - ret = first->cookie; + if (!list_empty(pending)) + ret = list_first_entry(pending, struct async_entry, + domain_list)->cookie; spin_unlock_irqrestore(&async_lock, flags); return ret; @@ -149,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain) +static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain) { struct async_entry *entry; unsigned long flags; @@ -169,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a spin_unlock_irqrestore(&async_lock, flags); /* low on memory.. run synchronously */ - ptr(data, newcookie); + func(data, newcookie); return newcookie; } INIT_LIST_HEAD(&entry->domain_list); INIT_LIST_HEAD(&entry->global_list); INIT_WORK(&entry->work, async_run_entry_fn); - entry->func = ptr; + entry->func = func; entry->data = data; entry->domain = domain; @@ -202,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a /** * async_schedule - schedule a function for asynchronous execution - * @ptr: function to execute asynchronously + * @func: function to execute asynchronously * @data: data pointer to pass to the function * * Returns an async_cookie_t that may be used for checkpointing later. * Note: This function may be called from atomic or non-atomic contexts. */ -async_cookie_t async_schedule(async_func_ptr *ptr, void *data) +async_cookie_t async_schedule(async_func_t func, void *data) { - return __async_schedule(ptr, data, &async_dfl_domain); + return __async_schedule(func, data, &async_dfl_domain); } EXPORT_SYMBOL_GPL(async_schedule); /** * async_schedule_domain - schedule a function for asynchronous execution within a certain domain - * @ptr: function to execute asynchronously + * @func: function to execute asynchronously * @data: data pointer to pass to the function * @domain: the domain * @@ -226,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule); * synchronization domain is specified via @domain. Note: This function * may be called from atomic or non-atomic contexts. */ -async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, +async_cookie_t async_schedule_domain(async_func_t func, void *data, struct async_domain *domain) { - return __async_schedule(ptr, data, domain); + return __async_schedule(func, data, domain); } EXPORT_SYMBOL_GPL(async_schedule_domain); diff --git a/kernel/audit.c b/kernel/audit.c index d596e5355f15..9816a1b96cfc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* As soon as there's any sign of userspace auditd, * start kauditd to talk to it */ - if (!kauditd_task) + if (!kauditd_task) { kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) { - err = PTR_ERR(kauditd_task); - kauditd_task = NULL; - return err; + if (IS_ERR(kauditd_task)) { + err = PTR_ERR(kauditd_task); + kauditd_task = NULL; + return err; + } } - loginuid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); diff --git a/kernel/audit.h b/kernel/audit.h index d51cba868e1b..11468d99dad0 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -59,10 +59,7 @@ struct audit_entry { struct audit_krule rule; }; -#ifdef CONFIG_AUDIT -extern int audit_enabled; extern int audit_ever_enabled; -#endif extern int audit_pid; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 642a89c4f3d6..a291aa23fb3f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -617,9 +617,9 @@ void audit_trim_trees(void) } spin_unlock(&hash_lock); trim_marked(tree); - put_tree(tree); drop_collected_mounts(root_mnt); skip_it: + put_tree(tree); mutex_lock(&audit_filter_mutex); } list_del(&cursor); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f9fc54bbe06f..267436826c3b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -594,6 +594,10 @@ exit_nofree: return entry; exit_free: + if (entry->rule.watch) + audit_put_watch(entry->rule.watch); /* matches initial get */ + if (entry->rule.tree) + audit_put_tree(entry->rule.tree); /* that's the temporary one */ audit_free_rule(entry); return ERR_PTR(err); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a371f857a0a9..c68229411a7c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context) } } -static inline void audit_zero_context(struct audit_context *context, - enum audit_state state) -{ - memset(context, 0, sizeof(*context)); - context->state = state; - context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; -} - static inline struct audit_context *audit_alloc_context(enum audit_state state) { struct audit_context *context; - if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) return NULL; - audit_zero_context(context, state); + context->state = state; + context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); return context; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32f9432666c..d3abce2d6455 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -30,7 +30,6 @@ #include <linux/cred.h> #include <linux/ctype.h> #include <linux/errno.h> -#include <linux/fs.h> #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/list.h> @@ -59,7 +58,7 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/eventfd.h> #include <linux/poll.h> -#include <linux/flex_array.h> /* used in cgroup_attach_proc */ +#include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> #include <linux/atomic.h> @@ -83,7 +82,13 @@ * B happens only through cgroup_show_options() and using cgroup_root_mutex * breaks it. */ +#ifdef CONFIG_PROVE_RCU +DEFINE_MUTEX(cgroup_mutex); +EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ +#else static DEFINE_MUTEX(cgroup_mutex); +#endif + static DEFINE_MUTEX(cgroup_root_mutex); /* @@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include <linux/cgroup_subsys.h> }; -#define MAX_CGROUP_ROOT_NAMELEN 64 - -/* - * A cgroupfs_root represents the root of a cgroup hierarchy, - * and may be associated with a superblock to form an active - * hierarchy - */ -struct cgroupfs_root { - struct super_block *sb; - - /* - * The bitmask of subsystems intended to be attached to this - * hierarchy - */ - unsigned long subsys_mask; - - /* Unique id for this hierarchy. */ - int hierarchy_id; - - /* The bitmask of subsystems currently attached to this hierarchy */ - unsigned long actual_subsys_mask; - - /* A list running through the attached subsystems */ - struct list_head subsys_list; - - /* The root cgroup for this hierarchy */ - struct cgroup top_cgroup; - - /* Tracks how many cgroups are currently defined in hierarchy.*/ - int number_of_cgroups; - - /* A list running through the active hierarchies */ - struct list_head root_list; - - /* All cgroups on this root, cgroup_mutex protected */ - struct list_head allcg_list; - - /* Hierarchy-specific flags */ - unsigned long flags; - - /* IDs for cgroups in this hierarchy */ - struct ida cgroup_ida; - - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - - /* The name for this hierarchy - may be empty */ - char name[MAX_CGROUP_ROOT_NAMELEN]; -}; - /* * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the * subsystems that are otherwise unattached - it never has more than a @@ -162,6 +117,9 @@ struct cfent { struct list_head node; struct dentry *dentry; struct cftype *type; + + /* file xattrs */ + struct simple_xattrs xattrs; }; /* @@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) +static struct cgroup_name root_cgroup_name = { .name = "/" }; + /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do * extra work in the fork/exit path if none of the subsystems need to @@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add); -#ifdef CONFIG_PROVE_LOCKING -int cgroup_lock_is_held(void) -{ - return lockdep_is_held(&cgroup_mutex); -} -#else /* #ifdef CONFIG_PROVE_LOCKING */ -int cgroup_lock_is_held(void) -{ - return mutex_is_locked(&cgroup_mutex); -} -#endif /* #else #ifdef CONFIG_PROVE_LOCKING */ - -EXPORT_SYMBOL_GPL(cgroup_lock_is_held); - static int css_unbias_refcnt(int refcnt) { return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; @@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) return test_bit(CGRP_REMOVED, &cgrp->flags); } -/* bits in struct cgroupfs_root flags field */ -enum { - ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ - ROOT_XATTR, /* supports extended attributes */ -}; +/** + * cgroup_is_descendant - test ancestry + * @cgrp: the cgroup to be tested + * @ancestor: possible ancestor of @cgrp + * + * Test whether @cgrp is a descendant of @ancestor. It also returns %true + * if @cgrp == @ancestor. This function is safe to call as long as @cgrp + * and @ancestor are accessible. + */ +bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) +{ + while (cgrp) { + if (cgrp == ancestor) + return true; + cgrp = cgrp->parent; + } + return false; +} +EXPORT_SYMBOL_GPL(cgroup_is_descendant); static int cgroup_is_releasable(const struct cgroup *cgrp) { @@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry) return __d_cfe(dentry)->type; } +/** + * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. + * @cgrp: the cgroup to be checked for liveness + * + * On success, returns true; the mutex should be later unlocked. On + * failure returns false with no lock held. + */ +static bool cgroup_lock_live_group(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_mutex); + if (cgroup_is_removed(cgrp)) { + mutex_unlock(&cgroup_mutex); + return false; + } + return true; +} + /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); @@ -800,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -/** - * cgroup_lock - lock out any changes to cgroup structures - * - */ -void cgroup_lock(void) -{ - mutex_lock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_lock); - -/** - * cgroup_unlock - release lock on cgroup changes - * - * Undo the lock taken in a previous cgroup_lock() call. - */ -void cgroup_unlock(void) -{ - mutex_unlock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_unlock); - /* * A couple of forward declarations required, due to cyclic reference loop: * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> @@ -859,6 +815,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } +static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) +{ + struct cgroup_name *name; + + name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); + if (!name) + return NULL; + strcpy(name->name, dentry->d_name.name); + return name; +} + static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, free_work); @@ -875,8 +842,18 @@ static void cgroup_free_fn(struct work_struct *work) mutex_unlock(&cgroup_mutex); /* + * We get a ref to the parent's dentry, and put the ref when + * this cgroup is being freed, so it's guaranteed that the + * parent won't be destroyed before its children. + */ + dput(cgrp->parent->dentry); + + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + + /* * Drop the active superblock reference that we took when we - * created the cgroup + * created the cgroup. This will free cgrp->root, if we are + * holding the last reference to @sb. */ deactivate_super(cgrp->root->sb); @@ -888,7 +865,7 @@ static void cgroup_free_fn(struct work_struct *work) simple_xattrs_free(&cgrp->xattrs); - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + kfree(rcu_dereference_raw(cgrp->name)); kfree(cgrp); } @@ -910,13 +887,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; - struct cftype *cft = cfe->type; WARN_ONCE(!list_empty(&cfe->node) && cgrp != &cgrp->root->top_cgroup, "cfe still linked for %s\n", cfe->type->name); + simple_xattrs_free(&cfe->xattrs); kfree(cfe); - simple_xattrs_free(&cft->xattrs); } iput(inode); } @@ -1108,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); - if (test_bit(ROOT_NOPREFIX, &root->flags)) + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) + seq_puts(seq, ",sane_behavior"); + if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); - if (test_bit(ROOT_XATTR, &root->flags)) + if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); @@ -1172,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } + if (!strcmp(token, "__DEVEL__sane_behavior")) { + opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; + continue; + } if (!strcmp(token, "noprefix")) { - set_bit(ROOT_NOPREFIX, &opts->flags); + opts->flags |= CGRP_ROOT_NOPREFIX; continue; } if (!strcmp(token, "clone_children")) { @@ -1181,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } if (!strcmp(token, "xattr")) { - set_bit(ROOT_XATTR, &opts->flags); + opts->flags |= CGRP_ROOT_XATTR; continue; } if (!strncmp(token, "release_agent=", 14)) { @@ -1259,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ + if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { + pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + + if (opts->flags & CGRP_ROOT_NOPREFIX) { + pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); + return -EINVAL; + } + + if (opts->cpuset_clone_children) { + pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); + return -EINVAL; + } + } + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ - if (test_bit(ROOT_NOPREFIX, &opts->flags) && - (opts->subsys_mask & mask)) + if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) return -EINVAL; @@ -1336,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { + pr_err("cgroup: sane_behavior: remount is not allowed\n"); + return -EINVAL; + } + mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1421,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) INIT_LIST_HEAD(&root->allcg_list); root->number_of_cgroups = 1; cgrp->root = root; - cgrp->top_cgroup = cgrp; + cgrp->name = &root_cgroup_name; init_cgroup_housekeeping(cgrp); list_add_tail(&cgrp->allcg_node, &root->allcg_list); } @@ -1685,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * any) is not needed */ cgroup_drop_root(opts.new_root); + + if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && + root->flags != opts.flags) { + pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + ret = -EINVAL; + goto drop_new_super; + } + /* no subsys rebinding, so refcounts don't change */ drop_parsed_module_refcounts(opts.subsys_mask); } @@ -1769,49 +1777,48 @@ static struct kobject *cgroup_kobj; * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Called with cgroup_mutex held or else with an RCU-protected cgroup - * reference. Writes path of cgroup into buf. Returns 0 on success, - * -errno on error. + * Writes path of cgroup into buf. Returns 0 on success, -errno on error. + * + * We can't generate cgroup path using dentry->d_name, as accessing + * dentry->name must be protected by irq-unsafe dentry->d_lock or parent + * inode's i_mutex, while on the other hand cgroup_path() can be called + * with some irq-safe spinlocks held. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { - struct dentry *dentry = cgrp->dentry; + int ret = -ENAMETOOLONG; char *start; - rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), - "cgroup_path() called without proper locking"); - - if (cgrp == dummytop) { - /* - * Inactive subsystems have no dentry for their root - * cgroup - */ - strcpy(buf, "/"); + if (!cgrp->parent) { + if (strlcpy(buf, "/", buflen) >= buflen) + return -ENAMETOOLONG; return 0; } start = buf + buflen - 1; - *start = '\0'; - for (;;) { - int len = dentry->d_name.len; + rcu_read_lock(); + do { + const char *name = cgroup_name(cgrp); + int len; + + len = strlen(name); if ((start -= len) < buf) - return -ENAMETOOLONG; - memcpy(start, dentry->d_name.name, len); - cgrp = cgrp->parent; - if (!cgrp) - break; + goto out; + memcpy(start, name, len); - dentry = cgrp->dentry; - if (!cgrp->parent) - continue; if (--start < buf) - return -ENAMETOOLONG; + goto out; *start = '/'; - } + + cgrp = cgrp->parent; + } while (cgrp->parent); + ret = 0; memmove(buf, start, buf + buflen - start); - return 0; +out: + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL_GPL(cgroup_path); @@ -1900,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * * Must be called with cgroup_mutex and threadgroup locked. */ -static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, +static void cgroup_task_migrate(struct cgroup *oldcgrp, struct task_struct *tsk, struct css_set *newcg) { struct css_set *oldcg; @@ -1933,121 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, } /** - * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' - * @cgrp: the cgroup the task is attaching to - * @tsk: the task to be attached - * - * Call with cgroup_mutex and threadgroup locked. May take task_lock of - * @tsk during call. - */ -int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ - int retval = 0; - struct cgroup_subsys *ss, *failed_ss = NULL; - struct cgroup *oldcgrp; - struct cgroupfs_root *root = cgrp->root; - struct cgroup_taskset tset = { }; - struct css_set *newcg; - - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) - return -ESRCH; - - /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) - return 0; - - tset.single.task = tsk; - tset.single.cgrp = oldcgrp; - - for_each_subsys(root, ss) { - if (ss->can_attach) { - retval = ss->can_attach(cgrp, &tset); - if (retval) { - /* - * Remember on which subsystem the can_attach() - * failed, so that we only call cancel_attach() - * against the subsystems whose can_attach() - * succeeded. (See below) - */ - failed_ss = ss; - goto out; - } - } - } - - newcg = find_css_set(tsk->cgroups, cgrp); - if (!newcg) { - retval = -ENOMEM; - goto out; - } - - cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); - - for_each_subsys(root, ss) { - if (ss->attach) - ss->attach(cgrp, &tset); - } - -out: - if (retval) { - for_each_subsys(root, ss) { - if (ss == failed_ss) - /* - * This subsystem was the one that failed the - * can_attach() check earlier, so we don't need - * to call cancel_attach() against it or any - * remaining subsystems. - */ - break; - if (ss->cancel_attach) - ss->cancel_attach(cgrp, &tset); - } - } - return retval; -} - -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ - struct cgroupfs_root *root; - int retval = 0; - - cgroup_lock(); - for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); - - retval = cgroup_attach_task(from_cg, tsk); - if (retval) - break; - } - cgroup_unlock(); - - return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - -/** - * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @cgrp: the cgroup to attach to - * @leader: the threadgroup leader task_struct of the group to be attached + * @tsk: the task or the leader of the threadgroup to be attached + * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of each thread in leader's threadgroup individually in turn. + * task_lock of @tsk or each thread in the threadgroup individually in turn. */ -static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, + bool threadgroup) { int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; - /* guaranteed to be initialized later, but the compiler needs this */ struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ - struct task_struct *tsk; + struct task_struct *leader = tsk; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; @@ -2059,17 +1967,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * group - group_rwsem prevents new threads from appearing, and if * threads exit, this will just be an over-estimate. */ - group_size = get_nr_threads(leader); + if (threadgroup) + group_size = get_nr_threads(tsk); + else + group_size = 1; /* flex_array supports very large thread-groups better than kmalloc. */ group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); if (!group) return -ENOMEM; /* pre-allocate to guarantee space while iterating in rcu read-side. */ - retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); + retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); if (retval) goto out_free_group_list; - tsk = leader; i = 0; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are @@ -2098,6 +2008,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; + + if (!threadgroup) + break; } while_each_thread(leader, tsk); rcu_read_unlock(); /* remember the number of threads in the array for later. */ @@ -2143,7 +2056,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); + cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); } /* nothing is sensitive to fork() after this point. */ @@ -2224,11 +2137,11 @@ retry_find_task: tsk = tsk->group_leader; /* - * Workqueue threads may acquire PF_THREAD_BOUND and become + * Workqueue threads may acquire PF_NO_SETAFFINITY and become * trapped in a cpuset, or RT worker may be born in a cgroup * with no rt_runtime allocated. Just say no. */ - if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { + if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; rcu_read_unlock(); goto out_unlock_cgroup; @@ -2251,17 +2164,42 @@ retry_find_task: put_task_struct(tsk); goto retry_find_task; } - ret = cgroup_attach_proc(cgrp, tsk); - } else - ret = cgroup_attach_task(cgrp, tsk); + } + + ret = cgroup_attach_task(cgrp, tsk, threadgroup); + threadgroup_unlock(tsk); put_task_struct(tsk); out_unlock_cgroup: - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); return ret; } +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) +{ + struct cgroupfs_root *root; + int retval = 0; + + mutex_lock(&cgroup_mutex); + for_each_active_root(root) { + struct cgroup *from_cg = task_cgroup_from_root(from, root); + + retval = cgroup_attach_task(from_cg, tsk, false); + if (retval) + break; + } + mutex_unlock(&cgroup_mutex); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) { return attach_task_by_pid(cgrp, pid, false); @@ -2272,24 +2210,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) return attach_task_by_pid(cgrp, tgid, true); } -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the lock should be later released with - * cgroup_unlock(). On failure returns false with no lock held. - */ -bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_removed(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} -EXPORT_SYMBOL_GPL(cgroup_lock_live_group); - static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { @@ -2301,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, mutex_lock(&cgroup_root_mutex); strcpy(cgrp->root->release_agent_path, buffer); mutex_unlock(&cgroup_root_mutex); - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); return 0; } @@ -2312,7 +2232,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); seq_putc(seq, '\n'); - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *seq) +{ + seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); return 0; } @@ -2537,13 +2464,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file) static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { + int ret; + struct cgroup_name *name, *old_name; + struct cgroup *cgrp; + + /* + * It's convinient to use parent dir's i_mutex to protected + * cgrp->name. + */ + lockdep_assert_held(&old_dir->i_mutex); + if (!S_ISDIR(old_dentry->d_inode->i_mode)) return -ENOTDIR; if (new_dentry->d_inode) return -EEXIST; if (old_dir != new_dir) return -EIO; - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); + + cgrp = __d_cgrp(old_dentry); + + name = cgroup_alloc_name(new_dentry); + if (!name) + return -ENOMEM; + + ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); + if (ret) { + kfree(name); + return ret; + } + + old_name = cgrp->name; + rcu_assign_pointer(cgrp->name, name); + + kfree_rcu(old_name, rcu_head); + return 0; } static struct simple_xattrs *__d_xattrs(struct dentry *dentry) @@ -2551,13 +2505,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry) if (S_ISDIR(dentry->d_inode->i_mode)) return &__d_cgrp(dentry)->xattrs; else - return &__d_cft(dentry)->xattrs; + return &__d_cfe(dentry)->xattrs; } static inline int xattr_enabled(struct dentry *dentry) { struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - return test_bit(ROOT_XATTR, &root->flags); + return root->flags & CGRP_ROOT_XATTR; } static bool is_valid_xattr(const char *name) @@ -2727,9 +2681,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - simple_xattrs_init(&cft->xattrs); - - if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { + if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { strcpy(name, subsys->name); strcat(name, "."); } @@ -2753,6 +2705,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, cfe->type = (void *)cft; cfe->dentry = dentry; dentry->d_fsdata = cfe; + simple_xattrs_init(&cfe->xattrs); list_add_tail(&cfe->node, &parent->files); cfe = NULL; } @@ -2770,6 +2723,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) + continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) @@ -3300,6 +3255,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) return 0; } +static void cgroup_transfer_one_task(struct task_struct *task, + struct cgroup_scanner *scan) +{ + struct cgroup *new_cgroup = scan->data; + + mutex_lock(&cgroup_mutex); + cgroup_attach_task(new_cgroup, task, false); + mutex_unlock(&cgroup_mutex); +} + +/** + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside + */ +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +{ + struct cgroup_scanner scan; + + scan.cg = from; + scan.test_task = NULL; /* select all tasks in cgroup */ + scan.process_task = cgroup_transfer_one_task; + scan.heap = NULL; + scan.data = to; + + return cgroup_scan_tasks(&scan); +} + /* * Stuff for reading the 'tasks'/'procs' files. * @@ -3362,35 +3345,14 @@ static void pidlist_free(void *p) else kfree(p); } -static void *pidlist_resize(void *p, int newcount) -{ - void *newlist; - /* note: if new alloc fails, old p will still be valid either way */ - if (is_vmalloc_addr(p)) { - newlist = vmalloc(newcount * sizeof(pid_t)); - if (!newlist) - return NULL; - memcpy(newlist, p, newcount * sizeof(pid_t)); - vfree(p); - } else { - newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); - } - return newlist; -} /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries - * If the new stripped list is sufficiently smaller and there's enough memory - * to allocate a new buffer, will let go of the unneeded memory. Returns the - * number of unique elements. + * Returns the number of unique elements. */ -/* is the size difference enough that we should re-allocate the array? */ -#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) -static int pidlist_uniq(pid_t **p, int length) +static int pidlist_uniq(pid_t *list, int length) { int src, dest = 1; - pid_t *list = *p; - pid_t *newlist; /* * we presume the 0th element is unique, so i starts at 1. trivial @@ -3411,16 +3373,6 @@ static int pidlist_uniq(pid_t **p, int length) dest++; } after: - /* - * if the length difference is large enough, we want to allocate a - * smaller buffer to save memory. if this fails due to out of memory, - * we'll just stay with what we've got. - */ - if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { - newlist = pidlist_resize(list, dest); - if (newlist) - *p = newlist; - } return dest; } @@ -3516,7 +3468,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(&array, length); + length = pidlist_uniq(array, length); l = cgroup_pidlist_find(cgrp, type); if (!l) { pidlist_free(array); @@ -3930,11 +3882,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, if (ret) goto fail; - if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { - event->cft->unregister_event(cgrp, event->cft, event->eventfd); - ret = 0; - goto fail; - } + efile->f_op->poll(efile, &event->pt); /* * Events should be removed after rmdir of cgroup directory, but before @@ -4016,10 +3964,16 @@ static struct cftype files[] = { }, { .name = "cgroup.clone_children", + .flags = CFTYPE_INSANE, .read_u64 = cgroup_clone_children_read, .write_u64 = cgroup_clone_children_write, }, { + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_seq_string = cgroup_sane_behavior_show, + }, + { .name = "release_agent", .flags = CFTYPE_ONLY_ON_ROOT, .read_seq_string = cgroup_release_agent_show, @@ -4131,17 +4085,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!(css->flags & CSS_ONLINE)) return; - /* - * css_offline() should be called with cgroup_mutex unlocked. See - * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for - * details. This temporary unlocking should go away once - * cgroup_mutex is unexported from controllers. - */ - if (ss->css_offline) { - mutex_unlock(&cgroup_mutex); + if (ss->css_offline) ss->css_offline(cgrp); - mutex_lock(&cgroup_mutex); - } cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; } @@ -4158,6 +4103,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { struct cgroup *cgrp; + struct cgroup_name *name; struct cgroupfs_root *root = parent->root; int err = 0; struct cgroup_subsys *ss; @@ -4168,9 +4114,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (!cgrp) return -ENOMEM; + name = cgroup_alloc_name(dentry); + if (!name) + goto err_free_cgrp; + rcu_assign_pointer(cgrp->name, name); + cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); if (cgrp->id < 0) - goto err_free_cgrp; + goto err_free_name; /* * Only live parents can have children. Note that the liveliness @@ -4198,7 +4149,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->parent = parent; cgrp->root = parent->root; - cgrp->top_cgroup = parent->top_cgroup; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4241,6 +4191,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) dget(dentry); + /* hold a ref to the parent's dentry */ + dget(parent->dentry); + /* creation succeeded, notify subsystems */ for_each_subsys(root, ss) { err = online_css(ss, cgrp); @@ -4276,6 +4229,8 @@ err_free_all: deactivate_super(sb); err_free_id: ida_simple_remove(&root->cgroup_ida, cgrp->id); +err_free_name: + kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: kfree(cgrp); return err; @@ -4295,56 +4250,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -/* - * Check the reference count on each subsystem. Since we already - * established that there are no tasks in the cgroup, if the css refcount - * is also 1, then there should be no outstanding references, so the - * subsystem is safe to destroy. We scan across all subsystems rather than - * using the per-hierarchy linked list of mounted subsystems since we can - * be called via check_for_release() with no synchronization other than - * RCU, and the subsystem linked list isn't RCU-safe. - */ -static int cgroup_has_css_refs(struct cgroup *cgrp) -{ - int i; - - /* - * We won't need to lock the subsys array, because the subsystems - * we're concerned about aren't going anywhere since our cgroup root - * has a reference on them. - */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - struct cgroup_subsys_state *css; - - /* Skip subsystems not present or not in this hierarchy */ - if (ss == NULL || ss->root != cgrp->root) - continue; - - css = cgrp->subsys[ss->subsys_id]; - /* - * When called from check_for_release() it's possible - * that by this point the cgroup has been removed - * and the css deleted. But a false-positive doesn't - * matter, since it can only happen if the cgroup - * has been deleted and hence no longer needs the - * release agent to be called anyway. - */ - if (css && css_refcnt(css) > 1) - return 1; - } - return 0; -} - static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct dentry *d = cgrp->dentry; struct cgroup *parent = cgrp->parent; - DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - LIST_HEAD(tmp_list); lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); @@ -4468,7 +4380,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - ss->active = 1; BUG_ON(online_css(ss, dummytop)); mutex_unlock(&cgroup_mutex); @@ -4573,7 +4484,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ss->active = 1; ret = online_css(ss, dummytop); if (ret) goto err_unload; @@ -4614,7 +4524,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); offline_css(ss, dummytop); - ss->active = 0; if (ss->use_id) idr_destroy(&ss->idr); @@ -4935,17 +4844,17 @@ void cgroup_post_fork(struct task_struct *child) * and addition to css_set. */ if (need_forkexit_callback) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * fork/exit callbacks are supported only for builtin + * subsystems, and the builtin section of the subsys + * array is immutable, so we don't need to lock the + * subsys array here. On the other hand, modular section + * of the array can be freed at module unload, so we + * can't touch that. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - /* - * fork/exit callbacks are supported only for - * builtin subsystems and we don't need further - * synchronization as they never go away. - */ - if (!ss || ss->module) - continue; - if (ss->fork) ss->fork(child); } @@ -5010,13 +4919,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) tsk->cgroups = &init_css_set; if (run_callbacks && need_forkexit_callback) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * fork/exit callbacks are supported only for builtin + * subsystems, see cgroup_post_fork() for details. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - /* modular subsystems can't use callbacks */ - if (!ss || ss->module) - continue; - if (ss->exit) { struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; @@ -5030,44 +4939,19 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) put_css_set_taskexit(cg); } -/** - * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp - * @cgrp: the cgroup in question - * @task: the task in question - * - * See if @cgrp is a descendant of @task's cgroup in the appropriate - * hierarchy. - * - * If we are sending in dummytop, then presumably we are creating - * the top cgroup in the subsystem. - * - * Called only by the ns (nsproxy) cgroup. - */ -int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) -{ - int ret; - struct cgroup *target; - - if (cgrp == dummytop) - return 1; - - target = task_cgroup_from_root(task, cgrp->root); - while (cgrp != target && cgrp!= cgrp->top_cgroup) - cgrp = cgrp->parent; - ret = (cgrp == target); - return ret; -} - static void check_for_release(struct cgroup *cgrp) { /* All of these checks rely on RCU to keep the cgroup * structure alive */ - if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) - && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { - /* Control Group is currently removeable. If it's not + if (cgroup_is_releasable(cgrp) && + !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { + /* + * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue - * it now */ + * it now + */ int need_schedule_work = 0; + raw_spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { @@ -5100,24 +4984,11 @@ EXPORT_SYMBOL_GPL(__css_tryget); /* Caller must verify that the css is not for root cgroup */ void __css_put(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = css->cgroup; int v; - rcu_read_lock(); v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); - - switch (v) { - case 1: - if (notify_on_release(cgrp)) { - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } - break; - case 0: + if (v == 0) schedule_work(&css->dput_work); - break; - } - rcu_read_unlock(); } EXPORT_SYMBOL_GPL(__css_put); @@ -5416,55 +5287,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) } EXPORT_SYMBOL_GPL(css_lookup); -/** - * css_get_next - lookup next cgroup under specified hierarchy. - * @ss: pointer to subsystem - * @id: current position of iteration. - * @root: pointer to css. search tree under this. - * @foundid: position of found object. - * - * Search next css under the specified hierarchy of rootid. Calling under - * rcu_read_lock() is necessary. Returns NULL if it reaches the end. - */ -struct cgroup_subsys_state * -css_get_next(struct cgroup_subsys *ss, int id, - struct cgroup_subsys_state *root, int *foundid) -{ - struct cgroup_subsys_state *ret = NULL; - struct css_id *tmp; - int tmpid; - int rootid = css_id(root); - int depth = css_depth(root); - - if (!rootid) - return NULL; - - BUG_ON(!ss->use_id); - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* fill start point for scan */ - tmpid = id; - while (1) { - /* - * scan next entry from bitmap(tree), tmpid is updated after - * idr_get_next(). - */ - tmp = idr_get_next(&ss->idr, &tmpid); - if (!tmp) - break; - if (tmp->depth >= depth && tmp->stack[depth] == rootid) { - ret = rcu_dereference(tmp->css); - if (ret) { - *foundid = tmpid; - break; - } - } - /* continue to scan from next id */ - tmpid = tmpid + 1; - } - return ret; -} - /* * get corresponding css from file open on cgroupfs directory */ diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile new file mode 100644 index 000000000000..59ab052ef7a0 --- /dev/null +++ b/kernel/cpu/Makefile @@ -0,0 +1 @@ +obj-y = idle.o diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c new file mode 100644 index 000000000000..8b86c0c68edf --- /dev/null +++ b/kernel/cpu/idle.c @@ -0,0 +1,116 @@ +/* + * Generic entry point for the idle threads + */ +#include <linux/sched.h> +#include <linux/cpu.h> +#include <linux/tick.h> +#include <linux/mm.h> + +#include <asm/tlb.h> + +#include <trace/events/power.h> + +static int __read_mostly cpu_idle_force_poll; + +void cpu_idle_poll_ctrl(bool enable) +{ + if (enable) { + cpu_idle_force_poll++; + } else { + cpu_idle_force_poll--; + WARN_ON_ONCE(cpu_idle_force_poll < 0); + } +} + +#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP +static int __init cpu_idle_poll_setup(char *__unused) +{ + cpu_idle_force_poll = 1; + return 1; +} +__setup("nohlt", cpu_idle_poll_setup); + +static int __init cpu_idle_nopoll_setup(char *__unused) +{ + cpu_idle_force_poll = 0; + return 1; +} +__setup("hlt", cpu_idle_nopoll_setup); +#endif + +static inline int cpu_idle_poll(void) +{ + trace_cpu_idle_rcuidle(0, smp_processor_id()); + local_irq_enable(); + while (!need_resched()) + cpu_relax(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + return 1; +} + +/* Weak implementations for optional arch specific functions */ +void __weak arch_cpu_idle_prepare(void) { } +void __weak arch_cpu_idle_enter(void) { } +void __weak arch_cpu_idle_exit(void) { } +void __weak arch_cpu_idle_dead(void) { } +void __weak arch_cpu_idle(void) +{ + cpu_idle_force_poll = 1; +} + +/* + * Generic idle loop implementation + */ +static void cpu_idle_loop(void) +{ + while (1) { + tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(smp_processor_id())) + arch_cpu_idle_dead(); + + local_irq_disable(); + arch_cpu_idle_enter(); + + /* + * In poll mode we reenable interrupts and spin. + * + * Also if we detected in the wakeup from idle + * path that the tick broadcast device expired + * for us, we don't want to go deep idle as we + * know that the IPI is going to arrive right + * away + */ + if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + cpu_idle_poll(); + } else { + current_clr_polling(); + if (!need_resched()) { + stop_critical_timings(); + rcu_idle_enter(); + arch_cpu_idle(); + WARN_ON_ONCE(irqs_disabled()); + rcu_idle_exit(); + start_critical_timings(); + } else { + local_irq_enable(); + } + current_set_polling(); + } + arch_cpu_idle_exit(); + } + tick_nohz_idle_exit(); + schedule_preempt_disabled(); + } +} + +void cpu_startup_entry(enum cpuhp_state state) +{ + current_set_polling(); + arch_cpu_idle_prepare(); + cpu_idle_loop(); +} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4f9dfe43ecbd..12331120767c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -265,17 +265,6 @@ static DEFINE_MUTEX(cpuset_mutex); static DEFINE_MUTEX(callback_mutex); /* - * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist - * buffers. They are statically allocated to prevent using excess stack - * when calling cpuset_print_task_mems_allowed(). - */ -#define CPUSET_NAME_LEN (128) -#define CPUSET_NODELIST_LEN (256) -static char cpuset_name[CPUSET_NAME_LEN]; -static char cpuset_nodelist[CPUSET_NODELIST_LEN]; -static DEFINE_SPINLOCK(cpuset_buffer_lock); - -/* * CPU / memory hotplug is handled asynchronously. */ static struct workqueue_struct *cpuset_propagate_hotplug_wq; @@ -780,25 +769,26 @@ static void rebuild_sched_domains_locked(void) lockdep_assert_held(&cpuset_mutex); get_online_cpus(); + /* + * We have raced with CPU hotplug. Don't do anything to avoid + * passing doms with offlined cpu to partition_sched_domains(). + * Anyways, hotplug work item will rebuild sched domains. + */ + if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) + goto out; + /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); - +out: put_online_cpus(); } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) { } - -static int generate_sched_domains(cpumask_var_t **domains, - struct sched_domain_attr **attributes) -{ - *domains = NULL; - return 1; -} #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) @@ -1388,16 +1378,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) cgroup_taskset_for_each(task, cgrp, tset) { /* - * Kthreads bound to specific cpus cannot be moved to a new - * cpuset; we cannot change their cpu affinity and - * isolating such threads by their set of allowed nodes is - * unnecessary. Thus, cpusets are not applicable for such - * threads. This prevents checking for success of - * set_cpus_allowed_ptr() on all attached tasks before - * cpus_allowed may be changed. + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. */ ret = -EINVAL; - if (task->flags & PF_THREAD_BOUND) + if (task->flags & PF_NO_SETAFFINITY) goto out_unlock; ret = security_task_setscheduler(task); if (ret) @@ -2005,50 +1995,6 @@ int __init cpuset_init(void) return 0; } -/** - * cpuset_do_move_task - move a given task to another cpuset - * @tsk: pointer to task_struct the task to move - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner - * - * Called by cgroup_scan_tasks() for each task in a cgroup. - * Return nonzero to stop the walk through the tasks. - */ -static void cpuset_do_move_task(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - struct cgroup *new_cgroup = scan->data; - - cgroup_lock(); - cgroup_attach_task(new_cgroup, tsk); - cgroup_unlock(); -} - -/** - * move_member_tasks_to_cpuset - move tasks from one cpuset to another - * @from: cpuset in which the tasks currently reside - * @to: cpuset to which the tasks will be moved - * - * Called with cpuset_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. - * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - */ -static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) -{ - struct cgroup_scanner scan; - - scan.cg = from->css.cgroup; - scan.test_task = NULL; /* select all tasks in cgroup */ - scan.process_task = cpuset_do_move_task; - scan.heap = NULL; - scan.data = to->css.cgroup; - - if (cgroup_scan_tasks(&scan)) - printk(KERN_ERR "move_member_tasks_to_cpuset: " - "cgroup_scan_tasks failed\n"); -} - /* * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, @@ -2069,7 +2015,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) nodes_empty(parent->mems_allowed)) parent = parent_cs(parent); - move_member_tasks_to_cpuset(cs, parent); + if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { + rcu_read_lock(); + printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", + cgroup_name(cs->css.cgroup)); + rcu_read_unlock(); + } } /** @@ -2222,17 +2173,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work) flush_workqueue(cpuset_propagate_hotplug_wq); /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) { - struct sched_domain_attr *attr; - cpumask_var_t *doms; - int ndoms; - - mutex_lock(&cpuset_mutex); - ndoms = generate_sched_domains(&doms, &attr); - mutex_unlock(&cpuset_mutex); - - partition_sched_domains(ndoms, doms, attr); - } + if (cpus_updated) + rebuild_sched_domains(); } void cpuset_update_active_cpus(bool cpu_online) @@ -2251,7 +2193,6 @@ void cpuset_update_active_cpus(bool cpu_online) schedule_work(&cpuset_hotplug_work); } -#ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. @@ -2263,20 +2204,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self, schedule_work(&cpuset_hotplug_work); return NOTIFY_OK; } -#endif + +static struct notifier_block cpuset_track_online_nodes_nb = { + .notifier_call = cpuset_track_online_nodes, + .priority = 10, /* ??! */ +}; /** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized - **/ - + */ void __init cpuset_init_smp(void) { cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, 10); + register_hotmemory_notifier(&cpuset_track_online_nodes_nb); cpuset_propagate_hotplug_wq = alloc_ordered_workqueue("cpuset_hotplug", 0); @@ -2592,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } +#define CPUSET_NODELIST_LEN (256) + /** * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed * @task: pointer to task_struct of some task. @@ -2602,25 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, */ void cpuset_print_task_mems_allowed(struct task_struct *tsk) { - struct dentry *dentry; + /* Statically allocated to prevent using excess stack. */ + static char cpuset_nodelist[CPUSET_NODELIST_LEN]; + static DEFINE_SPINLOCK(cpuset_buffer_lock); - dentry = task_cs(tsk)->css.cgroup->dentry; - spin_lock(&cpuset_buffer_lock); + struct cgroup *cgrp = task_cs(tsk)->css.cgroup; - if (!dentry) { - strcpy(cpuset_name, "/"); - } else { - spin_lock(&dentry->d_lock); - strlcpy(cpuset_name, (const char *)dentry->d_name.name, - CPUSET_NAME_LEN); - spin_unlock(&dentry->d_lock); - } + rcu_read_lock(); + spin_lock(&cpuset_buffer_lock); nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", - tsk->comm, cpuset_name, cpuset_nodelist); + tsk->comm, cgroup_name(cgrp), cpuset_nodelist); + spin_unlock(&cpuset_buffer_lock); + rcu_read_unlock(); } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d3124b39277..3820e3cefbae 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -37,6 +37,7 @@ #include <linux/ftrace_event.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> +#include <linux/cgroup.h> #include "internal.h" @@ -234,6 +235,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, #ifdef CONFIG_CGROUP_PERF /* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_info __percpu *info; +}; + +/* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. @@ -251,7 +266,22 @@ perf_cgroup_match(struct perf_event *event) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - return !event->cgrp || event->cgrp == cpuctx->cgrp; + /* @event doesn't care about cgroup */ + if (!event->cgrp) + return true; + + /* wants specific cgroup scope but @cpuctx isn't associated with any */ + if (!cpuctx->cgrp) + return false; + + /* + * Cgroup scoping is recursive. An event enabled for a cgroup is + * also enabled for all its descendant cgroups. If @cpuctx's + * cgroup is a descendant of @event's (the test covers identity + * case), it's a match. + */ + return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, + event->cgrp->css.cgroup); } static inline bool perf_tryget_cgroup(struct perf_event *event) @@ -961,9 +991,15 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); + if (sample_type & PERF_SAMPLE_WEIGHT) + size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_READ) size += event->read_size; + if (sample_type & PERF_SAMPLE_DATA_SRC) + size += sizeof(data->data_src.val); + event->header_size = size; } @@ -4178,6 +4214,12 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); + + if (sample_type & PERF_SAMPLE_WEIGHT) + perf_output_put(handle, data->weight); + + if (sample_type & PERF_SAMPLE_DATA_SRC) + perf_output_put(handle, data->data_src.val); } void perf_prepare_sample(struct perf_event_header *header, @@ -4596,6 +4638,7 @@ void perf_event_comm(struct task_struct *task) struct perf_event_context *ctx; int ctxn; + rcu_read_lock(); for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (!ctx) @@ -4603,6 +4646,7 @@ void perf_event_comm(struct task_struct *task) perf_event_enable_on_exec(ctx); } + rcu_read_unlock(); if (!atomic_read(&nr_comm_events)) return; @@ -4765,6 +4809,9 @@ got_name: mmap_event->file_name = name; mmap_event->file_size = size; + if (!(vma->vm_flags & VM_EXEC)) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; rcu_read_lock(); @@ -7515,12 +7562,5 @@ struct cgroup_subsys perf_subsys = { .css_free = perf_cgroup_css_free, .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, - - /* - * perf_event cgroup doesn't handle nesting correctly. - * ctx->nr_cgroups adjustments should be propagated through the - * cgroup hierarchy. Fix it and remove the following. - */ - .broken_hierarchy = true, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a567c8c7ef31..f3569747d629 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -75,6 +75,15 @@ struct uprobe { struct arch_uprobe arch; }; +struct return_instance { + struct uprobe *uprobe; + unsigned long func; + unsigned long orig_ret_vaddr; /* original return address */ + bool chained; /* true, if instance is nested */ + + struct return_instance *next; /* keep as stack */ +}; + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) return *insn == UPROBE_SWBP_INSN; } -static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) +/** + * is_trap_insn - check if instruction is breakpoint instruction. + * @insn: instruction to be checked. + * Default implementation of is_trap_insn + * Returns true if @insn is a breakpoint instruction. + * + * This function is needed for the case where an architecture has multiple + * trap instructions (like powerpc). + */ +bool __weak is_trap_insn(uprobe_opcode_t *insn) +{ + return is_swbp_insn(insn); +} + +static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); - memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); + memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); + kunmap_atomic(kaddr); +} + +static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) +{ + void *kaddr = kmap_atomic(page); + memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); kunmap_atomic(kaddr); } @@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t uprobe_opcode_t old_opcode; bool is_swbp; - copy_opcode(page, vaddr, &old_opcode); + /* + * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. + * We do not check if it is any other 'trap variant' which could + * be conditional trap instruction such as the one powerpc supports. + * + * The logic is that we do not care if the underlying instruction + * is a trap variant; uprobes always wins over any other (gdb) + * breakpoint. + */ + copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { @@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction - * supported by that architecture then we need to modify is_swbp_at_addr and + * supported by that architecture then we need to modify is_trap_at_addr and * write_opcode accordingly. This would never be a problem for archs that * have fixed length instructions. */ @@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; - void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; int ret; @@ -246,15 +284,8 @@ retry: __SetPageUptodate(new_page); - /* copy the page now that we've got it stable */ - vaddr_old = kmap_atomic(old_page); - vaddr_new = kmap_atomic(new_page); - - memcpy(vaddr_new, vaddr_old, PAGE_SIZE); - memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); - - kunmap_atomic(vaddr_new); - kunmap_atomic(vaddr_old); + copy_highpage(new_page, old_page); + copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = anon_vma_prepare(vma); if (ret) @@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, unsigned long nbytes, loff_t offset) { struct page *page; - void *vaddr; - unsigned long off; - pgoff_t idx; - - if (!filp) - return -EINVAL; if (!mapping->a_ops->readpage) return -EIO; - - idx = offset >> PAGE_CACHE_SHIFT; - off = offset & ~PAGE_MASK; - /* * Ensure that the page that has the original instruction is * populated and in page-cache. */ - page = read_mapping_page(mapping, idx, filp); + page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); if (IS_ERR(page)) return PTR_ERR(page); - vaddr = kmap_atomic(page); - memcpy(insn, vaddr + off, nbytes); - kunmap_atomic(vaddr); + copy_from_page(page, offset, insn, nbytes); page_cache_release(page); return 0; @@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, goto out; ret = -ENOTSUPP; - if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); @@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) down_write(&mm->mmap_sem); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || - vma->vm_file->f_mapping->host != uprobe->inode) + file_inode(vma->vm_file) != uprobe->inode) goto unlock; if (vma->vm_start > info->vaddr || @@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * struct uprobe *uprobe; int ret; + /* Uprobe must have at least one set consumer */ + if (!uc->handler && !uc->ret_handler) + return -EINVAL; + /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; @@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) loff_t offset; if (!valid_vma(vma, false) || - vma->vm_file->f_mapping->host != uprobe->inode) + file_inode(vma->vm_file) != uprobe->inode) continue; offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; @@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (no_uprobe_events() || !valid_vma(vma, true)) return 0; - inode = vma->vm_file->f_mapping->host; + inode = file_inode(vma->vm_file); if (!inode) return 0; @@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e struct inode *inode; struct rb_node *n; - inode = vma->vm_file->f_mapping->host; + inode = file_inode(vma->vm_file); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; @@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void) { struct mm_struct *mm = current->mm; struct xol_area *area; + uprobe_opcode_t insn = UPROBE_SWBP_INSN; area = mm->uprobes_state.xol_area; if (area) @@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void) if (!area->page) goto free_bitmap; + /* allocate first slot of task's xol_area for the return probes */ + set_bit(0, area->bitmap); + copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + atomic_set(&area->slot_count, 1); init_waitqueue_head(&area->wq); + if (!xol_add_vma(area)) return area; @@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area) static unsigned long xol_get_insn_slot(struct uprobe *uprobe) { struct xol_area *area; - unsigned long offset; unsigned long xol_vaddr; - void *vaddr; area = get_xol_area(); if (!area) @@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) return 0; /* Initialize the slot */ - offset = xol_vaddr & ~PAGE_MASK; - vaddr = kmap_atomic(area->page); - memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); - kunmap_atomic(vaddr); + copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. @@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; + struct return_instance *ri, *tmp; if (!utask) return; @@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t) if (utask->active_uprobe) put_uprobe(utask->active_uprobe); + ri = utask->return_instances; + while (ri) { + tmp = ri; + ri = ri->next; + + put_uprobe(tmp->uprobe); + kfree(tmp); + } + xol_free_insn_slot(t); kfree(utask); t->utask = NULL; @@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void) return current->utask; } +/* + * Current area->vaddr notion assume the trampoline address is always + * equal area->vaddr. + * + * Returns -1 in case the xol_area is not allocated. + */ +static unsigned long get_trampoline_vaddr(void) +{ + struct xol_area *area; + unsigned long trampoline_vaddr = -1; + + area = current->mm->uprobes_state.xol_area; + smp_read_barrier_depends(); + if (area) + trampoline_vaddr = area->vaddr; + + return trampoline_vaddr; +} + +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct return_instance *ri; + struct uprobe_task *utask; + unsigned long orig_ret_vaddr, trampoline_vaddr; + bool chained = false; + + if (!get_xol_area()) + return; + + utask = get_utask(); + if (!utask) + return; + + if (utask->depth >= MAX_URETPROBE_DEPTH) { + printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" + " nestedness limit pid/tgid=%d/%d\n", + current->pid, current->tgid); + return; + } + + ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + if (!ri) + goto fail; + + trampoline_vaddr = get_trampoline_vaddr(); + orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); + if (orig_ret_vaddr == -1) + goto fail; + + /* + * We don't want to keep trampoline address in stack, rather keep the + * original return address of first caller thru all the consequent + * instances. This also makes breakpoint unwrapping easier. + */ + if (orig_ret_vaddr == trampoline_vaddr) { + if (!utask->return_instances) { + /* + * This situation is not possible. Likely we have an + * attack from user-space. + */ + pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", + current->pid, current->tgid); + goto fail; + } + + chained = true; + orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; + } + + atomic_inc(&uprobe->ref); + ri->uprobe = uprobe; + ri->func = instruction_pointer(regs); + ri->orig_ret_vaddr = orig_ret_vaddr; + ri->chained = chained; + + utask->depth++; + + /* add instance to the stack */ + ri->next = utask->return_instances; + utask->return_instances = ri; + + return; + + fail: + kfree(ri); +} + /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) @@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) clear_bit(MMF_HAS_UPROBES, &mm->flags); } -static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) { struct page *page; uprobe_opcode_t opcode; @@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_opcode(page, vaddr, &opcode); + copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: - return is_swbp_insn(&opcode); + /* This needs to return true for any variant of the trap insn */ + return is_trap_insn(&opcode); } static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) @@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) vma = find_vma(mm, bp_vaddr); if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { - struct inode *inode = vma->vm_file->f_mapping->host; + struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); uprobe = find_uprobe(inode, offset); } if (!uprobe) - *is_swbp = is_swbp_at_addr(mm, bp_vaddr); + *is_swbp = is_trap_at_addr(mm, bp_vaddr); } else { *is_swbp = -EFAULT; } @@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; int remove = UPROBE_HANDLER_REMOVE; + bool need_prep = false; /* prepare return uprobe, when needed */ down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - int rc = uc->handler(uc, regs); + int rc = 0; + + if (uc->handler) { + rc = uc->handler(uc, regs); + WARN(rc & ~UPROBE_HANDLER_MASK, + "bad rc=0x%x from %pf()\n", rc, uc->handler); + } + + if (uc->ret_handler) + need_prep = true; - WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); remove &= rc; } + if (need_prep && !remove) + prepare_uretprobe(uprobe, regs); /* put bp at return */ + if (remove && uprobe->consumers) { WARN_ON(!uprobe_is_active(uprobe)); unapply_uprobe(uprobe, current->mm); @@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } +static void +handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) +{ + struct uprobe *uprobe = ri->uprobe; + struct uprobe_consumer *uc; + + down_read(&uprobe->register_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + if (uc->ret_handler) + uc->ret_handler(uc, ri->func, regs); + } + up_read(&uprobe->register_rwsem); +} + +static bool handle_trampoline(struct pt_regs *regs) +{ + struct uprobe_task *utask; + struct return_instance *ri, *tmp; + bool chained; + + utask = current->utask; + if (!utask) + return false; + + ri = utask->return_instances; + if (!ri) + return false; + + /* + * TODO: we should throw out return_instance's invalidated by + * longjmp(), currently we assume that the probed function always + * returns. + */ + instruction_pointer_set(regs, ri->orig_ret_vaddr); + + for (;;) { + handle_uretprobe_chain(ri, regs); + + chained = ri->chained; + put_uprobe(ri->uprobe); + + tmp = ri; + ri = ri->next; + kfree(tmp); + + if (!chained) + break; + + utask->depth--; + + BUG_ON(!ri); + } + + utask->return_instances = ri; + + return true; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs) int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); - uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + if (bp_vaddr == get_trampoline_vaddr()) { + if (handle_trampoline(regs)) + return; + + pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", + current->pid, current->tgid); + } + uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ @@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs) */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { - if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) + if (!current->mm) + return 0; + + if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && + (!current->utask || !current->utask->return_instances)) return 0; set_thread_flag(TIF_UPROBE); diff --git a/kernel/extable.c b/kernel/extable.c index fe35a634bf76..67460b93b1a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,10 +41,10 @@ u32 __initdata main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - if (main_extable_sort_needed) + if (main_extable_sort_needed) { + pr_notice("Sorting __ex_table...\n"); sort_extable(__start___ex_table, __stop___ex_table); - else - pr_notice("__ex_table already sorted, skipping sort\n"); + } } /* Given an address, look for it in the exception tables. */ diff --git a/kernel/fork.c b/kernel/fork.c index 1766d324d5e3..339f60dfd62b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1233,7 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 14be27feda49..609d8ff38b74 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -84,6 +84,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_boottime, .resolution = KTIME_LOW_RES, }, + { + .index = HRTIMER_BASE_TAI, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + .resolution = KTIME_LOW_RES, + }, } }; @@ -91,6 +97,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_TAI] = HRTIMER_BASE_TAI, }; static inline int hrtimer_clockid_to_base(clockid_t clock_id) @@ -107,8 +114,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, mono, boot; struct timespec xts, tom, slp; + s32 tai_offset; get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); + tai_offset = timekeeping_get_tai_offset(); xtim = timespec_to_ktime(xts); mono = ktime_add(xtim, timespec_to_ktime(tom)); @@ -116,6 +125,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; + base->clock_base[HRTIMER_BASE_TAI].softirq_time = + ktime_add(xtim, ktime_set(tai_offset, 0)); } /* @@ -276,6 +287,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) } else { unsigned long rem = do_div(nsec, NSEC_PER_SEC); + /* Make sure nsec fits into long */ + if (unlikely(nsec > KTIME_SEC_MAX)) + return (ktime_t){ .tv64 = KTIME_MAX }; + tmp = ktime_set((long)nsec, rem); } @@ -652,8 +667,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets(offs_real, offs_boot); + return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); } /* @@ -1011,7 +1027,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) * * Returns: * 0 on success @@ -1028,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); * hrtimer_start - (re)start an hrtimer on the current CPU * @timer: the timer to be added * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) * * Returns: * 0 on success @@ -1310,6 +1328,8 @@ retry: expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires.tv64 < 0) + expires.tv64 = KTIME_MAX; if (expires.tv64 < expires_next.tv64) expires_next = expires; break; diff --git a/kernel/kexec.c b/kernel/kexec.c index ffd4e111fd67..b574920cbd4b 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1118,12 +1118,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin, { unsigned long addr; - for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); - init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); - free_page((unsigned long)__va(addr)); - totalram_pages++; - } + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); } int crash_shrink_memory(unsigned long new_size) @@ -1581,7 +1577,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmlist); + VMCOREINFO_SYMBOL(vmap_area_list); #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(mem_map); @@ -1619,7 +1615,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vm_struct, addr); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); log_buf_kexec_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); diff --git a/kernel/kthread.c b/kernel/kthread.c index 9eb7fed0bbaa..16d8ddd268b1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -52,8 +52,21 @@ enum KTHREAD_BITS { KTHREAD_IS_PARKED, }; -#define to_kthread(tsk) \ - container_of((tsk)->vfork_done, struct kthread, exited) +#define __to_kthread(vfork) \ + container_of(vfork, struct kthread, exited) + +static inline struct kthread *to_kthread(struct task_struct *k) +{ + return __to_kthread(k->vfork_done); +} + +static struct kthread *to_live_kthread(struct task_struct *k) +{ + struct completion *vfork = ACCESS_ONCE(k->vfork_done); + if (likely(vfork)) + return __to_kthread(vfork); + return NULL; +} /** * kthread_should_stop - should this kthread return now? @@ -265,7 +278,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) } /* It's safe because the task is inactive. */ do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; + p->flags |= PF_NO_SETAFFINITY; } /** @@ -311,19 +324,6 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), return p; } -static struct kthread *task_get_live_kthread(struct task_struct *k) -{ - struct kthread *kthread; - - get_task_struct(k); - kthread = to_kthread(k); - /* It might have exited */ - barrier(); - if (k->vfork_done != NULL) - return kthread; - return NULL; -} - static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) { clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); @@ -350,11 +350,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) */ void kthread_unpark(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread = to_live_kthread(k); if (kthread) __kthread_unpark(k, kthread); - put_task_struct(k); } /** @@ -371,7 +370,7 @@ void kthread_unpark(struct task_struct *k) */ int kthread_park(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread = to_live_kthread(k); int ret = -ENOSYS; if (kthread) { @@ -384,7 +383,6 @@ int kthread_park(struct task_struct *k) } ret = 0; } - put_task_struct(k); return ret; } @@ -405,10 +403,13 @@ int kthread_park(struct task_struct *k) */ int kthread_stop(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread; int ret; trace_sched_kthread_stop(k); + + get_task_struct(k); + kthread = to_live_kthread(k); if (kthread) { set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); __kthread_unpark(k, kthread); @@ -416,10 +417,9 @@ int kthread_stop(struct task_struct *k) wait_for_completion(&kthread->exited); } ret = k->exit_code; - put_task_struct(k); - trace_sched_kthread_stop_ret(ret); + trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8a0efac4f99d..6a3bccba7e7d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -380,6 +380,13 @@ static int verbose(struct lock_class *class) unsigned long nr_stack_trace_entries; static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static void print_lockdep_off(const char *bug_msg) +{ + printk(KERN_DEBUG "%s\n", bug_msg); + printk(KERN_DEBUG "turning off the locking correctness validator.\n"); + printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +} + static int save_trace(struct stack_trace *trace) { trace->nr_entries = 0; @@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace) if (!debug_locks_off_graph_unlock()) return 0; - printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); return 0; @@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) } raw_local_irq_restore(flags); - printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); return NULL; } @@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void) if (!debug_locks_off_graph_unlock()) return NULL; - printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); dump_stack(); return NULL; } @@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct lock_class *class = hlock_class(hlock); struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - struct held_lock *hlock_curr, *hlock_next; + struct held_lock *hlock_curr; int i, j; /* @@ -2048,8 +2052,7 @@ cache_hit: if (!debug_locks_off_graph_unlock()) return 0; - printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); dump_stack(); return 0; } @@ -2057,12 +2060,10 @@ cache_hit: chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; /* Find the first held_lock of current chain */ - hlock_next = hlock; for (i = curr->lockdep_depth - 1; i >= 0; i--) { hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock_next->irq_context) + if (hlock_curr->irq_context != hlock->irq_context) break; - hlock_next = hlock; } i++; chain->depth = curr->lockdep_depth + 1 - i; @@ -3190,9 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); - printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", + print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); + printk(KERN_DEBUG "depth: %i max: %lu!\n", curr->lockdep_depth, MAX_LOCK_DEPTH); - printk("turning off the locking correctness validator.\n"); lockdep_print_held_locks(current); debug_show_all_locks(); diff --git a/kernel/mutex.c b/kernel/mutex.c index 52f23011b6e0..ad53a664f113 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -37,6 +37,12 @@ # include <asm/mutex.h> #endif +/* + * A negative mutex count indicates that waiters are sleeping waiting for the + * mutex. + */ +#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) + void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + lock->spin_mlock = NULL; +#endif debug_mutex_init(lock, name, key); } @@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * In order to avoid a stampede of mutex spinners from acquiring the mutex + * more or less simultaneously, the spinners need to acquire a MCS lock + * first before spinning on the owner field. + * + * We don't inline mspin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +struct mspin_node { + struct mspin_node *next ; + int locked; /* 1 if lock acquired */ +}; +#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) + +static noinline +void mspin_lock(struct mspin_node **lock, struct mspin_node *node) +{ + struct mspin_node *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* Lock acquired */ + node->locked = 1; + return; + } + ACCESS_ONCE(prev->next) = node; + smp_wmb(); + /* Wait until the lock holder passes the lock down */ + while (!ACCESS_ONCE(node->locked)) + arch_mutex_cpu_relax(); +} + +static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) +{ + struct mspin_node *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (cmpxchg(lock, node, NULL) == node) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + ACCESS_ONCE(next->locked) = 1; + smp_wmb(); +} + +/* + * Mutex spinning code migrated from kernel/sched/core.c + */ + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + if (lock->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +static noinline +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(lock, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. + */ + return lock->owner == NULL; +} + +/* + * Initial check for entering the mutex spinning loop + */ +static inline int mutex_can_spin_on_owner(struct mutex *lock) +{ + int retval = 1; + + rcu_read_lock(); + if (lock->owner) + retval = lock->owner->on_cpu; + rcu_read_unlock(); + /* + * if lock->owner is not set, the mutex owner may have just acquired + * it and not set the owner yet or the mutex has been released. + */ + return retval; +} +#endif + static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /** @@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * * We can't do this for DEBUG_MUTEXES because that relies on wait_lock * to serialize everything. + * + * The mutex spinners are queued up using MCS lock so that only one + * spinner can compete for the mutex. However, if mutex spinning isn't + * going to happen, there is no point in going through the lock/unlock + * overhead. */ + if (!mutex_can_spin_on_owner(lock)) + goto slowpath; for (;;) { struct task_struct *owner; + struct mspin_node node; /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ + mspin_lock(MLOCK(lock), &node); owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) + if (owner && !mutex_spin_on_owner(lock, owner)) { + mspin_unlock(MLOCK(lock), &node); break; + } - if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { + if ((atomic_read(&lock->count) == 1) && + (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); mutex_set_owner(lock); + mspin_unlock(MLOCK(lock), &node); preempt_enable(); return 0; } + mspin_unlock(MLOCK(lock), &node); /* * When there's no owner, we might have preempted between the @@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } +slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); @@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; - if (atomic_xchg(&lock->count, -1) == 1) + if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) goto done; lock_contended(&lock->dep_map, ip); @@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * that when we release the lock, we properly wake up the * other waiters: */ - if (atomic_xchg(&lock->count, -1) == 1) + if (MUTEX_SHOW_NO_WAITER(lock) && + (atomic_xchg(&lock->count, -1) == 1)) break; /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 6edbb2c55c22..424c2d4265c9 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -40,38 +40,31 @@ #include <linux/list.h> #include <linux/init.h> #include <linux/compiler.h> -#include <linux/idr.h> +#include <linux/hash.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> #include <linux/syscalls.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <linux/export.h> +#include <linux/hashtable.h> /* - * Management arrays for POSIX timers. Timers are kept in slab memory - * Timer ids are allocated by an external routine that keeps track of the - * id and the timer. The external interface is: - * - * void *idr_find(struct idr *idp, int id); to find timer_id <id> - * int idr_get_new(struct idr *idp, void *ptr); to get a new id and - * related it to <ptr> - * void idr_remove(struct idr *idp, int id); to release <id> - * void idr_init(struct idr *idp); to initialize <idp> - * which we supply. - * The idr_get_new *may* call slab for more memory so it must not be - * called under a spin lock. Likewise idr_remore may release memory - * (but it may be ok to do this under a lock...). - * idr_find is just a memory look up and is quite fast. A -1 return - * indicates that the requested id does not exist. + * Management arrays for POSIX timers. Timers are now kept in static hash table + * with 512 entries. + * Timer ids are allocated by local routine, which selects proper hash head by + * key, constructed from current->signal address and per signal struct counter. + * This keeps timer ids unique per process, but now they can intersect between + * processes. */ /* * Lets keep our timers in a slab cache :-) */ static struct kmem_cache *posix_timers_cache; -static struct idr posix_timers_id; -static DEFINE_SPINLOCK(idr_lock); + +static DEFINE_HASHTABLE(posix_timers_hashtable, 9); +static DEFINE_SPINLOCK(hash_lock); /* * we assume that the new SIGEV_THREAD_ID shares no bits with the other @@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); __timr; \ }) +static int hash(struct signal_struct *sig, unsigned int nr) +{ + return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); +} + +static struct k_itimer *__posix_timers_find(struct hlist_head *head, + struct signal_struct *sig, + timer_t id) +{ + struct k_itimer *timer; + + hlist_for_each_entry_rcu(timer, head, t_hash) { + if ((timer->it_signal == sig) && (timer->it_id == id)) + return timer; + } + return NULL; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + + return __posix_timers_find(head, sig, id); +} + +static int posix_timer_add(struct k_itimer *timer) +{ + struct signal_struct *sig = current->signal; + int first_free_id = sig->posix_timer_id; + struct hlist_head *head; + int ret = -ENOENT; + + do { + spin_lock(&hash_lock); + head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; + if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + hlist_add_head_rcu(&timer->t_hash, head); + ret = sig->posix_timer_id; + } + if (++sig->posix_timer_id < 0) + sig->posix_timer_id = 0; + if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) + /* Loop over all possible ids completed */ + ret = -EAGAIN; + spin_unlock(&hash_lock); + } while (ret == -ENOENT); + return ret; +} + static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) { spin_unlock_irqrestore(&timr->it_lock, flags); @@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) return 0; } +static int posix_get_tai(clockid_t which_clock, struct timespec *tp) +{ + timekeeping_clocktai(tp); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) @@ -261,6 +309,16 @@ static __init int init_posix_timers(void) .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, }; + struct k_clock clock_tai = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; struct k_clock clock_boottime = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_boottime, @@ -278,11 +336,11 @@ static __init int init_posix_timers(void) posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); + posix_timers_register_clock(CLOCK_TAI, &clock_tai); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, NULL); - idr_init(&posix_timers_id); return 0; } @@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) { if (it_id_set) { unsigned long flags; - spin_lock_irqsave(&idr_lock, flags); - idr_remove(&posix_timers_id, tmr->it_id); - spin_unlock_irqrestore(&idr_lock, flags); + spin_lock_irqsave(&hash_lock, flags); + hlist_del_rcu(&tmr->t_hash); + spin_unlock_irqrestore(&hash_lock, flags); } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); @@ -552,22 +610,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, return -EAGAIN; spin_lock_init(&new_timer->it_lock); - - idr_preload(GFP_KERNEL); - spin_lock_irq(&idr_lock); - error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT); - spin_unlock_irq(&idr_lock); - idr_preload_end(); - if (error < 0) { - /* - * Weird looking, but we return EAGAIN if the IDR is - * full (proper POSIX return value for this) - */ - if (error == -ENOSPC) - error = -EAGAIN; + new_timer_id = posix_timer_add(new_timer); + if (new_timer_id < 0) { + error = new_timer_id; goto out; } - new_timer_id = error; it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; @@ -645,7 +692,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) return NULL; rcu_read_lock(); - timr = idr_find(&posix_timers_id, (int)timer_id); + timr = posix_timer_by_id(timer_id); if (timr) { spin_lock_irqsave(&timr->it_lock, *flags); if (timr->it_signal == current->signal) { diff --git a/kernel/printk.c b/kernel/printk.c index abbdd9e2ac82..376914e2869d 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -49,13 +49,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/printk.h> -/* - * Architectures can override it: - */ -void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) -{ -} - /* printk's without a loglevel use this.. */ #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -608,7 +601,8 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; - ret = POLLIN|POLLRDNORM; + else + ret = POLLIN|POLLRDNORM; } raw_spin_unlock_irq(&logbuf_lock); @@ -1265,7 +1259,7 @@ static void call_console_drivers(int level, const char *text, size_t len) { struct console *con; - trace_console(text, 0, len, len); + trace_console(text, len); if (level >= console_loglevel && !ignore_loglevel) return; @@ -1723,6 +1717,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; } #endif /* CONFIG_PRINTK */ +#ifdef CONFIG_EARLY_PRINTK +struct console *early_console; + +void early_vprintk(const char *fmt, va_list ap) +{ + if (early_console) { + char buf[512]; + int n = vscnprintf(buf, sizeof(buf), fmt, ap); + + early_console->write(early_console, buf, n); + } +} + +asmlinkage void early_printk(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + early_vprintk(fmt, ap); + va_end(ap); +} +#endif + static int __add_preferred_console(char *name, int idx, char *options, char *brl_options) { diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5b8ad827fd86..d8534308fd05 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -64,7 +64,7 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(sname, cr) { \ +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ @@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = #sname, \ + .abbr = sabbr, \ } struct rcu_state rcu_sched_state = - RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); + RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; @@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); @@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_gp_in_progress(rsp)) return 0; /* No, a grace period is already in progress. */ + if (rcu_nocb_needs_gp(rsp)) + return 1; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) return 0; /* No, this is a no-CBs (or offline) CPU. */ if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) @@ -1035,10 +1040,11 @@ static void init_callback_list(struct rcu_data *rdp) { int i; + if (init_nocb_callback_list(rdp)) + return; rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; - init_nocb_callback_list(rdp); } /* @@ -1071,6 +1077,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, } /* + * Trace-event helper function for rcu_start_future_gp() and + * rcu_nocb_wait_gp(). + */ +static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, char *s) +{ + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, + rnp->completed, c, rnp->level, + rnp->grplo, rnp->grphi, s); +} + +/* + * Start some future grace period, as needed to handle newly arrived + * callbacks. The required future grace periods are recorded in each + * rcu_node structure's ->need_future_gp field. + * + * The caller must hold the specified rcu_node structure's ->lock. + */ +static unsigned long __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +{ + unsigned long c; + int i; + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + + /* + * Pick up grace-period number for new callbacks. If this + * grace period is already marked as needed, return to the caller. + */ + c = rcu_cbs_completed(rdp->rsp, rnp); + trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); + if (rnp->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); + return c; + } + + /* + * If either this rcu_node structure or the root rcu_node structure + * believe that a grace period is in progress, then we must wait + * for the one following, which is in "c". Because our request + * will be noticed at the end of the current grace period, we don't + * need to explicitly start one. + */ + if (rnp->gpnum != rnp->completed || + ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { + rnp->need_future_gp[c & 0x1]++; + trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); + return c; + } + + /* + * There might be no grace period in progress. If we don't already + * hold it, acquire the root rcu_node structure's lock in order to + * start one (if needed). + */ + if (rnp != rnp_root) + raw_spin_lock(&rnp_root->lock); + + /* + * Get a new grace-period number. If there really is no grace + * period in progress, it will be smaller than the one we obtained + * earlier. Adjust callbacks as needed. Note that even no-CBs + * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. + */ + c = rcu_cbs_completed(rdp->rsp, rnp_root); + for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) + if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) + rdp->nxtcompleted[i] = c; + + /* + * If the needed for the required grace period is already + * recorded, trace and leave. + */ + if (rnp_root->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); + goto unlock_out; + } + + /* Record the need for the future grace period. */ + rnp_root->need_future_gp[c & 0x1]++; + + /* If a grace period is not already in progress, start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); + } else { + trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); + rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + } +unlock_out: + if (rnp != rnp_root) + raw_spin_unlock(&rnp_root->lock); + return c; +} + +/* + * Clean up any old requests for the just-ended grace period. Also return + * whether any additional grace periods have been requested. Also invoke + * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads + * waiting for this grace period to complete. + */ +static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ + int c = rnp->completed; + int needmore; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + + rcu_nocb_gp_cleanup(rsp, rnp); + rnp->need_future_gp[c & 0x1] = 0; + needmore = rnp->need_future_gp[(c + 1) & 0x1]; + trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); + return needmore; +} + +/* * If there is room, assign a ->completed number to any callbacks on * this CPU that have not already been assigned. Also accelerate any * callbacks that were previously assigned a ->completed number that has @@ -1129,6 +1249,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxtcompleted[i] = c; } + /* Record any needed additional grace periods. */ + rcu_start_future_gp(rnp, rdp); /* Trace depending on how much we were able to accelerate. */ if (!*rdp->nxttail[RCU_WAIT_TAIL]) @@ -1308,9 +1430,9 @@ static int rcu_gp_init(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; + ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; WARN_ON_ONCE(rnp->completed != rsp->completed); - rnp->completed = rsp->completed; + ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); @@ -1319,7 +1441,8 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); #ifdef CONFIG_PROVE_RCU_DELAY - if ((random32() % (rcu_num_nodes * 8)) == 0) + if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && + system_state == SYSTEM_RUNNING) schedule_timeout_uninterruptible(2); #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); @@ -1361,6 +1484,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; + int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1390,17 +1514,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); - rnp->completed = rsp->gpnum; + ACCESS_ONCE(rnp->completed) = rsp->gpnum; + rdp = this_cpu_ptr(rsp->rda); + if (rnp == rdp->mynode) + __rcu_process_gp_end(rsp, rnp, rdp); + nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); + rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ if (cpu_needs_another_gp(rsp, rdp)) rsp->gp_flags = 1; raw_spin_unlock_irq(&rnp->lock); @@ -1476,57 +1606,62 @@ static int __noreturn rcu_gp_kthread(void *arg) /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock, which is released before return. Hard irqs must - * be disabled. + * the root node's ->lock and hard irqs must be disabled. * * Note that it is legal for a dying CPU (which is marked as offline) to * invoke this function. This can happen when the dying CPU reports its * quiescent state. */ static void -rcu_start_gp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) +rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) { - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - - if (!rsp->gp_kthread || - !cpu_needs_another_gp(rsp, rdp)) { + if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period * task, this CPU does not need another grace period, * or a grace period is already in progress. * Either way, don't start a new grace period. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - - /* - * Because there is no grace period in progress right now, - * any callbacks we have up to this point will be satisfied - * by the next grace period. So this is a good place to - * assign a grace period number to recently posted callbacks. - */ - rcu_accelerate_cbs(rsp, rnp, rdp); - rsp->gp_flags = RCU_GP_FLAG_INIT; - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ - - /* Ensure that CPU is aware of completion of last grace period. */ - rcu_process_gp_end(rsp, rdp); - local_irq_restore(flags); /* Wake up rcu_gp_kthread() to start the grace period. */ wake_up(&rsp->gp_wq); } /* + * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's + * callbacks. Note that rcu_start_gp_advanced() cannot do this because it + * is invoked indirectly from rcu_advance_cbs(), which would result in + * endless recursion -- or would do so if it wasn't for the self-deadlock + * that is encountered beforehand. + */ +static void +rcu_start_gp(struct rcu_state *rsp) +{ + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_node *rnp = rcu_get_root(rsp); + + /* + * If there is no grace period in progress right now, any + * callbacks we have up to this point will be satisfied by the + * next grace period. Also, advancing the callbacks reduces the + * probability of false positives from cpu_needs_another_gp() + * resulting in pointless grace periods. So, advance callbacks + * then start the grace period! + */ + rcu_advance_cbs(rsp, rnp, rdp); + rcu_start_gp_advanced(rsp, rnp, rdp); +} + +/* * Report a full set of quiescent states to the specified rcu_state * data structure. This involves cleaning up after the prior grace * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, as - * required by rcu_start_gp(), which will release it. + * if one is needed. Note that the caller must hold rnp->lock, which + * is released before return. */ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) @@ -2124,7 +2259,8 @@ __rcu_process_callbacks(struct rcu_state *rsp) local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ - rcu_start_gp(rsp, flags); /* releases above lock */ + rcu_start_gp(rsp); + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); } else { local_irq_restore(flags); } @@ -2169,7 +2305,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) static void invoke_rcu_core(void) { - raise_softirq(RCU_SOFTIRQ); + if (cpu_online(smp_processor_id())) + raise_softirq(RCU_SOFTIRQ); } /* @@ -2204,11 +2341,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + raw_spin_lock(&rnp_root->lock); + rcu_start_gp(rsp); + raw_spin_unlock(&rnp_root->lock); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; @@ -2628,19 +2765,27 @@ static int rcu_pending(int cpu) } /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. + * Return true if the specified CPU has any callback. If all_lazy is + * non-NULL, store an indication of whether all callbacks are lazy. + * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int rcu_cpu_has_callbacks(int cpu) +static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) { + bool al = true; + bool hc = false; + struct rcu_data *rdp; struct rcu_state *rsp; - /* RCU callbacks either ready or pending? */ - for_each_rcu_flavor(rsp) - if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) - return 1; - return 0; + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->qlen != rdp->qlen_lazy) + al = false; + if (rdp->nxtlist) + hc = true; + } + if (all_lazy) + *all_lazy = al; + return hc; } /* @@ -2859,7 +3004,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* Add CPU to rcu_node bitmasks. */ @@ -2909,7 +3053,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; - int ret = NOTIFY_OK; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2923,21 +3066,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: - if (nocb_cpu_expendable(cpu)) - rcu_boost_kthread_setaffinity(rnp, cpu); - else - ret = NOTIFY_BAD; + rcu_boost_kthread_setaffinity(rnp, cpu); break; case CPU_DYING: case CPU_DYING_FROZEN: - /* - * The whole machine is "stopped" except this CPU, so we can - * touch any data without introducing corruption. We send the - * dying CPU's callbacks to an arbitrarily chosen online CPU. - */ for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); - rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -2950,7 +3084,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; } trace_rcu_utilization("End CPU hotplug"); - return ret; + return NOTIFY_OK; } /* @@ -3085,6 +3219,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); + rcu_init_one_nocb(rnp); } } @@ -3170,8 +3305,7 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); - rcu_init_nocb(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcutree.h b/kernel/rcutree.h index c896b5045d9d..14ee40795d6f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,18 +88,13 @@ struct rcu_dynticks { int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ #ifdef CONFIG_RCU_FAST_NO_HZ - int dyntick_drain; /* Prepare-for-idle state variable. */ - unsigned long dyntick_holdoff; - /* No retries for the jiffy of failure. */ - struct timer_list idle_gp_timer; - /* Wake up CPU sleeping with callbacks. */ - unsigned long idle_gp_timer_expires; - /* When to wake up CPU (for repost). */ - bool idle_first_pass; /* First pass of attempt to go idle? */ + bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; /* # times non-lazy CBs posted to CPU. */ unsigned long nonlazy_posted_snap; /* idle-period nonlazy_posted snapshot. */ + unsigned long last_accelerate; + /* Last jiffy CBs were accelerated. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; @@ -134,9 +129,6 @@ struct rcu_node { /* elements that need to drain to allow the */ /* current expedited grace period to */ /* complete (only for TREE_PREEMPT_RCU). */ - atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ - /* Since this has meaning only for leaf */ - /* rcu_node structures, 32 bits suffices. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ @@ -196,6 +188,12 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ +#ifdef CONFIG_RCU_NOCB_CPU + wait_queue_head_t nocb_gp_wq[2]; + /* Place for rcu_nocb_kthread() to wait GP. */ +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + int need_future_gp[2]; + /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; @@ -328,6 +326,11 @@ struct rcu_data { struct task_struct *nocb_kthread; #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + /* 8) RCU CPU stall data. */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + unsigned int softirq_snap; /* Snapshot of softirq activity. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + int cpu; struct rcu_state *rsp; }; @@ -375,12 +378,6 @@ struct rcu_state { struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ void (*func)(struct rcu_head *head)); -#ifdef CONFIG_RCU_NOCB_CPU - void (*call_remote)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); - /* call_rcu() flavor, but for */ - /* placing on remote CPU. */ -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -443,6 +440,7 @@ struct rcu_state { unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ char *name; /* Name of structure. */ + char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ }; @@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ static void __cpuinit rcu_prepare_kthreads(int cpu); -static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); static void rcu_idle_count_callbacks_posted(void); @@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static int rcu_nocb_needs_gp(struct rcu_state *rsp); +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static void rcu_init_one_nocb(struct rcu_node *rnp); static bool is_nocb_cpu(int cpu); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy); static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp); -static bool nocb_cpu_expendable(int cpu); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); -static void init_nocb_callback_list(struct rcu_data *rdp); -static void __init rcu_init_nocb(void); +static bool init_nocb_callback_list(struct rcu_data *rdp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c1cc7e17ff9d..d084ae3f281c 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -85,11 +85,21 @@ static void __init rcu_bootup_announce_oddness(void) if (nr_cpu_ids != NR_CPUS) printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_NOCB_CPU +#ifndef CONFIG_RCU_NOCB_CPU_NONE + if (!have_rcu_nocb_mask) { + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + have_rcu_nocb_mask = true; + } +#ifdef CONFIG_RCU_NOCB_CPU_ZERO + pr_info("\tExperimental no-CBs CPU 0\n"); + cpumask_set_cpu(0, rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ +#ifdef CONFIG_RCU_NOCB_CPU_ALL + pr_info("\tExperimental no-CBs for all CPUs\n"); + cpumask_setall(rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { - if (cpumask_test_cpu(0, rcu_nocb_mask)) { - cpumask_clear_cpu(0, rcu_nocb_mask); - pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); - } cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll) @@ -101,7 +111,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_state rcu_preempt_state = - RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); + RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; @@ -1533,14 +1543,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; - return rcu_cpu_has_callbacks(cpu); -} - -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ + return rcu_cpu_has_callbacks(cpu, NULL); } /* @@ -1577,16 +1580,6 @@ static void rcu_idle_count_callbacks_posted(void) * * The following three proprocessor symbols control this state machine: * - * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt - * to satisfy RCU. Beyond this point, it is better to incur a periodic - * scheduling-clock interrupt than to loop through the state machine - * at full power. - * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are - * optional if RCU does not need anything immediately from this - * CPU, even if this CPU still has RCU callbacks queued. The first - * times through the state machine are mandatory: we need to give - * the state machine a chance to communicate a quiescent state - * to the RCU core. * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted * to sleep in dyntick-idle mode with RCU callbacks pending. This * is sized to be roughly one RCU grace period. Those energy-efficiency @@ -1602,186 +1595,108 @@ static void rcu_idle_count_callbacks_posted(void) * adjustment, they can be converted into kernel config parameters, though * making the state machine smarter might be a better option. */ -#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ -#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -extern int tick_nohz_enabled; - -/* - * Does the specified flavor of RCU have non-lazy callbacks pending on - * the specified CPU? Both RCU flavor and CPU are specified by the - * rcu_data structure. - */ -static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) -{ - return rdp->qlen != rdp->qlen_lazy; -} +static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; +module_param(rcu_idle_gp_delay, int, 0644); +static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; +module_param(rcu_idle_lazy_gp_delay, int, 0644); -#ifdef CONFIG_TREE_PREEMPT_RCU +extern int tick_nohz_enabled; /* - * Are there non-lazy RCU-preempt callbacks? (There cannot be if there - * is no RCU-preempt in the kernel.) + * Try to advance callbacks for all flavors of RCU on the current CPU. + * Afterwards, if there are any callbacks ready for immediate invocation, + * return true. */ -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +static bool rcu_try_advance_all_cbs(void) { - struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - - return __rcu_cpu_has_nonlazy_callbacks(rdp); -} - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + bool cbs_ready = false; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) -{ - return 0; -} + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; -#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ + /* + * Don't bother checking unless a grace period has + * completed since we last checked and there are + * callbacks not yet ready to invoke. + */ + if (rdp->completed != rnp->completed && + rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) + rcu_process_gp_end(rsp, rdp); -/* - * Does any flavor of RCU have non-lazy callbacks on the specified CPU? - */ -static bool rcu_cpu_has_nonlazy_callbacks(int cpu) -{ - return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || - __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || - rcu_preempt_cpu_has_nonlazy_callbacks(cpu); + if (cpu_has_callbacks_ready_to_invoke(rdp)) + cbs_ready = true; + } + return cbs_ready; } /* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! + * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready + * to invoke. If the CPU has callbacks, try to advance them. Tell the + * caller to set the timeout based on whether or not there are non-lazy + * callbacks. * - * The delta_jiffies argument is used to store the time when RCU is - * going to need the CPU again if it still has callbacks. The reason - * for this is that rcu_prepare_for_idle() might need to post a timer, - * but if so, it will do so after tick_nohz_stop_sched_tick() has set - * the wakeup time for this CPU. This means that RCU's timer can be - * delayed until the wakeup time, which defeats the purpose of posting - * a timer. + * The caller must have disabled interrupts. */ -int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +int rcu_needs_cpu(int cpu, unsigned long *dj) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - /* Flag a new idle sojourn to the idle-entry state machine. */ - rdtp->idle_first_pass = 1; + /* Snapshot to detect later posting of non-lazy callback. */ + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; + /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) { - *delta_jiffies = ULONG_MAX; + if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { + *dj = ULONG_MAX; return 0; } - if (rdtp->dyntick_holdoff == jiffies) { - /* RCU recently tried and failed, so don't try again. */ - *delta_jiffies = 1; + + /* Attempt to advance callbacks. */ + if (rcu_try_advance_all_cbs()) { + /* Some ready to invoke, so initiate later invocation. */ + invoke_rcu_core(); return 1; } - /* Set up for the possibility that RCU will post a timer. */ - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, - RCU_IDLE_GP_DELAY) - jiffies; + rdtp->last_accelerate = jiffies; + + /* Request timer delay depending on laziness, and round. */ + if (rdtp->all_lazy) { + *dj = round_up(rcu_idle_gp_delay + jiffies, + rcu_idle_gp_delay) - jiffies; } else { - *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; - *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; + *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } return 0; } /* - * Handler for smp_call_function_single(). The only point of this - * handler is to wake the CPU up, so the handler does only tracing. - */ -void rcu_idle_demigrate(void *unused) -{ - trace_rcu_prep_idle("Demigrate"); -} - -/* - * Timer handler used to force CPU to start pushing its remaining RCU - * callbacks in the case where it entered dyntick-idle mode with callbacks - * pending. The hander doesn't really need to do anything because the - * real work is done upon re-entry to idle, or by the next scheduling-clock - * interrupt should idle not be re-entered. - * - * One special case: the timer gets migrated without awakening the CPU - * on which the timer was scheduled on. In this case, we must wake up - * that CPU. We do so with smp_call_function_single(). - */ -static void rcu_idle_gp_timer_func(unsigned long cpu_in) -{ - int cpu = (int)cpu_in; - - trace_rcu_prep_idle("Timer"); - if (cpu != smp_processor_id()) - smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); - else - WARN_ON_ONCE(1); /* Getting here can hang the system... */ -} - -/* - * Initialize the timer used to pull CPUs out of dyntick-idle mode. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - rdtp->dyntick_holdoff = jiffies - 1; - setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); - rdtp->idle_gp_timer_expires = jiffies - 1; - rdtp->idle_first_pass = 1; -} - -/* - * Clean up for exit from idle. Because we are exiting from idle, there - * is no longer any point to ->idle_gp_timer, so cancel it. This will - * do nothing if this timer is not active, so just cancel it unconditionally. - */ -static void rcu_cleanup_after_idle(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - del_timer(&rdtp->idle_gp_timer); - trace_rcu_prep_idle("Cleanup after idle"); - rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); -} - -/* - * Check to see if any RCU-related work can be done by the current CPU, - * and if so, schedule a softirq to get it done. This function is part - * of the RCU implementation; it is -not- an exported member of the RCU API. - * - * The idea is for the current CPU to clear out all work required by the - * RCU core for the current grace period, so that this CPU can be permitted - * to enter dyntick-idle mode. In some cases, it will need to be awakened - * at the end of the grace period by whatever CPU ends the grace period. - * This allows CPUs to go dyntick-idle more quickly, and to reduce the - * number of wakeups by a modest integer factor. - * - * Because it is not legal to invoke rcu_process_callbacks() with irqs - * disabled, we do one pass of force_quiescent_state(), then do a - * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked - * later. The ->dyntick_drain field controls the sequencing. + * Prepare a CPU for idle from an RCU perspective. The first major task + * is to sense whether nohz mode has been enabled or disabled via sysfs. + * The second major task is to check to see if a non-lazy callback has + * arrived at a CPU that previously had only lazy callbacks. The third + * major task is to accelerate (that is, assign grace-period numbers to) + * any recently arrived callbacks. * * The caller must have disabled interrupts. */ static void rcu_prepare_for_idle(int cpu) { - struct timer_list *tp; + struct rcu_data *rdp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_node *rnp; + struct rcu_state *rsp; int tne; /* Handle nohz enablement switches conservatively. */ tne = ACCESS_ONCE(tick_nohz_enabled); if (tne != rdtp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(cpu)) + if (rcu_cpu_has_callbacks(cpu, NULL)) invoke_rcu_core(); /* force nohz to see update. */ rdtp->tick_nohz_enabled_snap = tne; return; @@ -1789,125 +1704,56 @@ static void rcu_prepare_for_idle(int cpu) if (!tne) return; - /* Adaptive-tick mode, where usermode execution is idle to RCU. */ - if (!is_idle_task(current)) { - rdtp->dyntick_holdoff = jiffies - 1; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - trace_rcu_prep_idle("User dyntick with callbacks"); - rdtp->idle_gp_timer_expires = - round_up(jiffies + RCU_IDLE_GP_DELAY, - RCU_IDLE_GP_DELAY); - } else if (rcu_cpu_has_callbacks(cpu)) { - rdtp->idle_gp_timer_expires = - round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); - trace_rcu_prep_idle("User dyntick with lazy callbacks"); - } else { - return; - } - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + /* If this is a no-CBs CPU, no callbacks, just return. */ + if (is_nocb_cpu(cpu)) return; - } /* - * If this is an idle re-entry, for example, due to use of - * RCU_NONIDLE() or the new idle-loop tracing API within the idle - * loop, then don't take any state-machine actions, unless the - * momentary exit from idle queued additional non-lazy callbacks. - * Instead, repost the ->idle_gp_timer if this CPU has callbacks - * pending. + * If a non-lazy callback arrived at a CPU having only lazy + * callbacks, invoke RCU core for the side-effect of recalculating + * idle duration on re-entry to idle. */ - if (!rdtp->idle_first_pass && - (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { - if (rcu_cpu_has_callbacks(cpu)) { - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); - } + if (rdtp->all_lazy && + rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { + invoke_rcu_core(); return; } - rdtp->idle_first_pass = 0; - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; /* - * If there are no callbacks on this CPU, enter dyntick-idle mode. - * Also reset state to avoid prejudicing later attempts. + * If we have not yet accelerated this jiffy, accelerate all + * callbacks on this CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) { - rdtp->dyntick_holdoff = jiffies - 1; - rdtp->dyntick_drain = 0; - trace_rcu_prep_idle("No callbacks"); + if (rdtp->last_accelerate == jiffies) return; + rdtp->last_accelerate = jiffies; + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (!*rdp->nxttail[RCU_DONE_TAIL]) + continue; + rnp = rdp->mynode; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } +} - /* - * If in holdoff mode, just return. We will presumably have - * refrained from disabling the scheduling-clock tick. - */ - if (rdtp->dyntick_holdoff == jiffies) { - trace_rcu_prep_idle("In holdoff"); - return; - } +/* + * Clean up for exit from idle. Attempt to advance callbacks based on + * any grace periods that elapsed while the CPU was idle, and if any + * callbacks are now ready to invoke, initiate invocation. + */ +static void rcu_cleanup_after_idle(int cpu) +{ + struct rcu_data *rdp; + struct rcu_state *rsp; - /* Check and update the ->dyntick_drain sequencing. */ - if (rdtp->dyntick_drain <= 0) { - /* First time through, initialize the counter. */ - rdtp->dyntick_drain = RCU_IDLE_FLUSHES; - } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && - !rcu_pending(cpu) && - !local_softirq_pending()) { - /* Can we go dyntick-idle despite still having callbacks? */ - rdtp->dyntick_drain = 0; - rdtp->dyntick_holdoff = jiffies; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - trace_rcu_prep_idle("Dyntick with callbacks"); - rdtp->idle_gp_timer_expires = - round_up(jiffies + RCU_IDLE_GP_DELAY, - RCU_IDLE_GP_DELAY); - } else { - rdtp->idle_gp_timer_expires = - round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); - trace_rcu_prep_idle("Dyntick with lazy callbacks"); - } - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; - return; /* Nothing more to do immediately. */ - } else if (--(rdtp->dyntick_drain) <= 0) { - /* We have hit the limit, so time to give up. */ - rdtp->dyntick_holdoff = jiffies; - trace_rcu_prep_idle("Begin holdoff"); - invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ + if (is_nocb_cpu(cpu)) return; - } - - /* - * Do one step of pushing the remaining RCU callbacks through - * the RCU core state machine. - */ -#ifdef CONFIG_TREE_PREEMPT_RCU - if (per_cpu(rcu_preempt_data, cpu).nxtlist) { - rcu_preempt_qs(cpu); - force_quiescent_state(&rcu_preempt_state); - } -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - if (per_cpu(rcu_sched_data, cpu).nxtlist) { - rcu_sched_qs(cpu); - force_quiescent_state(&rcu_sched_state); - } - if (per_cpu(rcu_bh_data, cpu).nxtlist) { - rcu_bh_qs(cpu); - force_quiescent_state(&rcu_bh_state); - } - - /* - * If RCU callbacks are still pending, RCU still needs this CPU. - * So try forcing the callbacks through the grace period. - */ - if (rcu_cpu_has_callbacks(cpu)) { - trace_rcu_prep_idle("More callbacks"); - invoke_rcu_core(); - } else { - trace_rcu_prep_idle("Callbacks drained"); + rcu_try_advance_all_cbs(); + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (cpu_has_callbacks_ready_to_invoke(rdp)) + invoke_rcu_core(); } } @@ -2015,16 +1861,13 @@ early_initcall(rcu_register_oom_notifier); static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - struct timer_list *tltp = &rdtp->idle_gp_timer; - char c; + unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; - c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; - if (timer_pending(tltp)) - sprintf(cp, "drain=%d %c timer=%lu", - rdtp->dyntick_drain, c, tltp->expires - jiffies); - else - sprintf(cp, "drain=%d %c timer not pending", - rdtp->dyntick_drain, c); + sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", + rdtp->last_accelerate & 0xffff, jiffies & 0xffff, + ulong2long(nlpd), + rdtp->all_lazy ? 'L' : '.', + rdtp->tick_nohz_enabled_snap ? '.' : 'D'); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -2070,10 +1913,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", + printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, + rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), fast_no_hz); } @@ -2087,6 +1931,7 @@ static void print_cpu_stall_info_end(void) static void zero_cpu_stall_ticks(struct rcu_data *rdp) { rdp->ticks_this_gp = 0; + rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); } /* Increment ->ticks_this_gp for all flavors of RCU. */ @@ -2165,6 +2010,47 @@ static int __init parse_rcu_nocb_poll(char *arg) } early_param("rcu_nocb_poll", parse_rcu_nocb_poll); +/* + * Do any no-CBs CPUs need another grace period? + * + * Interrupts must be disabled. If the caller does not hold the root + * rnp_node structure's ->lock, the results are advisory only. + */ +static int rcu_nocb_needs_gp(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; +} + +/* + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended + * grace period. + */ +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ + wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); +} + +/* + * Set the root rcu_node structure's ->need_future_gp field + * based on the sum of those of all rcu_node structures. This does + * double-count the root rcu_node structure's requests, but this + * is necessary to handle the possibility of a rcu_nocb_kthread() + * having awakened during the time that the rcu_node structures + * were being updated for the end of the previous grace period. + */ +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ + rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ + init_waitqueue_head(&rnp->nocb_gp_wq[0]); + init_waitqueue_head(&rnp->nocb_gp_wq[1]); +} + /* Is the specified CPU a no-CPUs CPU? */ static bool is_nocb_cpu(int cpu) { @@ -2227,6 +2113,13 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, if (!is_nocb_cpu(rdp->cpu)) return 0; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + if (__is_kfree_rcu_offset((unsigned long)rhp->func)) + trace_rcu_kfree_callback(rdp->rsp->name, rhp, + (unsigned long)rhp->func, + rdp->qlen_lazy, rdp->qlen); + else + trace_rcu_callback(rdp->rsp->name, rhp, + rdp->qlen_lazy, rdp->qlen); return 1; } @@ -2265,95 +2158,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, } /* - * There must be at least one non-no-CBs CPU in operation at any given - * time, because no-CBs CPUs are not capable of initiating grace periods - * independently. This function therefore complains if the specified - * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to - * avoid offlining the last such CPU. (Recursion is a wonderful thing, - * but you have to have a base case!) + * If necessary, kick off a new grace period, and either way wait + * for a subsequent grace period to complete. */ -static bool nocb_cpu_expendable(int cpu) +static void rcu_nocb_wait_gp(struct rcu_data *rdp) { - cpumask_var_t non_nocb_cpus; - int ret; + unsigned long c; + bool d; + unsigned long flags; + struct rcu_node *rnp = rdp->mynode; + + raw_spin_lock_irqsave(&rnp->lock, flags); + c = rcu_start_future_gp(rnp, rdp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); /* - * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, - * then offlining this CPU is harmless. Let it happen. + * Wait for the grace period. Do so interruptibly to avoid messing + * up the load average. */ - if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) - return 1; - - /* If no memory, play it safe and keep the CPU around. */ - if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) - return 0; - cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); - cpumask_clear_cpu(cpu, non_nocb_cpus); - ret = !cpumask_empty(non_nocb_cpus); - free_cpumask_var(non_nocb_cpus); - return ret; -} - -/* - * Helper structure for remote registry of RCU callbacks. - * This is needed for when a no-CBs CPU needs to start a grace period. - * If it just invokes call_rcu(), the resulting callback will be queued, - * which can result in deadlock. - */ -struct rcu_head_remote { - struct rcu_head *rhp; - call_rcu_func_t *crf; - void (*func)(struct rcu_head *rhp); -}; - -/* - * Register a callback as specified by the rcu_head_remote struct. - * This function is intended to be invoked via smp_call_function_single(). - */ -static void call_rcu_local(void *arg) -{ - struct rcu_head_remote *rhrp = - container_of(arg, struct rcu_head_remote, rhp); - - rhrp->crf(rhrp->rhp, rhrp->func); -} - -/* - * Set up an rcu_head_remote structure and the invoke call_rcu_local() - * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via - * smp_call_function_single(). - */ -static void invoke_crf_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp), - call_rcu_func_t crf) -{ - struct rcu_head_remote rhr; - - rhr.rhp = rhp; - rhr.crf = crf; - rhr.func = func; - smp_call_function_single(0, call_rcu_local, &rhr, 1); -} - -/* - * Helper functions to be passed to wait_rcu_gp(), each of which - * invokes invoke_crf_remote() to register a callback appropriately. - */ -static void __maybe_unused -call_rcu_preempt_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu); -} -static void call_rcu_bh_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu_bh); -} -static void call_rcu_sched_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu_sched); + trace_rcu_future_gp(rnp, rdp, c, "StartWait"); + for (;;) { + wait_event_interruptible( + rnp->nocb_gp_wq[c & 0x1], + (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); + if (likely(d)) + break; + flush_signals(current); + trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); + } + trace_rcu_future_gp(rnp, rdp, c, "EndWait"); + smp_mb(); /* Ensure that CB invocation happens after GP end. */ } /* @@ -2390,7 +2224,7 @@ static int rcu_nocb_kthread(void *arg) cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); ACCESS_ONCE(rdp->nocb_p_count) += c; ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; - wait_rcu_gp(rdp->rsp->call_remote); + rcu_nocb_wait_gp(rdp); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); @@ -2436,32 +2270,41 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) return; for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); - t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); + t = kthread_run(rcu_nocb_kthread, rdp, + "rcuo%c/%d", rsp->abbr, cpu); BUG_ON(IS_ERR(t)); ACCESS_ONCE(rdp->nocb_kthread) = t; } } /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ -static void init_nocb_callback_list(struct rcu_data *rdp) +static bool init_nocb_callback_list(struct rcu_data *rdp) { if (rcu_nocb_mask == NULL || !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) - return; + return false; rdp->nxttail[RCU_NEXT_TAIL] = NULL; + return true; +} + +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static int rcu_nocb_needs_gp(struct rcu_state *rsp) +{ + return 0; } -/* Initialize the ->call_remote fields in the rcu_state structures. */ -static void __init rcu_init_nocb(void) +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { -#ifdef CONFIG_PREEMPT_RCU - rcu_preempt_state.call_remote = call_rcu_preempt_remote; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - rcu_bh_state.call_remote = call_rcu_bh_remote; - rcu_sched_state.call_remote = call_rcu_sched_remote; } -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ +} static bool is_nocb_cpu(int cpu) { @@ -2480,11 +2323,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, return 0; } -static bool nocb_cpu_expendable(int cpu) -{ - return 1; -} - static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } @@ -2493,12 +2331,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) { } -static void init_nocb_callback_list(struct rcu_data *rdp) -{ -} - -static void __init rcu_init_nocb(void) +static bool init_nocb_callback_list(struct rcu_data *rdp) { + return false; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0d095dcaa670..49099e81c87b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,8 +46,6 @@ #define RCU_TREE_NONCORE #include "rcutree.h" -#define ulong2long(a) (*(long *)(&(a))) - static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { diff --git a/kernel/resource.c b/kernel/resource.c index 73f35d4b30b9..d7386986e10e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -21,6 +21,7 @@ #include <linux/seq_file.h> #include <linux/device.h> #include <linux/pfn.h> +#include <linux/mm.h> #include <asm/io.h> @@ -50,6 +51,14 @@ struct resource_constraint { static DEFINE_RWLOCK(resource_lock); +/* + * For memory hotplug, there is no way to free resource entries allocated + * by boot mem after the system is up. So for reusing the resource entry + * we need to remember the resource. + */ +static struct resource *bootmem_resource_free; +static DEFINE_SPINLOCK(bootmem_resource_lock); + static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -151,6 +160,40 @@ __initcall(ioresources_init); #endif /* CONFIG_PROC_FS */ +static void free_resource(struct resource *res) +{ + if (!res) + return; + + if (!PageSlab(virt_to_head_page(res))) { + spin_lock(&bootmem_resource_lock); + res->sibling = bootmem_resource_free; + bootmem_resource_free = res; + spin_unlock(&bootmem_resource_lock); + } else { + kfree(res); + } +} + +static struct resource *alloc_resource(gfp_t flags) +{ + struct resource *res = NULL; + + spin_lock(&bootmem_resource_lock); + if (bootmem_resource_free) { + res = bootmem_resource_free; + bootmem_resource_free = res->sibling; + } + spin_unlock(&bootmem_resource_lock); + + if (res) + memset(res, 0, sizeof(struct resource)); + else + res = kzalloc(sizeof(struct resource), flags); + + return res; +} + /* Return the conflict entry if you can't request it */ static struct resource * __request_resource(struct resource *root, struct resource *new) { @@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) write_unlock(&resource_lock); } -/** - * adjust_resource - modify a resource's start and size - * @res: resource to modify - * @start: new start value - * @size: new size - * - * Given an existing resource, change its start and size to match the - * arguments. Returns 0 on success, -EBUSY if it can't fit. - * Existing children of the resource are assumed to be immutable. - */ -int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) +static int __adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) { struct resource *tmp, *parent = res->parent; resource_size_t end = start + size - 1; int result = -EBUSY; - write_lock(&resource_lock); - if (!parent) goto skip; @@ -751,6 +783,26 @@ skip: result = 0; out: + return result; +} + +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * + * Given an existing resource, change its start and size to match the + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) +{ + int result; + + write_lock(&resource_lock); + result = __adjust_resource(res, start, size); write_unlock(&resource_lock); return result; } @@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root, { struct resource *parent = root; struct resource *conflict; - struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; if (!res) @@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root, /* conflict covered whole area */ if (conflict->start <= res->start && conflict->end >= res->end) { - kfree(res); + free_resource(res); WARN_ON(next_res); break; } @@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root, end = res->end; res->end = conflict->start - 1; if (conflict->end < end) { - next_res = kzalloc(sizeof(*next_res), - GFP_ATOMIC); + next_res = alloc_resource(GFP_ATOMIC); if (!next_res) { - kfree(res); + free_resource(res); break; } next_res->name = name; @@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); - struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = alloc_resource(GFP_KERNEL); if (!res) return NULL; @@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent, continue; } /* Uhhuh, that didn't work out.. */ - kfree(res); + free_resource(res); res = NULL; break; } @@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start, return -EBUSY; release_resource(res); - kfree(res); + free_resource(res); return 0; } EXPORT_SYMBOL(__check_region); @@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start, write_unlock(&resource_lock); if (res->flags & IORESOURCE_MUXED) wake_up(&muxed_resource_wait); - kfree(res); + free_resource(res); return; } p = &res->sibling; @@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start, } EXPORT_SYMBOL(__release_region); +#ifdef CONFIG_MEMORY_HOTREMOVE +/** + * release_mem_region_adjustable - release a previously reserved memory region + * @parent: parent resource descriptor + * @start: resource start address + * @size: resource region size + * + * This interface is intended for memory hot-delete. The requested region + * is released from a currently busy memory resource. The requested region + * must either match exactly or fit into a single busy resource entry. In + * the latter case, the remaining resource is adjusted accordingly. + * Existing children of the busy memory resource must be immutable in the + * request. + * + * Note: + * - Additional release conditions, such as overlapping region, can be + * supported after they are confirmed as valid cases. + * - When a busy memory resource gets split into two entries, the code + * assumes that all children remain in the lower address entry for + * simplicity. Enhance this logic when necessary. + */ +int release_mem_region_adjustable(struct resource *parent, + resource_size_t start, resource_size_t size) +{ + struct resource **p; + struct resource *res; + struct resource *new_res; + resource_size_t end; + int ret = -EINVAL; + + end = start + size - 1; + if ((start < parent->start) || (end > parent->end)) + return ret; + + /* The alloc_resource() result gets checked later */ + new_res = alloc_resource(GFP_KERNEL); + + p = &parent->child; + write_lock(&resource_lock); + + while ((res = *p)) { + if (res->start >= end) + break; + + /* look for the next resource if it does not fit into */ + if (res->start > start || res->end < end) { + p = &res->sibling; + continue; + } + + if (!(res->flags & IORESOURCE_MEM)) + break; + + if (!(res->flags & IORESOURCE_BUSY)) { + p = &res->child; + continue; + } + + /* found the target resource; let's adjust accordingly */ + if (res->start == start && res->end == end) { + /* free the whole entry */ + *p = res->sibling; + free_resource(res); + ret = 0; + } else if (res->start == start && res->end != end) { + /* adjust the start */ + ret = __adjust_resource(res, end + 1, + res->end - end); + } else if (res->start != start && res->end == end) { + /* adjust the end */ + ret = __adjust_resource(res, res->start, + start - res->start); + } else { + /* split into two entries */ + if (!new_res) { + ret = -ENOMEM; + break; + } + new_res->name = res->name; + new_res->start = end + 1; + new_res->end = res->end; + new_res->flags = res->flags; + new_res->parent = res->parent; + new_res->sibling = res->sibling; + new_res->child = NULL; + + ret = __adjust_resource(res, res->start, + start - res->start); + if (ret) + break; + res->sibling = new_res; + new_res = NULL; + } + + break; + } + + write_unlock(&resource_lock); + free_resource(new_res); + return ret; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + /* * Managed region resource */ diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 7890b10084a7..1d96dd0d93c1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -14,6 +14,7 @@ #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/freezer.h> +#include <linux/stat.h> #include "rtmutex.h" @@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at return curr - buf; } -static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); -static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); +static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); +static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); static struct bus_type rttest_subsys = { .name = "rttest", diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f06d249e103b..deaf90e4a1de 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 67d04651f44b..c70a8814a767 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -512,11 +512,6 @@ static inline void init_hrtick(void) * the target CPU. */ #ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) 0 -#endif - void resched_task(struct task_struct *p) { int cpu; @@ -1288,8 +1283,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { - trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); + trace_sched_wakeup(p, true); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2999,51 +2994,6 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ - if (lock->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ - if (!sched_feat(OWNER_SPIN)) - return 0; - - rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) - break; - - arch_mutex_cpu_relax(); - } - rcu_read_unlock(); - - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; -} -#endif - #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -3084,11 +3034,13 @@ EXPORT_SYMBOL(preempt_schedule); asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); + enum ctx_state prev_state; /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); - user_exit(); + prev_state = exception_enter(); + do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); @@ -3102,6 +3054,8 @@ asmlinkage void __sched preempt_schedule_irq(void) */ barrier(); } while (need_resched()); + + exception_exit(prev_state); } #endif /* CONFIG_PREEMPT */ @@ -4128,6 +4082,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) get_task_struct(p); rcu_read_unlock(); + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; @@ -4775,11 +4733,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ @@ -6250,7 +6203,7 @@ static void sched_init_numa(void) * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). * - * The sched_domains_nume_distance[] array includes the actual distance + * The sched_domains_numa_distance[] array includes the actual distance * numbers. */ @@ -6863,11 +6816,15 @@ int in_sched_functions(unsigned long addr) } #ifdef CONFIG_CGROUP_SCHED +/* + * Default task group. + * Every task in system belongs to this group at bootup. + */ struct task_group root_task_group; LIST_HEAD(task_groups); #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); void __init sched_init(void) { @@ -6904,7 +6861,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { - per_cpu(load_balance_tmpmask, i) = (void *)ptr; + per_cpu(load_balance_mask, i) = (void *)ptr; ptr += cpumask_size(); } #endif /* CONFIG_CPUMASK_OFFSTACK */ @@ -6930,12 +6887,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - /* Too early, not expected to fail */ - BUG_ON(!root_cpuacct.cpuusage); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -7457,7 +7408,7 @@ unlock: return err; } -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) { u64 rt_runtime, rt_period; @@ -7469,7 +7420,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } -long sched_group_rt_runtime(struct task_group *tg) +static long sched_group_rt_runtime(struct task_group *tg) { u64 rt_runtime_us; @@ -7481,7 +7432,7 @@ long sched_group_rt_runtime(struct task_group *tg) return rt_runtime_us; } -int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) { u64 rt_runtime, rt_period; @@ -7494,7 +7445,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } -long sched_group_rt_period(struct task_group *tg) +static long sched_group_rt_period(struct task_group *tg) { u64 rt_period_us; @@ -7529,7 +7480,7 @@ static int sched_rt_global_constraints(void) return ret; } -int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept realtime tasks when there is no way for them to run */ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) @@ -8037,226 +7988,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -struct cpuacct root_cpuacct; - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) -{ - struct cpuacct *ca; - - if (!cgrp->parent) - return &root_cpuacct.css; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - goto out; - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) - goto out_free_ca; - - ca->cpustat = alloc_percpu(struct kernel_cpustat); - if (!ca->cpustat) - goto out_free_cpuusage; - - return &ca->css; - -out_free_cpuusage: - free_percpu(ca->cpuusage); -out_free_ca: - kfree(ca); -out: - return ERR_PTR(-ENOMEM); -} - -/* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpustat); - free_percpu(ca->cpuusage); - kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - u64 data; - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; -#endif - - return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit write safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); - -out: - return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct cpuacct *ca = cgroup_ca(cgroup); - u64 percpu; - int i; - - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); - seq_printf(m, "%llu ", (unsigned long long) percpu); - } - seq_printf(m, "\n"); - return 0; -} - -static const char *cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int cpu; - s64 val = 0; - - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } - - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - - return 0; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, - { - .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, - }, - { - .name = "stat", - .read_map = cpuacct_stats_show, - }, - { } /* terminate */ -}; - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - int cpu; - - if (unlikely(!cpuacct_subsys.active)) - return; - - cpu = task_cpu(tsk); - - rcu_read_lock(); - - ca = task_ca(tsk); - - for (; ca; ca = parent_ca(ca)) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - } - - rcu_read_unlock(); -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .css_alloc = cpuacct_css_alloc, - .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, - .base_cftypes = files, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ - void dump_cpu_task(int cpu) { pr_info("Task dump for CPU %d:\n", cpu); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 000000000000..dbb7e2cd95eb --- /dev/null +++ b/kernel/sched/cpuacct.c @@ -0,0 +1,296 @@ +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/cpumask.h> +#include <linux/seq_file.h> +#include <linux/rcupdate.h> +#include <linux/kernel_stat.h> +#include <linux/err.h> + +#include "sched.h" + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *__parent_ca(struct cpuacct *ca) +{ + return cgroup_ca(ca->css.cgroup->parent); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + if (!ca->css.cgroup->parent) + return NULL; + return cgroup_ca(ca->css.cgroup->parent); +} + +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static struct cpuacct root_cpuacct = { + .cpustat = &kernel_cpustat, + .cpuusage = &root_cpuacct_cpuusage, +}; + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +{ + struct cpuacct *ca; + + if (!cgrp->parent) + return &root_cpuacct.css; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto out; + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) + goto out_free_ca; + + ca->cpustat = alloc_percpu(struct kernel_cpustat); + if (!ca->cpustat) + goto out_free_cpuusage; + + return &ca->css; + +out_free_cpuusage: + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void cpuacct_css_free(struct cgroup *cgrp) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + + free_percpu(ca->cpustat); + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int cpu; + s64 val = 0; + + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_USER]; + val += kcpustat->cpustat[CPUTIME_NICE]; + } + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + + val = 0; + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_SYSTEM]; + val += kcpustat->cpustat[CPUTIME_IRQ]; + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + } + + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + + return 0; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, + { } /* terminate */ +}; + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + cpu = task_cpu(tsk); + + rcu_read_lock(); + + ca = task_ca(tsk); + + while (true) { + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + + ca = parent_ca(ca); + if (!ca) + break; + } + + rcu_read_unlock(); +} + +/* + * Add user/system time to cpuacct. + * + * Note: it's the caller that updates the account of the root cgroup. + */ +void cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; + + rcu_read_lock(); + ca = task_ca(p); + while (ca != &root_cpuacct) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += val; + ca = __parent_ca(ca); + } + rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, + .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, + .early_init = 1, +}; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 000000000000..ed605624a5e7 --- /dev/null +++ b/kernel/sched/cpuacct.h @@ -0,0 +1,17 @@ +#ifdef CONFIG_CGROUP_CPUACCT + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); + +#else + +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +} + +static inline void +cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ +} + +#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e93cca92f38b..ea32f02bf2c3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -115,10 +115,6 @@ static int irqtime_account_si_update(void) static inline void task_group_account_field(struct task_struct *p, int index, u64 tmp) { -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif /* * Since all updates are sure to touch the root cgroup, we * get ourselves ahead and touch it first. If the root cgroup @@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, */ __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif + cpuacct_account_field(p, index, tmp); } /* @@ -388,82 +372,10 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ struct rq *rq) {} #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -/* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to - * @user_tick: indicates if the tick is a user or a system tick - */ -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - struct rq *rq = this_rq(); - - if (vtime_accounting_enabled()) - return; - - if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); - return; - } - - if (steal_account_process_tick()) - return; - - if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); - else - account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); -} - -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - - if (sched_clock_irqtime) { - irqtime_account_idle_ticks(ticks); - return; - } - - account_idle_time(jiffies_to_cputime(ticks)); -} -#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ - /* * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} - -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} #ifndef __ARCH_HAS_VTIME_TASK_SWITCH void vtime_task_switch(struct task_struct *prev) @@ -518,21 +430,111 @@ void vtime_account_irq_enter(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + *ut = p->utime; + *st = p->stime; +} -#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; -static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) { - u64 temp = (__force u64) rtime; + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + struct rq *rq = this_rq(); - temp *= (__force u64) stime; + if (vtime_accounting_enabled()) + return; + + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + + if (steal_account_process_tick()) + return; - if (sizeof(cputime_t) == 4) - temp = div_u64(temp, (__force u32) total); + if (user_tick) + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, + one_jiffy_scaled); else - temp = div64_u64(temp, (__force u64) total); + account_idle_time(cputime_one_jiffy); +} - return (__force cputime_t) temp; +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + + account_idle_time(jiffies_to_cputime(ticks)); +} + +/* + * Perform (stime * rtime) / total with reduced chances + * of multiplication overflows by using smaller factors + * like quotient and remainders of divisions between + * rtime and total. + */ +static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) +{ + u64 rem, res, scaled; + + if (rtime >= total) { + /* + * Scale up to rtime / total then add + * the remainder scaled to stime / total. + */ + res = div64_u64_rem(rtime, total, &rem); + scaled = stime * res; + scaled += div64_u64(stime * rem, total); + } else { + /* + * Same in reverse: scale down to total / rtime + * then substract that result scaled to + * to the remaining part. + */ + res = div64_u64_rem(total, rtime, &rem); + scaled = div64_u64(stime, res); + scaled -= div64_u64(scaled * rem, total); + } + + return (__force cputime_t) scaled; } /* @@ -545,6 +547,12 @@ static void cputime_adjust(struct task_cputime *curr, { cputime_t rtime, stime, total; + if (vtime_accounting_enabled()) { + *ut = curr->utime; + *st = curr->stime; + return; + } + stime = curr->stime; total = stime + curr->utime; @@ -560,10 +568,14 @@ static void cputime_adjust(struct task_cputime *curr, */ rtime = nsecs_to_cputime(curr->sum_exec_runtime); - if (total) - stime = scale_stime(stime, rtime, total); - else + if (!rtime) { + stime = 0; + } else if (!total) { stime = rtime; + } else { + stime = scale_stime((__force u64)stime, + (__force u64)rtime, (__force u64)total); + } /* * If the tick based count grows faster than the scheduler one, @@ -597,7 +609,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime thread_group_cputime(p, &cputime); cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } -#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN static unsigned long long vtime_delta(struct task_struct *tsk) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a33e5986fc5..8bf7081b1ec5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); * Scheduling class tree data structure manipulation methods: */ -static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) +static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - min_vruntime); + s64 delta = (s64)(vruntime - max_vruntime); if (delta > 0) - min_vruntime = vruntime; + max_vruntime = vruntime; - return min_vruntime; + return max_vruntime; } static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) @@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) vruntime = min_vruntime(vruntime, se->vruntime); } + /* ensure we never gain time by being placed backwards. */ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); #ifndef CONFIG_64BIT smp_wmb(); @@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) } /* - * We calculate the vruntime slice of a to be inserted task + * We calculate the vruntime slice of a to-be-inserted task. * * vs = s/w */ @@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } + +/* + * Update the rq's load with the elapsed running time before entering + * idle. if the last scheduled task is not a CFS task, idle_enter will + * be the only way to update the runnable statistic. + */ +void idle_enter_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 1); +} + +/* + * Update the rq's load with the elapsed idle time before a task is + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will + * be the only way to update the runnable statistic. + */ +void idle_exit_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 0); +} + #else static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} @@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) int tsk_cache_hot = 0; /* * We do not migrate tasks that are: - * 1) running (obviously), or + * 1) throttled_lb_pair, or * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. */ + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { - int new_dst_cpu; + int cpu; schedstat_inc(p, se.statistics.nr_failed_migrations_affine); @@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) return 0; - new_dst_cpu = cpumask_first_and(env->dst_grpmask, - tsk_cpus_allowed(p)); - if (new_dst_cpu < nr_cpu_ids) { - env->flags |= LBF_SOME_PINNED; - env->new_dst_cpu = new_dst_cpu; + /* Prevent to re-select dst_cpu via env's cpus */ + for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = cpu; + break; + } } + return 0; } @@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } -#endif + return 1; } - if (tsk_cache_hot) { - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); - return 0; - } - return 1; + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + return 0; } /* @@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env) struct task_struct *p, *n; list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) - continue; - if (!can_migrate_task(p, env)) continue; @@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env) break; } - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (!can_migrate_task(p, env)) goto next; load = task_h_load(p); @@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - if (!can_migrate_task(p, env)) - goto next; - move_task(p, env); pulled++; env->imbalance -= load; @@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { return SCHED_POWER_SCALE; } @@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) return default_scale_freq_power(sd, cpu); } -unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; @@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) return default_scale_smt_power(sd, cpu); } -unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; @@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static int need_active_balance(struct lb_env *env) { @@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *balance) { int ld_moved, cur_ld_moved, active_balance = 0; - int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct cpumask *cpus = __get_cpu_var(load_balance_mask); struct lb_env env = { .sd = sd, @@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, .cpus = cpus, }; + /* + * For NEWLY_IDLE load_balancing, we don't need to consider + * other cpus in our group + */ + if (idle == CPU_NEWLY_IDLE) + env.dst_grpmask = NULL; + cpumask_copy(cpus, cpu_active_mask); - max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); @@ -5034,7 +5059,6 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; - lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -5061,17 +5085,17 @@ more_balance: double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto more_balance; - } - /* * some other cpu did the load balance for us. */ if (cur_ld_moved && env.dst_cpu != smp_processor_id()) resched_cpu(env.dst_cpu); + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + /* * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group @@ -5091,14 +5115,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && - lb_iterations++ < max_lb_iterations) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_SOME_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. @@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; - update_rq_runnable_avg(this_rq, 1); - /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ @@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + for (; sd; sd = sd->parent) atomic_inc(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - set_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + for (; sd; sd = sd->parent) atomic_dec(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5468,7 +5499,7 @@ void update_max_interval(void) * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * - * Balancing parameters are set up in arch_init_sched_domains. + * Balancing parameters are set up in init_sched_domains. */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { @@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &balance)) { /* - * We've pulled tasks over so either we're no - * longer idle. + * The LBF_SOME_PINNED logic could have changed + * env->dst_cpu, so we can't know our idle + * state even if we migrated tasks. Update it. */ - idle = CPU_NOT_IDLE; + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395f..99399f8e4799 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) /* - * Spin-wait on mutex acquisition when the mutex owner is running on - * another cpu -- assumes that when the owner is running, it will soon - * release the lock. Decreases scheduling overhead. - */ -SCHED_FEAT(OWNER_SPIN, true) - -/* * Decrement CPU power based on time not spent running tasks */ SCHED_FEAT(NONTASK_POWER, true) diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf370cae9..b8ce77328341 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) +{ + idle_exit_fair(rq); +} + +static void post_schedule_idle(struct rq *rq) +{ + idle_enter_fair(rq); +} #endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: @@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); +#ifdef CONFIG_SMP + /* Trigger the post schedule to do an idle_enter for CFS */ + rq->post_schedule = 1; +#endif return rq->idle; } @@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .pre_schedule = pre_schedule_idle, + .post_schedule = post_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cc03cfdf469f..4c225c4c7111 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -7,6 +7,7 @@ #include <linux/stop_machine.h> #include "cpupri.h" +#include "cpuacct.h" extern __read_mostly int scheduler_running; @@ -33,6 +34,31 @@ extern __read_mostly int scheduler_running; */ #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the + * increased costs. + */ +#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ +# define SCHED_LOAD_RESOLUTION 10 +# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) +# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#else +# define SCHED_LOAD_RESOLUTION 0 +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) +#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) + #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT @@ -154,11 +180,6 @@ struct task_group { #define MAX_SHARES (1UL << 18) #endif -/* Default task group. - * Every task in system belong to this group at bootup. - */ -extern struct task_group root_task_group; - typedef int (*tg_visitor)(struct task_group *, void *); extern int walk_tg_tree_from(struct task_group *from, @@ -196,6 +217,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +#endif + #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; @@ -547,6 +580,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_id); +struct sched_group_power { + atomic_t ref; + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. + */ + unsigned int power, power_orig; + unsigned long next_update; + /* + * Number of busy cpus in this group. + */ + atomic_t nr_busy_cpus; + + unsigned long cpumask[0]; /* iteration mask */ +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + + unsigned int group_weight; + struct sched_group_power *sgp; + + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long cpumask[0]; +}; + +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +/* + * cpumask masking which cpus in the group are allowed to iterate up the domain + * tree. + */ +static inline struct cpumask *sched_group_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgp->cpumask); +} + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + extern int group_balance_cpu(struct sched_group *sg); #endif /* CONFIG_SMP */ @@ -784,6 +873,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x4 /* internal use, task got migrated */ static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -856,14 +951,61 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif - CPUACCT_STAT_NSTATS, -}; +#define DEQUEUE_SLEEP 1 + +struct sched_class { + const struct sched_class *next; + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + + struct task_struct * (*pick_next_task) (struct rq *rq); + void (*put_prev_task) (struct rq *rq, struct task_struct *p); + +#ifdef CONFIG_SMP + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + + void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); + void (*post_schedule) (struct rq *this_rq); + void (*task_waking) (struct task_struct *task); + void (*task_woken) (struct rq *this_rq, struct task_struct *task); + + void (*set_cpus_allowed)(struct task_struct *p, + const struct cpumask *newmask); + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); +#endif + + void (*set_curr_task) (struct rq *rq); + void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); + void (*task_fork) (struct task_struct *p); + + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio); + + unsigned int (*get_rr_interval) (struct rq *rq, + struct task_struct *task); + +#ifdef CONFIG_FAIR_GROUP_SCHED + void (*task_move_group) (struct task_struct *p, int on_rq); +#endif +}; #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ @@ -877,9 +1019,23 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP +extern void update_group_power(struct sched_domain *sd, int cpu); + extern void trigger_load_balance(struct rq *rq, int cpu); extern void idle_balance(int this_cpu, struct rq *this_rq); +/* + * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg + * becomes useful in lb + */ +#if defined(CONFIG_FAIR_GROUP_SCHED) +extern void idle_enter_fair(struct rq *this_rq); +extern void idle_exit_fair(struct rq *this_rq); +#else +static inline void idle_enter_fair(struct rq *this_rq) {} +static inline void idle_exit_fair(struct rq *this_rq) {} +#endif + #else /* CONFIG_SMP */ static inline void idle_balance(int cpu, struct rq *rq) @@ -891,7 +1047,6 @@ static inline void idle_balance(int cpu, struct rq *rq) extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); -extern void update_group_power(struct sched_domain *sd, int cpu); extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); @@ -904,45 +1059,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern void update_idle_cpu_load(struct rq *this_rq); -#ifdef CONFIG_CGROUP_CPUACCT -#include <linux/cgroup.h> -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; -}; - -extern struct cgroup_subsys cpuacct_subsys; -extern struct cpuacct root_cpuacct; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -static inline struct cpuacct *parent_ca(struct cpuacct *ca) -{ - if (!ca || !ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); -} - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -#endif - #ifdef CONFIG_PARAVIRT static inline u64 steal_ticks(u64 steal) { @@ -1187,7 +1303,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled); enum rq_nohz_flag_bits { NOHZ_TICK_STOPPED, NOHZ_BALANCE_KICK, - NOHZ_IDLE, }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index afc1dc60f3f8..9edcf456e0fc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit; #endif extern int pid_max; extern int pid_max_min, pid_max_max; -extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; extern int latencytop_enabled; @@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index f8b11a283171..12d6ebbfdd83 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -365,7 +365,7 @@ int init_test_probes(void) target2 = kprobe_target2; do { - rand1 = random32(); + rand1 = prandom_u32(); } while (rand1 <= div_factor); printk(KERN_INFO "Kprobe smoke test started\n"); diff --git a/kernel/time.c b/kernel/time.c index f8342a41efa6..d3617dbd3dca 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -138,13 +138,14 @@ int persistent_clock_is_local; */ static inline void warp_clock(void) { - struct timespec adjust; + if (sys_tz.tz_minuteswest != 0) { + struct timespec adjust; - adjust = current_kernel_time(); - if (sys_tz.tz_minuteswest != 0) persistent_clock_is_local = 1; - adjust.tv_sec += sys_tz.tz_minuteswest * 60; - do_settimeofday(&adjust); + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); + } } /* diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 072bb066bb7d..12ff13a838c6 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,13 +18,14 @@ #include <linux/rtc.h> #include "tick-internal.h" +#include "ntp_internal.h" /* * NTP timekeeping variables: + * + * Note: All of the NTP state is protected by the timekeeping locks. */ -DEFINE_RAW_SPINLOCK(ntp_lock); - /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; @@ -53,9 +54,6 @@ static int time_state = TIME_OK; /* clock status bits: */ static int time_status = STA_UNSYNC; -/* TAI offset (secs): */ -static long time_tai; - /* time adjustment (nsecs): */ static s64 time_offset; @@ -134,8 +132,6 @@ static inline void pps_reset_freq_interval(void) /** * pps_clear - Clears the PPS state variables - * - * Must be called while holding a write on the ntp_lock */ static inline void pps_clear(void) { @@ -150,8 +146,6 @@ static inline void pps_clear(void) /* Decrease pps_valid to indicate that another second has passed since * the last PPS signal. When it reaches 0, indicate that PPS signal is * missing. - * - * Must be called while holding a write on the ntp_lock */ static inline void pps_dec_valid(void) { @@ -346,10 +340,6 @@ static void ntp_update_offset(long offset) */ void ntp_clear(void) { - unsigned long flags; - - raw_spin_lock_irqsave(&ntp_lock, flags); - time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; @@ -362,20 +352,12 @@ void ntp_clear(void) /* Clear PPS state variables */ pps_clear(); - raw_spin_unlock_irqrestore(&ntp_lock, flags); - } u64 ntp_tick_length(void) { - unsigned long flags; - s64 ret; - - raw_spin_lock_irqsave(&ntp_lock, flags); - ret = tick_length; - raw_spin_unlock_irqrestore(&ntp_lock, flags); - return ret; + return tick_length; } @@ -393,9 +375,6 @@ int second_overflow(unsigned long secs) { s64 delta; int leap = 0; - unsigned long flags; - - raw_spin_lock_irqsave(&ntp_lock, flags); /* * Leap second processing. If in leap-insert state at the end of the @@ -415,7 +394,6 @@ int second_overflow(unsigned long secs) else if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; - time_tai++; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); } @@ -425,7 +403,6 @@ int second_overflow(unsigned long secs) time_state = TIME_OK; else if ((secs + 1) % 86400 == 0) { leap = 1; - time_tai--; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); @@ -479,8 +456,6 @@ int second_overflow(unsigned long secs) time_adjust = 0; out: - raw_spin_unlock_irqrestore(&ntp_lock, flags); - return leap; } @@ -575,11 +550,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) time_status |= txc->status & ~STA_RONLY; } -/* - * Called with ntp_lock held, so we can access and modify - * all the global NTP state: - */ -static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) + +static inline void process_adjtimex_modes(struct timex *txc, + struct timespec *ts, + s32 *time_tai) { if (txc->modes & ADJ_STATUS) process_adj_status(txc, ts); @@ -613,7 +587,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts } if (txc->modes & ADJ_TAI && txc->constant > 0) - time_tai = txc->constant; + *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) ntp_update_offset(txc->offset); @@ -625,16 +599,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts ntp_update_frequency(); } -/* - * adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. + + +/** + * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -int do_adjtimex(struct timex *txc) +int ntp_validate_timex(struct timex *txc) { - struct timespec ts; - int result; - - /* Validate the data before disabling interrupts */ if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) @@ -646,7 +617,6 @@ int do_adjtimex(struct timex *txc) /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; - /* * if the quartz is off by more than 10% then * something is VERY wrong! @@ -657,22 +627,20 @@ int do_adjtimex(struct timex *txc) return -EINVAL; } - if (txc->modes & ADJ_SETOFFSET) { - struct timespec delta; - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; - if (!capable(CAP_SYS_TIME)) - return -EPERM; - if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - result = timekeeping_inject_offset(&delta); - if (result) - return result; - } + if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) + return -EPERM; - getnstimeofday(&ts); + return 0; +} - raw_spin_lock_irq(&ntp_lock); + +/* + * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) +{ + int result; if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -687,7 +655,7 @@ int do_adjtimex(struct timex *txc) /* If there are input parameters, then process them: */ if (txc->modes) - process_adjtimex_modes(txc, &ts); + process_adjtimex_modes(txc, ts, time_tai); txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); @@ -709,15 +677,13 @@ int do_adjtimex(struct timex *txc) txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; txc->tick = tick_usec; - txc->tai = time_tai; + txc->tai = *time_tai; /* fill PPS status fields */ pps_fill_timex(txc); - raw_spin_unlock_irq(&ntp_lock); - - txc->time.tv_sec = ts.tv_sec; - txc->time.tv_usec = ts.tv_nsec; + txc->time.tv_sec = ts->tv_sec; + txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; @@ -894,7 +860,7 @@ static void hardpps_update_phase(long error) } /* - * hardpps() - discipline CPU clock oscillator to external PPS signal + * __hardpps() - discipline CPU clock oscillator to external PPS signal * * This routine is called at each PPS signal arrival in order to * discipline the CPU clock oscillator to the PPS signal. It takes two @@ -905,15 +871,13 @@ static void hardpps_update_phase(long error) * This code is based on David Mills's reference nanokernel * implementation. It was mostly rewritten but keeps the same idea. */ -void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) { struct pps_normtime pts_norm, freq_norm; unsigned long flags; pts_norm = pps_normalize_ts(*phase_ts); - raw_spin_lock_irqsave(&ntp_lock, flags); - /* clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -925,7 +889,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) * just start the frequency interval */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; - raw_spin_unlock_irqrestore(&ntp_lock, flags); return; } @@ -940,7 +903,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - raw_spin_unlock_irqrestore(&ntp_lock, flags); pr_err("hardpps: PPSJITTER: bad pulse\n"); return; } @@ -957,10 +919,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) hardpps_update_phase(pts_norm.nsec); - raw_spin_unlock_irqrestore(&ntp_lock, flags); } -EXPORT_SYMBOL(hardpps); - #endif /* CONFIG_NTP_PPS */ static int __init ntp_tick_adj_setup(char *str) diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h new file mode 100644 index 000000000000..1950cb4ca2a4 --- /dev/null +++ b/kernel/time/ntp_internal.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_NTP_INTERNAL_H +#define _LINUX_NTP_INTERNAL_H + +extern void ntp_init(void); +extern void ntp_clear(void); +/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ +extern u64 ntp_tick_length(void); +extern int second_overflow(unsigned long secs); +extern int ntp_validate_timex(struct timex *); +extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); +extern void __hardpps(const struct timespec *, const struct timespec *); +#endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 7f32fe0e52cd..61d00a8cdf2f 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -28,9 +28,8 @@ */ static struct tick_device tick_broadcast_device; -/* FIXME: Use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); -static DECLARE_BITMAP(tmpmask, NR_CPUS); +static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tmpmask; static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void) struct cpumask *tick_get_broadcast_mask(void) { - return to_cpumask(tick_broadcast_mask); + return tick_broadcast_mask; } /* @@ -67,6 +66,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) */ int tick_check_broadcast_device(struct clock_event_device *dev) { + struct clock_event_device *cur = tick_broadcast_device.evtdev; + if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || @@ -74,9 +75,21 @@ int tick_check_broadcast_device(struct clock_event_device *dev) return 0; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); + if (cur) + cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(dev); + /* + * Inform all cpus about this. We might be in a situation + * where we did not switch to oneshot mode because the per cpu + * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack + * of a oneshot capable broadcast device. Without that + * notification the systems stays stuck in periodic mode + * forever. + */ + if (dev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_clock_notify(); return 1; } @@ -124,7 +137,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); + cpumask_set_cpu(cpu, tick_broadcast_mask); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; } else { @@ -135,7 +148,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { int cpu = smp_processor_id(); - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_mask); tick_broadcast_clear_oneshot(cpu); } else { tick_device_setup_broadcast_func(dev); @@ -199,9 +212,8 @@ static void tick_do_periodic_broadcast(void) { raw_spin_lock(&tick_broadcast_lock); - cpumask_and(to_cpumask(tmpmask), - cpu_online_mask, tick_get_broadcast_mask()); - tick_do_broadcast(to_cpumask(tmpmask)); + cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); + tick_do_broadcast(tmpmask); raw_spin_unlock(&tick_broadcast_lock); } @@ -264,13 +276,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason) if (!tick_device_is_functional(dev)) goto out; - bc_stopped = cpumask_empty(tick_get_broadcast_mask()); + bc_stopped = cpumask_empty(tick_broadcast_mask); switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); @@ -280,8 +291,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: if (!tick_broadcast_force && - cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -289,7 +299,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) break; } - if (cpumask_empty(tick_get_broadcast_mask())) { + if (cpumask_empty(tick_broadcast_mask)) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { @@ -338,10 +348,10 @@ void tick_shutdown_broadcast(unsigned int *cpup) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_mask); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { - if (bc && cpumask_empty(tick_get_broadcast_mask())) + if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } @@ -377,13 +387,13 @@ int tick_resume_broadcast(void) switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(bc); broadcast = cpumask_test_cpu(smp_processor_id(), - tick_get_broadcast_mask()); + tick_broadcast_mask); break; case TICKDEV_MODE_ONESHOT: - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) broadcast = tick_resume_broadcast_oneshot(bc); break; } @@ -396,25 +406,58 @@ int tick_resume_broadcast(void) #ifdef CONFIG_TICK_ONESHOT -/* FIXME: use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); +static cpumask_var_t tick_broadcast_oneshot_mask; +static cpumask_var_t tick_broadcast_pending_mask; +static cpumask_var_t tick_broadcast_force_mask; /* * Exposed for debugging: see timer_list.c */ struct cpumask *tick_get_broadcast_oneshot_mask(void) { - return to_cpumask(tick_broadcast_oneshot_mask); + return tick_broadcast_oneshot_mask; } -static int tick_broadcast_set_event(ktime_t expires, int force) +/* + * Called before going idle with interrupts disabled. Checks whether a + * broadcast event from the other core is about to happen. We detected + * that in tick_broadcast_oneshot_control(). The callsite can use this + * to avoid a deep idle transition as we are about to get the + * broadcast IPI right away. + */ +int tick_check_broadcast_expired(void) { - struct clock_event_device *bc = tick_broadcast_device.evtdev; + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +} + +/* + * Set broadcast interrupt affinity + */ +static void tick_broadcast_set_affinity(struct clock_event_device *bc, + const struct cpumask *cpumask) +{ + if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) + return; + + if (cpumask_equal(bc->cpumask, cpumask)) + return; + + bc->cpumask = cpumask; + irq_set_affinity(bc->irq, bc->cpumask); +} + +static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, + ktime_t expires, int force) +{ + int ret; if (bc->mode != CLOCK_EVT_MODE_ONESHOT) clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - return clockevents_program_event(bc, expires, force); + ret = clockevents_program_event(bc, expires, force); + if (!ret) + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); + return ret; } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) @@ -429,7 +472,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) */ void tick_check_oneshot_broadcast(int cpu) { - if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { + if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); @@ -443,27 +486,39 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; ktime_t now, next_event; - int cpu; + int cpu, next_cpu = 0; raw_spin_lock(&tick_broadcast_lock); again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; - cpumask_clear(to_cpumask(tmpmask)); + cpumask_clear(tmpmask); now = ktime_get(); /* Find all expired events */ - for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { + for_each_cpu(cpu, tick_broadcast_oneshot_mask) { td = &per_cpu(tick_cpu_device, cpu); - if (td->evtdev->next_event.tv64 <= now.tv64) - cpumask_set_cpu(cpu, to_cpumask(tmpmask)); - else if (td->evtdev->next_event.tv64 < next_event.tv64) + if (td->evtdev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, tmpmask); + /* + * Mark the remote cpu in the pending mask, so + * it can avoid reprogramming the cpu local + * timer in tick_broadcast_oneshot_control(). + */ + cpumask_set_cpu(cpu, tick_broadcast_pending_mask); + } else if (td->evtdev->next_event.tv64 < next_event.tv64) { next_event.tv64 = td->evtdev->next_event.tv64; + next_cpu = cpu; + } } + /* Take care of enforced broadcast requests */ + cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); + cpumask_clear(tick_broadcast_force_mask); + /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(to_cpumask(tmpmask)); + tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: @@ -480,7 +535,7 @@ again: * Rearm the broadcast device. If event expired, * repeat the above */ - if (tick_broadcast_set_event(next_event, 0)) + if (tick_broadcast_set_event(dev, next_cpu, next_event, 0)) goto again; } raw_spin_unlock(&tick_broadcast_lock); @@ -495,6 +550,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags; + ktime_t now; int cpu; /* @@ -519,21 +575,84 @@ void tick_broadcast_oneshot_control(unsigned long reason) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); - if (dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(dev->next_event, 1); + /* + * We only reprogram the broadcast timer if we + * did not mark ourself in the force mask and + * if the cpu local event is earlier than the + * broadcast event. If the current CPU is in + * the force mask, then we are going to be + * woken by the IPI right away. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && + dev->next_event.tv64 < bc->next_event.tv64) + tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } } else { - if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_clear_cpu(cpu, - tick_get_broadcast_oneshot_mask()); + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - if (dev->next_event.tv64 != KTIME_MAX) - tick_program_event(dev->next_event, 1); + if (dev->next_event.tv64 == KTIME_MAX) + goto out; + /* + * The cpu which was handling the broadcast + * timer marked this cpu in the broadcast + * pending mask and fired the broadcast + * IPI. So we are going to handle the expired + * event anyway via the broadcast IPI + * handler. No need to reprogram the timer + * with an already expired event. + */ + if (cpumask_test_and_clear_cpu(cpu, + tick_broadcast_pending_mask)) + goto out; + + /* + * If the pending bit is not set, then we are + * either the CPU handling the broadcast + * interrupt or we got woken by something else. + * + * We are not longer in the broadcast mask, so + * if the cpu local expiry time is already + * reached, we would reprogram the cpu local + * timer with an already expired event. + * + * This can lead to a ping-pong when we return + * to idle and therefor rearm the broadcast + * timer before the cpu local timer was able + * to fire. This happens because the forced + * reprogramming makes sure that the event + * will happen in the future and depending on + * the min_delta setting this might be far + * enough out that the ping-pong starts. + * + * If the cpu local next_event has expired + * then we know that the broadcast timer + * next_event has expired as well and + * broadcast is about to be handled. So we + * avoid reprogramming and enforce that the + * broadcast handler, which did not run yet, + * will invoke the cpu local handler. + * + * We cannot call the handler directly from + * here, because we might be in a NOHZ phase + * and we did not go through the irq_enter() + * nohz fixups. + */ + now = ktime_get(); + if (dev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, tick_broadcast_force_mask); + goto out; + } + /* + * We got woken by something else. Reprogram + * the cpu local timer device. + */ + tick_program_event(dev->next_event, 1); } } +out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -544,7 +663,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) */ static void tick_broadcast_clear_oneshot(int cpu) { - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, @@ -582,17 +701,16 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) * oneshot_mask bits for those and program the * broadcast device to fire. */ - cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); - cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); - cpumask_or(tick_get_broadcast_oneshot_mask(), - tick_get_broadcast_oneshot_mask(), - to_cpumask(tmpmask)); + cpumask_copy(tmpmask, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tmpmask); + cpumask_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, tmpmask); - if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + if (was_periodic && !cpumask_empty(tmpmask)) { clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - tick_broadcast_init_next_event(to_cpumask(tmpmask), + tick_broadcast_init_next_event(tmpmask, tick_next_period); - tick_broadcast_set_event(tick_next_period, 1); + tick_broadcast_set_event(bc, cpu, tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; } else { @@ -640,7 +758,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) * Clear the broadcast mask flag for the dead cpu, but do not * stop the broadcast device! */ - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -664,3 +782,14 @@ bool tick_broadcast_oneshot_available(void) } #endif + +void __init tick_broadcast_init(void) +{ + alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + alloc_cpumask_var(&tmpmask, GFP_NOWAIT); +#ifdef CONFIG_TICK_ONESHOT + alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); +#endif +} diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b1600a6973f4..6176a3e45709 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -323,6 +323,7 @@ static void tick_shutdown(unsigned int *cpup) */ dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); + dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } raw_spin_unlock_irqrestore(&tick_device_lock, flags); @@ -416,4 +417,5 @@ static struct notifier_block tick_notifier = { void __init tick_init(void) { clockevents_register_notifier(&tick_notifier); + tick_broadcast_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index cf3e59ed6dc0..f0299eae4602 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,6 +4,8 @@ #include <linux/hrtimer.h> #include <linux/tick.h> +extern seqlock_t jiffies_lock; + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 @@ -94,7 +96,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); extern void tick_suspend_broadcast(void); extern int tick_resume_broadcast(void); - +extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); @@ -119,6 +121,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline int tick_resume_broadcast(void) { return 0; } +static inline void tick_broadcast_init(void) { } /* * Set the periodic handler in non broadcast mode diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a19a39952c1b..225f8bf19095 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -482,8 +482,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (ratelimit < 10 && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); + pr_warn("NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); ratelimit++; } return false; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9a0bc98fbe1d..98cd470bbe49 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -23,8 +23,13 @@ #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> +#include "tick-internal.h" +#include "ntp_internal.h" static struct timekeeper timekeeper; +static DEFINE_RAW_SPINLOCK(timekeeper_lock); +static seqcount_t timekeeper_seq; +static struct timekeeper shadow_timekeeper; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -67,6 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) tk->wall_to_monotonic = wtm; set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); tk->offs_real = timespec_to_ktime(tmp); + tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); } static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) @@ -96,7 +102,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) old_clock = tk->clock; tk->clock = clock; - clock->cycle_last = clock->read(clock); + tk->cycle_last = clock->cycle_last = clock->read(clock); /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -201,8 +207,6 @@ static void update_pvclock_gtod(struct timekeeper *tk) /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener - * - * Must hold write on timekeeper.lock */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { @@ -210,11 +214,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb) unsigned long flags; int ret; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); - /* update timekeeping data */ update_pvclock_gtod(tk); - write_sequnlock_irqrestore(&tk->lock, flags); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } @@ -223,25 +226,22 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener - * - * Must hold write on timekeeper.lock */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { - struct timekeeper *tk = &timekeeper; unsigned long flags; int ret; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); - write_sequnlock_irqrestore(&tk->lock, flags); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); -/* must hold write on timekeeper.lock */ -static void timekeeping_update(struct timekeeper *tk, bool clearntp) +/* must hold timekeeper_lock */ +static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) { if (clearntp) { tk->ntp_error = 0; @@ -249,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) } update_vsyscall(tk); update_pvclock_gtod(tk); + + if (mirror) + memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); } /** @@ -267,7 +270,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) clock = tk->clock; cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - clock->cycle_last = cycle_now; + tk->cycle_last = clock->cycle_last = cycle_now; tk->xtime_nsec += cycle_delta * tk->mult; @@ -294,12 +297,12 @@ int __getnstimeofday(struct timespec *ts) s64 nsecs = 0; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); @@ -335,11 +338,11 @@ ktime_t ktime_get(void) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -366,12 +369,12 @@ void ktime_get_ts(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; @@ -379,6 +382,50 @@ void ktime_get_ts(struct timespec *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts); + +/** + * timekeeping_clocktai - Returns the TAI time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void timekeeping_clocktai(struct timespec *ts) +{ + struct timekeeper *tk = &timekeeper; + unsigned long seq; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&timekeeper_seq); + + ts->tv_sec = tk->xtime_sec + tk->tai_offset; + nsecs = timekeeping_get_ns(tk); + + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + ts->tv_nsec = 0; + timespec_add_ns(ts, nsecs); + +} +EXPORT_SYMBOL(timekeeping_clocktai); + + +/** + * ktime_get_clocktai - Returns the TAI time of day in a ktime + * + * Returns the time of day in a ktime. + */ +ktime_t ktime_get_clocktai(void) +{ + struct timespec ts; + + timekeeping_clocktai(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL(ktime_get_clocktai); + #ifdef CONFIG_NTP_PPS /** @@ -399,7 +446,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); *ts_raw = tk->raw_time; ts_real->tv_sec = tk->xtime_sec; @@ -408,7 +455,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) nsecs_raw = timekeeping_get_ns_raw(tk); nsecs_real = timekeeping_get_ns(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -448,7 +495,8 @@ int do_settimeofday(const struct timespec *tv) if (!timespec_valid_strict(tv)) return -EINVAL; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); @@ -460,9 +508,10 @@ int do_settimeofday(const struct timespec *tv) tk_set_xtime(tk, tv); - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -487,7 +536,8 @@ int timekeeping_inject_offset(struct timespec *ts) if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); @@ -502,9 +552,10 @@ int timekeeping_inject_offset(struct timespec *ts) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -513,6 +564,52 @@ error: /* even if we error out, we forwarded the time, so call update */ } EXPORT_SYMBOL(timekeeping_inject_offset); + +/** + * timekeeping_get_tai_offset - Returns current TAI offset from UTC + * + */ +s32 timekeeping_get_tai_offset(void) +{ + struct timekeeper *tk = &timekeeper; + unsigned int seq; + s32 ret; + + do { + seq = read_seqcount_begin(&timekeeper_seq); + ret = tk->tai_offset; + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + return ret; +} + +/** + * __timekeeping_set_tai_offset - Lock free worker function + * + */ +static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) +{ + tk->tai_offset = tai_offset; + tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); +} + +/** + * timekeeping_set_tai_offset - Sets the current TAI offset from UTC + * + */ +void timekeeping_set_tai_offset(s32 tai_offset) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + __timekeeping_set_tai_offset(tk, tai_offset); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + clock_was_set(); +} + /** * change_clocksource - Swaps clocksources if a new one is available * @@ -526,7 +623,8 @@ static int change_clocksource(void *data) new = (struct clocksource *) data; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); if (!new->enable || new->enable(new) == 0) { @@ -535,9 +633,10 @@ static int change_clocksource(void *data) if (old->disable) old->disable(old); } - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return 0; } @@ -587,11 +686,11 @@ void getrawmonotonic(struct timespec *ts) s64 nsecs; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); nsecs = timekeeping_get_ns_raw(tk); *ts = tk->raw_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); timespec_add_ns(ts, nsecs); } @@ -607,11 +706,11 @@ int timekeeping_valid_for_hres(void) int ret; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return ret; } @@ -626,11 +725,11 @@ u64 timekeeping_max_deferment(void) u64 ret; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ret = tk->clock->max_idle_ns; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return ret; } @@ -693,11 +792,10 @@ void __init timekeeping_init(void) boot.tv_nsec = 0; } - seqlock_init(&tk->lock); - + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); ntp_init(); - write_seqlock_irqsave(&tk->lock, flags); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); @@ -716,7 +814,10 @@ void __init timekeeping_init(void) tmp.tv_nsec = 0; tk_set_sleep_time(tk, tmp); - write_sequnlock_irqrestore(&tk->lock, flags); + memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /* time in seconds when suspend began */ @@ -764,15 +865,17 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (has_persistent_clock()) return; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -788,26 +891,72 @@ void timekeeping_inject_sleeptime(struct timespec *delta) static void timekeeping_resume(void) { struct timekeeper *tk = &timekeeper; + struct clocksource *clock = tk->clock; unsigned long flags; - struct timespec ts; + struct timespec ts_new, ts_delta; + cycle_t cycle_now, cycle_delta; + bool suspendtime_found = false; - read_persistent_clock(&ts); + read_persistent_clock(&ts_new); clockevents_resume(); clocksource_resume(); - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + /* + * After system resumes, we need to calculate the suspended time and + * compensate it for the OS time. There are 3 sources that could be + * used: Nonstop clocksource during suspend, persistent clock and rtc + * device. + * + * One specific platform may have 1 or 2 or all of them, and the + * preference will be: + * suspend-nonstop clocksource -> persistent clock -> rtc + * The less preferred source will only be tried if there is no better + * usable source. The rtc part is handled separately in rtc core code. + */ + cycle_now = clock->read(clock); + if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && + cycle_now > clock->cycle_last) { + u64 num, max = ULLONG_MAX; + u32 mult = clock->mult; + u32 shift = clock->shift; + s64 nsec = 0; + + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { - ts = timespec_sub(ts, timekeeping_suspend_time); - __timekeeping_inject_sleeptime(tk, &ts); + /* + * "cycle_delta * mutl" may cause 64 bits overflow, if the + * suspended time is too long. In that case we need do the + * 64 bits math carefully + */ + do_div(max, mult); + if (cycle_delta > max) { + num = div64_u64(cycle_delta, max); + nsec = (((u64) max * mult) >> shift) * num; + cycle_delta -= num * max; + } + nsec += ((u64) cycle_delta * mult) >> shift; + + ts_delta = ns_to_timespec(nsec); + suspendtime_found = true; + } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); + suspendtime_found = true; } - /* re-base the last cycle value */ - tk->clock->cycle_last = tk->clock->read(tk->clock); + + if (suspendtime_found) + __timekeeping_inject_sleeptime(tk, &ts_delta); + + /* Re-base the last cycle value */ + tk->cycle_last = clock->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, false); - write_sequnlock_irqrestore(&tk->lock, flags); + timekeeping_update(tk, false, true); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); touch_softlockup_watchdog(); @@ -826,7 +975,8 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; @@ -849,7 +999,8 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -1099,6 +1250,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts)); + __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); + clock_was_set_delayed(); } } @@ -1116,15 +1269,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, u32 shift) { + cycle_t interval = tk->cycle_interval << shift; u64 raw_nsecs; /* If the offset is smaller then a shifted interval, do nothing */ - if (offset < tk->cycle_interval<<shift) + if (offset < interval) return offset; /* Accumulate one shifted interval */ - offset -= tk->cycle_interval << shift; - tk->clock->cycle_last += tk->cycle_interval << shift; + offset -= interval; + tk->cycle_last += interval; tk->xtime_nsec += tk->xtime_interval << shift; accumulate_nsecs_to_secs(tk); @@ -1181,27 +1335,28 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) static void update_wall_time(void) { struct clocksource *clock; - struct timekeeper *tk = &timekeeper; + struct timekeeper *real_tk = &timekeeper; + struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; unsigned long flags; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) goto out; - clock = tk->clock; + clock = real_tk->clock; #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - offset = tk->cycle_interval; + offset = real_tk->cycle_interval; #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif /* Check if there's really nothing to do */ - if (offset < tk->cycle_interval) + if (offset < real_tk->cycle_interval) goto out; /* @@ -1238,11 +1393,24 @@ static void update_wall_time(void) */ accumulate_nsecs_to_secs(tk); - timekeeping_update(tk, false); - + write_seqcount_begin(&timekeeper_seq); + /* Update clock->cycle_last with the new value */ + clock->cycle_last = tk->cycle_last; + /* + * Update the real timekeeper. + * + * We could avoid this memcpy by switching pointers, but that + * requires changes to all other timekeeper usage sites as + * well, i.e. move the timekeeper pointer getter into the + * spinlocked/seqcount protected sections. And we trade this + * memcpy under the timekeeper_seq against one before we start + * updating. + */ + memcpy(real_tk, tk, sizeof(*tk)); + timekeeping_update(real_tk, false, false); + write_seqcount_end(&timekeeper_seq); out: - write_sequnlock_irqrestore(&tk->lock, flags); - + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /** @@ -1289,13 +1457,13 @@ void get_monotonic_boottime(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; sleep = tk->total_sleep_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_sec += tomono.tv_sec + sleep.tv_sec; ts->tv_nsec = 0; @@ -1354,10 +1522,10 @@ struct timespec current_kernel_time(void) unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); now = tk_xtime(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return now; } @@ -1370,11 +1538,11 @@ struct timespec get_monotonic_coarse(void) unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1405,11 +1573,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); *xtim = tk_xtime(tk); *wtom = tk->wall_to_monotonic; *sleep = tk->total_sleep_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1421,7 +1589,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * Returns current monotonic time and updates the offsets * Called from hrtimer_interupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, + ktime_t *offs_tai) { struct timekeeper *tk = &timekeeper; ktime_t now; @@ -1429,14 +1598,15 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) u64 secs, nsecs; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); secs = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; - } while (read_seqretry(&tk->lock, seq)); + *offs_tai = tk->offs_tai; + } while (read_seqcount_retry(&timekeeper_seq, seq)); now = ktime_add_ns(ktime_set(secs, 0), nsecs); now = ktime_sub(now, *offs_real); @@ -1454,15 +1624,79 @@ ktime_t ktime_get_monotonic_offset(void) struct timespec wtom; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); wtom = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return timespec_to_ktime(wtom); } EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); /** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + */ +int do_adjtimex(struct timex *txc) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + struct timespec ts; + s32 orig_tai, tai; + int ret; + + /* Validate the data before disabling interrupts */ + ret = ntp_validate_timex(txc); + if (ret) + return ret; + + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + ret = timekeeping_inject_offset(&delta); + if (ret) + return ret; + } + + getnstimeofday(&ts); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + orig_tai = tai = tk->tai_offset; + ret = __do_adjtimex(txc, &ts, &tai); + + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tk, tai); + clock_was_set_delayed(); + } + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return ret; +} + +#ifdef CONFIG_NTP_PPS +/** + * hardpps() - Accessor function to NTP __hardpps function + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + __hardpps(phase_ts, raw_ts); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} +EXPORT_SYMBOL(hardpps); +#endif + +/** * xtime_update() - advances the timekeeping infrastructure * @ticks: number of ticks, that have elapsed since the last call. * diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index af5a7e9f164b..3bdf28323012 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -20,6 +20,13 @@ #include <asm/uaccess.h> + +struct timer_list_iter { + int cpu; + bool second_pass; + u64 now; +}; + typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); @@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - SEQ_printf(m, "\n"); SEQ_printf(m, "cpu: %d\n", cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { SEQ_printf(m, " clock %d:\n", i); @@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #undef P #undef P_ns + SEQ_printf(m, "\n"); } #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; - SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); @@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->event_handler); SEQ_printf(m, "\n"); SEQ_printf(m, " retries: %lu\n", dev->retries); + SEQ_printf(m, "\n"); } -static void timer_list_show_tickdevices(struct seq_file *m) +static void timer_list_show_tickdevices_header(struct seq_file *m) { - int cpu; - #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", @@ -246,47 +251,104 @@ static void timer_list_show_tickdevices(struct seq_file *m) #endif SEQ_printf(m, "\n"); #endif - for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu), cpu); - SEQ_printf(m, "\n"); } -#else -static void timer_list_show_tickdevices(struct seq_file *m) { } #endif +static inline void timer_list_header(struct seq_file *m, u64 now) +{ + SEQ_printf(m, "Timer List Version: v0.7\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + SEQ_printf(m, "\n"); +} + static int timer_list_show(struct seq_file *m, void *v) { + struct timer_list_iter *iter = v; + u64 now = ktime_to_ns(ktime_get()); + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + +void sysrq_timer_list_show(void) +{ u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.7\n"); - SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); - SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + timer_list_header(NULL, now); for_each_online_cpu(cpu) - print_cpu(m, cpu, now); + print_cpu(NULL, cpu, now); - SEQ_printf(m, "\n"); - timer_list_show_tickdevices(m); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + timer_list_show_tickdevices_header(NULL); + for_each_online_cpu(cpu) + print_tickdevice(NULL, tick_get_device(cpu), cpu); +#endif + return; +} - return 0; +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + + if (!*offset) { + iter->cpu = -1; + iter->now = ktime_to_ns(ktime_get()); + } else if (iter->cpu >= nr_cpu_ids) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS + if (!iter->second_pass) { + iter->cpu = -1; + iter->second_pass = true; + } else + return NULL; +#else + return NULL; +#endif + } + return iter; } -void sysrq_timer_list_show(void) +static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); + ++*offset; + return timer_list_start(file, offset); +} + +static void timer_list_stop(struct seq_file *seq, void *v) { - timer_list_show(NULL, NULL); } +static const struct seq_operations timer_list_sops = { + .start = timer_list_start, + .next = timer_list_next, + .stop = timer_list_stop, + .show = timer_list_show, +}; + static int timer_list_open(struct inode *inode, struct file *filp) { - return single_open(filp, timer_list_show, NULL); + return seq_open_private(filp, &timer_list_sops, + sizeof(struct timer_list_iter)); } static const struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = seq_release_private, }; static int __init init_timer_list_procfs(void) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fc382d6e2765..5e9efd4b83a4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -176,6 +176,8 @@ config IRQSOFF_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP help This option measures the time spent in irqs-off critical sections, with microsecond accuracy. @@ -198,6 +200,8 @@ config PREEMPT_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP help This option measures the time spent in preemption-off critical sections, with microsecond accuracy. @@ -217,6 +221,7 @@ config SCHED_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE + select TRACER_SNAPSHOT help This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. @@ -248,6 +253,27 @@ config TRACER_SNAPSHOT echo 1 > /sys/kernel/debug/tracing/snapshot cat snapshot +config TRACER_SNAPSHOT_PER_CPU_SWAP + bool "Allow snapshot to swap per CPU" + depends on TRACER_SNAPSHOT + select RING_BUFFER_ALLOW_SWAP + help + Allow doing a snapshot of a single CPU buffer instead of a + full swap (all buffers). If this is set, then the following is + allowed: + + echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + + After which, only the tracing buffer for CPU 2 was swapped with + the main tracing buffer, and the other CPU buffers remain the same. + + When this is enabled, this adds a little more overhead to the + trace recording, as it needs to add some checks to synchronize + recording with swaps. But this does not affect the performance + of the overall system. This is enabled by default when the preempt + or irq latency tracers are enabled, as those need to swap as well + and already adds the overhead (plus a lot more). + config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER @@ -524,6 +550,29 @@ config RING_BUFFER_BENCHMARK If unsure, say N. +config RING_BUFFER_STARTUP_TEST + bool "Ring buffer startup self test" + depends on RING_BUFFER + help + Run a simple self test on the ring buffer on boot up. Late in the + kernel boot sequence, the test will start that kicks off + a thread per cpu. Each thread will write various size events + into the ring buffer. Another thread is created to send IPIs + to each of the threads, where the IPI handler will also write + to the ring buffer, to test/stress the nesting ability. + If any anomalies are discovered, a warning will be displayed + and all ring buffers will be disabled. + + The test runs for 10 seconds. This will slow your boot time + by at least 10 more seconds. + + At the end of the test, statics and more checks are done. + It will output the stats of each per cpu buffer. What + was written, the sizes, what was read, what was lost, and + other similar details. + + If unsure, say N + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5a0f781cd729..ed58a3216a6d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, bool blk_tracer = blk_tracer_enabled; if (blk_tracer) { - buffer = blk_tr->buffer; + buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len, @@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (blk_tracer) { tracing_record_cmdline(current); - buffer = blk_tr->buffer; + buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b3fde6d7b7fc..8a5c017bb50c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -486,7 +486,6 @@ struct ftrace_profile_stat { #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits __read_mostly; static int ftrace_profile_enabled __read_mostly; /* ftrace_profile_lock - synchronize the enable and disable of the profiler */ @@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock); static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); -#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ +#define FTRACE_PROFILE_HASH_BITS 10 +#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS) static void * function_stat_next(void *v, int idx) @@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); - for (i = 0; i < pages; i++) { + for (i = 1; i < pages; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); if (!pg->next) goto out_free; @@ -724,13 +724,6 @@ static int ftrace_profile_init_cpu(int cpu) if (!stat->hash) return -ENOMEM; - if (!ftrace_profile_bits) { - size--; - - for (; size; size >>= 1) - ftrace_profile_bits++; - } - /* Preallocate the function profiling pages */ if (ftrace_profile_pages_init(stat) < 0) { kfree(stat->hash); @@ -763,7 +756,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) struct hlist_head *hhd; unsigned long key; - key = hash_long(ip, ftrace_profile_bits); + key = hash_long(ip, FTRACE_PROFILE_HASH_BITS); hhd = &stat->hash[key]; if (hlist_empty(hhd)) @@ -782,7 +775,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat, { unsigned long key; - key = hash_long(rec->ip, ftrace_profile_bits); + key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS); hlist_add_head_rcu(&rec->node, &stat->hash[key]); } @@ -1079,7 +1072,7 @@ struct ftrace_func_probe { unsigned long flags; unsigned long ip; void *data; - struct rcu_head rcu; + struct list_head free_list; }; struct ftrace_func_entry { @@ -1329,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct hlist_head *hhd; struct ftrace_hash *old_hash; struct ftrace_hash *new_hash; - unsigned long key; int size = src->count; int bits = 0; int ret; @@ -1372,10 +1364,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, for (i = 0; i < size; i++) { hhd = &src->buckets[i]; hlist_for_each_entry_safe(entry, tn, hhd, hlist) { - if (bits > 0) - key = hash_long(entry->ip, bits); - else - key = 0; remove_hash_entry(src, entry); __add_hash_entry(new_hash, entry); } @@ -2973,28 +2961,27 @@ static void __disable_ftrace_function_probe(void) } -static void ftrace_free_entry_rcu(struct rcu_head *rhp) +static void ftrace_free_entry(struct ftrace_func_probe *entry) { - struct ftrace_func_probe *entry = - container_of(rhp, struct ftrace_func_probe, rcu); - if (entry->ops->free) - entry->ops->free(&entry->data); + entry->ops->free(entry->ops, entry->ip, &entry->data); kfree(entry); } - int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_probe *entry; + struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; int type, len, not; unsigned long key; int count = 0; char *search; + int ret; type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); @@ -3005,8 +2992,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_lock(&ftrace_lock); - if (unlikely(ftrace_disabled)) + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) { + count = -ENOMEM; goto out_unlock; + } + + if (unlikely(ftrace_disabled)) { + count = -ENODEV; + goto out_unlock; + } do_for_each_ftrace_rec(pg, rec) { @@ -3030,14 +3025,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, * for each function we find. We call the callback * to give the caller an opportunity to do so. */ - if (ops->callback) { - if (ops->callback(rec->ip, &entry->data) < 0) { + if (ops->init) { + if (ops->init(ops, rec->ip, &entry->data) < 0) { /* caller does not like this func */ kfree(entry); continue; } } + ret = enter_record(hash, rec, 0); + if (ret < 0) { + kfree(entry); + count = ret; + goto out_unlock; + } + entry->ops = ops; entry->ip = rec->ip; @@ -3045,10 +3047,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); } while_for_each_ftrace_rec(); + + ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + if (ret < 0) + count = ret; + __enable_ftrace_function_probe(); out_unlock: mutex_unlock(&ftrace_lock); + free_ftrace_hash(hash); return count; } @@ -3062,7 +3070,12 @@ static void __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data, int flags) { + struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; + struct ftrace_func_probe *p; + struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct list_head free_list; + struct ftrace_hash *hash; struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; int type = MATCH_FULL; @@ -3083,6 +3096,14 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } mutex_lock(&ftrace_lock); + + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) + /* Hmm, should report this somehow */ + goto out_unlock; + + INIT_LIST_HEAD(&free_list); + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; @@ -3103,12 +3124,30 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, continue; } + rec_entry = ftrace_lookup_ip(hash, entry->ip); + /* It is possible more than one entry had this ip */ + if (rec_entry) + free_hash_entry(hash, rec_entry); + hlist_del_rcu(&entry->node); - call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu); + list_add(&entry->free_list, &free_list); } } __disable_ftrace_function_probe(); + /* + * Remove after the disable is called. Otherwise, if the last + * probe is removed, a null hash means *all enabled*. + */ + ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + synchronize_sched(); + list_for_each_entry_safe(entry, p, &free_list, free_list) { + list_del(&entry->free_list); + ftrace_free_entry(entry); + } + + out_unlock: mutex_unlock(&ftrace_lock); + free_ftrace_hash(hash); } void @@ -3736,7 +3775,8 @@ out: if (fail) return -EINVAL; - ftrace_graph_filter_enabled = 1; + ftrace_graph_filter_enabled = !!(*idx); + return 0; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6989df2ba194..b59aea2c48c2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -8,13 +8,16 @@ #include <linux/trace_clock.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> +#include <linux/irq_work.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/hardirq.h> +#include <linux/kthread.h> /* for self test */ #include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> +#include <linux/delay.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> @@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s) return ret; } +struct rb_irq_work { + struct irq_work work; + wait_queue_head_t waiters; + bool waiters_pending; +}; + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -478,6 +487,8 @@ struct ring_buffer_per_cpu { struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; + + struct rb_irq_work irq_work; }; struct ring_buffer { @@ -497,6 +508,8 @@ struct ring_buffer { struct notifier_block cpu_notify; #endif u64 (*clock)(void); + + struct rb_irq_work irq_work; }; struct ring_buffer_iter { @@ -508,6 +521,118 @@ struct ring_buffer_iter { u64 read_stamp; }; +/* + * rb_wake_up_waiters - wake up tasks waiting for ring buffer input + * + * Schedules a delayed work to wake up any task that is blocked on the + * ring buffer waiters queue. + */ +static void rb_wake_up_waiters(struct irq_work *work) +{ + struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + + wake_up_all(&rbwork->waiters); +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +void ring_buffer_wait(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + DEFINE_WAIT(wait); + struct rb_irq_work *work; + + /* + * Depending on what the caller is waiting for, either any + * data in any cpu buffer, or a specific buffer, put the + * caller on the appropriate wait queue. + */ + if (cpu == RING_BUFFER_ALL_CPUS) + work = &buffer->irq_work; + else { + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + work->waiters_pending = true; + + if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) + schedule(); + + finish_wait(&work->waiters, &wait); +} + +/** + * ring_buffer_poll_wait - poll on buffer input + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @filp: the file descriptor + * @poll_table: The poll descriptor + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + * + * Returns POLLIN | POLLRDNORM if data exists in the buffers, + * zero otherwise. + */ +int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, + struct file *filp, poll_table *poll_table) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *work; + + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) + return POLLIN | POLLRDNORM; + + if (cpu == RING_BUFFER_ALL_CPUS) + work = &buffer->irq_work; + else { + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + work->waiters_pending = true; + poll_wait(filp, &work->waiters, poll_table); + + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) + return POLLIN | POLLRDNORM; + return 0; +} + /* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(b, cond) \ ({ \ @@ -1063,6 +1188,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); init_completion(&cpu_buffer->update_done); + init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&cpu_buffer->irq_work.waiters); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1158,6 +1285,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, buffer->clock = trace_clock_local; buffer->reader_lock_key = key; + init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&buffer->irq_work.waiters); + /* need at least two pages */ if (nr_pages < 2) nr_pages = 2; @@ -1553,11 +1683,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, if (!cpu_buffer->nr_pages_to_update) continue; - if (cpu_online(cpu)) + /* The update must run on the CPU that is being updated. */ + preempt_disable(); + if (cpu == smp_processor_id() || !cpu_online(cpu)) { + rb_update_pages(cpu_buffer); + cpu_buffer->nr_pages_to_update = 0; + } else { + /* + * Can not disable preemption for schedule_work_on() + * on PREEMPT_RT. + */ + preempt_enable(); schedule_work_on(cpu, &cpu_buffer->update_pages_work); - else - rb_update_pages(cpu_buffer); + preempt_disable(); + } + preempt_enable(); } /* wait for all the updates to complete */ @@ -1595,12 +1736,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, get_online_cpus(); - if (cpu_online(cpu_id)) { + preempt_disable(); + /* The update must run on the CPU that is being updated. */ + if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) + rb_update_pages(cpu_buffer); + else { + /* + * Can not disable preemption for schedule_work_on() + * on PREEMPT_RT. + */ + preempt_enable(); schedule_work_on(cpu_id, &cpu_buffer->update_pages_work); wait_for_completion(&cpu_buffer->update_done); - } else - rb_update_pages(cpu_buffer); + preempt_disable(); + } + preempt_enable(); cpu_buffer->nr_pages_to_update = 0; put_online_cpus(); @@ -2612,6 +2763,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_end_commit(cpu_buffer); } +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); + } + + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } +} + /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to @@ -2631,6 +2798,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + rb_wakeups(buffer, cpu_buffer); + trace_recursive_unlock(); preempt_enable_notrace(); @@ -2803,6 +2972,8 @@ int ring_buffer_write(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + rb_wakeups(buffer, cpu_buffer); + ret = 0; out: preempt_enable_notrace(); @@ -4467,3 +4638,320 @@ static int rb_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } #endif + +#ifdef CONFIG_RING_BUFFER_STARTUP_TEST +/* + * This is a basic integrity check of the ring buffer. + * Late in the boot cycle this test will run when configured in. + * It will kick off a thread per CPU that will go into a loop + * writing to the per cpu ring buffer various sizes of data. + * Some of the data will be large items, some small. + * + * Another thread is created that goes into a spin, sending out + * IPIs to the other CPUs to also write into the ring buffer. + * this is to test the nesting ability of the buffer. + * + * Basic stats are recorded and reported. If something in the + * ring buffer should happen that's not expected, a big warning + * is displayed and all ring buffers are disabled. + */ +static struct task_struct *rb_threads[NR_CPUS] __initdata; + +struct rb_test_data { + struct ring_buffer *buffer; + unsigned long events; + unsigned long bytes_written; + unsigned long bytes_alloc; + unsigned long bytes_dropped; + unsigned long events_nested; + unsigned long bytes_written_nested; + unsigned long bytes_alloc_nested; + unsigned long bytes_dropped_nested; + int min_size_nested; + int max_size_nested; + int max_size; + int min_size; + int cpu; + int cnt; +}; + +static struct rb_test_data rb_data[NR_CPUS] __initdata; + +/* 1 meg per cpu */ +#define RB_TEST_BUFFER_SIZE 1048576 + +static char rb_string[] __initdata = + "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" + "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" + "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; + +static bool rb_test_started __initdata; + +struct rb_item { + int size; + char str[]; +}; + +static __init int rb_write_something(struct rb_test_data *data, bool nested) +{ + struct ring_buffer_event *event; + struct rb_item *item; + bool started; + int event_len; + int size; + int len; + int cnt; + + /* Have nested writes different that what is written */ + cnt = data->cnt + (nested ? 27 : 0); + + /* Multiply cnt by ~e, to make some unique increment */ + size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + + len = size + sizeof(struct rb_item); + + started = rb_test_started; + /* read rb_test_started before checking buffer enabled */ + smp_rmb(); + + event = ring_buffer_lock_reserve(data->buffer, len); + if (!event) { + /* Ignore dropped events before test starts. */ + if (started) { + if (nested) + data->bytes_dropped += len; + else + data->bytes_dropped_nested += len; + } + return len; + } + + event_len = ring_buffer_event_length(event); + + if (RB_WARN_ON(data->buffer, event_len < len)) + goto out; + + item = ring_buffer_event_data(event); + item->size = size; + memcpy(item->str, rb_string, size); + + if (nested) { + data->bytes_alloc_nested += event_len; + data->bytes_written_nested += len; + data->events_nested++; + if (!data->min_size_nested || len < data->min_size_nested) + data->min_size_nested = len; + if (len > data->max_size_nested) + data->max_size_nested = len; + } else { + data->bytes_alloc += event_len; + data->bytes_written += len; + data->events++; + if (!data->min_size || len < data->min_size) + data->max_size = len; + if (len > data->max_size) + data->max_size = len; + } + + out: + ring_buffer_unlock_commit(data->buffer, event); + + return 0; +} + +static __init int rb_test(void *arg) +{ + struct rb_test_data *data = arg; + + while (!kthread_should_stop()) { + rb_write_something(data, false); + data->cnt++; + + set_current_state(TASK_INTERRUPTIBLE); + /* Now sleep between a min of 100-300us and a max of 1ms */ + usleep_range(((data->cnt % 3) + 1) * 100, 1000); + } + + return 0; +} + +static __init void rb_ipi(void *ignore) +{ + struct rb_test_data *data; + int cpu = smp_processor_id(); + + data = &rb_data[cpu]; + rb_write_something(data, true); +} + +static __init int rb_hammer_test(void *arg) +{ + while (!kthread_should_stop()) { + + /* Send an IPI to all cpus to write data! */ + smp_call_function(rb_ipi, NULL, 1); + /* No sleep, but for non preempt, let others run */ + schedule(); + } + + return 0; +} + +static __init int test_ringbuffer(void) +{ + struct task_struct *rb_hammer; + struct ring_buffer *buffer; + int cpu; + int ret = 0; + + pr_info("Running ring buffer tests...\n"); + + buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); + if (WARN_ON(!buffer)) + return 0; + + /* Disable buffer so that threads can't write to it yet */ + ring_buffer_record_off(buffer); + + for_each_online_cpu(cpu) { + rb_data[cpu].buffer = buffer; + rb_data[cpu].cpu = cpu; + rb_data[cpu].cnt = cpu; + rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], + "rbtester/%d", cpu); + if (WARN_ON(!rb_threads[cpu])) { + pr_cont("FAILED\n"); + ret = -1; + goto out_free; + } + + kthread_bind(rb_threads[cpu], cpu); + wake_up_process(rb_threads[cpu]); + } + + /* Now create the rb hammer! */ + rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); + if (WARN_ON(!rb_hammer)) { + pr_cont("FAILED\n"); + ret = -1; + goto out_free; + } + + ring_buffer_record_on(buffer); + /* + * Show buffer is enabled before setting rb_test_started. + * Yes there's a small race window where events could be + * dropped and the thread wont catch it. But when a ring + * buffer gets enabled, there will always be some kind of + * delay before other CPUs see it. Thus, we don't care about + * those dropped events. We care about events dropped after + * the threads see that the buffer is active. + */ + smp_wmb(); + rb_test_started = true; + + set_current_state(TASK_INTERRUPTIBLE); + /* Just run for 10 seconds */; + schedule_timeout(10 * HZ); + + kthread_stop(rb_hammer); + + out_free: + for_each_online_cpu(cpu) { + if (!rb_threads[cpu]) + break; + kthread_stop(rb_threads[cpu]); + } + if (ret) { + ring_buffer_free(buffer); + return ret; + } + + /* Report! */ + pr_info("finished\n"); + for_each_online_cpu(cpu) { + struct ring_buffer_event *event; + struct rb_test_data *data = &rb_data[cpu]; + struct rb_item *item; + unsigned long total_events; + unsigned long total_dropped; + unsigned long total_written; + unsigned long total_alloc; + unsigned long total_read = 0; + unsigned long total_size = 0; + unsigned long total_len = 0; + unsigned long total_lost = 0; + unsigned long lost; + int big_event_size; + int small_event_size; + + ret = -1; + + total_events = data->events + data->events_nested; + total_written = data->bytes_written + data->bytes_written_nested; + total_alloc = data->bytes_alloc + data->bytes_alloc_nested; + total_dropped = data->bytes_dropped + data->bytes_dropped_nested; + + big_event_size = data->max_size + data->max_size_nested; + small_event_size = data->min_size + data->min_size_nested; + + pr_info("CPU %d:\n", cpu); + pr_info(" events: %ld\n", total_events); + pr_info(" dropped bytes: %ld\n", total_dropped); + pr_info(" alloced bytes: %ld\n", total_alloc); + pr_info(" written bytes: %ld\n", total_written); + pr_info(" biggest event: %d\n", big_event_size); + pr_info(" smallest event: %d\n", small_event_size); + + if (RB_WARN_ON(buffer, total_dropped)) + break; + + ret = 0; + + while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { + total_lost += lost; + item = ring_buffer_event_data(event); + total_len += ring_buffer_event_length(event); + total_size += item->size + sizeof(struct rb_item); + if (memcmp(&item->str[0], rb_string, item->size) != 0) { + pr_info("FAILED!\n"); + pr_info("buffer had: %.*s\n", item->size, item->str); + pr_info("expected: %.*s\n", item->size, rb_string); + RB_WARN_ON(buffer, 1); + ret = -1; + break; + } + total_read++; + } + if (ret) + break; + + ret = -1; + + pr_info(" read events: %ld\n", total_read); + pr_info(" lost events: %ld\n", total_lost); + pr_info(" total events: %ld\n", total_lost + total_read); + pr_info(" recorded len bytes: %ld\n", total_len); + pr_info(" recorded size bytes: %ld\n", total_size); + if (total_lost) + pr_info(" With dropped events, record len and size may not match\n" + " alloced and written from above\n"); + if (!total_lost) { + if (RB_WARN_ON(buffer, total_len != total_alloc || + total_size != total_written)) + break; + } + if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) + break; + + ret = 0; + } + if (!ret) + pr_info("Ring buffer PASSED!\n"); + + ring_buffer_free(buffer); + return 0; +} + +late_initcall(test_ringbuffer); +#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 66338c4f7f4b..ae6fa2d1cdf7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,7 +1,7 @@ /* * ring buffer based function tracer * - * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com> * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> * * Originally taken from the RT patch by: @@ -19,7 +19,6 @@ #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/irqflags.h> -#include <linux/irq_work.h> #include <linux/debugfs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> @@ -48,7 +47,7 @@ * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ -int ring_buffer_expanded; +bool ring_buffer_expanded; /* * We need to change this state when a selftest is running. @@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) static DEFINE_PER_CPU(bool, trace_cmdline_save); /* - * When a reader is waiting for data, then this variable is - * set to true. - */ -static bool trace_wakeup_needed; - -static struct irq_work trace_work_wakeup; - -/* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization * of the tracer is successful. But that is the only place that sets @@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf); static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; +static bool allocate_snapshot; + static int __init set_cmdline_ftrace(char *str) { strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ - ring_buffer_expanded = 1; + ring_buffer_expanded = true; return 1; } __setup("ftrace=", set_cmdline_ftrace); @@ -156,6 +149,15 @@ static int __init set_ftrace_dump_on_oops(char *str) } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init boot_alloc_snapshot(char *str) +{ + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + ring_buffer_expanded = true; + return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); + static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static char *trace_boot_options __initdata; @@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec) */ static struct trace_array global_trace; -static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +LIST_HEAD(ftrace_trace_arrays); int filter_current_check_discard(struct ring_buffer *buffer, struct ftrace_event_call *call, void *rec, @@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu) u64 ts; /* Early boot up does not have a buffer yet */ - if (!global_trace.buffer) + if (!global_trace.trace_buffer.buffer) return trace_clock_local(); - ts = ring_buffer_time_stamp(global_trace.buffer, cpu); - ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); + ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); + ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts); return ts; } -/* - * The max_tr is used to snapshot the global_trace when a maximum - * latency is reached. Some tracers will use this to store a maximum - * trace while it continues examining live traces. - * - * The buffers for the max_tr are set up the same as the global_trace. - * When a snapshot is taken, the link list of the max_tr is swapped - * with the link list of the global_trace and the buffers are reset for - * the global_trace so the tracing can continue. - */ -static struct trace_array max_tr; - -static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); - int tracing_is_enabled(void) { return tracing_is_on(); @@ -249,9 +237,6 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; -/* current_trace points to the tracer that is currently active */ -static struct tracer *current_trace __read_mostly = &nop_trace; - /* * trace_types_lock is used to protect the trace_types list. */ @@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock); static inline void trace_access_lock(int cpu) { - if (cpu == TRACE_PIPE_ALL_CPU) { + if (cpu == RING_BUFFER_ALL_CPUS) { /* gain it for accessing the whole ring buffer. */ down_write(&all_cpu_access_lock); } else { /* gain it for accessing a cpu ring buffer. */ - /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ + /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */ down_read(&all_cpu_access_lock); /* Secondly block other access to this @cpu ring buffer. */ @@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu) static inline void trace_access_unlock(int cpu) { - if (cpu == TRACE_PIPE_ALL_CPU) { + if (cpu == RING_BUFFER_ALL_CPUS) { up_write(&all_cpu_access_lock); } else { mutex_unlock(&per_cpu(cpu_access_lock, cpu)); @@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void) #endif -/* trace_wait is a waitqueue for tasks blocked on trace_poll */ -static DECLARE_WAIT_QUEUE_HEAD(trace_wait); - /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; - -static int trace_stop_count; -static DEFINE_RAW_SPINLOCK(tracing_start_lock); - -/** - * trace_wake_up - wake up tasks waiting for trace input - * - * Schedules a delayed work to wake up any task that is blocked on the - * trace_wait queue. These is used with trace_poll for tasks polling the - * trace. - */ -static void trace_wake_up(struct irq_work *work) -{ - wake_up_all(&trace_wait); - -} + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; /** * tracing_on - enable tracing buffers @@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work) */ void tracing_on(void) { - if (global_trace.buffer) - ring_buffer_record_on(global_trace.buffer); + if (global_trace.trace_buffer.buffer) + ring_buffer_record_on(global_trace.trace_buffer.buffer); /* * This flag is only looked at when buffers haven't been * allocated yet. We don't really care about the race @@ -385,6 +351,196 @@ void tracing_on(void) EXPORT_SYMBOL_GPL(tracing_on); /** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + * @size: The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + int alloc; + + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + irq_flags, preempt_count()); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, str, size); + + /* Add a newline if necessary */ + if (entry->buf[size - 1] != '\n') { + entry->buf[size] = '\n'; + entry->buf[size + 1] = '\0'; + } else + entry->buf[size] = '\0'; + + __buffer_unlock_commit(buffer, event); + + return size; +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip: The address of the caller + * @str: The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct bputs_entry *entry; + unsigned long irq_flags; + int size = sizeof(struct bputs_entry); + + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + irq_flags, preempt_count()); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->str = str; + + __buffer_unlock_commit(buffer, event); + + return 1; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + +#ifdef CONFIG_TRACER_SNAPSHOT +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ + struct trace_array *tr = &global_trace; + struct tracer *tracer = tr->current_trace; + unsigned long flags; + + if (in_nmi()) { + internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + internal_trace_puts("*** snapshot is being ignored ***\n"); + return; + } + + if (!tr->allocated_snapshot) { + internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); + internal_trace_puts("*** stopping trace here! ***\n"); + tracing_off(); + return; + } + + /* Note, snapshot can not be used when the tracer uses it */ + if (tracer->use_max_tr) { + internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); + internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); + return; + } + + local_irq_save(flags); + update_max_tr(tr, current, smp_processor_id()); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); + +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, + struct trace_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); + +static int alloc_snapshot(struct trace_array *tr) +{ + int ret; + + if (!tr->allocated_snapshot) { + + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->trace_buffer, RING_BUFFER_ALL_CPUS); + if (ret < 0) + return ret; + + tr->allocated_snapshot = true; + } + + return 0; +} + +void free_snapshot(struct trace_array *tr) +{ + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); + set_buffer_entries(&tr->max_buffer, 1); + tracing_reset_online_cpus(&tr->max_buffer); + tr->allocated_snapshot = false; +} + +/** + * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to trace_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ + struct trace_array *tr = &global_trace; + int ret; + + ret = alloc_snapshot(tr); + if (WARN_ON(ret < 0)) + return; + + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#else +void tracing_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); +void tracing_snapshot_alloc(void) +{ + /* Give warning */ + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#endif /* CONFIG_TRACER_SNAPSHOT */ + +/** * tracing_off - turn off tracing buffers * * This function stops the tracing buffers from recording data. @@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on); */ void tracing_off(void) { - if (global_trace.buffer) - ring_buffer_record_off(global_trace.buffer); + if (global_trace.trace_buffer.buffer) + ring_buffer_record_off(global_trace.trace_buffer.buffer); /* * This flag is only looked at when buffers haven't been * allocated yet. We don't really care about the race @@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off); */ int tracing_is_on(void) { - if (global_trace.buffer) - return ring_buffer_record_is_on(global_trace.buffer); + if (global_trace.trace_buffer.buffer) + return ring_buffer_record_is_on(global_trace.trace_buffer.buffer); return !global_trace.buffer_disabled; } EXPORT_SYMBOL_GPL(tracing_is_on); @@ -479,6 +635,7 @@ static const char *trace_options[] = { "disable_on_free", "irq-info", "markers", + "function-trace", NULL }; @@ -490,6 +647,8 @@ static struct { { trace_clock_local, "local", 1 }, { trace_clock_global, "global", 1 }, { trace_clock_counter, "counter", 0 }, + { trace_clock_jiffies, "uptime", 1 }, + { trace_clock, "perf", 1 }, ARCH_TRACE_CLOCKS }; @@ -670,13 +829,14 @@ unsigned long __read_mostly tracing_max_latency; static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_array_cpu *data = tr->data[cpu]; - struct trace_array_cpu *max_data; + struct trace_buffer *trace_buf = &tr->trace_buffer; + struct trace_buffer *max_buf = &tr->max_buffer; + struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); + struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); - max_tr.cpu = cpu; - max_tr.time_start = data->preempt_timestamp; + max_buf->cpu = cpu; + max_buf->time_start = data->preempt_timestamp; - max_data = max_tr.data[cpu]; max_data->saved_latency = tracing_max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; @@ -706,22 +866,22 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct ring_buffer *buf; - if (trace_stop_count) + if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->allocated_snapshot) { + if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ - WARN_ON_ONCE(current_trace != &nop_trace); + WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } arch_spin_lock(&ftrace_max_lock); - buf = tr->buffer; - tr->buffer = max_tr.buffer; - max_tr.buffer = buf; + buf = tr->trace_buffer.buffer; + tr->trace_buffer.buffer = tr->max_buffer.buffer; + tr->max_buffer.buffer = buf; __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&ftrace_max_lock); @@ -740,19 +900,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { int ret; - if (trace_stop_count) + if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->allocated_snapshot) { + if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ - WARN_ON_ONCE(current_trace != &nop_trace); + WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } arch_spin_lock(&ftrace_max_lock); - ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); if (ret == -EBUSY) { /* @@ -761,7 +921,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * the max trace buffer (no one writes directly to it) * and flag that it failed. */ - trace_array_printk(&max_tr, _THIS_IP_, + trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, "Failed to swap buffers due to commit in progress\n"); } @@ -774,37 +934,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) static void default_wait_pipe(struct trace_iterator *iter) { - DEFINE_WAIT(wait); + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return; + + ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); +} + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int run_tracer_selftest(struct tracer *type) +{ + struct trace_array *tr = &global_trace; + struct tracer *saved_tracer = tr->current_trace; + int ret; - prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); + if (!type->selftest || tracing_selftest_disabled) + return 0; /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. */ - trace_wakeup_needed = true; + tracing_reset_online_cpus(&tr->trace_buffer); - if (trace_empty(iter)) - schedule(); + tr->current_trace = type; - finish_wait(&trace_wait, &wait); +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); + tr->allocated_snapshot = true; + } +#endif + + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + tr->current_trace = saved_tracer; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + /* Add the warning after printing 'FAILED' */ + WARN_ON(1); + return -1; + } + /* Only reset on passing, to avoid touching corrupted buffers */ + tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + tr->allocated_snapshot = false; + + /* Shrink the max buffer again */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, 1, + RING_BUFFER_ALL_CPUS); + } +#endif + + printk(KERN_CONT "PASSED\n"); + return 0; +} +#else +static inline int run_tracer_selftest(struct tracer *type) +{ + return 0; } +#endif /* CONFIG_FTRACE_STARTUP_TEST */ /** * register_tracer - register a tracer with the ftrace system. @@ -851,57 +1052,9 @@ int register_tracer(struct tracer *type) if (!type->wait_pipe) type->wait_pipe = default_wait_pipe; - -#ifdef CONFIG_FTRACE_STARTUP_TEST - if (type->selftest && !tracing_selftest_disabled) { - struct tracer *saved_tracer = current_trace; - struct trace_array *tr = &global_trace; - - /* - * Run a selftest on this tracer. - * Here we reset the trace buffer, and set the current - * tracer to be this tracer. The tracer can then run some - * internal tracing to verify that everything is in order. - * If we fail, we do not register this tracer. - */ - tracing_reset_online_cpus(tr); - - current_trace = type; - - if (type->use_max_tr) { - /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded) - ring_buffer_resize(max_tr.buffer, trace_buf_size, - RING_BUFFER_ALL_CPUS); - type->allocated_snapshot = true; - } - - /* the test is responsible for initializing and enabling */ - pr_info("Testing tracer %s: ", type->name); - ret = type->selftest(type, tr); - /* the test is responsible for resetting too */ - current_trace = saved_tracer; - if (ret) { - printk(KERN_CONT "FAILED!\n"); - /* Add the warning after printing 'FAILED' */ - WARN_ON(1); - goto out; - } - /* Only reset on passing, to avoid touching corrupted buffers */ - tracing_reset_online_cpus(tr); - - if (type->use_max_tr) { - type->allocated_snapshot = false; - - /* Shrink the max buffer again */ - if (ring_buffer_expanded) - ring_buffer_resize(max_tr.buffer, 1, - RING_BUFFER_ALL_CPUS); - } - - printk(KERN_CONT "PASSED\n"); - } -#endif + ret = run_tracer_selftest(type); + if (ret < 0) + goto out; type->next = trace_types; trace_types = type; @@ -921,7 +1074,7 @@ int register_tracer(struct tracer *type) tracing_set_tracer(type->name); default_bootup_tracer = NULL; /* disable other selftests, since this will break it. */ - tracing_selftest_disabled = 1; + tracing_selftest_disabled = true; #ifdef CONFIG_FTRACE_STARTUP_TEST printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", type->name); @@ -931,9 +1084,9 @@ int register_tracer(struct tracer *type) return ret; } -void tracing_reset(struct trace_array *tr, int cpu) +void tracing_reset(struct trace_buffer *buf, int cpu) { - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = buf->buffer; if (!buffer) return; @@ -947,9 +1100,9 @@ void tracing_reset(struct trace_array *tr, int cpu) ring_buffer_record_enable(buffer); } -void tracing_reset_online_cpus(struct trace_array *tr) +void tracing_reset_online_cpus(struct trace_buffer *buf) { - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = buf->buffer; int cpu; if (!buffer) @@ -960,7 +1113,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) /* Make sure all commits have finished */ synchronize_sched(); - tr->time_start = ftrace_now(tr->cpu); + buf->time_start = ftrace_now(buf->cpu); for_each_online_cpu(cpu) ring_buffer_reset_cpu(buffer, cpu); @@ -970,12 +1123,21 @@ void tracing_reset_online_cpus(struct trace_array *tr) void tracing_reset_current(int cpu) { - tracing_reset(&global_trace, cpu); + tracing_reset(&global_trace.trace_buffer, cpu); } -void tracing_reset_current_online_cpus(void) +void tracing_reset_all_online_cpus(void) { - tracing_reset_online_cpus(&global_trace); + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + tracing_reset_online_cpus(&tr->trace_buffer); +#ifdef CONFIG_TRACER_MAX_TRACE + tracing_reset_online_cpus(&tr->max_buffer); +#endif + } + mutex_unlock(&trace_types_lock); } #define SAVED_CMDLINES 128 @@ -998,7 +1160,7 @@ static void trace_init_cmdlines(void) int is_tracing_stopped(void) { - return trace_stop_count; + return global_trace.stop_count; } /** @@ -1030,12 +1192,12 @@ void tracing_start(void) if (tracing_disabled) return; - raw_spin_lock_irqsave(&tracing_start_lock, flags); - if (--trace_stop_count) { - if (trace_stop_count < 0) { + raw_spin_lock_irqsave(&global_trace.start_lock, flags); + if (--global_trace.stop_count) { + if (global_trace.stop_count < 0) { /* Someone screwed up their debugging */ WARN_ON_ONCE(1); - trace_stop_count = 0; + global_trace.stop_count = 0; } goto out; } @@ -1043,19 +1205,52 @@ void tracing_start(void) /* Prevent the buffers from switching */ arch_spin_lock(&ftrace_max_lock); - buffer = global_trace.buffer; + buffer = global_trace.trace_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); - buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = global_trace.max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); +#endif arch_spin_unlock(&ftrace_max_lock); ftrace_start(); out: - raw_spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_start_tr(struct trace_array *tr) +{ + struct ring_buffer *buffer; + unsigned long flags; + + if (tracing_disabled) + return; + + /* If global, we need to also start the max tracer */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return tracing_start(); + + raw_spin_lock_irqsave(&tr->start_lock, flags); + + if (--tr->stop_count) { + if (tr->stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + tr->stop_count = 0; + } + goto out; + } + + buffer = tr->trace_buffer.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -1070,25 +1265,48 @@ void tracing_stop(void) unsigned long flags; ftrace_stop(); - raw_spin_lock_irqsave(&tracing_start_lock, flags); - if (trace_stop_count++) + raw_spin_lock_irqsave(&global_trace.start_lock, flags); + if (global_trace.stop_count++) goto out; /* Prevent the buffers from switching */ arch_spin_lock(&ftrace_max_lock); - buffer = global_trace.buffer; + buffer = global_trace.trace_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); - buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = global_trace.max_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); +#endif arch_spin_unlock(&ftrace_max_lock); out: - raw_spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_stop_tr(struct trace_array *tr) +{ + struct ring_buffer *buffer; + unsigned long flags; + + /* If global, we need to also stop the max tracer */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return tracing_stop(); + + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (tr->stop_count++) + goto out; + + buffer = tr->trace_buffer.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); } void trace_stop_cmdline_recording(void); @@ -1221,11 +1439,6 @@ void __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { __this_cpu_write(trace_cmdline_save, true); - if (trace_wakeup_needed) { - trace_wakeup_needed = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&trace_work_wakeup); - } ring_buffer_unlock_commit(buffer, event); } @@ -1249,11 +1462,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer, EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, + struct ftrace_event_file *ftrace_file, + int type, unsigned long len, + unsigned long flags, int pc) +{ + *current_rb = ftrace_file->tr->trace_buffer.buffer; + return trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); + +struct ring_buffer_event * trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, int type, unsigned long len, unsigned long flags, int pc) { - *current_rb = global_trace.buffer; + *current_rb = global_trace.trace_buffer.buffer; return trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); } @@ -1292,7 +1517,7 @@ trace_function(struct trace_array *tr, int pc) { struct ftrace_event_call *call = &event_function; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -1433,13 +1658,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); + __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); } /** * trace_dump_stack - record a stack back trace in the trace buffer + * @skip: Number of functions to skip (helper handlers) */ -void trace_dump_stack(void) +void trace_dump_stack(int skip) { unsigned long flags; @@ -1448,8 +1674,13 @@ void trace_dump_stack(void) local_save_flags(flags); - /* skipping 3 traces, seems to get us at the caller of this function */ - __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); + /* + * Skip 3 more, seems to get us at the caller of + * this function. + */ + skip += 3; + __ftrace_trace_stack(global_trace.trace_buffer.buffer, + flags, skip, preempt_count(), NULL); } static DEFINE_PER_CPU(int, user_stack_count); @@ -1619,7 +1850,7 @@ void trace_printk_init_buffers(void) * directly here. If the global_trace.buffer is already * allocated here, then this was called by module code. */ - if (global_trace.buffer) + if (global_trace.trace_buffer.buffer) tracing_start_cmdline_record(); } @@ -1679,7 +1910,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, flags, pc); if (!event) @@ -1702,27 +1933,12 @@ out: } EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - va_start(ap, fmt); - ret = trace_array_vprintk(tr, ip, fmt, ap); - va_end(ap); - return ret; -} - -int trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args) +static int +__trace_array_vprintk(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; - struct ring_buffer *buffer; int len = 0, size, pc; struct print_entry *entry; unsigned long flags; @@ -1750,7 +1966,6 @@ int trace_array_vprintk(struct trace_array *tr, local_save_flags(flags); size = sizeof(*entry) + len + 1; - buffer = tr->buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, flags, pc); if (!event) @@ -1771,6 +1986,42 @@ int trace_array_vprintk(struct trace_array *tr, return len; } +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) +{ + return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); +} + +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_array_printk_buf(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = __trace_array_vprintk(buffer, ip, fmt, ap); + va_end(ap); + return ret; +} + int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); @@ -1796,7 +2047,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); else - event = ring_buffer_peek(iter->tr->buffer, cpu, ts, + event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts, lost_events); if (event) { @@ -1811,7 +2062,7 @@ static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, unsigned long *missing_events, u64 *ent_ts) { - struct ring_buffer *buffer = iter->tr->buffer; + struct ring_buffer *buffer = iter->trace_buffer->buffer; struct trace_entry *ent, *next = NULL; unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; @@ -1824,7 +2075,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, * If we are in a per_cpu trace file, don't bother by iterating over * all cpu and peek directly. */ - if (cpu_file > TRACE_PIPE_ALL_CPU) { + if (cpu_file > RING_BUFFER_ALL_CPUS) { if (ring_buffer_empty_cpu(buffer, cpu_file)) return NULL; ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); @@ -1888,7 +2139,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) static void trace_consume(struct trace_iterator *iter) { - ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, + ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts, &iter->lost_events); } @@ -1921,13 +2172,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) void tracing_iter_reset(struct trace_iterator *iter, int cpu) { - struct trace_array *tr = iter->tr; struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter; unsigned long entries = 0; u64 ts; - tr->data[cpu]->skipped_entries = 0; + per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0; buf_iter = trace_buffer_iter(iter, cpu); if (!buf_iter) @@ -1941,13 +2191,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) * by the timestamp being before the start of the buffer. */ while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { - if (ts >= iter->tr->time_start) + if (ts >= iter->trace_buffer->time_start) break; entries++; ring_buffer_read(buf_iter, NULL); } - tr->data[cpu]->skipped_entries = entries; + per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries; } /* @@ -1957,6 +2207,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; @@ -1969,12 +2220,14 @@ static void *s_start(struct seq_file *m, loff_t *pos) * will point to the same string as current_trace->name. */ mutex_lock(&trace_types_lock); - if (unlikely(current_trace && iter->trace->name != current_trace->name)) - *iter->trace = *current_trace; + if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); +#ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return ERR_PTR(-EBUSY); +#endif if (!iter->snapshot) atomic_inc(&trace_record_cmdline_disabled); @@ -1984,7 +2237,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->cpu = 0; iter->idx = -1; - if (cpu_file == TRACE_PIPE_ALL_CPU) { + if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) tracing_iter_reset(iter, cpu); } else @@ -2016,17 +2269,21 @@ static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; +#ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return; +#endif if (!iter->snapshot) atomic_dec(&trace_record_cmdline_disabled); + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } static void -get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) +get_total_entries(struct trace_buffer *buf, + unsigned long *total, unsigned long *entries) { unsigned long count; int cpu; @@ -2035,19 +2292,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e *entries = 0; for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(tr->buffer, cpu); + count = ring_buffer_entries_cpu(buf->buffer, cpu); /* * If this buffer has skipped entries, then we hold all * entries for the trace and we need to ignore the * ones before the time stamp. */ - if (tr->data[cpu]->skipped_entries) { - count -= tr->data[cpu]->skipped_entries; + if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; /* total is the same as the entries */ *total += count; } else *total += count + - ring_buffer_overrun_cpu(tr->buffer, cpu); + ring_buffer_overrun_cpu(buf->buffer, cpu); *entries += count; } } @@ -2064,27 +2321,27 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static void print_event_info(struct trace_array *tr, struct seq_file *m) +static void print_event_info(struct trace_buffer *buf, struct seq_file *m) { unsigned long total; unsigned long entries; - get_total_entries(tr, &total, &entries); + get_total_entries(buf, &total, &entries); seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", entries, total, num_online_cpus()); seq_puts(m, "#\n"); } -static void print_func_help_header(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) { - print_event_info(tr, m); + print_event_info(buf, m); seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } -static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) { - print_event_info(tr, m); + print_event_info(buf, m); seq_puts(m, "# _-----=> irqs-off\n"); seq_puts(m, "# / _----=> need-resched\n"); seq_puts(m, "# | / _---=> hardirq/softirq\n"); @@ -2098,16 +2355,16 @@ void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_array *tr = iter->tr; - struct trace_array_cpu *data = tr->data[tr->cpu]; - struct tracer *type = current_trace; + struct trace_buffer *buf = iter->trace_buffer; + struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); + struct tracer *type = iter->trace; unsigned long entries; unsigned long total; const char *name = "preemption"; name = type->name; - get_total_entries(tr, &total, &entries); + get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -2118,7 +2375,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) nsecs_to_usecs(data->saved_latency), entries, total, - tr->cpu, + buf->cpu, #if defined(CONFIG_PREEMPT_NONE) "server", #elif defined(CONFIG_PREEMPT_VOLUNTARY) @@ -2169,7 +2426,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (cpumask_test_cpu(iter->cpu, iter->started)) return; - if (iter->tr->data[iter->cpu]->skipped_entries) + if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) return; cpumask_set_cpu(iter->cpu, iter->started); @@ -2292,14 +2549,14 @@ int trace_empty(struct trace_iterator *iter) int cpu; /* If we are looking at one CPU buffer, only check that one */ - if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { cpu = iter->cpu_file; buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) return 0; } return 1; @@ -2311,7 +2568,7 @@ int trace_empty(struct trace_iterator *iter) if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) return 0; } } @@ -2335,6 +2592,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) return ret; } + if (iter->ent->type == TRACE_BPUTS && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_bputs_msg_only(iter); + if (iter->ent->type == TRACE_BPRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) @@ -2389,9 +2651,9 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->tr, m); + print_func_help_header_irq(iter->trace_buffer, m); else - print_func_help_header(iter->tr, m); + print_func_help_header(iter->trace_buffer, m); } } } @@ -2405,14 +2667,8 @@ static void test_ftrace_alive(struct seq_file *m) } #ifdef CONFIG_TRACER_MAX_TRACE -static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +static void show_snapshot_main_help(struct seq_file *m) { - if (iter->trace->allocated_snapshot) - seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); - else - seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); - - seq_printf(m, "# Snapshot commands:\n"); seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); seq_printf(m, "# Takes a snapshot of the main buffer.\n"); @@ -2420,6 +2676,35 @@ static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); seq_printf(m, "# is not a '0' or '1')\n"); } + +static void show_snapshot_percpu_help(struct seq_file *m) +{ + seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); + seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); +#else + seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); + seq_printf(m, "# Must use main snapshot file to allocate.\n"); +#endif + seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); + seq_printf(m, "# is not a '0' or '1')\n"); +} + +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + if (iter->tr->allocated_snapshot) + seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); + else + seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + + seq_printf(m, "# Snapshot commands:\n"); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + show_snapshot_main_help(m); + else + show_snapshot_percpu_help(m); +} #else /* Should never be called */ static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } @@ -2479,7 +2764,8 @@ static const struct seq_operations tracer_seq_ops = { static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { - long cpu_file = (long) inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct trace_iterator *iter; int cpu; @@ -2504,26 +2790,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter->trace) goto fail; - *iter->trace = *current_trace; + *iter->trace = *tr->current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - if (current_trace->print_max || snapshot) - iter->tr = &max_tr; + iter->tr = tr; + +#ifdef CONFIG_TRACER_MAX_TRACE + /* Currently only the top directory has a snapshot */ + if (tr->current_trace->print_max || snapshot) + iter->trace_buffer = &tr->max_buffer; else - iter->tr = &global_trace; +#endif + iter->trace_buffer = &tr->trace_buffer; iter->snapshot = snapshot; iter->pos = -1; mutex_init(&iter->mutex); - iter->cpu_file = cpu_file; + iter->cpu_file = tc->cpu; /* Notify the tracer early; before we stop tracing. */ if (iter->trace && iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ - if (ring_buffer_overruns(iter->tr->buffer)) + if (ring_buffer_overruns(iter->trace_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -2532,12 +2823,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) /* stop the trace while dumping if we are not opening "snapshot" */ if (!iter->snapshot) - tracing_stop(); + tracing_stop_tr(tr); - if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { @@ -2547,12 +2838,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); } + tr->ref++; + mutex_unlock(&trace_types_lock); return iter; @@ -2579,14 +2872,20 @@ static int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct trace_iterator *iter; + struct trace_array *tr; int cpu; if (!(file->f_mode & FMODE_READ)) return 0; iter = m->private; + tr = iter->tr; mutex_lock(&trace_types_lock); + + WARN_ON(!tr->ref); + tr->ref--; + for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -2597,7 +2896,7 @@ static int tracing_release(struct inode *inode, struct file *file) if (!iter->snapshot) /* reenable tracing if it was previously enabled */ - tracing_start(); + tracing_start_tr(tr); mutex_unlock(&trace_types_lock); mutex_destroy(&iter->mutex); @@ -2616,12 +2915,13 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - long cpu = (long) inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; - if (cpu == TRACE_PIPE_ALL_CPU) - tracing_reset_online_cpus(&global_trace); + if (tc->cpu == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->trace_buffer); else - tracing_reset(&global_trace, cpu); + tracing_reset(&tr->trace_buffer, tc->cpu); } if (file->f_mode & FMODE_READ) { @@ -2768,8 +3068,9 @@ static ssize_t tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { - int err, cpu; + struct trace_array *tr = filp->private_data; cpumask_var_t tracing_cpumask_new; + int err, cpu; if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; @@ -2789,13 +3090,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, */ if (cpumask_test_cpu(cpu, tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&global_trace.data[cpu]->disabled); - ring_buffer_record_disable_cpu(global_trace.buffer, cpu); + atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); + ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); } if (!cpumask_test_cpu(cpu, tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&global_trace.data[cpu]->disabled); - ring_buffer_record_enable_cpu(global_trace.buffer, cpu); + atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); + ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); } } arch_spin_unlock(&ftrace_max_lock); @@ -2824,12 +3125,13 @@ static const struct file_operations tracing_cpumask_fops = { static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; + struct trace_array *tr = m->private; u32 tracer_flags; int i; mutex_lock(&trace_types_lock); - tracer_flags = current_trace->flags->val; - trace_opts = current_trace->flags->opts; + tracer_flags = tr->current_trace->flags->val; + trace_opts = tr->current_trace->flags->opts; for (i = 0; trace_options[i]; i++) { if (trace_flags & (1 << i)) @@ -2893,15 +3195,15 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) return 0; } -int set_tracer_flag(unsigned int mask, int enabled) +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { /* do nothing if flag is already set */ if (!!(trace_flags & mask) == !!enabled) return 0; /* Give the tracer a chance to approve the change */ - if (current_trace->flag_changed) - if (current_trace->flag_changed(current_trace, mask, !!enabled)) + if (tr->current_trace->flag_changed) + if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) return -EINVAL; if (enabled) @@ -2913,9 +3215,9 @@ int set_tracer_flag(unsigned int mask, int enabled) trace_event_enable_cmd_record(enabled); if (mask == TRACE_ITER_OVERWRITE) { - ring_buffer_change_overwrite(global_trace.buffer, enabled); + ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE - ring_buffer_change_overwrite(max_tr.buffer, enabled); + ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); #endif } @@ -2925,7 +3227,7 @@ int set_tracer_flag(unsigned int mask, int enabled) return 0; } -static int trace_set_options(char *option) +static int trace_set_options(struct trace_array *tr, char *option) { char *cmp; int neg = 0; @@ -2943,14 +3245,14 @@ static int trace_set_options(char *option) for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { - ret = set_tracer_flag(1 << i, !neg); + ret = set_tracer_flag(tr, 1 << i, !neg); break; } } /* If no option could be set, test the specific tracer options */ if (!trace_options[i]) - ret = set_tracer_option(current_trace, cmp, neg); + ret = set_tracer_option(tr->current_trace, cmp, neg); mutex_unlock(&trace_types_lock); @@ -2961,6 +3263,8 @@ static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; char buf[64]; int ret; @@ -2972,7 +3276,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - ret = trace_set_options(buf); + ret = trace_set_options(tr, buf); if (ret < 0) return ret; @@ -2985,7 +3289,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file) { if (tracing_disabled) return -ENODEV; - return single_open(file, tracing_trace_options_show, NULL); + + return single_open(file, tracing_trace_options_show, inode->i_private); } static const struct file_operations tracing_iter_fops = { @@ -2998,20 +3303,84 @@ static const struct file_operations tracing_iter_fops = { static const char readme_msg[] = "tracing mini-HOWTO:\n\n" - "# mount -t debugfs nodev /sys/kernel/debug\n\n" - "# cat /sys/kernel/debug/tracing/available_tracers\n" - "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" - "# cat /sys/kernel/debug/tracing/current_tracer\n" - "nop\n" - "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" - "# cat /sys/kernel/debug/tracing/current_tracer\n" - "wakeup\n" - "# cat /sys/kernel/debug/tracing/trace_options\n" - "noprint-parent nosym-offset nosym-addr noverbose\n" - "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" - "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" - "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" - "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" + "# echo 0 > tracing_on : quick way to disable tracing\n" + "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" + " Important files:\n" + " trace\t\t\t- The static contents of the buffer\n" + "\t\t\t To clear the buffer write into this file: echo > trace\n" + " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" + " current_tracer\t- function and latency tracers\n" + " available_tracers\t- list of configured tracers for current_tracer\n" + " buffer_size_kb\t- view and modify size of per cpu buffer\n" + " buffer_total_size_kb - view total size of all cpu buffers\n\n" + " trace_clock\t\t-change the clock used to order events\n" + " local: Per cpu clock but may not be synced across CPUs\n" + " global: Synced across CPUs but slows tracing down.\n" + " counter: Not a clock, but just an increment\n" + " uptime: Jiffy counter from time of boot\n" + " perf: Same clock that perf events use\n" +#ifdef CONFIG_X86_64 + " x86-tsc: TSC cycle counter\n" +#endif + "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" + " tracing_cpumask\t- Limit which CPUs to trace\n" + " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" + "\t\t\t Remove sub-buffer with rmdir\n" + " trace_options\t\t- Set format or modify how tracing happens\n" + "\t\t\t Disable an option by adding a suffix 'no' to the option name\n" +#ifdef CONFIG_DYNAMIC_FTRACE + "\n available_filter_functions - list of functions that can be filtered on\n" + " set_ftrace_filter\t- echo function name in here to only trace these functions\n" + " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + " modules: Can select a group via module\n" + " Format: :mod:<module-name>\n" + " example: echo :mod:ext3 > set_ftrace_filter\n" + " triggers: a command to perform when function is hit\n" + " Format: <function>:<trigger>[:count]\n" + " trigger: traceon, traceoff\n" + " enable_event:<system>:<event>\n" + " disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE + " stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + " snapshot\n" +#endif + " example: echo do_fault:traceoff > set_ftrace_filter\n" + " echo do_trap:traceoff:3 > set_ftrace_filter\n" + " The first one will disable tracing every time do_fault is hit\n" + " The second will disable tracing at most 3 times when do_trap is hit\n" + " The first time do trap is hit and it disables tracing, the counter\n" + " will decrement to 2. If tracing is already disabled, the counter\n" + " will not decrement. It only decrements when the trigger did work\n" + " To remove trigger without count:\n" + " echo '!<function>:<trigger> > set_ftrace_filter\n" + " To remove trigger with a count:\n" + " echo '!<function>:<trigger>:0 > set_ftrace_filter\n" + " set_ftrace_notrace\t- echo function name in here to never trace.\n" + " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + " modules: Can select a group via module command :mod:\n" + " Does not accept triggers\n" +#endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_TRACER + " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" + " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" + "\t\t\t Read the contents for more information\n" +#endif +#ifdef CONFIG_STACKTRACE + " stack_trace\t\t- Shows the max stack trace when active\n" + " stack_max_size\t- Shows current max stack size that was traced\n" + "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" +#ifdef CONFIG_DYNAMIC_FTRACE + " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" +#endif +#endif /* CONFIG_STACKTRACE */ ; static ssize_t @@ -3083,11 +3452,12 @@ static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+2]; int r; mutex_lock(&trace_types_lock); - r = sprintf(buf, "%s\n", current_trace->name); + r = sprintf(buf, "%s\n", tr->current_trace->name); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3095,43 +3465,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, int tracer_init(struct tracer *t, struct trace_array *tr) { - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); return t->init(tr); } -static void set_buffer_entries(struct trace_array *tr, unsigned long val) +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) { int cpu; + for_each_tracing_cpu(cpu) - tr->data[cpu]->entries = val; + per_cpu_ptr(buf->data, cpu)->entries = val; } +#ifdef CONFIG_TRACER_MAX_TRACE /* resize @tr's buffer to the size of @size_tr's entries */ -static int resize_buffer_duplicate_size(struct trace_array *tr, - struct trace_array *size_tr, int cpu_id) +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, + struct trace_buffer *size_buf, int cpu_id) { int cpu, ret = 0; if (cpu_id == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { - ret = ring_buffer_resize(tr->buffer, - size_tr->data[cpu]->entries, cpu); + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu)->entries, cpu); if (ret < 0) break; - tr->data[cpu]->entries = size_tr->data[cpu]->entries; + per_cpu_ptr(trace_buf->data, cpu)->entries = + per_cpu_ptr(size_buf->data, cpu)->entries; } } else { - ret = ring_buffer_resize(tr->buffer, - size_tr->data[cpu_id]->entries, cpu_id); + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); if (ret == 0) - tr->data[cpu_id]->entries = - size_tr->data[cpu_id]->entries; + per_cpu_ptr(trace_buf->data, cpu_id)->entries = + per_cpu_ptr(size_buf->data, cpu_id)->entries; } return ret; } +#endif /* CONFIG_TRACER_MAX_TRACE */ -static int __tracing_resize_ring_buffer(unsigned long size, int cpu) +static int __tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu) { int ret; @@ -3140,23 +3515,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) * we use the size that was given, and we can forget about * expanding it later. */ - ring_buffer_expanded = 1; + ring_buffer_expanded = true; /* May be called before buffers are initialized */ - if (!global_trace.buffer) + if (!tr->trace_buffer.buffer) return 0; - ret = ring_buffer_resize(global_trace.buffer, size, cpu); + ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu); if (ret < 0) return ret; - if (!current_trace->use_max_tr) +#ifdef CONFIG_TRACER_MAX_TRACE + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || + !tr->current_trace->use_max_tr) goto out; - ret = ring_buffer_resize(max_tr.buffer, size, cpu); + ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); if (ret < 0) { - int r = resize_buffer_duplicate_size(&global_trace, - &global_trace, cpu); + int r = resize_buffer_duplicate_size(&tr->trace_buffer, + &tr->trace_buffer, cpu); if (r < 0) { /* * AARGH! We are left with different @@ -3179,20 +3556,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) } if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&max_tr, size); + set_buffer_entries(&tr->max_buffer, size); else - max_tr.data[cpu]->entries = size; + per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size; out: +#endif /* CONFIG_TRACER_MAX_TRACE */ + if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&global_trace, size); + set_buffer_entries(&tr->trace_buffer, size); else - global_trace.data[cpu]->entries = size; + per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size; return ret; } -static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) +static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu_id) { int ret = size; @@ -3206,7 +3586,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) } } - ret = __tracing_resize_ring_buffer(size, cpu_id); + ret = __tracing_resize_ring_buffer(tr, size, cpu_id); if (ret < 0) ret = -ENOMEM; @@ -3233,7 +3613,7 @@ int tracing_update_buffers(void) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) - ret = __tracing_resize_ring_buffer(trace_buf_size, + ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, RING_BUFFER_ALL_CPUS); mutex_unlock(&trace_types_lock); @@ -3243,7 +3623,7 @@ int tracing_update_buffers(void) struct trace_option_dentry; static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer); +create_trace_option_files(struct trace_array *tr, struct tracer *tracer); static void destroy_trace_option_files(struct trace_option_dentry *topts); @@ -3253,13 +3633,15 @@ static int tracing_set_tracer(const char *buf) static struct trace_option_dentry *topts; struct trace_array *tr = &global_trace; struct tracer *t; +#ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; +#endif int ret = 0; mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) { - ret = __tracing_resize_ring_buffer(trace_buf_size, + ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; @@ -3274,18 +3656,21 @@ static int tracing_set_tracer(const char *buf) ret = -EINVAL; goto out; } - if (t == current_trace) + if (t == tr->current_trace) goto out; trace_branch_disable(); - current_trace->enabled = false; + tr->current_trace->enabled = false; - if (current_trace->reset) - current_trace->reset(tr); + if (tr->current_trace->reset) + tr->current_trace->reset(tr); - had_max_tr = current_trace->allocated_snapshot; - current_trace = &nop_trace; + /* Current trace needs to be nop_trace before synchronize_sched */ + tr->current_trace = &nop_trace; + +#ifdef CONFIG_TRACER_MAX_TRACE + had_max_tr = tr->allocated_snapshot; if (had_max_tr && !t->use_max_tr) { /* @@ -3296,27 +3681,20 @@ static int tracing_set_tracer(const char *buf) * so a synchronized_sched() is sufficient. */ synchronize_sched(); - /* - * We don't free the ring buffer. instead, resize it because - * The max_tr ring buffer has some state (e.g. ring->clock) and - * we want preserve it. - */ - ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); - set_buffer_entries(&max_tr, 1); - tracing_reset_online_cpus(&max_tr); - current_trace->allocated_snapshot = false; + free_snapshot(tr); } +#endif destroy_trace_option_files(topts); - topts = create_trace_option_files(t); + topts = create_trace_option_files(tr, t); + +#ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { - /* we need to make per cpu buffer sizes equivalent */ - ret = resize_buffer_duplicate_size(&max_tr, &global_trace, - RING_BUFFER_ALL_CPUS); + ret = alloc_snapshot(tr); if (ret < 0) goto out; - t->allocated_snapshot = true; } +#endif if (t->init) { ret = tracer_init(t, tr); @@ -3324,8 +3702,8 @@ static int tracing_set_tracer(const char *buf) goto out; } - current_trace = t; - current_trace->enabled = true; + tr->current_trace = t; + tr->current_trace->enabled = true; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@ -3399,7 +3777,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, static int tracing_open_pipe(struct inode *inode, struct file *filp) { - long cpu_file = (long) inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct trace_iterator *iter; int ret = 0; @@ -3424,7 +3803,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) ret = -ENOMEM; goto fail; } - *iter->trace = *current_trace; + *iter->trace = *tr->current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -3441,8 +3820,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (trace_clocks[trace_clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - iter->cpu_file = cpu_file; - iter->tr = &global_trace; + iter->cpu_file = tc->cpu; + iter->tr = tc->tr; + iter->trace_buffer = &tc->tr->trace_buffer; mutex_init(&iter->mutex); filp->private_data = iter; @@ -3481,24 +3861,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) } static unsigned int -tracing_poll_pipe(struct file *filp, poll_table *poll_table) +trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { - struct trace_iterator *iter = filp->private_data; + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return POLLIN | POLLRDNORM; - if (trace_flags & TRACE_ITER_BLOCK) { + if (trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ return POLLIN | POLLRDNORM; - } else { - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; - poll_wait(filp, &trace_wait, poll_table); - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; + else + return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, + filp, poll_table); +} - return 0; - } +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + return trace_poll(iter, filp, poll_table); } /* @@ -3564,6 +3948,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; + struct trace_array *tr = iter->tr; ssize_t sret; /* return any leftover data */ @@ -3575,8 +3960,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != current_trace->name)) - *iter->trace = *current_trace; + if (unlikely(iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); /* @@ -3732,6 +4117,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; + struct trace_array *tr = iter->tr; ssize_t ret; size_t rem; unsigned int i; @@ -3741,8 +4127,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != current_trace->name)) - *iter->trace = *current_trace; + if (unlikely(iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); mutex_lock(&iter->mutex); @@ -3804,43 +4190,19 @@ out_err: goto out; } -struct ftrace_entries_info { - struct trace_array *tr; - int cpu; -}; - -static int tracing_entries_open(struct inode *inode, struct file *filp) -{ - struct ftrace_entries_info *info; - - if (tracing_disabled) - return -ENODEV; - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - - info->tr = &global_trace; - info->cpu = (unsigned long)inode->i_private; - - filp->private_data = info; - - return 0; -} - static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_entries_info *info = filp->private_data; - struct trace_array *tr = info->tr; + struct trace_cpu *tc = filp->private_data; + struct trace_array *tr = tc->tr; char buf[64]; int r = 0; ssize_t ret; mutex_lock(&trace_types_lock); - if (info->cpu == RING_BUFFER_ALL_CPUS) { + if (tc->cpu == RING_BUFFER_ALL_CPUS) { int cpu, buf_size_same; unsigned long size; @@ -3850,8 +4212,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf, for_each_tracing_cpu(cpu) { /* fill in the size from first enabled cpu */ if (size == 0) - size = tr->data[cpu]->entries; - if (size != tr->data[cpu]->entries) { + size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; + if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) { buf_size_same = 0; break; } @@ -3867,7 +4229,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } else r = sprintf(buf, "X\n"); } else - r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); + r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10); mutex_unlock(&trace_types_lock); @@ -3879,7 +4241,7 @@ static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_entries_info *info = filp->private_data; + struct trace_cpu *tc = filp->private_data; unsigned long val; int ret; @@ -3894,7 +4256,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, /* value is in KB */ val <<= 10; - ret = tracing_resize_ring_buffer(val, info->cpu); + ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu); if (ret < 0) return ret; @@ -3903,16 +4265,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } -static int -tracing_entries_release(struct inode *inode, struct file *filp) -{ - struct ftrace_entries_info *info = filp->private_data; - - kfree(info); - - return 0; -} - static ssize_t tracing_total_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3924,7 +4276,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { - size += tr->data[cpu]->entries >> 10; + size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10; if (!ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } @@ -3954,11 +4306,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf, static int tracing_free_buffer_release(struct inode *inode, struct file *filp) { + struct trace_array *tr = inode->i_private; + /* disable tracing ? */ if (trace_flags & TRACE_ITER_STOP_ON_FREE) tracing_off(); /* resize the ring buffer to 0 */ - tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); + tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); return 0; } @@ -4027,7 +4381,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* possible \n added */ - buffer = global_trace.buffer; + buffer = global_trace.trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, irq_flags, preempt_count()); if (!event) { @@ -4069,13 +4423,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, static int tracing_clock_show(struct seq_file *m, void *v) { + struct trace_array *tr = m->private; int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) seq_printf(m, "%s%s%s%s", i ? " " : "", - i == trace_clock_id ? "[" : "", trace_clocks[i].name, - i == trace_clock_id ? "]" : ""); + i == tr->clock_id ? "[" : "", trace_clocks[i].name, + i == tr->clock_id ? "]" : ""); seq_putc(m, '\n'); return 0; @@ -4084,6 +4439,8 @@ static int tracing_clock_show(struct seq_file *m, void *v) static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; char buf[64]; const char *clockstr; int i; @@ -4105,20 +4462,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; - trace_clock_id = i; - mutex_lock(&trace_types_lock); - ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); - if (max_tr.buffer) - ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); + tr->clock_id = i; + + ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func); /* * New clock may not be consistent with the previous clock. * Reset the buffer so that it doesn't have incomparable timestamps. */ - tracing_reset_online_cpus(&global_trace); - tracing_reset_online_cpus(&max_tr); + tracing_reset_online_cpus(&global_trace.trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); + tracing_reset_online_cpus(&global_trace.max_buffer); +#endif mutex_unlock(&trace_types_lock); @@ -4131,20 +4491,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file) { if (tracing_disabled) return -ENODEV; - return single_open(file, tracing_clock_show, NULL); + + return single_open(file, tracing_clock_show, inode->i_private); } +struct ftrace_buffer_info { + struct trace_iterator iter; + void *spare; + unsigned int read; +}; + #ifdef CONFIG_TRACER_SNAPSHOT static int tracing_snapshot_open(struct inode *inode, struct file *file) { + struct trace_cpu *tc = inode->i_private; struct trace_iterator *iter; + struct seq_file *m; int ret = 0; if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, true); if (IS_ERR(iter)) ret = PTR_ERR(iter); + } else { + /* Writes still need the seq_file to hold the private data */ + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) + return -ENOMEM; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + kfree(m); + return -ENOMEM; + } + iter->tr = tc->tr; + iter->trace_buffer = &tc->tr->max_buffer; + iter->cpu_file = tc->cpu; + m->private = iter; + file->private_data = m; } + return ret; } @@ -4152,6 +4537,9 @@ static ssize_t tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct seq_file *m = filp->private_data; + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; unsigned long val; int ret; @@ -4165,40 +4553,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&trace_types_lock); - if (current_trace->use_max_tr) { + if (tr->current_trace->use_max_tr) { ret = -EBUSY; goto out; } switch (val) { case 0: - if (current_trace->allocated_snapshot) { - /* free spare buffer */ - ring_buffer_resize(max_tr.buffer, 1, - RING_BUFFER_ALL_CPUS); - set_buffer_entries(&max_tr, 1); - tracing_reset_online_cpus(&max_tr); - current_trace->allocated_snapshot = false; + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; } + if (tr->allocated_snapshot) + free_snapshot(tr); break; case 1: - if (!current_trace->allocated_snapshot) { - /* allocate spare buffer */ - ret = resize_buffer_duplicate_size(&max_tr, - &global_trace, RING_BUFFER_ALL_CPUS); +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; + } +#endif + if (!tr->allocated_snapshot) { + ret = alloc_snapshot(tr); if (ret < 0) break; - current_trace->allocated_snapshot = true; } - local_irq_disable(); /* Now, we're going to swap */ - update_max_tr(&global_trace, current, smp_processor_id()); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + update_max_tr(tr, current, smp_processor_id()); + else + update_max_tr_single(tr, current, iter->cpu_file); local_irq_enable(); break; default: - if (current_trace->allocated_snapshot) - tracing_reset_online_cpus(&max_tr); + if (tr->allocated_snapshot) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->max_buffer); + else + tracing_reset(&tr->max_buffer, iter->cpu_file); + } break; } @@ -4210,6 +4606,51 @@ out: mutex_unlock(&trace_types_lock); return ret; } + +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + if (file->f_mode & FMODE_READ) + return tracing_release(inode, file); + + /* If write only, the seq_file is just a stub */ + if (m) + kfree(m->private); + kfree(m); + + return 0; +} + +static int tracing_buffers_open(struct inode *inode, struct file *filp); +static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos); +static int tracing_buffers_release(struct inode *inode, struct file *file); +static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +static int snapshot_raw_open(struct inode *inode, struct file *filp) +{ + struct ftrace_buffer_info *info; + int ret; + + ret = tracing_buffers_open(inode, filp); + if (ret < 0) + return ret; + + info = filp->private_data; + + if (info->iter.trace->use_max_tr) { + tracing_buffers_release(inode, filp); + return -EBUSY; + } + + info->iter.snapshot = true; + info->iter.trace_buffer = &info->iter.tr->max_buffer; + + return ret; +} + #endif /* CONFIG_TRACER_SNAPSHOT */ @@ -4237,10 +4678,9 @@ static const struct file_operations tracing_pipe_fops = { }; static const struct file_operations tracing_entries_fops = { - .open = tracing_entries_open, + .open = tracing_open_generic, .read = tracing_entries_read, .write = tracing_entries_write, - .release = tracing_entries_release, .llseek = generic_file_llseek, }; @@ -4275,20 +4715,23 @@ static const struct file_operations snapshot_fops = { .read = seq_read, .write = tracing_snapshot_write, .llseek = tracing_seek, - .release = tracing_release, + .release = tracing_snapshot_release, }; -#endif /* CONFIG_TRACER_SNAPSHOT */ -struct ftrace_buffer_info { - struct trace_array *tr; - void *spare; - int cpu; - unsigned int read; +static const struct file_operations snapshot_raw_fops = { + .open = snapshot_raw_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .llseek = no_llseek, }; +#endif /* CONFIG_TRACER_SNAPSHOT */ + static int tracing_buffers_open(struct inode *inode, struct file *filp) { - int cpu = (int)(long)inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct ftrace_buffer_info *info; if (tracing_disabled) @@ -4298,72 +4741,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) if (!info) return -ENOMEM; - info->tr = &global_trace; - info->cpu = cpu; - info->spare = NULL; + mutex_lock(&trace_types_lock); + + tr->ref++; + + info->iter.tr = tr; + info->iter.cpu_file = tc->cpu; + info->iter.trace = tr->current_trace; + info->iter.trace_buffer = &tr->trace_buffer; + info->spare = NULL; /* Force reading ring buffer for first read */ - info->read = (unsigned int)-1; + info->read = (unsigned int)-1; filp->private_data = info; + mutex_unlock(&trace_types_lock); + return nonseekable_open(inode, filp); } +static unsigned int +tracing_buffers_poll(struct file *filp, poll_table *poll_table) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + + return trace_poll(iter, filp, poll_table); +} + static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; ssize_t ret; - size_t size; + ssize_t size; if (!count) return 0; + mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) { + size = -EBUSY; + goto out_unlock; + } +#endif + if (!info->spare) - info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); + info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, + iter->cpu_file); + size = -ENOMEM; if (!info->spare) - return -ENOMEM; + goto out_unlock; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) goto read; - trace_access_lock(info->cpu); - ret = ring_buffer_read_page(info->tr->buffer, + again: + trace_access_lock(iter->cpu_file); + ret = ring_buffer_read_page(iter->trace_buffer->buffer, &info->spare, count, - info->cpu, 0); - trace_access_unlock(info->cpu); - if (ret < 0) - return 0; + iter->cpu_file, 0); + trace_access_unlock(iter->cpu_file); - info->read = 0; + if (ret < 0) { + if (trace_empty(iter)) { + if ((filp->f_flags & O_NONBLOCK)) { + size = -EAGAIN; + goto out_unlock; + } + mutex_unlock(&trace_types_lock); + iter->trace->wait_pipe(iter); + mutex_lock(&trace_types_lock); + if (signal_pending(current)) { + size = -EINTR; + goto out_unlock; + } + goto again; + } + size = 0; + goto out_unlock; + } -read: + info->read = 0; + read: size = PAGE_SIZE - info->read; if (size > count) size = count; ret = copy_to_user(ubuf, info->spare + info->read, size); - if (ret == size) - return -EFAULT; + if (ret == size) { + size = -EFAULT; + goto out_unlock; + } size -= ret; *ppos += size; info->read += size; + out_unlock: + mutex_unlock(&trace_types_lock); + return size; } static int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + mutex_lock(&trace_types_lock); + + WARN_ON(!iter->tr->ref); + iter->tr->ref--; if (info->spare) - ring_buffer_free_read_page(info->tr->buffer, info->spare); + ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); kfree(info); + mutex_unlock(&trace_types_lock); + return 0; } @@ -4428,6 +4930,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct page *pages_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { @@ -4440,10 +4943,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, }; struct buffer_ref *ref; int entries, size, i; - size_t ret; + ssize_t ret; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; + mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto out; + } +#endif + + if (splice_grow_spd(pipe, &spd)) { + ret = -ENOMEM; + goto out; + } if (*ppos & (PAGE_SIZE - 1)) { ret = -EINVAL; @@ -4458,8 +4972,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } - trace_access_lock(info->cpu); - entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + again: + trace_access_lock(iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { struct page *page; @@ -4470,15 +4985,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; ref->ref = 1; - ref->buffer = info->tr->buffer; - ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); + ref->buffer = iter->trace_buffer->buffer; + ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (!ref->page) { kfree(ref); break; } r = ring_buffer_read_page(ref->buffer, &ref->page, - len, info->cpu, 1); + len, iter->cpu_file, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->page); kfree(ref); @@ -4502,31 +5017,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.nr_pages++; *ppos += PAGE_SIZE; - entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); } - trace_access_unlock(info->cpu); + trace_access_unlock(iter->cpu_file); spd.nr_pages = i; /* did we read anything? */ if (!spd.nr_pages) { - if (flags & SPLICE_F_NONBLOCK) + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { ret = -EAGAIN; - else - ret = 0; - /* TODO: block */ - goto out; + goto out; + } + mutex_unlock(&trace_types_lock); + iter->trace->wait_pipe(iter); + mutex_lock(&trace_types_lock); + if (signal_pending(current)) { + ret = -EINTR; + goto out; + } + goto again; } ret = splice_to_pipe(pipe, &spd); splice_shrink_spd(&spd); out: + mutex_unlock(&trace_types_lock); + return ret; } static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, + .poll = tracing_buffers_poll, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, .llseek = no_llseek, @@ -4536,12 +5060,14 @@ static ssize_t tracing_stats_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - unsigned long cpu = (unsigned long)filp->private_data; - struct trace_array *tr = &global_trace; + struct trace_cpu *tc = filp->private_data; + struct trace_array *tr = tc->tr; + struct trace_buffer *trace_buf = &tr->trace_buffer; struct trace_seq *s; unsigned long cnt; unsigned long long t; unsigned long usec_rem; + int cpu = tc->cpu; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) @@ -4549,41 +5075,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf, trace_seq_init(s); - cnt = ring_buffer_entries_cpu(tr->buffer, cpu); + cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "entries: %ld\n", cnt); - cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); + cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "overrun: %ld\n", cnt); - cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); + cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); - cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); + cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); if (trace_clocks[trace_clock_id].in_ns) { /* local or global for trace_clock */ - t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); + t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); - t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); + t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); } else { /* counter or tsc mode for trace_clock */ trace_seq_printf(s, "oldest event ts: %llu\n", - ring_buffer_oldest_event_ts(tr->buffer, cpu)); + ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); trace_seq_printf(s, "now ts: %llu\n", - ring_buffer_time_stamp(tr->buffer, cpu)); + ring_buffer_time_stamp(trace_buf->buffer, cpu)); } - cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); + cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "dropped events: %ld\n", cnt); - cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); + cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "read events: %ld\n", cnt); count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); @@ -4635,60 +5161,161 @@ static const struct file_operations tracing_dyn_info_fops = { .read = tracing_read_dyn_info, .llseek = generic_file_llseek, }; -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ -static struct dentry *d_tracer; +#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ + tracing_snapshot(); +} -struct dentry *tracing_init_dentry(void) +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ + unsigned long *count = (long *)data; + + if (!*count) + return; + + if (*count != -1) + (*count)--; + + tracing_snapshot(); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + long count = (long)data; + + seq_printf(m, "%ps:", (void *)ip); + + seq_printf(m, "snapshot"); + + if (count == -1) + seq_printf(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", count); + + return 0; +} + +static struct ftrace_probe_ops snapshot_probe_ops = { + .func = ftrace_snapshot, + .print = ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { + .func = ftrace_count_snapshot, + .print = ftrace_snapshot_print, +}; + +static int +ftrace_trace_snapshot_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) { - static int once; + struct ftrace_probe_ops *ops; + void *count = (void *)-1; + char *number; + int ret; - if (d_tracer) - return d_tracer; + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; + + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + return 0; + } + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = register_ftrace_function_probe(glob, ops, count); + + if (ret >= 0) + alloc_snapshot(&global_trace); + + return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { + .name = "snapshot", + .func = ftrace_trace_snapshot_callback, +}; + +static int register_snapshot_cmd(void) +{ + return register_ftrace_command(&ftrace_snapshot_cmd); +} +#else +static inline int register_snapshot_cmd(void) { return 0; } +#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ + +struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +{ + if (tr->dir) + return tr->dir; if (!debugfs_initialized()) return NULL; - d_tracer = debugfs_create_dir("tracing", NULL); + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + tr->dir = debugfs_create_dir("tracing", NULL); - if (!d_tracer && !once) { - once = 1; - pr_warning("Could not create debugfs directory 'tracing'\n"); - return NULL; - } + if (!tr->dir) + pr_warn_once("Could not create debugfs directory 'tracing'\n"); - return d_tracer; + return tr->dir; } -static struct dentry *d_percpu; +struct dentry *tracing_init_dentry(void) +{ + return tracing_init_dentry_tr(&global_trace); +} -static struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) { - static int once; struct dentry *d_tracer; - if (d_percpu) - return d_percpu; - - d_tracer = tracing_init_dentry(); + if (tr->percpu_dir) + return tr->percpu_dir; + d_tracer = tracing_init_dentry_tr(tr); if (!d_tracer) return NULL; - d_percpu = debugfs_create_dir("per_cpu", d_tracer); + tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); - if (!d_percpu && !once) { - once = 1; - pr_warning("Could not create debugfs directory 'per_cpu'\n"); - return NULL; - } + WARN_ONCE(!tr->percpu_dir, + "Could not create debugfs directory 'per_cpu/%d'\n", cpu); - return d_percpu; + return tr->percpu_dir; } -static void tracing_init_debugfs_percpu(long cpu) +static void +tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) { - struct dentry *d_percpu = tracing_dentry_percpu(); + struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu); + struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -4704,20 +5331,28 @@ static void tracing_init_debugfs_percpu(long cpu) /* per cpu trace_pipe */ trace_create_file("trace_pipe", 0444, d_cpu, - (void *) cpu, &tracing_pipe_fops); + (void *)&data->trace_cpu, &tracing_pipe_fops); /* per cpu trace */ trace_create_file("trace", 0644, d_cpu, - (void *) cpu, &tracing_fops); + (void *)&data->trace_cpu, &tracing_fops); trace_create_file("trace_pipe_raw", 0444, d_cpu, - (void *) cpu, &tracing_buffers_fops); + (void *)&data->trace_cpu, &tracing_buffers_fops); trace_create_file("stats", 0444, d_cpu, - (void *) cpu, &tracing_stats_fops); + (void *)&data->trace_cpu, &tracing_stats_fops); trace_create_file("buffer_size_kb", 0444, d_cpu, - (void *) cpu, &tracing_entries_fops); + (void *)&data->trace_cpu, &tracing_entries_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_cpu, + (void *)&data->trace_cpu, &snapshot_fops); + + trace_create_file("snapshot_raw", 0444, d_cpu, + (void *)&data->trace_cpu, &snapshot_raw_fops); +#endif } #ifdef CONFIG_FTRACE_SELFTEST @@ -4728,6 +5363,7 @@ static void tracing_init_debugfs_percpu(long cpu) struct trace_option_dentry { struct tracer_opt *opt; struct tracer_flags *flags; + struct trace_array *tr; struct dentry *entry; }; @@ -4763,7 +5399,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); - ret = __set_tracer_option(current_trace, topt->flags, + ret = __set_tracer_option(topt->tr->current_trace, topt->flags, topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) @@ -4802,6 +5438,7 @@ static ssize_t trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = &global_trace; long index = (long)filp->private_data; unsigned long val; int ret; @@ -4814,7 +5451,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; mutex_lock(&trace_types_lock); - ret = set_tracer_flag(1 << index, val); + ret = set_tracer_flag(tr, 1 << index, val); mutex_unlock(&trace_types_lock); if (ret < 0) @@ -4848,40 +5485,41 @@ struct dentry *trace_create_file(const char *name, } -static struct dentry *trace_options_init_dentry(void) +static struct dentry *trace_options_init_dentry(struct trace_array *tr) { struct dentry *d_tracer; - static struct dentry *t_options; - if (t_options) - return t_options; + if (tr->options) + return tr->options; - d_tracer = tracing_init_dentry(); + d_tracer = tracing_init_dentry_tr(tr); if (!d_tracer) return NULL; - t_options = debugfs_create_dir("options", d_tracer); - if (!t_options) { + tr->options = debugfs_create_dir("options", d_tracer); + if (!tr->options) { pr_warning("Could not create debugfs directory 'options'\n"); return NULL; } - return t_options; + return tr->options; } static void -create_trace_option_file(struct trace_option_dentry *topt, +create_trace_option_file(struct trace_array *tr, + struct trace_option_dentry *topt, struct tracer_flags *flags, struct tracer_opt *opt) { struct dentry *t_options; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return; topt->flags = flags; topt->opt = opt; + topt->tr = tr; topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops); @@ -4889,7 +5527,7 @@ create_trace_option_file(struct trace_option_dentry *topt, } static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer) +create_trace_option_files(struct trace_array *tr, struct tracer *tracer) { struct trace_option_dentry *topts; struct tracer_flags *flags; @@ -4914,7 +5552,7 @@ create_trace_option_files(struct tracer *tracer) return NULL; for (cnt = 0; opts[cnt].name; cnt++) - create_trace_option_file(&topts[cnt], flags, + create_trace_option_file(tr, &topts[cnt], flags, &opts[cnt]); return topts; @@ -4937,11 +5575,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts) } static struct dentry * -create_trace_option_core_file(const char *option, long index) +create_trace_option_core_file(struct trace_array *tr, + const char *option, long index) { struct dentry *t_options; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return NULL; @@ -4949,17 +5588,17 @@ create_trace_option_core_file(const char *option, long index) &trace_options_core_fops); } -static __init void create_trace_options_dir(void) +static __init void create_trace_options_dir(struct trace_array *tr) { struct dentry *t_options; int i; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return; for (i = 0; trace_options[i]; i++) - create_trace_option_core_file(trace_options[i], i); + create_trace_option_core_file(tr, trace_options[i], i); } static ssize_t @@ -4967,7 +5606,7 @@ rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; char buf[64]; int r; @@ -4986,7 +5625,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; unsigned long val; int ret; @@ -4998,12 +5637,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); if (val) { ring_buffer_record_on(buffer); - if (current_trace->start) - current_trace->start(tr); + if (tr->current_trace->start) + tr->current_trace->start(tr); } else { ring_buffer_record_off(buffer); - if (current_trace->stop) - current_trace->stop(tr); + if (tr->current_trace->stop) + tr->current_trace->stop(tr); } mutex_unlock(&trace_types_lock); } @@ -5020,23 +5659,310 @@ static const struct file_operations rb_simple_fops = { .llseek = default_llseek, }; +struct dentry *trace_instance_dir; + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); + +static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) +{ + int cpu; + + for_each_tracing_cpu(cpu) { + memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); + per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; + per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; + } +} + +static int +allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) +{ + enum ring_buffer_flags rb_flags; + + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + + buf->buffer = ring_buffer_alloc(size, rb_flags); + if (!buf->buffer) + return -ENOMEM; + + buf->data = alloc_percpu(struct trace_array_cpu); + if (!buf->data) { + ring_buffer_free(buf->buffer); + return -ENOMEM; + } + + init_trace_buffers(tr, buf); + + /* Allocate the first page for all buffers */ + set_buffer_entries(&tr->trace_buffer, + ring_buffer_size(tr->trace_buffer.buffer, 0)); + + return 0; +} + +static int allocate_trace_buffers(struct trace_array *tr, int size) +{ + int ret; + + ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); + if (ret) + return ret; + +#ifdef CONFIG_TRACER_MAX_TRACE + ret = allocate_trace_buffer(tr, &tr->max_buffer, + allocate_snapshot ? size : 1); + if (WARN_ON(ret)) { + ring_buffer_free(tr->trace_buffer.buffer); + free_percpu(tr->trace_buffer.data); + return -ENOMEM; + } + tr->allocated_snapshot = allocate_snapshot; + + /* + * Only the top level trace array gets its snapshot allocated + * from the kernel command line. + */ + allocate_snapshot = false; +#endif + return 0; +} + +static int new_instance_create(const char *name) +{ + struct trace_array *tr; + int ret; + + mutex_lock(&trace_types_lock); + + ret = -EEXIST; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + ret = -ENOMEM; + tr = kzalloc(sizeof(*tr), GFP_KERNEL); + if (!tr) + goto out_unlock; + + tr->name = kstrdup(name, GFP_KERNEL); + if (!tr->name) + goto out_free_tr; + + raw_spin_lock_init(&tr->start_lock); + + tr->current_trace = &nop_trace; + + INIT_LIST_HEAD(&tr->systems); + INIT_LIST_HEAD(&tr->events); + + if (allocate_trace_buffers(tr, trace_buf_size) < 0) + goto out_free_tr; + + /* Holder for file callbacks */ + tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; + tr->trace_cpu.tr = tr; + + tr->dir = debugfs_create_dir(name, trace_instance_dir); + if (!tr->dir) + goto out_free_tr; + + ret = event_trace_add_tracer(tr->dir, tr); + if (ret) + goto out_free_tr; + + init_tracer_debugfs(tr, tr->dir); + + list_add(&tr->list, &ftrace_trace_arrays); + + mutex_unlock(&trace_types_lock); + + return 0; + + out_free_tr: + if (tr->trace_buffer.buffer) + ring_buffer_free(tr->trace_buffer.buffer); + kfree(tr->name); + kfree(tr); + + out_unlock: + mutex_unlock(&trace_types_lock); + + return ret; + +} + +static int instance_delete(const char *name) +{ + struct trace_array *tr; + int found = 0; + int ret; + + mutex_lock(&trace_types_lock); + + ret = -ENODEV; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) { + found = 1; + break; + } + } + if (!found) + goto out_unlock; + + ret = -EBUSY; + if (tr->ref) + goto out_unlock; + + list_del(&tr->list); + + event_trace_del_tracer(tr); + debugfs_remove_recursive(tr->dir); + free_percpu(tr->trace_buffer.data); + ring_buffer_free(tr->trace_buffer.buffer); + + kfree(tr->name); + kfree(tr); + + ret = 0; + + out_unlock: + mutex_unlock(&trace_types_lock); + + return ret; +} + +static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) +{ + struct dentry *parent; + int ret; + + /* Paranoid: Make sure the parent is the "instances" directory */ + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + if (WARN_ON_ONCE(parent != trace_instance_dir)) + return -ENOENT; + + /* + * The inode mutex is locked, but debugfs_create_dir() will also + * take the mutex. As the instances directory can not be destroyed + * or changed in any other way, it is safe to unlock it, and + * let the dentry try. If two users try to make the same dir at + * the same time, then the new_instance_create() will determine the + * winner. + */ + mutex_unlock(&inode->i_mutex); + + ret = new_instance_create(dentry->d_iname); + + mutex_lock(&inode->i_mutex); + + return ret; +} + +static int instance_rmdir(struct inode *inode, struct dentry *dentry) +{ + struct dentry *parent; + int ret; + + /* Paranoid: Make sure the parent is the "instances" directory */ + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + if (WARN_ON_ONCE(parent != trace_instance_dir)) + return -ENOENT; + + /* The caller did a dget() on dentry */ + mutex_unlock(&dentry->d_inode->i_mutex); + + /* + * The inode mutex is locked, but debugfs_create_dir() will also + * take the mutex. As the instances directory can not be destroyed + * or changed in any other way, it is safe to unlock it, and + * let the dentry try. If two users try to make the same dir at + * the same time, then the instance_delete() will determine the + * winner. + */ + mutex_unlock(&inode->i_mutex); + + ret = instance_delete(dentry->d_iname); + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock(&dentry->d_inode->i_mutex); + + return ret; +} + +static const struct inode_operations instance_dir_inode_operations = { + .lookup = simple_lookup, + .mkdir = instance_mkdir, + .rmdir = instance_rmdir, +}; + +static __init void create_trace_instances(struct dentry *d_tracer) +{ + trace_instance_dir = debugfs_create_dir("instances", d_tracer); + if (WARN_ON(!trace_instance_dir)) + return; + + /* Hijack the dir inode operations, to allow mkdir */ + trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; +} + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +{ + int cpu; + + trace_create_file("trace_options", 0644, d_tracer, + tr, &tracing_iter_fops); + + trace_create_file("trace", 0644, d_tracer, + (void *)&tr->trace_cpu, &tracing_fops); + + trace_create_file("trace_pipe", 0444, d_tracer, + (void *)&tr->trace_cpu, &tracing_pipe_fops); + + trace_create_file("buffer_size_kb", 0644, d_tracer, + (void *)&tr->trace_cpu, &tracing_entries_fops); + + trace_create_file("buffer_total_size_kb", 0444, d_tracer, + tr, &tracing_total_entries_fops); + + trace_create_file("free_buffer", 0644, d_tracer, + tr, &tracing_free_buffer_fops); + + trace_create_file("trace_marker", 0220, d_tracer, + tr, &tracing_mark_fops); + + trace_create_file("trace_clock", 0644, d_tracer, tr, + &trace_clock_fops); + + trace_create_file("tracing_on", 0644, d_tracer, + tr, &rb_simple_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_tracer, + (void *)&tr->trace_cpu, &snapshot_fops); +#endif + + for_each_tracing_cpu(cpu) + tracing_init_debugfs_percpu(tr, cpu); + +} + static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; - int cpu; trace_access_lock_init(); d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; - trace_create_file("trace_options", 0644, d_tracer, - NULL, &tracing_iter_fops); + init_tracer_debugfs(&global_trace, d_tracer); trace_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); - - trace_create_file("trace", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); + &global_trace, &tracing_cpumask_fops); trace_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); @@ -5055,44 +5981,17 @@ static __init int tracer_init_debugfs(void) trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); - trace_create_file("trace_pipe", 0444, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - - trace_create_file("buffer_size_kb", 0644, d_tracer, - (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); - - trace_create_file("buffer_total_size_kb", 0444, d_tracer, - &global_trace, &tracing_total_entries_fops); - - trace_create_file("free_buffer", 0644, d_tracer, - &global_trace, &tracing_free_buffer_fops); - - trace_create_file("trace_marker", 0220, d_tracer, - NULL, &tracing_mark_fops); - trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); - trace_create_file("trace_clock", 0644, d_tracer, NULL, - &trace_clock_fops); - - trace_create_file("tracing_on", 0644, d_tracer, - &global_trace, &rb_simple_fops); - #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif -#ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); -#endif + create_trace_instances(d_tracer); - create_trace_options_dir(); - - for_each_tracing_cpu(cpu) - tracing_init_debugfs_percpu(cpu); + create_trace_options_dir(&global_trace); return 0; } @@ -5148,8 +6047,8 @@ void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ - if (s->len >= 1000) - s->len = 1000; + if (s->len >= TRACE_MAX_PRINT) + s->len = TRACE_MAX_PRINT; /* should be zero ended, but we are paranoid. */ s->buffer[s->len] = 0; @@ -5162,46 +6061,43 @@ trace_printk_seq(struct trace_seq *s) void trace_init_global_iter(struct trace_iterator *iter) { iter->tr = &global_trace; - iter->trace = current_trace; - iter->cpu_file = TRACE_PIPE_ALL_CPU; + iter->trace = iter->tr->current_trace; + iter->cpu_file = RING_BUFFER_ALL_CPUS; + iter->trace_buffer = &global_trace.trace_buffer; } -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { - static arch_spinlock_t ftrace_dump_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; + static atomic_t dump_running; unsigned int old_userobj; - static int dump_ran; unsigned long flags; int cnt = 0, cpu; - /* only one dump */ - local_irq_save(flags); - arch_spin_lock(&ftrace_dump_lock); - if (dump_ran) - goto out; - - dump_ran = 1; + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + /* + * Always turn off tracing when we dump. + * We don't need to show trace output of what happens + * between multiple crashes. + * + * If the user does a sysrq-z, then they can re-enable + * tracing with echo 1 > tracing_on. + */ tracing_off(); - /* Did function tracer already get disabled? */ - if (ftrace_is_dead()) { - printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); - printk("# MAY BE MISSING FUNCTION EVENTS\n"); - } - - if (disable_tracing) - ftrace_kill(); + local_irq_save(flags); /* Simulate the iterator */ trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&iter.tr->data[cpu]->disabled); + atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -5211,7 +6107,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) switch (oops_dump_mode) { case DUMP_ALL: - iter.cpu_file = TRACE_PIPE_ALL_CPU; + iter.cpu_file = RING_BUFFER_ALL_CPUS; break; case DUMP_ORIG: iter.cpu_file = raw_smp_processor_id(); @@ -5220,11 +6116,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) goto out_enable; default: printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); - iter.cpu_file = TRACE_PIPE_ALL_CPU; + iter.cpu_file = RING_BUFFER_ALL_CPUS; } printk(KERN_TRACE "Dumping ftrace buffer:\n"); + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + /* * We need to stop all tracing on all CPUS to read the * the next buffer. This is a bit expensive, but is @@ -5264,33 +6166,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) printk(KERN_TRACE "---------------------------------\n"); out_enable: - /* Re-enable tracing if requested */ - if (!disable_tracing) { - trace_flags |= old_userobj; + trace_flags |= old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&iter.tr->data[cpu]->disabled); - } - tracing_on(); + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - - out: - arch_spin_unlock(&ftrace_dump_lock); + atomic_dec(&dump_running); local_irq_restore(flags); } - -/* By default: disable tracing after the dump */ -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) -{ - __ftrace_dump(true, oops_dump_mode); -} EXPORT_SYMBOL_GPL(ftrace_dump); __init static int tracer_alloc_buffers(void) { int ring_buf_size; - enum ring_buffer_flags rb_flags; - int i; int ret = -ENOMEM; @@ -5311,49 +6199,27 @@ __init static int tracer_alloc_buffers(void) else ring_buf_size = 1; - rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; - cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); + raw_spin_lock_init(&global_trace.start_lock); + /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); - if (!global_trace.buffer) { + if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); goto out_free_cpumask; } + if (global_trace.buffer_disabled) tracing_off(); - -#ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(1, rb_flags); - if (!max_tr.buffer) { - printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); - WARN_ON(1); - ring_buffer_free(global_trace.buffer); - goto out_free_cpumask; - } -#endif - - /* Allocate the first page for all buffers */ - for_each_tracing_cpu(i) { - global_trace.data[i] = &per_cpu(global_trace_cpu, i); - max_tr.data[i] = &per_cpu(max_tr_data, i); - } - - set_buffer_entries(&global_trace, - ring_buffer_size(global_trace.buffer, 0)); -#ifdef CONFIG_TRACER_MAX_TRACE - set_buffer_entries(&max_tr, 1); -#endif - trace_init_cmdlines(); - init_irq_work(&trace_work_wakeup, trace_wake_up); register_tracer(&nop_trace); + global_trace.current_trace = &nop_trace; + /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -5362,16 +6228,32 @@ __init static int tracer_alloc_buffers(void) register_die_notifier(&trace_die_notifier); + global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + + /* Holder for file callbacks */ + global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; + global_trace.trace_cpu.tr = &global_trace; + + INIT_LIST_HEAD(&global_trace.systems); + INIT_LIST_HEAD(&global_trace.events); + list_add(&global_trace.list, &ftrace_trace_arrays); + while (trace_boot_options) { char *option; option = strsep(&trace_boot_options, ","); - trace_set_options(option); + trace_set_options(&global_trace, option); } + register_snapshot_cmd(); + return 0; out_free_cpumask: + free_percpu(global_trace.trace_buffer.data); +#ifdef CONFIG_TRACER_MAX_TRACE + free_percpu(global_trace.max_buffer.data); +#endif free_cpumask_var(tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2081971367ea..711ca7d3e7f1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,11 @@ #include <linux/trace_seq.h> #include <linux/ftrace_event.h> +#ifdef CONFIG_FTRACE_SYSCALLS +#include <asm/unistd.h> /* For NR_SYSCALLS */ +#include <asm/syscall.h> /* some archs define it here */ +#endif + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -29,6 +34,7 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_BLK, + TRACE_BPUTS, __TRACE_LAST_TYPE, }; @@ -103,11 +109,6 @@ struct kretprobe_trace_entry_head { unsigned long ret_ip; }; -struct uprobe_trace_entry_head { - struct trace_entry ent; - unsigned long ip; -}; - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -127,12 +128,21 @@ enum trace_flag_type { #define TRACE_BUF_SIZE 1024 +struct trace_array; + +struct trace_cpu { + struct trace_array *tr; + struct dentry *dir; + int cpu; +}; + /* * The CPU trace array - it consists of thousands of trace entries * plus some other descriptor data: (for example which task started * the trace, etc.) */ struct trace_array_cpu { + struct trace_cpu trace_cpu; atomic_t disabled; void *buffer_page; /* ring buffer spare */ @@ -151,20 +161,83 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; }; +struct tracer; + +struct trace_buffer { + struct trace_array *tr; + struct ring_buffer *buffer; + struct trace_array_cpu __percpu *data; + cycle_t time_start; + int cpu; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. * They have on/off state as well: */ struct trace_array { - struct ring_buffer *buffer; - int cpu; + struct list_head list; + char *name; + struct trace_buffer trace_buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + /* + * The max_buffer is used to snapshot the trace when a maximum + * latency is reached, or when the user initiates a snapshot. + * Some tracers will use this to store a maximum trace while + * it continues examining live traces. + * + * The buffers for the max_buffer are set up the same as the trace_buffer + * When a snapshot is taken, the buffer of the max_buffer is swapped + * with the buffer of the trace_buffer and the buffers are reset for + * the trace_buffer so the tracing can continue. + */ + struct trace_buffer max_buffer; + bool allocated_snapshot; +#endif int buffer_disabled; - cycle_t time_start; + struct trace_cpu trace_cpu; /* place holder */ +#ifdef CONFIG_FTRACE_SYSCALLS + int sys_refcount_enter; + int sys_refcount_exit; + DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); + DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +#endif + int stop_count; + int clock_id; + struct tracer *current_trace; + unsigned int flags; + raw_spinlock_t start_lock; + struct dentry *dir; + struct dentry *options; + struct dentry *percpu_dir; + struct dentry *event_dir; + struct list_head systems; + struct list_head events; struct task_struct *waiter; - struct trace_array_cpu *data[NR_CPUS]; + int ref; +}; + +enum { + TRACE_ARRAY_FL_GLOBAL = (1 << 0) }; +extern struct list_head ftrace_trace_arrays; + +/* + * The global tracer (top) should be the first trace array added, + * but we check the flag anyway. + */ +static inline struct trace_array *top_trace_array(void) +{ + struct trace_array *tr; + + tr = list_entry(ftrace_trace_arrays.prev, + typeof(*tr), list); + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); + return tr; +} + #define FTRACE_CMP_TYPE(var, type) \ __builtin_types_compatible_p(typeof(var), type *) @@ -200,6 +273,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ + IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ @@ -289,9 +363,10 @@ struct tracer { struct tracer *next; struct tracer_flags *flags; bool print_max; - bool use_max_tr; - bool allocated_snapshot; bool enabled; +#ifdef CONFIG_TRACER_MAX_TRACE + bool use_max_tr; +#endif }; @@ -427,8 +502,6 @@ static __always_inline void trace_clear_recursion(int bit) current->trace_recursion = val; } -#define TRACE_PIPE_ALL_CPU -1 - static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { @@ -439,10 +512,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); -void tracing_reset(struct trace_array *tr, int cpu); -void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset(struct trace_buffer *buf, int cpu); +void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); -void tracing_reset_current_online_cpus(void); +void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, umode_t mode, @@ -450,6 +523,7 @@ struct dentry *trace_create_file(const char *name, void *data, const struct file_operations *fops); +struct dentry *tracing_init_dentry_tr(struct trace_array *tr); struct dentry *tracing_init_dentry(void); struct ring_buffer_event; @@ -583,7 +657,7 @@ extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); -extern int ring_buffer_expanded; +extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; DECLARE_PER_CPU(int, ftrace_cpu_disabled); @@ -619,6 +693,8 @@ trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); +int trace_array_printk_buf(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); enum print_line_t print_trace_line(struct trace_iterator *iter); @@ -786,6 +862,7 @@ enum trace_iterator_flags { TRACE_ITER_STOP_ON_FREE = 0x400000, TRACE_ITER_IRQ_INFO = 0x800000, TRACE_ITER_MARKERS = 0x1000000, + TRACE_ITER_FUNCTION = 0x2000000, }; /* @@ -832,8 +909,8 @@ enum { struct ftrace_event_field { struct list_head link; - char *name; - char *type; + const char *name; + const char *type; int filter_type; int offset; int size; @@ -851,12 +928,19 @@ struct event_filter { struct event_subsystem { struct list_head list; const char *name; - struct dentry *entry; struct event_filter *filter; - int nr_events; int ref_count; }; +struct ftrace_subsystem_dir { + struct list_head list; + struct event_subsystem *subsystem; + struct trace_array *tr; + struct dentry *entry; + int ref_count; + int nr_events; +}; + #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) @@ -906,22 +990,20 @@ struct filter_pred { unsigned short right; }; -extern struct list_head ftrace_common_fields; - extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); extern int apply_event_filter(struct ftrace_event_call *call, char *filter_string); -extern int apply_subsystem_event_filter(struct event_subsystem *system, +extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); -struct list_head * -trace_get_fields(struct ftrace_event_call *event_call); +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -938,6 +1020,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, } extern void trace_event_enable_cmd_record(bool enable); +extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); +extern int event_trace_del_tracer(struct trace_array *tr); extern struct mutex event_mutex; extern struct list_head ftrace_events; @@ -948,7 +1032,18 @@ extern const char *__stop___trace_bprintk_fmt[]; void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); -int set_tracer_flag(unsigned int mask, int enabled); +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); + +/* + * Normal trace_printk() and friends allocates special buffers + * to do the manipulation, as well as saves the print formats + * into sections to display. But the trace infrastructure wants + * to use these without the added overhead at the price of being + * a bit slower (used mainly for warnings, where we don't care + * about performance). The internal_trace_puts() is for such + * a purpose. + */ +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 95e96842ed29..d594da0dc03c 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { struct ftrace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; + struct trace_array_cpu *data; struct ring_buffer_event *event; struct trace_branch *entry; struct ring_buffer *buffer; @@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) local_irq_save(flags); cpu = raw_smp_processor_id(); - if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + data = per_cpu_ptr(tr->trace_buffer.data, cpu); + if (atomic_inc_return(&data->disabled) != 1) goto out; pc = preempt_count(); - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) @@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) __buffer_unlock_commit(buffer, event); out: - atomic_dec(&tr->data[cpu]->disabled); + atomic_dec(&data->disabled); local_irq_restore(flags); } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index aa8f5f48dae6..26dc348332b7 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -57,6 +57,16 @@ u64 notrace trace_clock(void) return local_clock(); } +/* + * trace_jiffy_clock(): Simply use jiffies as a clock counter. + */ +u64 notrace trace_clock_jiffies(void) +{ + u64 jiffy = jiffies - INITIAL_JIFFIES; + + /* Return nsecs */ + return (u64)jiffies_to_usecs(jiffy) * 1000ULL; +} /* * trace_clock_global(): special globally coherent trace clock diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4108e1250ca2..e2d027ac66a2 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry, __dynamic_array( u32, buf ) ), - F_printk("%08lx fmt:%p", - __entry->ip, __entry->fmt), + F_printk("%pf: %s", + (void *)__entry->ip, __entry->fmt), FILTER_OTHER ); @@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry, __dynamic_array( char, buf ) ), - F_printk("%08lx %s", - __entry->ip, __entry->buf), + F_printk("%pf: %s", + (void *)__entry->ip, __entry->buf), + + FILTER_OTHER +); + +FTRACE_ENTRY(bputs, bputs_entry, + + TRACE_BPUTS, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, str ) + ), + + F_printk("%pf: %s", + (void *)__entry->ip, __entry->str), FILTER_OTHER ); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 57e9b284250c..53582e982e51 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE]; EXPORT_SYMBOL_GPL(event_storage); LIST_HEAD(ftrace_events); -LIST_HEAD(ftrace_common_fields); +static LIST_HEAD(ftrace_common_fields); -struct list_head * +#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) + +static struct kmem_cache *field_cachep; +static struct kmem_cache *file_cachep; + +/* Double loops, do not use break, only goto's work */ +#define do_for_each_event_file(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + list_for_each_entry(file, &tr->events, list) + +#define do_for_each_event_file_safe(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + struct ftrace_event_file *___n; \ + list_for_each_entry_safe(file, ___n, &tr->events, list) + +#define while_for_each_event_file() \ + } + +static struct list_head * trace_get_fields(struct ftrace_event_call *event_call) { if (!event_call->class->get_fields) @@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call) return event_call->class->get_fields(event_call); } +static struct ftrace_event_field * +__find_event_field(struct list_head *head, char *name) +{ + struct ftrace_event_field *field; + + list_for_each_entry(field, head, link) { + if (!strcmp(field->name, name)) + return field; + } + + return NULL; +} + +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + field = __find_event_field(&ftrace_common_fields, name); + if (field) + return field; + + head = trace_get_fields(call); + return __find_event_field(head, name); +} + static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { struct ftrace_event_field *field; - field = kzalloc(sizeof(*field), GFP_KERNEL); + field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) goto err; - field->name = kstrdup(name, GFP_KERNEL); - if (!field->name) - goto err; - - field->type = kstrdup(type, GFP_KERNEL); - if (!field->type) - goto err; + field->name = name; + field->type = type; if (filter_type == FILTER_OTHER) field->filter_type = filter_assign_type(type); @@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type, return 0; err: - if (field) - kfree(field->name); - kfree(field); + kmem_cache_free(field_cachep, field); return -ENOMEM; } @@ -120,7 +158,7 @@ static int trace_define_common_fields(void) return ret; } -void trace_destroy_fields(struct ftrace_event_call *call) +static void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; struct list_head *head; @@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call) head = trace_get_fields(call); list_for_each_entry_safe(field, next, head, link) { list_del(&field->link); - kfree(field->type); - kfree(field->name); - kfree(field); + kmem_cache_free(field_cachep, field); } } @@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init); int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type, void *data) { + struct ftrace_event_file *file = data; + switch (type) { case TRACE_REG_REGISTER: return tracepoint_probe_register(call->name, call->class->probe, - call); + file); case TRACE_REG_UNREGISTER: tracepoint_probe_unregister(call->name, call->class->probe, - call); + file); return 0; #ifdef CONFIG_PERF_EVENTS @@ -183,54 +221,100 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg); void trace_event_enable_cmd_record(bool enable) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - if (!(call->flags & TRACE_EVENT_FL_ENABLED)) + do_for_each_event_file(tr, file) { + + if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) continue; if (enable) { tracing_start_cmdline_record(); - call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } else { tracing_stop_cmdline_record(); - call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } - } + } while_for_each_event_file(); mutex_unlock(&event_mutex); } -static int ftrace_event_enable_disable(struct ftrace_event_call *call, - int enable) +static int __ftrace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable) { + struct ftrace_event_call *call = file->event_call; int ret = 0; + int disable; switch (enable) { case 0: - if (call->flags & TRACE_EVENT_FL_ENABLED) { - call->flags &= ~TRACE_EVENT_FL_ENABLED; - if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { + /* + * When soft_disable is set and enable is cleared, we want + * to clear the SOFT_DISABLED flag but leave the event in the + * state that it was. That is, if the event was enabled and + * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED + * is set we do not want the event to be enabled before we + * clear the bit. + * + * When soft_disable is not set but the SOFT_MODE flag is, + * we do nothing. Do not disable the tracepoint, otherwise + * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. + */ + if (soft_disable) { + disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; + clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + } else + disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); + + if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { + clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) { tracing_stop_cmdline_record(); - call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } - call->class->reg(call, TRACE_REG_UNREGISTER, NULL); + call->class->reg(call, TRACE_REG_UNREGISTER, file); } + /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ + if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: - if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { + /* + * When soft_disable is set and enable is set, we want to + * register the tracepoint for the event, but leave the event + * as is. That means, if the event was already enabled, we do + * nothing (but set SOFT_MODE). If the event is disabled, we + * set SOFT_DISABLED before enabling the event tracepoint, so + * it still seems to be disabled. + */ + if (!soft_disable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + + if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { + + /* Keep the event disabled, when going to SOFT_MODE. */ + if (soft_disable) + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + if (trace_flags & TRACE_ITER_RECORD_CMD) { tracing_start_cmdline_record(); - call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } - ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); + ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " "%s\n", call->name); break; } - call->flags |= TRACE_EVENT_FL_ENABLED; + set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + + /* WAS_ENABLED gets set but never cleared. */ + call->flags |= TRACE_EVENT_FL_WAS_ENABLED; } break; } @@ -238,13 +322,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, return ret; } -static void ftrace_clear_events(void) +static int ftrace_event_enable_disable(struct ftrace_event_file *file, + int enable) { - struct ftrace_event_call *call; + return __ftrace_event_enable_disable(file, enable, 0); +} + +static void ftrace_clear_events(struct trace_array *tr) +{ + struct ftrace_event_file *file; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - ftrace_event_enable_disable(call, 0); + list_for_each_entry(file, &tr->events, list) { + ftrace_event_enable_disable(file, 0); } mutex_unlock(&event_mutex); } @@ -257,11 +347,12 @@ static void __put_system(struct event_subsystem *system) if (--system->ref_count) return; + list_del(&system->list); + if (filter) { kfree(filter->filter_string); kfree(filter); } - kfree(system->name); kfree(system); } @@ -271,24 +362,45 @@ static void __get_system(struct event_subsystem *system) system->ref_count++; } -static void put_system(struct event_subsystem *system) +static void __get_system_dir(struct ftrace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + dir->ref_count++; + __get_system(dir->subsystem); +} + +static void __put_system_dir(struct ftrace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + /* If the subsystem is about to be freed, the dir must be too */ + WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); + + __put_system(dir->subsystem); + if (!--dir->ref_count) + kfree(dir); +} + +static void put_system(struct ftrace_subsystem_dir *dir) { mutex_lock(&event_mutex); - __put_system(system); + __put_system_dir(dir); mutex_unlock(&event_mutex); } /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ -static int __ftrace_set_clr_event(const char *match, const char *sub, - const char *event, int set) +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) { + struct ftrace_event_file *file; struct ftrace_event_call *call; int ret = -EINVAL; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; if (!call->name || !call->class || !call->class->reg) continue; @@ -307,7 +419,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, if (event && strcmp(event, call->name) != 0) continue; - ftrace_event_enable_disable(call, set); + ftrace_event_enable_disable(file, set); ret = 0; } @@ -316,7 +428,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, return ret; } -static int ftrace_set_clr_event(char *buf, int set) +static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; @@ -344,7 +456,7 @@ static int ftrace_set_clr_event(char *buf, int set) event = NULL; } - return __ftrace_set_clr_event(match, sub, event, set); + return __ftrace_set_clr_event(tr, match, sub, event, set); } /** @@ -361,7 +473,9 @@ static int ftrace_set_clr_event(char *buf, int set) */ int trace_set_clr_event(const char *system, const char *event, int set) { - return __ftrace_set_clr_event(NULL, system, event, set); + struct trace_array *tr = top_trace_array(); + + return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -373,6 +487,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; + struct seq_file *m = file->private_data; + struct trace_array *tr = m->private; ssize_t read, ret; if (!cnt) @@ -395,7 +511,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf, parser.buffer[parser.idx] = 0; - ret = ftrace_set_clr_event(parser.buffer + !set, set); + ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; } @@ -411,17 +527,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct ftrace_event_call *call; + struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(call, &ftrace_events, list) { + list_for_each_entry_continue(file, &tr->events, list) { + call = file->event_call; /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->class && call->class->reg) - return call; + return file; } return NULL; @@ -429,30 +548,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - call = list_entry(&ftrace_events, struct ftrace_event_call, list); + file = list_entry(&tr->events, struct ftrace_event_file, list); for (l = 0; l <= *pos; ) { - call = t_next(m, call, &l); - if (!call) + file = t_next(m, file, &l); + if (!file) break; } - return call; + return file; } static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(call, &ftrace_events, list) { - if (call->flags & TRACE_EVENT_FL_ENABLED) - return call; + list_for_each_entry_continue(file, &tr->events, list) { + if (file->flags & FTRACE_EVENT_FL_ENABLED) + return file; } return NULL; @@ -460,23 +581,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - call = list_entry(&ftrace_events, struct ftrace_event_call, list); + file = list_entry(&tr->events, struct ftrace_event_file, list); for (l = 0; l <= *pos; ) { - call = s_next(m, call, &l); - if (!call) + file = s_next(m, file, &l); + if (!file) break; } - return call; + return file; } static int t_show(struct seq_file *m, void *v) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct ftrace_event_call *call = file->event_call; if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); @@ -494,25 +617,31 @@ static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_file *file = filp->private_data; char *buf; - if (call->flags & TRACE_EVENT_FL_ENABLED) - buf = "1\n"; - else + if (file->flags & FTRACE_EVENT_FL_ENABLED) { + if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) + buf = "0*\n"; + else + buf = "1\n"; + } else buf = "0\n"; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); } static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_file *file = filp->private_data; unsigned long val; int ret; + if (!file) + return -EINVAL; + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; @@ -525,7 +654,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, case 0: case 1: mutex_lock(&event_mutex); - ret = ftrace_event_enable_disable(call, val); + ret = ftrace_event_enable_disable(file, val); mutex_unlock(&event_mutex); break; @@ -543,14 +672,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = dir->tr; char buf[2]; int set = 0; int ret; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (!call->name || !call->class || !call->class->reg) continue; @@ -562,7 +695,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); + set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED)); /* * If we have a mixture, no need to look further. @@ -584,7 +717,8 @@ static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; const char *name = NULL; unsigned long val; ssize_t ret; @@ -607,7 +741,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (system) name = system->name; - ret = __ftrace_set_clr_event(NULL, name, NULL, val); + ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); if (ret) goto out; @@ -845,43 +979,75 @@ static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { struct event_subsystem *system = NULL; + struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ + struct trace_array *tr; int ret; - if (!inode->i_private) - goto skip_search; - /* Make sure the system still exists */ mutex_lock(&event_mutex); - list_for_each_entry(system, &event_subsystems, list) { - if (system == inode->i_private) { - /* Don't open systems with no events */ - if (!system->nr_events) { - system = NULL; - break; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + list_for_each_entry(dir, &tr->systems, list) { + if (dir == inode->i_private) { + /* Don't open systems with no events */ + if (dir->nr_events) { + __get_system_dir(dir); + system = dir->subsystem; + } + goto exit_loop; } - __get_system(system); - break; } } + exit_loop: mutex_unlock(&event_mutex); - if (system != inode->i_private) + if (!system) return -ENODEV; - skip_search: + /* Some versions of gcc think dir can be uninitialized here */ + WARN_ON(!dir); + ret = tracing_open_generic(inode, filp); - if (ret < 0 && system) - put_system(system); + if (ret < 0) + put_system(dir); + + return ret; +} + +static int system_tr_open(struct inode *inode, struct file *filp) +{ + struct ftrace_subsystem_dir *dir; + struct trace_array *tr = inode->i_private; + int ret; + + /* Make a temporary dir that has no system but points to tr */ + dir = kzalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + return -ENOMEM; + + dir->tr = tr; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) + kfree(dir); + + filp->private_data = dir; return ret; } static int subsystem_release(struct inode *inode, struct file *file) { - struct event_subsystem *system = inode->i_private; + struct ftrace_subsystem_dir *dir = file->private_data; - if (system) - put_system(system); + /* + * If dir->subsystem is NULL, then this is a temporary + * descriptor that was made for a trace_array to enable + * all subsystems. + */ + if (dir->subsystem) + put_system(dir); + else + kfree(dir); return 0; } @@ -890,7 +1056,8 @@ static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; struct trace_seq *s; int r; @@ -915,7 +1082,7 @@ static ssize_t subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; char *buf; int err; @@ -932,7 +1099,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } buf[cnt] = '\0'; - err = apply_subsystem_event_filter(system, buf); + err = apply_subsystem_event_filter(dir, buf); free_page((unsigned long) buf); if (err < 0) return err; @@ -1041,30 +1208,35 @@ static const struct file_operations ftrace_system_enable_fops = { .release = subsystem_release, }; +static const struct file_operations ftrace_tr_enable_fops = { + .open = system_tr_open, + .read = system_enable_read, + .write = system_enable_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, .llseek = default_llseek, }; -static struct dentry *event_trace_events_dir(void) +static int +ftrace_event_open(struct inode *inode, struct file *file, + const struct seq_operations *seq_ops) { - static struct dentry *d_tracer; - static struct dentry *d_events; - - if (d_events) - return d_events; - - d_tracer = tracing_init_dentry(); - if (!d_tracer) - return NULL; + struct seq_file *m; + int ret; - d_events = debugfs_create_dir("events", d_tracer); - if (!d_events) - pr_warning("Could not create debugfs " - "'events' directory\n"); + ret = seq_open(file, seq_ops); + if (ret < 0) + return ret; + m = file->private_data; + /* copy tr over to seq ops */ + m->private = inode->i_private; - return d_events; + return ret; } static int @@ -1072,117 +1244,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_event_seq_ops; - return seq_open(file, seq_ops); + return ftrace_event_open(inode, file, seq_ops); } static int ftrace_event_set_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_event_seq_ops; + struct trace_array *tr = inode->i_private; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_clear_events(); + ftrace_clear_events(tr); - return seq_open(file, seq_ops); + return ftrace_event_open(inode, file, seq_ops); +} + +static struct event_subsystem * +create_new_subsystem(const char *name) +{ + struct event_subsystem *system; + + /* need to create new entry */ + system = kmalloc(sizeof(*system), GFP_KERNEL); + if (!system) + return NULL; + + system->ref_count = 1; + system->name = name; + + system->filter = NULL; + + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) + goto out_free; + + list_add(&system->list, &event_subsystems); + + return system; + + out_free: + kfree(system); + return NULL; } static struct dentry * -event_subsystem_dir(const char *name, struct dentry *d_events) +event_subsystem_dir(struct trace_array *tr, const char *name, + struct ftrace_event_file *file, struct dentry *parent) { + struct ftrace_subsystem_dir *dir; struct event_subsystem *system; struct dentry *entry; /* First see if we did not already create this dir */ - list_for_each_entry(system, &event_subsystems, list) { + list_for_each_entry(dir, &tr->systems, list) { + system = dir->subsystem; if (strcmp(system->name, name) == 0) { - system->nr_events++; - return system->entry; + dir->nr_events++; + file->system = dir; + return dir->entry; } } - /* need to create new entry */ - system = kmalloc(sizeof(*system), GFP_KERNEL); - if (!system) { - pr_warning("No memory to create event subsystem %s\n", - name); - return d_events; + /* Now see if the system itself exists. */ + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) + break; } + /* Reset system variable when not found */ + if (&system->list == &event_subsystems) + system = NULL; - system->entry = debugfs_create_dir(name, d_events); - if (!system->entry) { - pr_warning("Could not create event subsystem %s\n", - name); - kfree(system); - return d_events; - } + dir = kmalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + goto out_fail; - system->nr_events = 1; - system->ref_count = 1; - system->name = kstrdup(name, GFP_KERNEL); - if (!system->name) { - debugfs_remove(system->entry); - kfree(system); - return d_events; + if (!system) { + system = create_new_subsystem(name); + if (!system) + goto out_free; + } else + __get_system(system); + + dir->entry = debugfs_create_dir(name, parent); + if (!dir->entry) { + pr_warning("Failed to create system directory %s\n", name); + __put_system(system); + goto out_free; } - list_add(&system->list, &event_subsystems); - - system->filter = NULL; - - system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); - if (!system->filter) { - pr_warning("Could not allocate filter for subsystem " - "'%s'\n", name); - return system->entry; - } + dir->tr = tr; + dir->ref_count = 1; + dir->nr_events = 1; + dir->subsystem = system; + file->system = dir; - entry = debugfs_create_file("filter", 0644, system->entry, system, + entry = debugfs_create_file("filter", 0644, dir->entry, dir, &ftrace_subsystem_filter_fops); if (!entry) { kfree(system->filter); system->filter = NULL; - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", name); + pr_warning("Could not create debugfs '%s/filter' entry\n", name); } - trace_create_file("enable", 0644, system->entry, system, + trace_create_file("enable", 0644, dir->entry, dir, &ftrace_system_enable_fops); - return system->entry; + list_add(&dir->list, &tr->systems); + + return dir->entry; + + out_free: + kfree(dir); + out_fail: + /* Only print this message if failed on memory allocation */ + if (!dir || !system) + pr_warning("No memory to create event subsystem %s\n", + name); + return NULL; } static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, +event_create_dir(struct dentry *parent, + struct ftrace_event_file *file, const struct file_operations *id, const struct file_operations *enable, const struct file_operations *filter, const struct file_operations *format) { + struct ftrace_event_call *call = file->event_call; + struct trace_array *tr = file->tr; struct list_head *head; + struct dentry *d_events; int ret; /* * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->class->system, TRACE_SYSTEM) != 0) - d_events = event_subsystem_dir(call->class->system, d_events); - - call->dir = debugfs_create_dir(call->name, d_events); - if (!call->dir) { - pr_warning("Could not create debugfs " - "'%s' directory\n", call->name); + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { + d_events = event_subsystem_dir(tr, call->class->system, file, parent); + if (!d_events) + return -ENOMEM; + } else + d_events = parent; + + file->dir = debugfs_create_dir(call->name, d_events); + if (!file->dir) { + pr_warning("Could not create debugfs '%s' directory\n", + call->name); return -1; } if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - trace_create_file("enable", 0644, call->dir, call, + trace_create_file("enable", 0644, file->dir, file, enable); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", 0444, call->dir, call, + trace_create_file("id", 0444, file->dir, call, id); #endif @@ -1196,23 +1416,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); - return ret; + return -1; } } - trace_create_file("filter", 0644, call->dir, call, + trace_create_file("filter", 0644, file->dir, call, filter); - trace_create_file("format", 0444, call->dir, call, + trace_create_file("format", 0444, file->dir, call, format); return 0; } +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ + if (!dir) + return; + + if (!--dir->nr_events) { + debugfs_remove_recursive(dir->entry); + list_del(&dir->list); + __put_system_dir(dir); + } +} + +static void remove_event_from_tracers(struct ftrace_event_call *call) +{ + struct ftrace_event_file *file; + struct trace_array *tr; + + do_for_each_event_file_safe(tr, file) { + + if (file->event_call != call) + continue; + + list_del(&file->list); + debugfs_remove_recursive(file->dir); + remove_subsystem(file->system); + kmem_cache_free(file_cachep, file); + + /* + * The do_for_each_event_file_safe() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); +} + static void event_remove(struct ftrace_event_call *call) { - ftrace_event_enable_disable(call, 0); + struct trace_array *tr; + struct ftrace_event_file *file; + + do_for_each_event_file(tr, file) { + if (file->event_call != call) + continue; + ftrace_event_enable_disable(file, 0); + /* + * The do_for_each_event_file() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); + if (call->event.funcs) __unregister_ftrace_event(&call->event); + remove_event_from_tracers(call); list_del(&call->list); } @@ -1234,82 +1507,99 @@ static int event_init(struct ftrace_event_call *call) } static int -__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, - const struct file_operations *id, - const struct file_operations *enable, - const struct file_operations *filter, - const struct file_operations *format) +__register_event(struct ftrace_event_call *call, struct module *mod) { - struct dentry *d_events; int ret; ret = event_init(call); if (ret < 0) return ret; - d_events = event_trace_events_dir(); - if (!d_events) - return -ENOENT; - - ret = event_create_dir(call, d_events, id, enable, filter, format); - if (!ret) - list_add(&call->list, &ftrace_events); + list_add(&call->list, &ftrace_events); call->mod = mod; - return ret; + return 0; +} + +/* Add an event to a trace directory */ +static int +__trace_add_new_event(struct ftrace_event_call *call, + struct trace_array *tr, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) +{ + struct ftrace_event_file *file; + + file = kmem_cache_alloc(file_cachep, GFP_TRACE); + if (!file) + return -ENOMEM; + + file->event_call = call; + file->tr = tr; + list_add(&file->list, &tr->events); + + return event_create_dir(tr->event_dir, file, id, enable, filter, format); } +/* + * Just create a decriptor for early init. A descriptor is required + * for enabling events at boot. We want to enable events before + * the filesystem is initialized. + */ +static __init int +__trace_early_add_new_event(struct ftrace_event_call *call, + struct trace_array *tr) +{ + struct ftrace_event_file *file; + + file = kmem_cache_alloc(file_cachep, GFP_TRACE); + if (!file) + return -ENOMEM; + + file->event_call = call; + file->tr = tr; + list_add(&file->list, &tr->events); + + return 0; +} + +struct ftrace_module_file_ops; +static void __add_event_to_tracers(struct ftrace_event_call *call, + struct ftrace_module_file_ops *file_ops); + /* Add an additional event_call dynamically */ int trace_add_event_call(struct ftrace_event_call *call) { int ret; mutex_lock(&event_mutex); - ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); - mutex_unlock(&event_mutex); - return ret; -} -static void remove_subsystem_dir(const char *name) -{ - struct event_subsystem *system; - - if (strcmp(name, TRACE_SYSTEM) == 0) - return; + ret = __register_event(call, NULL); + if (ret >= 0) + __add_event_to_tracers(call, NULL); - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) { - if (!--system->nr_events) { - debugfs_remove_recursive(system->entry); - list_del(&system->list); - __put_system(system); - } - break; - } - } + mutex_unlock(&event_mutex); + return ret; } /* - * Must be called under locking both of event_mutex and trace_event_mutex. + * Must be called under locking both of event_mutex and trace_event_sem. */ static void __trace_remove_event_call(struct ftrace_event_call *call) { event_remove(call); trace_destroy_fields(call); destroy_preds(call); - debugfs_remove_recursive(call->dir); - remove_subsystem_dir(call->class->system); } /* Remove an event_call */ void trace_remove_event_call(struct ftrace_event_call *call) { mutex_lock(&event_mutex); - down_write(&trace_event_mutex); + down_write(&trace_event_sem); __trace_remove_event_call(call); - up_write(&trace_event_mutex); + up_write(&trace_event_sem); mutex_unlock(&event_mutex); } @@ -1336,6 +1626,26 @@ struct ftrace_module_file_ops { }; static struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) +{ + /* + * As event_calls are added in groups by module, + * when we find one file_ops, we don't need to search for + * each call in that module, as the rest should be the + * same. Only search for a new one if the last one did + * not match. + */ + if (file_ops && mod == file_ops->mod) + return file_ops; + + list_for_each_entry(file_ops, &ftrace_module_file_list, list) { + if (file_ops->mod == mod) + return file_ops; + } + return NULL; +} + +static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { struct ftrace_module_file_ops *file_ops; @@ -1386,9 +1696,8 @@ static void trace_module_add_events(struct module *mod) return; for_each_event(call, start, end) { - __trace_add_event_call(*call, mod, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + __register_event(*call, mod); + __add_event_to_tracers(*call, file_ops); } } @@ -1396,12 +1705,13 @@ static void trace_module_remove_events(struct module *mod) { struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; - bool found = false; + bool clear_trace = false; - down_write(&trace_event_mutex); + down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { - found = true; + if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) + clear_trace = true; __trace_remove_event_call(call); } } @@ -1415,14 +1725,18 @@ static void trace_module_remove_events(struct module *mod) list_del(&file_ops->list); kfree(file_ops); } + up_write(&trace_event_sem); /* * It is safest to reset the ring buffer if the module being unloaded - * registered any events. + * registered any events that were used. The only worry is if + * a new module gets loaded, and takes on the same id as the events + * of this module. When printing out the buffer, traced events left + * over from this module may be passed to the new module events and + * unexpected results may occur. */ - if (found) - tracing_reset_current_online_cpus(); - up_write(&trace_event_mutex); + if (clear_trace) + tracing_reset_all_online_cpus(); } static int trace_module_notify(struct notifier_block *self, @@ -1443,14 +1757,433 @@ static int trace_module_notify(struct notifier_block *self, return 0; } + +static int +__trace_add_new_mod_event(struct ftrace_event_call *call, + struct trace_array *tr, + struct ftrace_module_file_ops *file_ops) +{ + return __trace_add_new_event(call, tr, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); +} + #else -static int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static inline struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) +{ + return NULL; +} +static inline int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) { return 0; } +static inline int +__trace_add_new_mod_event(struct ftrace_event_call *call, + struct trace_array *tr, + struct ftrace_module_file_ops *file_ops) +{ + return -ENODEV; +} #endif /* CONFIG_MODULES */ +/* Create a new event directory structure for a trace directory. */ +static void +__trace_add_event_dirs(struct trace_array *tr) +{ + struct ftrace_module_file_ops *file_ops = NULL; + struct ftrace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + if (call->mod) { + /* + * Directories for events by modules need to + * keep module ref counts when opened (as we don't + * want the module to disappear when reading one + * of these files). The file_ops keep account of + * the module ref count. + */ + file_ops = find_ftrace_file_ops(file_ops, call->mod); + if (!file_ops) + continue; /* Warn? */ + ret = __trace_add_new_mod_event(call, tr, file_ops); + if (ret < 0) + pr_warning("Could not create directory for event %s\n", + call->name); + continue; + } + ret = __trace_add_new_event(call, tr, + &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (ret < 0) + pr_warning("Could not create directory for event %s\n", + call->name); + } +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct event_probe_data { + struct ftrace_event_file *file; + unsigned long count; + int ref; + bool enable; +}; + +static struct ftrace_event_file * +find_event_file(struct trace_array *tr, const char *system, const char *event) +{ + struct ftrace_event_file *file; + struct ftrace_event_call *call; + + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; + + if (!call->name || !call->class || !call->class->reg) + continue; + + if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + continue; + + if (strcmp(event, call->name) == 0 && + strcmp(system, call->class->system) == 0) + return file; + } + return NULL; +} + +static void +event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (!data) + return; + + if (data->enable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (!data) + return; + + if (!data->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + return; + + if (data->count != -1) + (data->count)--; + + event_enable_probe(ip, parent_ip, _data); +} + +static int +event_enable_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *_data) +{ + struct event_probe_data *data = _data; + + seq_printf(m, "%ps:", (void *)ip); + + seq_printf(m, "%s:%s:%s", + data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + data->file->event_call->class->system, + data->file->event_call->name); + + if (data->count == -1) + seq_printf(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", data->count); + + return 0; +} + +static int +event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + data->ref++; + return 0; +} + +static void +event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + /* Remove the SOFT_MODE flag */ + __ftrace_event_enable_disable(data->file, 0, 1); + module_put(data->file->event_call->mod); + kfree(data); + } + *pdata = NULL; +} + +static struct ftrace_probe_ops event_enable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_enable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static int +event_enable_func(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enabled) +{ + struct trace_array *tr = top_trace_array(); + struct ftrace_event_file *file; + struct ftrace_probe_ops *ops; + struct event_probe_data *data; + const char *system; + const char *event; + char *number; + bool enable; + int ret; + + /* hash funcs only work with set_ftrace_filter */ + if (!enabled) + return -EINVAL; + + if (!param) + return -EINVAL; + + system = strsep(¶m, ":"); + if (!param) + return -EINVAL; + + event = strsep(¶m, ":"); + + mutex_lock(&event_mutex); + + ret = -EINVAL; + file = find_event_file(tr, system, event); + if (!file) + goto out; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + if (enable) + ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; + else + ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; + + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + ret = 0; + goto out; + } + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + data->enable = enable; + data->count = -1; + data->file = file; + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &data->count); + if (ret) + goto out_free; + + out_reg: + /* Don't let event modules unload while probe registered */ + ret = try_module_get(file->event_call->mod); + if (!ret) + goto out_free; + + ret = __ftrace_event_enable_disable(file, 1, 1); + if (ret < 0) + goto out_put; + ret = register_ftrace_function_probe(glob, ops, data); + if (!ret) + goto out_disable; + out: + mutex_unlock(&event_mutex); + return ret; + + out_disable: + __ftrace_event_enable_disable(file, 0, 1); + out_put: + module_put(file->event_call->mod); + out_free: + kfree(data); + goto out; +} + +static struct ftrace_func_command event_enable_cmd = { + .name = ENABLE_EVENT_STR, + .func = event_enable_func, +}; + +static struct ftrace_func_command event_disable_cmd = { + .name = DISABLE_EVENT_STR, + .func = event_enable_func, +}; + +static __init int register_event_cmds(void) +{ + int ret; + + ret = register_ftrace_command(&event_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_ftrace_command(&event_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_ftrace_command(&event_enable_cmd); + return ret; +} +#else +static inline int register_event_cmds(void) { return 0; } +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/* + * The top level array has already had its ftrace_event_file + * descriptors created in order to allow for early events to + * be recorded. This function is called after the debugfs has been + * initialized, and we now have to create the files associated + * to the events. + */ +static __init void +__trace_early_add_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_file *file; + int ret; + + + list_for_each_entry(file, &tr->events, list) { + ret = event_create_dir(tr->event_dir, file, + &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (ret < 0) + pr_warning("Could not create directory for event %s\n", + file->event_call->name); + } +} + +/* + * For early boot up, the top trace array requires to have + * a list of events that can be enabled. This must be done before + * the filesystem is set up in order to allow events to be traced + * early. + */ +static __init void +__trace_early_add_events(struct trace_array *tr) +{ + struct ftrace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + /* Early boot up should not have any modules loaded */ + if (WARN_ON_ONCE(call->mod)) + continue; + + ret = __trace_early_add_new_event(call, tr); + if (ret < 0) + pr_warning("Could not create early event %s\n", + call->name); + } +} + +/* Remove the event directory structure for a trace directory. */ +static void +__trace_remove_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_file *file, *next; + + list_for_each_entry_safe(file, next, &tr->events, list) { + list_del(&file->list); + debugfs_remove_recursive(file->dir); + remove_subsystem(file->system); + kmem_cache_free(file_cachep, file); + } +} + +static void +__add_event_to_tracers(struct ftrace_event_call *call, + struct ftrace_module_file_ops *file_ops) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (file_ops) + __trace_add_new_mod_event(call, tr, file_ops); + else + __trace_add_new_event(call, tr, + &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + } +} + static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, @@ -1464,15 +2197,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); - ring_buffer_expanded = 1; - tracing_selftest_disabled = 1; + ring_buffer_expanded = true; + tracing_selftest_disabled = true; return 1; } __setup("trace_event=", setup_trace_event); +/* Expects to have event_mutex held when called */ +static int +create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) +{ + struct dentry *d_events; + struct dentry *entry; + + entry = debugfs_create_file("set_event", 0644, parent, + tr, &ftrace_set_event_fops); + if (!entry) { + pr_warning("Could not create debugfs 'set_event' entry\n"); + return -ENOMEM; + } + + d_events = debugfs_create_dir("events", parent); + if (!d_events) { + pr_warning("Could not create debugfs 'events' directory\n"); + return -ENOMEM; + } + + /* ring buffer internal formats */ + trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + + trace_create_file("enable", 0644, d_events, + tr, &ftrace_tr_enable_fops); + + tr->event_dir = d_events; + + return 0; +} + +/** + * event_trace_add_tracer - add a instance of a trace_array to events + * @parent: The parent dentry to place the files/directories for events in + * @tr: The trace array associated with these events + * + * When a new instance is created, it needs to set up its events + * directory, as well as other files associated with events. It also + * creates the event hierachry in the @parent/events directory. + * + * Returns 0 on success. + */ +int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out_unlock; + + down_write(&trace_event_sem); + __trace_add_event_dirs(tr); + up_write(&trace_event_sem); + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +/* + * The top trace array already had its file descriptors created. + * Now the files themselves need to be created. + */ +static __init int +early_event_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out_unlock; + + down_write(&trace_event_sem); + __trace_early_add_event_dirs(tr); + up_write(&trace_event_sem); + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +int event_trace_del_tracer(struct trace_array *tr) +{ + /* Disable any running events */ + __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + + mutex_lock(&event_mutex); + + down_write(&trace_event_sem); + __trace_remove_event_dirs(tr); + debugfs_remove_recursive(tr->event_dir); + up_write(&trace_event_sem); + + tr->event_dir = NULL; + + mutex_unlock(&event_mutex); + + return 0; +} + +static __init int event_trace_memsetup(void) +{ + field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); + file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); + return 0; +} + static __init int event_trace_enable(void) { + struct trace_array *tr = top_trace_array(); struct ftrace_event_call **iter, *call; char *buf = bootup_event_buf; char *token; @@ -1486,6 +2339,14 @@ static __init int event_trace_enable(void) list_add(&call->list, &ftrace_events); } + /* + * We need the top trace array to have a working set of trace + * points at early init, before the debug files and directories + * are created. Create the file entries now, and attach them + * to the actual file dentries later. + */ + __trace_early_add_events(tr); + while (true) { token = strsep(&buf, ","); @@ -1494,73 +2355,43 @@ static __init int event_trace_enable(void) if (!*token) continue; - ret = ftrace_set_clr_event(token, 1); + ret = ftrace_set_clr_event(tr, token, 1); if (ret) pr_warn("Failed to enable trace event: %s\n", token); } trace_printk_start_comm(); + register_event_cmds(); + return 0; } static __init int event_trace_init(void) { - struct ftrace_event_call *call; + struct trace_array *tr; struct dentry *d_tracer; struct dentry *entry; - struct dentry *d_events; int ret; + tr = top_trace_array(); + d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; entry = debugfs_create_file("available_events", 0444, d_tracer, - NULL, &ftrace_avail_fops); + tr, &ftrace_avail_fops); if (!entry) pr_warning("Could not create debugfs " "'available_events' entry\n"); - entry = debugfs_create_file("set_event", 0644, d_tracer, - NULL, &ftrace_set_event_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_event' entry\n"); - - d_events = event_trace_events_dir(); - if (!d_events) - return 0; - - /* ring buffer internal formats */ - trace_create_file("header_page", 0444, d_events, - ring_buffer_print_page_header, - &ftrace_show_header_fops); - - trace_create_file("header_event", 0444, d_events, - ring_buffer_print_entry_header, - &ftrace_show_header_fops); - - trace_create_file("enable", 0644, d_events, - NULL, &ftrace_system_enable_fops); - if (trace_define_common_fields()) pr_warning("tracing: Failed to allocate common fields"); - /* - * Early initialization already enabled ftrace event. - * Now it's only necessary to create the event directory. - */ - list_for_each_entry(call, &ftrace_events, list) { - - ret = event_create_dir(call, d_events, - &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); - if (ret < 0) - event_remove(call); - } + ret = early_event_add_tracer(d_tracer, tr); + if (ret) + return ret; ret = register_module_notifier(&trace_module_nb); if (ret) @@ -1568,6 +2399,7 @@ static __init int event_trace_init(void) return 0; } +early_initcall(event_trace_memsetup); core_initcall(event_trace_enable); fs_initcall(event_trace_init); @@ -1627,13 +2459,20 @@ static __init void event_test_stuff(void) */ static __init void event_trace_self_tests(void) { + struct ftrace_subsystem_dir *dir; + struct ftrace_event_file *file; struct ftrace_event_call *call; struct event_subsystem *system; + struct trace_array *tr; int ret; + tr = top_trace_array(); + pr_info("Running tests on trace events:\n"); - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; /* Only test those that have a probe */ if (!call->class || !call->class->probe) @@ -1657,15 +2496,15 @@ static __init void event_trace_self_tests(void) * If an event is already enabled, someone is using * it and the self test should not be on. */ - if (call->flags & TRACE_EVENT_FL_ENABLED) { + if (file->flags & FTRACE_EVENT_FL_ENABLED) { pr_warning("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; } - ftrace_event_enable_disable(call, 1); + ftrace_event_enable_disable(file, 1); event_test_stuff(); - ftrace_event_enable_disable(call, 0); + ftrace_event_enable_disable(file, 0); pr_cont("OK\n"); } @@ -1674,7 +2513,9 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on trace event systems:\n"); - list_for_each_entry(system, &event_subsystems, list) { + list_for_each_entry(dir, &tr->systems, list) { + + system = dir->subsystem; /* the ftrace system is special, skip it */ if (strcmp(system->name, "ftrace") == 0) @@ -1682,7 +2523,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling system %s\n", system->name); @@ -1691,7 +2532,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warning("error disabling system %s\n", system->name); @@ -1706,7 +2547,7 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling all events\n"); return; @@ -1715,7 +2556,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); return; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e5b0ca8b8d4d..a6361178de5a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system, mutex_unlock(&event_mutex); } -static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) -{ - struct ftrace_event_field *field; - - list_for_each_entry(field, head, link) { - if (!strcmp(field->name, name)) - return field; - } - - return NULL; -} - -static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) -{ - struct ftrace_event_field *field; - struct list_head *head; - - field = __find_event_field(&ftrace_common_fields, name); - if (field) - return field; - - head = trace_get_fields(call); - return __find_event_field(head, name); -} - static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); @@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, return NULL; } - field = find_event_field(call, operand1); + field = trace_find_event_field(call, operand1); if (!field) { parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); return NULL; @@ -1907,16 +1880,17 @@ out_unlock: return err; } -int apply_subsystem_event_filter(struct event_subsystem *system, +int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string) { + struct event_subsystem *system = dir->subsystem; struct event_filter *filter; int err = 0; mutex_lock(&event_mutex); /* Make sure the system still has events */ - if (!system->nr_events) { + if (!dir->nr_events) { err = -ENODEV; goto out_unlock; } diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index e039906b037d..d21a74670088 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ -int \ +static int __init \ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ struct struct_name field; \ @@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ regfn) \ \ -struct ftrace_event_class event_class_ftrace_##call = { \ +struct ftrace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 601152523326..c4d6d7191988 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void); static int function_trace_init(struct trace_array *tr) { func_trace = tr; - tr->cpu = get_cpu(); + tr->trace_buffer.cpu = get_cpu(); put_cpu(); tracing_start_cmdline_record(); @@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr) static void function_trace_start(struct trace_array *tr) { - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); } /* Our option */ @@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, goto out; cpu = smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (!atomic_read(&data->disabled)) { local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); @@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, */ local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { @@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly = }; #ifdef CONFIG_DYNAMIC_FTRACE -static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +static int update_count(void **data) { - long *count = (long *)data; - - if (tracing_is_on()) - return; + unsigned long *count = (long *)data; if (!*count) - return; + return 0; if (*count != -1) (*count)--; - tracing_on(); + return 1; } static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) { - long *count = (long *)data; + if (tracing_is_on()) + return; + + if (update_count(data)) + tracing_on(); +} +static void +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +{ if (!tracing_is_on()) return; - if (!*count) + if (update_count(data)) + tracing_off(); +} + +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (tracing_is_on()) return; - if (*count != -1) - (*count)--; + tracing_on(); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (!tracing_is_on()) + return; tracing_off(); } -static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data); +/* + * Skip 4: + * ftrace_stacktrace() + * function_trace_probe_call() + * ftrace_ops_list_func() + * ftrace_call() + */ +#define STACK_SKIP 4 -static struct ftrace_probe_ops traceon_probe_ops = { - .func = ftrace_traceon, - .print = ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +{ + trace_dump_stack(STACK_SKIP); +} -static struct ftrace_probe_ops traceoff_probe_ops = { - .func = ftrace_traceoff, - .print = ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (!tracing_is_on()) + return; + + if (update_count(data)) + trace_dump_stack(STACK_SKIP); +} static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) +ftrace_probe_print(const char *name, struct seq_file *m, + unsigned long ip, void *data) { long count = (long)data; - seq_printf(m, "%ps:", (void *)ip); - - if (ops == &traceon_probe_ops) - seq_printf(m, "traceon"); - else - seq_printf(m, "traceoff"); + seq_printf(m, "%ps:%s", (void *)ip, name); if (count == -1) seq_printf(m, ":unlimited\n"); @@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, } static int -ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) +ftrace_traceon_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) { - struct ftrace_probe_ops *ops; - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = &traceon_probe_ops; - else - ops = &traceoff_probe_ops; + return ftrace_probe_print("traceon", m, ip, data); +} - unregister_ftrace_function_probe_func(glob, ops); +static int +ftrace_traceoff_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("traceoff", m, ip, data); +} - return 0; +static int +ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("stacktrace", m, ip, data); } +static struct ftrace_probe_ops traceon_count_probe_ops = { + .func = ftrace_traceon_count, + .print = ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_count_probe_ops = { + .func = ftrace_traceoff_count, + .print = ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_count_probe_ops = { + .func = ftrace_stacktrace_count, + .print = ftrace_stacktrace_print, +}; + +static struct ftrace_probe_ops traceon_probe_ops = { + .func = ftrace_traceon, + .print = ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_probe_ops = { + .func = ftrace_traceoff, + .print = ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_probe_ops = { + .func = ftrace_stacktrace, + .print = ftrace_stacktrace_print, +}; + static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, - char *glob, char *cmd, char *param, int enable) +ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, + struct ftrace_hash *hash, char *glob, + char *cmd, char *param, int enable) { - struct ftrace_probe_ops *ops; void *count = (void *)-1; char *number; int ret; @@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, if (!enable) return -EINVAL; - if (glob[0] == '!') - return ftrace_trace_onoff_unreg(glob+1, cmd, param); - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = &traceon_probe_ops; - else - ops = &traceoff_probe_ops; + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + return 0; + } if (!param) goto out_reg; @@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, return ret < 0 ? ret : 0; } +static int +ftrace_trace_onoff_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; + else + ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; + + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + param, enable); +} + +static int +ftrace_stacktrace_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; + + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + param, enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = { .func = ftrace_trace_onoff_callback, }; +static struct ftrace_func_command ftrace_stacktrace_cmd = { + .name = "stacktrace", + .func = ftrace_stacktrace_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void) ret = register_ftrace_command(&ftrace_traceon_cmd); if (ret) unregister_ftrace_command(&ftrace_traceoff_cmd); + + ret = register_ftrace_command(&ftrace_stacktrace_cmd); + if (ret) { + unregister_ftrace_command(&ftrace_traceoff_cmd); + unregister_ftrace_command(&ftrace_traceon_cmd); + } return ret; } #else diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 39ada66389cc..8388bc99f2ee 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr, { struct ftrace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ent_entry *entry; if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); @@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr, { struct ftrace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ret_entry *entry; if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); @@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter, * We need to consume the current entry to see * the next one. */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, + ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, NULL, NULL); - event = ring_buffer_peek(iter->tr->buffer, iter->cpu, + event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu, NULL, NULL); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 443b25b43b4f..b19d065a28cb 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -33,6 +33,7 @@ enum { static int trace_type __read_mostly; static int save_flags; +static bool function_enabled; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr, if (!irqs_disabled_flags(*flags)) return 0; - *data = tr->data[cpu]; + *data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) @@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) per_cpu(tracing_cpu, cpu) = 0; tracing_max_latency = 0; - tracing_reset_online_cpus(irqsoff_trace); + tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); return start_irqsoff_tracer(irqsoff_trace, set); } @@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (per_cpu(tracing_cpu, cpu)) return; - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (unlikely(!data) || atomic_read(&data->disabled)) return; @@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) if (!tracer_enabled) return; - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (unlikely(!data) || !data->critical_start || atomic_read(&data->disabled)) @@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -static int start_irqsoff_tracer(struct trace_array *tr, int graph) +static int register_irqsoff_function(int graph, int set) { - int ret = 0; + int ret; - if (!graph) - ret = register_ftrace_function(&trace_ops); - else + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) ret = register_ftrace_graph(&irqsoff_graph_return, &irqsoff_graph_entry); + else + ret = register_ftrace_function(&trace_ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_irqsoff_function(int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(&trace_ops); + + function_enabled = false; +} + +static void irqsoff_function_set(int set) +{ + if (set) + register_irqsoff_function(is_graph(), 1); + else + unregister_irqsoff_function(is_graph()); +} + +static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) +{ + if (mask & TRACE_ITER_FUNCTION) + irqsoff_function_set(set); + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_irqsoff_tracer(struct trace_array *tr, int graph) +{ + int ret; + + ret = register_irqsoff_function(graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -550,10 +596,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - if (!graph) - unregister_ftrace_function(&trace_ops); - else - unregister_ftrace_graph(); + unregister_irqsoff_function(graph); } static void __irqsoff_tracer_init(struct trace_array *tr) @@ -561,14 +604,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr) save_flags = trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); if (start_irqsoff_tracer(tr, is_graph())) printk(KERN_ERR "failed to start irqsoff tracer\n"); @@ -581,8 +624,8 @@ static void irqsoff_tracer_reset(struct trace_array *tr) stop_irqsoff_tracer(tr, is_graph()); - set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); } static void irqsoff_tracer_start(struct trace_array *tr) @@ -615,7 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, - .flag_changed = trace_keep_overwrite, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, #endif @@ -649,7 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, - .flag_changed = trace_keep_overwrite, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, #endif @@ -685,7 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, - .flag_changed = trace_keep_overwrite, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, #endif diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 3c5c5dfea0b3..bd90e1b06088 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&iter.tr->data[cpu]->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } old_userobj = trace_flags; @@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) iter.iter_flags |= TRACE_FILE_LAT_FMT; iter.pos = -1; - if (cpu_file == TRACE_PIPE_ALL_CPU) { + if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.tr->buffer, cpu); + ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.tr->buffer, cpu_file); + ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } @@ -83,7 +83,7 @@ out: trace_flags = old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&iter.tr->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } for_each_tracing_cpu(cpu) @@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv) !cpu_online(cpu_file)) return KDB_BADINT; } else { - cpu_file = TRACE_PIPE_ALL_CPU; + cpu_file = RING_BUFFER_ALL_CPUS; } kdb_trap_printk++; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fd3c8aae55e5..a5e8f4878bfa 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr) overrun_detected = false; prev_overruns = 0; - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); } static int mmio_trace_init(struct trace_array *tr) @@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter) static unsigned long count_overruns(struct trace_iterator *iter) { unsigned long cnt = atomic_xchg(&dropped_count, 0); - unsigned long over = ring_buffer_overruns(iter->tr->buffer); + unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer); if (over > prev_overruns) cnt += over - prev_overruns; @@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct mmiotrace_rw *rw) { struct ftrace_event_call *call = &event_mmiotrace_rw; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; int pc = preempt_count(); @@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = tr->data[smp_processor_id()]; + struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); __trace_mmiotrace_rw(tr, data, rw); } @@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct mmiotrace_map *map) { struct ftrace_event_call *call = &event_mmiotrace_map; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; int pc = preempt_count(); @@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map) struct trace_array_cpu *data; preempt_disable(); - data = tr->data[smp_processor_id()]; + data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); __trace_mmiotrace_map(tr, data, map); preempt_enable(); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 697e88d13907..bb922d9ee51b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -DECLARE_RWSEM(trace_event_mutex); +DECLARE_RWSEM(trace_event_sem); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; @@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) return ret; } +enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bputs_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_puts(s, field->str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) } EXPORT_SYMBOL(ftrace_print_hex_seq); +int ftrace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *trace_event) +{ + struct ftrace_event_call *event; + struct trace_seq *s = &iter->seq; + struct trace_seq *p = &iter->tmp_seq; + struct trace_entry *entry; + int ret; + + event = container_of(trace_event, struct ftrace_event_call, event); + entry = iter->ent; + + if (entry->type != event->event.type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + + trace_seq_init(p); + ret = trace_seq_printf(s, "%s: ", event->name); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return 0; +} +EXPORT_SYMBOL(ftrace_raw_output_prep); + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { @@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; - unsigned long long abs_ts = iter->ts - iter->tr->time_start; + unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start; unsigned long long rel_ts = next_ts - iter->ts; struct trace_seq *s = &iter->seq; @@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list) void trace_event_read_lock(void) { - down_read(&trace_event_mutex); + down_read(&trace_event_sem); } void trace_event_read_unlock(void) { - up_read(&trace_event_mutex); + up_read(&trace_event_sem); } /** @@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event) unsigned key; int ret = 0; - down_write(&trace_event_mutex); + down_write(&trace_event_sem); if (WARN_ON(!event)) goto out; @@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event) ret = event->type; out: - up_write(&trace_event_mutex); + up_write(&trace_event_sem); return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); /* - * Used by module code with the trace_event_mutex held for write. + * Used by module code with the trace_event_sem held for write. */ int __unregister_ftrace_event(struct trace_event *event) { @@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event) */ int unregister_ftrace_event(struct trace_event *event) { - down_write(&trace_event_mutex); + down_write(&trace_event_sem); __unregister_ftrace_event(event); - up_write(&trace_event_mutex); + up_write(&trace_event_sem); return 0; } @@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = { .funcs = &trace_user_stack_funcs, }; +/* TRACE_BPUTS */ +static enum print_line_t +trace_bputs_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bputs_entry *field; + + trace_assign_type(field, entry); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (!trace_seq_puts(s, ": ")) + goto partial; + + if (!trace_seq_puts(s, field->str)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + + +static enum print_line_t +trace_bputs_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct bputs_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(s, ": %lx : ", field->ip)) + goto partial; + + if (!trace_seq_puts(s, field->str)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_bputs_funcs = { + .trace = trace_bputs_print, + .raw = trace_bputs_raw, +}; + +static struct trace_event trace_bputs_event = { + .type = TRACE_BPUTS, + .funcs = &trace_bputs_funcs, +}; + /* TRACE_BPRINT */ static enum print_line_t trace_bprint_print(struct trace_iterator *iter, int flags, @@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = { &trace_wake_event, &trace_stack_event, &trace_user_stack_event, + &trace_bputs_event, &trace_bprint_event, &trace_print_event, NULL diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index c038eba0492b..127a9d8c8357 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -5,6 +5,8 @@ #include "trace.h" extern enum print_line_t +trace_print_bputs_msg_only(struct trace_iterator *iter); +extern enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); @@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ extern int __unregister_ftrace_event(struct trace_event *event); -extern struct rw_semaphore trace_event_mutex; +extern struct rw_semaphore trace_event_sem; #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3374c792ccd8..4e98e3b257a3 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr, unsigned long flags, int pc) { struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = ctx_trace->data[cpu]; + data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); if (likely(!atomic_read(&data->disabled))) tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); @@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct ftrace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, sizeof(*entry), flags, pc); @@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = ctx_trace->data[cpu]; + data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); if (likely(!atomic_read(&data->disabled))) tracing_sched_wakeup_trace(ctx_trace, wakee, current, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fde652c9a511..fee77e15d815 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -37,6 +37,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace); static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_flags; +static bool function_enabled; #define TRACE_DISPLAY_GRAPH 1 @@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr, if (cpu != wakeup_current_cpu) goto out_enable; - *data = tr->data[cpu]; + *data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; @@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly = }; #endif /* CONFIG_FUNCTION_TRACER */ -static int start_func_tracer(int graph) +static int register_wakeup_function(int graph, int set) { int ret; - if (!graph) - ret = register_ftrace_function(&trace_ops); - else + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) ret = register_ftrace_graph(&wakeup_graph_return, &wakeup_graph_entry); + else + ret = register_ftrace_function(&trace_ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_wakeup_function(int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(&trace_ops); + + function_enabled = false; +} + +static void wakeup_function_set(int set) +{ + if (set) + register_wakeup_function(is_graph(), 1); + else + unregister_wakeup_function(is_graph()); +} + +static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) +{ + if (mask & TRACE_ITER_FUNCTION) + wakeup_function_set(set); + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(int graph) +{ + int ret; + + ret = register_wakeup_function(graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -156,10 +202,7 @@ static void stop_func_tracer(int graph) { tracer_enabled = 0; - if (!graph) - unregister_ftrace_function(&trace_ops); - else - unregister_ftrace_graph(); + unregister_wakeup_function(graph); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore, /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); if (likely(disabled != 1)) goto out; @@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore, goto out_unlock; /* The task we are waiting for is waking up */ - data = wakeup_trace->data[wakeup_cpu]; + data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); @@ -387,7 +430,7 @@ out_unlock: arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&wakeup_trace->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr) { unsigned long flags; - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); local_irq_save(flags); arch_spin_lock(&wakeup_lock); @@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) return; pc = preempt_count(); - disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; @@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) local_save_flags(flags); - data = wakeup_trace->data[wakeup_cpu]; + data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); @@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) out_locked: arch_spin_unlock(&wakeup_lock); out: - atomic_dec(&wakeup_trace->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); } static void start_wakeup_tracer(struct trace_array *tr) @@ -543,8 +586,8 @@ static int __wakeup_tracer_init(struct trace_array *tr) save_flags = trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; wakeup_trace = tr; @@ -573,8 +616,8 @@ static void wakeup_tracer_reset(struct trace_array *tr) /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); } static void wakeup_tracer_start(struct trace_array *tr) @@ -600,7 +643,7 @@ static struct tracer wakeup_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, - .flag_changed = trace_keep_overwrite, + .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif @@ -622,7 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, - .flag_changed = trace_keep_overwrite, + .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 51c819c12c29..55e2cf66967b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry) return 0; } -static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) +static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu) { struct ring_buffer_event *event; struct trace_entry *entry; unsigned int loops = 0; - while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { + while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) { entry = ring_buffer_event_data(event); /* @@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) * Test the trace buffer to see if all the elements * are still sane. */ -static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) { unsigned long flags, cnt = 0; int cpu, ret = 0; @@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) local_irq_save(flags); arch_spin_lock(&ftrace_max_lock); - cnt = ring_buffer_entries(tr->buffer); + cnt = ring_buffer_entries(buf->buffer); /* * The trace_test_buffer_cpu runs a while loop to consume all data. @@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) */ tracing_off(); for_each_possible_cpu(cpu) { - ret = trace_test_buffer_cpu(tr, cpu); + ret = trace_test_buffer_cpu(buf, cpu); if (ret) break; } @@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, msleep(100); /* we should have nothing in the buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); if (ret) goto out; @@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ftrace_enabled = 0; /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); tracing_start(); /* we should only have one item */ @@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = 0; /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); @@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* Maximum number of functions to trace before diagnosing a hang */ #define GRAPH_MAX_FUNC_TEST 100000000 -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode); static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ @@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { ftrace_graph_stop(); printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); - if (ftrace_dump_on_oops) - __ftrace_dump(false, DUMP_ALL); + if (ftrace_dump_on_oops) { + ftrace_dump(DUMP_ALL); + /* ftrace_dump() disables tracing */ + tracing_on(); + } return 0; } @@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, * Simulate the init() callback but we attach a watchdog callback * to detect and recover from possible hangs */ - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); set_graph_array(tr); ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry_watchdog); @@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); @@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); tracing_start(); @@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); tracing_start(); @@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (ret) goto out; - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); if (ret) goto out; @@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (ret) goto out; - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); printk("ret = %d\n", ret); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); @@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr /* stop the tracing. */ tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 83a8b5b7bd35..b20428c5efe2 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -20,13 +20,24 @@ #define STACK_TRACE_ENTRIES 500 +#ifdef CC_USING_FENTRY +# define fentry 1 +#else +# define fentry 0 +#endif + static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; +/* + * Reserve one entry for the passed in ip. This will allow + * us to remove most or all of the stack size overhead + * added by the stack tracer itself. + */ static struct stack_trace max_stack_trace = { - .max_entries = STACK_TRACE_ENTRIES, - .entries = stack_dump_trace, + .max_entries = STACK_TRACE_ENTRIES - 1, + .entries = &stack_dump_trace[1], }; static unsigned long max_stack_size; @@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; -static inline void check_stack(void) +static inline void +check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; + static int tracer_frame; + int frame_size = ACCESS_ONCE(tracer_frame); int i; - this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); + this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; + /* Remove the frame of the tracer */ + this_size -= frame_size; if (this_size <= max_stack_size) return; /* we do not handle interrupt stacks yet */ - if (!object_is_on_stack(&this_size)) + if (!object_is_on_stack(stack)) return; local_irq_save(flags); arch_spin_lock(&max_stack_lock); + /* In case another CPU set the tracer_frame on us */ + if (unlikely(!frame_size)) + this_size -= tracer_frame; + /* a race could have already updated it */ if (this_size <= max_stack_size) goto out; @@ -70,10 +90,18 @@ static inline void check_stack(void) save_stack_trace(&max_stack_trace); /* + * Add the passed in ip from the function tracer. + * Searching for this on the stack will skip over + * most of the overhead from the stack tracer itself. + */ + stack_dump_trace[0] = ip; + max_stack_trace.nr_entries++; + + /* * Now find where in the stack these are. */ i = 0; - start = &this_size; + start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -97,6 +125,18 @@ static inline void check_stack(void) found = 1; /* Start the search from here */ start = p + 1; + /* + * We do not want to show the overhead + * of the stack tracer stack in the + * max stack. If we haven't figured + * out what that is, then figure it out + * now. + */ + if (unlikely(!tracer_frame) && i == 1) { + tracer_frame = (p - stack) * + sizeof(unsigned long); + max_stack_size -= tracer_frame; + } } } @@ -113,6 +153,7 @@ static void stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { + unsigned long stack; int cpu; preempt_disable_notrace(); @@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - check_stack(); + /* + * When fentry is used, the traced function does not get + * its stack frame set up, and we lose the parent. + * The ip is pretty useless because the function tracer + * was called before that function set up its stack frame. + * In this case, we use the parent ip. + * + * By adding the return address of either the parent ip + * or the current ip we can disregard most of the stack usage + * caused by the stack tracer itself. + * + * The function tracer always reports the address of where the + * mcount call was, but the stack will hold the return address. + */ + if (fentry) + ip = parent_ip; + else + ip += MCOUNT_INSN_SIZE; + + check_stack(ip, &stack); out: per_cpu(trace_active, cpu)--; @@ -371,6 +431,8 @@ static __init int stack_trace_init(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; trace_create_file("stack_max_size", 0644, d_tracer, &max_stack_size, &stack_max_size_fops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 96cffb269e73..847f88a6194b 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -307,6 +307,8 @@ static int tracing_stat_init(void) struct dentry *d_tracing; d_tracing = tracing_init_dentry(); + if (!d_tracing) + return 0; stat_dir = debugfs_create_dir("trace_stat", d_tracing); if (!stat_dir) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7a809e321058..8f2ac73c7a5f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -12,10 +12,6 @@ #include "trace.h" static DEFINE_MUTEX(syscall_trace_lock); -static int sys_refcount_enter; -static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); static int syscall_enter_register(struct ftrace_event_call *event, enum trace_reg type, void *data); @@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name /* * Only compare after the "sys" prefix. Archs that use * syscall wrappers may have syscalls symbols aliases prefixed - * with "SyS" instead of "sys", leading to an unwanted + * with ".SyS" or ".sys" instead of "sys", leading to an unwanted * mismatch. */ return !strcmp(sym + 3, name + 3); @@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call) kfree(call->print_fmt); } -static int syscall_enter_define_fields(struct ftrace_event_call *call) +static int __init syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; @@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call) return ret; } -static int syscall_exit_define_fields(struct ftrace_event_call *call) +static int __init syscall_exit_define_fields(struct ftrace_event_call *call) { struct syscall_trace_exit trace; int ret; @@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call) return ret; } -static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { + struct trace_array *tr = data; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, enabled_enter_syscalls)) + if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(&buffer, + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, sys_data->enter_event->event.type, size, 0, 0); if (!event) return; @@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) { + struct trace_array *tr = data; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, enabled_exit_syscalls)) + if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; - event = trace_current_buffer_lock_reserve(&buffer, + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, sys_data->exit_event->event.type, sizeof(*entry), 0, 0); if (!event) return; @@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -static int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int ret = 0; int num; @@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call) if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); - if (!sys_refcount_enter) - ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); + if (!tr->sys_refcount_enter) + ret = register_trace_sys_enter(ftrace_syscall_enter, tr); if (!ret) { - set_bit(num, enabled_enter_syscalls); - sys_refcount_enter++; + set_bit(num, tr->enabled_enter_syscalls); + tr->sys_refcount_enter++; } mutex_unlock(&syscall_trace_lock); return ret; } -static void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); - sys_refcount_enter--; - clear_bit(num, enabled_enter_syscalls); - if (!sys_refcount_enter) - unregister_trace_sys_enter(ftrace_syscall_enter, NULL); + tr->sys_refcount_enter--; + clear_bit(num, tr->enabled_enter_syscalls); + if (!tr->sys_refcount_enter) + unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); } -static int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int ret = 0; int num; @@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call) if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); - if (!sys_refcount_exit) - ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); + if (!tr->sys_refcount_exit) + ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - set_bit(num, enabled_exit_syscalls); - sys_refcount_exit++; + set_bit(num, tr->enabled_exit_syscalls); + tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); return ret; } -static void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); - sys_refcount_exit--; - clear_bit(num, enabled_exit_syscalls); - if (!sys_refcount_exit) - unregister_trace_sys_exit(ftrace_syscall_exit, NULL); + tr->sys_refcount_exit--; + clear_bit(num, tr->enabled_exit_syscalls); + if (!tr->sys_refcount_exit) + unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); } @@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = { .trace = print_syscall_exit, }; -struct ftrace_event_class event_class_syscall_enter = { +struct ftrace_event_class __refdata event_class_syscall_enter = { .system = "syscalls", .reg = syscall_enter_register, .define_fields = syscall_enter_define_fields, @@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = { .raw_init = init_syscall_trace, }; -struct ftrace_event_class event_class_syscall_exit = { +struct ftrace_event_class __refdata event_class_syscall_exit = { .system = "syscalls", .reg = syscall_exit_register, .define_fields = syscall_exit_define_fields, @@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call) static int syscall_enter_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { + struct ftrace_event_file *file = data; + switch (type) { case TRACE_REG_REGISTER: - return reg_event_syscall_enter(event); + return reg_event_syscall_enter(file, event); case TRACE_REG_UNREGISTER: - unreg_event_syscall_enter(event); + unreg_event_syscall_enter(file, event); return 0; #ifdef CONFIG_PERF_EVENTS @@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event, static int syscall_exit_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { + struct ftrace_event_file *file = data; + switch (type) { case TRACE_REG_REGISTER: - return reg_event_syscall_exit(event); + return reg_event_syscall_exit(file, event); case TRACE_REG_UNREGISTER: - unreg_event_syscall_exit(event); + unreg_event_syscall_exit(file, event); return 0; #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8dad2a92dee9..32494fb0ee64 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -28,6 +28,18 @@ #define UPROBE_EVENT_SYSTEM "uprobes" +struct uprobe_trace_entry_head { + struct trace_entry ent; + unsigned long vaddr[]; +}; + +#define SIZEOF_TRACE_ENTRY(is_return) \ + (sizeof(struct uprobe_trace_entry_head) + \ + sizeof(unsigned long) * (is_return ? 2 : 1)) + +#define DATAOF_TRACE_ENTRY(entry, is_return) \ + ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) + struct trace_uprobe_filter { rwlock_t rwlock; int nr_systemwide; @@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock); static LIST_HEAD(uprobe_list); static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs); static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { @@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) return !filter->nr_systemwide && list_empty(&filter->perf_events); } +static inline bool is_ret_probe(struct trace_uprobe *tu) +{ + return tu->consumer.ret_handler != NULL; +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ static struct trace_uprobe * -alloc_trace_uprobe(const char *group, const char *event, int nargs) +alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) { struct trace_uprobe *tu; @@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) INIT_LIST_HEAD(&tu->list); tu->consumer.handler = uprobe_dispatcher; + if (is_ret) + tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); return tu; @@ -180,7 +201,7 @@ end: /* * Argument syntax: - * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] + * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS] * * - Remove uprobe: -:[GRP/]EVENT */ @@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv) char buf[MAX_EVENT_NAME_LEN]; struct path path; unsigned long offset; - bool is_delete; + bool is_delete, is_return; int i, ret; inode = NULL; ret = 0; is_delete = false; + is_return = false; event = NULL; group = NULL; /* argc must be >= 1 */ if (argv[0][0] == '-') is_delete = true; + else if (argv[0][0] == 'r') + is_return = true; else if (argv[0][0] != 'p') { - pr_info("Probe definition must be started with 'p' or '-'.\n"); + pr_info("Probe definition must be started with 'p', 'r' or '-'.\n"); return -EINVAL; } @@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv) kfree(tail); } - tu = alloc_trace_uprobe(group, event, argc); + tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); ret = PTR_ERR(tu); @@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { struct trace_uprobe *tu = v; + char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); + seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name); seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); for (i = 0; i < tu->nr_args; i++) @@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = { .release = seq_release, }; -/* uprobe handler */ -static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static void uprobe_trace_print(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - u8 *data; - int size, i, pc; - unsigned long irq_flags; + void *data; + int size, i; struct ftrace_event_call *call = &tu->call; - local_save_flags(irq_flags); - pc = preempt_count(); - - size = sizeof(*entry) + tu->size; - + size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, irq_flags, pc); + size + tu->size, 0, 0); if (!event) - return 0; + return; entry = ring_buffer_event_data(event); - entry->ip = instruction_pointer(task_pt_regs(current)); - data = (u8 *)&entry[1]; + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); + } + for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + trace_buffer_unlock_commit(buffer, event, 0, 0); +} +/* uprobe handler */ +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ + if (!is_ret_probe(tu)) + uprobe_trace_print(tu, 0, regs); return 0; } +static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs) +{ + uprobe_trace_print(tu, func, regs); +} + /* Event entry printers */ static enum print_line_t print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { - struct uprobe_trace_entry_head *field; + struct uprobe_trace_entry_head *entry; struct trace_seq *s = &iter->seq; struct trace_uprobe *tu; u8 *data; int i; - field = (struct uprobe_trace_entry_head *)iter->ent; + entry = (struct uprobe_trace_entry_head *)iter->ent; tu = container_of(event, struct trace_uprobe, call.event); - if (!trace_seq_printf(s, "%s: (", tu->call.name)) - goto partial; - - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) - goto partial; - - if (!trace_seq_puts(s, ")")) - goto partial; + if (is_ret_probe(tu)) { + if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, + entry->vaddr[1], entry->vaddr[0])) + goto partial; + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, + entry->vaddr[0])) + goto partial; + data = DATAOF_TRACE_ENTRY(entry, false); + } - data = (u8 *)&field[1]; for (i = 0; i < tu->nr_args; i++) { if (!tu->args[i].type->print(s, tu->args[i].name, - data + tu->args[i].offset, field)) + data + tu->args[i].offset, entry)) goto partial; } @@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) static int uprobe_event_define_fields(struct ftrace_event_call *event_call) { - int ret, i; + int ret, i, size; struct uprobe_trace_entry_head field; - struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; + struct trace_uprobe *tu = event_call->data; - DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + if (is_ret_probe(tu)) { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0); + size = SIZEOF_TRACE_ENTRY(true); + } else { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); + size = SIZEOF_TRACE_ENTRY(false); + } /* Set argument names as fields */ for (i = 0; i < tu->nr_args; i++) { ret = trace_define_field(event_call, tu->args[i].type->fmttype, tu->args[i].name, - sizeof(field) + tu->args[i].offset, + size + tu->args[i].offset, tu->args[i].type->size, tu->args[i].type->is_signed, FILTER_OTHER); @@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len) int i; int pos = 0; - fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; + if (is_ret_probe(tu)) { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } else { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } /* When len=0, we just calculate the needed length */ @@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, return ret; } -/* uprobe profile handler */ -static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static void uprobe_perf_print(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs) { struct ftrace_event_call *call = &tu->call; struct uprobe_trace_entry_head *entry; struct hlist_head *head; - u8 *data; - int size, __size, i; - int rctx; + void *data; + int size, rctx, i; - if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) - return UPROBE_HANDLER_REMOVE; - - __size = sizeof(*entry) + tu->size; - size = ALIGN(__size + sizeof(u32), sizeof(u64)); - size -= sizeof(u32); + size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return 0; + return; preempt_disable(); + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + goto out; entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) goto out; - entry->ip = instruction_pointer(task_pt_regs(current)); - data = (u8 *)&entry[1]; + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); + } + for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); - head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); - + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); out: preempt_enable(); +} + +/* uprobe profile handler */ +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + + if (!is_ret_probe(tu)) + uprobe_perf_print(tu, 0, regs); return 0; } + +static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs) +{ + uprobe_perf_print(tu, func, regs); +} #endif /* CONFIG_PERF_EVENTS */ static int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { - struct trace_uprobe *tu = (struct trace_uprobe *)event->data; + struct trace_uprobe *tu = event->data; switch (type) { case TRACE_REG_REGISTER: @@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) return ret; } +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs) +{ + struct trace_uprobe *tu; + + tu = container_of(con, struct trace_uprobe, consumer); + + if (tu->flags & TP_FLAG_TRACE) + uretprobe_trace_func(tu, func, regs); + +#ifdef CONFIG_PERF_EVENTS + if (tu->flags & TP_FLAG_PROFILE) + uretprobe_perf_func(tu, func, regs); +#endif + return 0; +} + static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 0c05a4592047..29f26540e9c9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, int nr_probes = 0; struct tracepoint_func *old, *new; - WARN_ON(!probe); + if (WARN_ON(!probe)) + return ERR_PTR(-EINVAL); debug_print_probes(entry); old = entry->funcs; @@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, debug_print_probes(entry); /* (N -> M), (N > 1, M >= 0) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if (!probe || - (old[nr_probes].func == probe && - old[nr_probes].data == data)) - nr_del++; + if (probe) { + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + if (old[nr_probes].func == probe && + old[nr_probes].data == data) + nr_del++; + } } + /* + * If probe is NULL, then nr_probes = nr_del = 0, and then the + * entire entry will be removed. + */ if (nr_probes - nr_del == 0) { /* N -> 0, (N > 1) */ entry->funcs = NULL; @@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, if (new == NULL) return ERR_PTR(-ENOMEM); for (i = 0; old[i].func; i++) - if (probe && - (old[i].func != probe || old[i].data != data)) + if (old[i].func != probe || old[i].data != data) new[j++] = old[i]; new[nr_probes - nr_del].func = NULL; entry->refcount = nr_probes - nr_del; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4a944676358e..05039e348f07 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -517,6 +517,11 @@ int proc_dowatchdog(struct ctl_table *table, int write, return ret; set_sample_period(); + /* + * Watchdog threads shouldn't be enabled if they are + * disabled. The 'watchdog_disabled' variable check in + * watchdog_*_all_cpus() function takes care of this. + */ if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b48cd597145d..154aa12af48e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,7 +41,11 @@ #include <linux/debug_locks.h> #include <linux/lockdep.h> #include <linux/idr.h> +#include <linux/jhash.h> #include <linux/hashtable.h> +#include <linux/rculist.h> +#include <linux/nodemask.h> +#include <linux/moduleparam.h> #include "workqueue_internal.h" @@ -58,12 +62,11 @@ enum { * %WORKER_UNBOUND set and concurrency management disabled, and may * be executing on any CPU. The pool behaves as an unbound one. * - * Note that DISASSOCIATED can be flipped only while holding - * assoc_mutex to avoid changing binding state while + * Note that DISASSOCIATED should be flipped only while holding + * manager_mutex to avoid changing binding state while * create_worker() is in progress. */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ - POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ POOL_FREEZING = 1 << 3, /* freeze in progress */ @@ -74,12 +77,14 @@ enum { WORKER_PREP = 1 << 3, /* preparing to run works */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ + WORKER_REBOUND = 1 << 8, /* worker was rebound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | - WORKER_CPU_INTENSIVE, + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | + WORKER_UNBOUND | WORKER_REBOUND, NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ + UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ @@ -97,6 +102,8 @@ enum { */ RESCUER_NICE_LEVEL = -20, HIGHPRI_NICE_LEVEL = -20, + + WQ_NAME_LEN = 24, }; /* @@ -115,16 +122,26 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * F: wq->flush_mutex protected. + * MG: pool->manager_mutex and pool->lock protected. Writes require both + * locks. Reads can happen under either lock. + * + * PL: wq_pool_mutex protected. + * + * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. + * + * WQ: wq->mutex protected. * - * W: workqueue_lock protected. + * WR: wq->mutex protected for writes. Sched-RCU protected for reads. + * + * MD: wq_mayday_lock protected. */ /* struct worker is defined in workqueue_internal.h */ struct worker_pool { spinlock_t lock; /* the pool lock */ - unsigned int cpu; /* I: the associated cpu */ + int cpu; /* I: the associated cpu */ + int node; /* I: the associated node ID */ int id; /* I: pool ID */ unsigned int flags; /* X: flags */ @@ -138,12 +155,18 @@ struct worker_pool { struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ - /* workers are chained either in busy_hash or idle_list */ + /* a workers is either on busy_hash or idle_list, or the manager */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ - struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ - struct ida worker_ida; /* L: for worker IDs */ + /* see manage_workers() for details on the two manager mutexes */ + struct mutex manager_arb; /* manager arbitration */ + struct mutex manager_mutex; /* manager exclusion */ + struct idr worker_idr; /* MG: worker IDs and iteration */ + + struct workqueue_attrs *attrs; /* I: worker attributes */ + struct hlist_node hash_node; /* PL: unbound_pool_hash node */ + int refcnt; /* PL: refcnt for unbound pools */ /* * The current concurrency level. As it's likely to be accessed @@ -151,6 +174,12 @@ struct worker_pool { * cacheline. */ atomic_t nr_running ____cacheline_aligned_in_smp; + + /* + * Destruction of pool is sched-RCU protected to allow dereferences + * from get_work_pool(). + */ + struct rcu_head rcu; } ____cacheline_aligned_in_smp; /* @@ -164,75 +193,107 @@ struct pool_workqueue { struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ + int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ -}; + struct list_head pwqs_node; /* WR: node on wq->pwqs */ + struct list_head mayday_node; /* MD: node on wq->maydays */ + + /* + * Release of unbound pwq is punted to system_wq. See put_pwq() + * and pwq_unbound_release_workfn() for details. pool_workqueue + * itself is also sched-RCU protected so that the first pwq can be + * determined without grabbing wq->mutex. + */ + struct work_struct unbound_release_work; + struct rcu_head rcu; +} __aligned(1 << WORK_STRUCT_FLAG_BITS); /* * Structure used to wait for workqueue flush. */ struct wq_flusher { - struct list_head list; /* F: list of flushers */ - int flush_color; /* F: flush color waiting for */ + struct list_head list; /* WQ: list of flushers */ + int flush_color; /* WQ: flush color waiting for */ struct completion done; /* flush completion */ }; -/* - * All cpumasks are assumed to be always set on UP and thus can't be - * used to determine whether there's something to be done. - */ -#ifdef CONFIG_SMP -typedef cpumask_var_t mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask) \ - cpumask_test_and_set_cpu((cpu), (mask)) -#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) -#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) -#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) -#define free_mayday_mask(mask) free_cpumask_var((mask)) -#else -typedef unsigned long mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) -#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) -#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) -#define alloc_mayday_mask(maskp, gfp) true -#define free_mayday_mask(mask) do { } while (0) -#endif +struct wq_device; /* - * The externally visible workqueue abstraction is an array of - * per-CPU workqueues: + * The externally visible workqueue. It relays the issued work items to + * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { - unsigned int flags; /* W: WQ_* flags */ - union { - struct pool_workqueue __percpu *pcpu; - struct pool_workqueue *single; - unsigned long v; - } pool_wq; /* I: pwq's */ - struct list_head list; /* W: list of all workqueues */ - - struct mutex flush_mutex; /* protects wq flushing */ - int work_color; /* F: current work color */ - int flush_color; /* F: current flush color */ + struct list_head pwqs; /* WR: all pwqs of this wq */ + struct list_head list; /* PL: list of all workqueues */ + + struct mutex mutex; /* protects this wq */ + int work_color; /* WQ: current work color */ + int flush_color; /* WQ: current flush color */ atomic_t nr_pwqs_to_flush; /* flush in progress */ - struct wq_flusher *first_flusher; /* F: first flusher */ - struct list_head flusher_queue; /* F: flush waiters */ - struct list_head flusher_overflow; /* F: flush overflow list */ + struct wq_flusher *first_flusher; /* WQ: first flusher */ + struct list_head flusher_queue; /* WQ: flush waiters */ + struct list_head flusher_overflow; /* WQ: flush overflow list */ - mayday_mask_t mayday_mask; /* cpus requesting rescue */ + struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ - int nr_drainers; /* W: drain in progress */ - int saved_max_active; /* W: saved pwq max_active */ + int nr_drainers; /* WQ: drain in progress */ + int saved_max_active; /* WQ: saved pwq max_active */ + + struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */ + struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */ + +#ifdef CONFIG_SYSFS + struct wq_device *wq_dev; /* I: for sysfs interface */ +#endif #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif - char name[]; /* I: workqueue name */ + char name[WQ_NAME_LEN]; /* I: workqueue name */ + + /* hot fields used during command issue, aligned to cacheline */ + unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ + struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ + struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */ }; +static struct kmem_cache *pwq_cache; + +static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ +static cpumask_var_t *wq_numa_possible_cpumask; + /* possible CPUs of each node */ + +static bool wq_disable_numa; +module_param_named(disable_numa, wq_disable_numa, bool, 0444); + +static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ + +/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ +static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; + +static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ +static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ + +static LIST_HEAD(workqueues); /* PL: list of all workqueues */ +static bool workqueue_freezing; /* PL: have wqs started freezing? */ + +/* the per-cpu worker pools */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + cpu_worker_pools); + +static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ + +/* PL: hash of all unbound pools keyed by pool->attrs */ +static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); + +/* I: attributes used when instantiating standard unbound pools on demand */ +static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -244,64 +305,87 @@ EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); +static int worker_thread(void *__worker); +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from); + #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> -#define for_each_std_worker_pool(pool, cpu) \ - for ((pool) = &std_worker_pools(cpu)[0]; \ - (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) +#define assert_rcu_or_pool_mutex() \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&wq_pool_mutex), \ + "sched RCU or wq_pool_mutex should be held") -#define for_each_busy_worker(worker, i, pool) \ - hash_for_each(pool->busy_hash, i, worker, hentry) +#define assert_rcu_or_wq_mutex(wq) \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&wq->mutex), \ + "sched RCU or wq->mutex should be held") -static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, - unsigned int sw) -{ - if (cpu < nr_cpu_ids) { - if (sw & 1) { - cpu = cpumask_next(cpu, mask); - if (cpu < nr_cpu_ids) - return cpu; - } - if (sw & 2) - return WORK_CPU_UNBOUND; - } - return WORK_CPU_END; -} +#ifdef CONFIG_LOCKDEP +#define assert_manager_or_pool_lock(pool) \ + WARN_ONCE(debug_locks && \ + !lockdep_is_held(&(pool)->manager_mutex) && \ + !lockdep_is_held(&(pool)->lock), \ + "pool->manager_mutex or ->lock should be held") +#else +#define assert_manager_or_pool_lock(pool) do { } while (0) +#endif -static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, - struct workqueue_struct *wq) -{ - return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); -} +#define for_each_cpu_worker_pool(pool, cpu) \ + for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ + (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ + (pool)++) -/* - * CPU iterators +/** + * for_each_pool - iterate through all worker_pools in the system + * @pool: iteration cursor + * @pi: integer used for iteration * - * An extra cpu number is defined using an invalid cpu number - * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any - * specific CPU. The following iterators are similar to for_each_*_cpu() - * iterators but also considers the unbound CPU. + * This must be called either with wq_pool_mutex held or sched RCU read + * locked. If the pool needs to be used beyond the locking in effect, the + * caller is responsible for guaranteeing that the pool stays online. * - * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND - * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND - * for_each_pwq_cpu() : possible CPUs for bound workqueues, - * WORK_CPU_UNBOUND for unbound workqueues + * The if/else clause exists only for the lockdep assertion and can be + * ignored. */ -#define for_each_wq_cpu(cpu) \ - for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) +#define for_each_pool(pool, pi) \ + idr_for_each_entry(&worker_pool_idr, pool, pi) \ + if (({ assert_rcu_or_pool_mutex(); false; })) { } \ + else -#define for_each_online_wq_cpu(cpu) \ - for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) +/** + * for_each_pool_worker - iterate through all workers of a worker_pool + * @worker: iteration cursor + * @wi: integer used for iteration + * @pool: worker_pool to iterate workers of + * + * This must be called with either @pool->manager_mutex or ->lock held. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. + */ +#define for_each_pool_worker(worker, wi, pool) \ + idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ + if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ + else -#define for_each_pwq_cpu(cpu, wq) \ - for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) +/** + * for_each_pwq - iterate through all pool_workqueues of the specified workqueue + * @pwq: iteration cursor + * @wq: the target workqueue + * + * This must be called either with wq->mutex held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. + */ +#define for_each_pwq(pwq, wq) \ + list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ + if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ + else #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -419,77 +503,35 @@ static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif -/* Serializes the accesses to the list of workqueues. */ -static DEFINE_SPINLOCK(workqueue_lock); -static LIST_HEAD(workqueues); -static bool workqueue_freezing; /* W: have wqs started freezing? */ - -/* - * The CPU and unbound standard worker pools. The unbound ones have - * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set. - */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_std_worker_pools); -static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; - -/* idr of all pools */ -static DEFINE_MUTEX(worker_pool_idr_mutex); -static DEFINE_IDR(worker_pool_idr); - -static int worker_thread(void *__worker); - -static struct worker_pool *std_worker_pools(int cpu) -{ - if (cpu != WORK_CPU_UNBOUND) - return per_cpu(cpu_std_worker_pools, cpu); - else - return unbound_std_worker_pools; -} - -static int std_worker_pool_pri(struct worker_pool *pool) -{ - return pool - std_worker_pools(pool->cpu); -} - /* allocate ID and assign it to @pool */ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; - mutex_lock(&worker_pool_idr_mutex); + lockdep_assert_held(&wq_pool_mutex); + ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); - if (ret >= 0) + if (ret >= 0) { pool->id = ret; - mutex_unlock(&worker_pool_idr_mutex); - - return ret < 0 ? ret : 0; + return 0; + } + return ret; } -/* - * Lookup worker_pool by id. The idr currently is built during boot and - * never modified. Don't worry about locking for now. +/** + * unbound_pwq_by_node - return the unbound pool_workqueue for the given node + * @wq: the target workqueue + * @node: the node ID + * + * This must be called either with pwq_lock held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online. */ -static struct worker_pool *worker_pool_by_id(int pool_id) +static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, + int node) { - return idr_find(&worker_pool_idr, pool_id); -} - -static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) -{ - struct worker_pool *pools = std_worker_pools(cpu); - - return &pools[highpri]; -} - -static struct pool_workqueue *get_pwq(unsigned int cpu, - struct workqueue_struct *wq) -{ - if (!(wq->flags & WQ_UNBOUND)) { - if (likely(cpu < nr_cpu_ids)) - return per_cpu_ptr(wq->pool_wq.pcpu, cpu); - } else if (likely(cpu == WORK_CPU_UNBOUND)) - return wq->pool_wq.single; - return NULL; + assert_rcu_or_wq_mutex(wq); + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } static unsigned int work_color_to_flags(int color) @@ -531,7 +573,7 @@ static int work_next_color(int color) static inline void set_work_data(struct work_struct *work, unsigned long data, unsigned long flags) { - BUG_ON(!work_pending(work)); + WARN_ON_ONCE(!work_pending(work)); atomic_long_set(&work->data, data | flags | work_static(work)); } @@ -583,13 +625,23 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * @work: the work item of interest * * Return the worker_pool @work was last associated with. %NULL if none. + * + * Pools are created and destroyed under wq_pool_mutex, and allows read + * access under sched-RCU read lock. As such, this function should be + * called under wq_pool_mutex or with preemption disabled. + * + * All fields of the returned pool are accessible as long as the above + * mentioned locking is in effect. If the returned pool needs to be used + * beyond the critical section, the caller is responsible for ensuring the + * returned pool is and stays online. */ static struct worker_pool *get_work_pool(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); - struct worker_pool *pool; int pool_id; + assert_rcu_or_pool_mutex(); + if (data & WORK_STRUCT_PWQ) return ((struct pool_workqueue *) (data & WORK_STRUCT_WQ_DATA_MASK))->pool; @@ -598,9 +650,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) if (pool_id == WORK_OFFQ_POOL_NONE) return NULL; - pool = worker_pool_by_id(pool_id); - WARN_ON_ONCE(!pool); - return pool; + return idr_find(&worker_pool_idr, pool_id); } /** @@ -689,7 +739,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = pool->flags & POOL_MANAGING_WORKERS; + bool managing = mutex_is_locked(&pool->manager_arb); int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -744,7 +794,7 @@ static void wake_up_worker(struct worker_pool *pool) * CONTEXT: * spin_lock_irq(rq->lock) */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +void wq_worker_waking_up(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task); @@ -769,8 +819,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) * RETURNS: * Worker task on @cpu to wake up, %NULL if none. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu) +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; @@ -786,7 +835,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, pool = worker->pool; /* this can only happen on the local cpu */ - BUG_ON(cpu != raw_smp_processor_id()); + if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) + return NULL; /* * The counterpart of the following dec_and_test, implied mb, @@ -891,13 +941,12 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * recycled work item as currently executing and make it wait until the * current execution finishes, introducing an unwanted dependency. * - * This function checks the work item address, work function and workqueue - * to avoid false positives. Note that this isn't complete as one may - * construct a work function which can introduce dependency onto itself - * through a recycled work item. Well, if somebody wants to shoot oneself - * in the foot that badly, there's only so much we can do, and if such - * deadlock actually occurs, it should be easy to locate the culprit work - * function. + * This function checks the work item address and work function to avoid + * false positives. Note that this isn't complete as one may construct a + * work function which can introduce dependency onto itself through a + * recycled work item. Well, if somebody wants to shoot oneself in the + * foot that badly, there's only so much we can do, and if such deadlock + * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: * spin_lock_irq(pool->lock). @@ -961,6 +1010,64 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, *nextp = n; } +/** + * get_pwq - get an extra reference on the specified pool_workqueue + * @pwq: pool_workqueue to get + * + * Obtain an extra reference on @pwq. The caller should guarantee that + * @pwq has positive refcnt and be holding the matching pool->lock. + */ +static void get_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&pwq->pool->lock); + WARN_ON_ONCE(pwq->refcnt <= 0); + pwq->refcnt++; +} + +/** + * put_pwq - put a pool_workqueue reference + * @pwq: pool_workqueue to put + * + * Drop a reference of @pwq. If its refcnt reaches zero, schedule its + * destruction. The caller should be holding the matching pool->lock. + */ +static void put_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&pwq->pool->lock); + if (likely(--pwq->refcnt)) + return; + if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) + return; + /* + * @pwq can't be released under pool->lock, bounce to + * pwq_unbound_release_workfn(). This never recurses on the same + * pool->lock as this path is taken only for unbound workqueues and + * the release work item is scheduled on a per-cpu workqueue. To + * avoid lockdep warning, unbound pool->locks are given lockdep + * subclass of 1 in get_unbound_pool(). + */ + schedule_work(&pwq->unbound_release_work); +} + +/** + * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock + * @pwq: pool_workqueue to put (can be %NULL) + * + * put_pwq() with locking. This function also allows %NULL @pwq. + */ +static void put_pwq_unlocked(struct pool_workqueue *pwq) +{ + if (pwq) { + /* + * As both pwqs and pools are sched-RCU protected, the + * following lock operations are safe. + */ + spin_lock_irq(&pwq->pool->lock); + put_pwq(pwq); + spin_unlock_irq(&pwq->pool->lock); + } +} + static void pwq_activate_delayed_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -992,9 +1099,9 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { - /* ignore uncolored works */ + /* uncolored work items don't participate in flushing or nr_active */ if (color == WORK_NO_COLOR) - return; + goto out_put; pwq->nr_in_flight[color]--; @@ -1007,11 +1114,11 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) - return; + goto out_put; /* are there still in-flight works? */ if (pwq->nr_in_flight[color]) - return; + goto out_put; /* this pwq is done, clear flush_color */ pwq->flush_color = -1; @@ -1022,6 +1129,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) */ if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) complete(&pwq->wq->first_flusher->done); +out_put: + put_pwq(pwq); } /** @@ -1144,11 +1253,12 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); + get_pwq(pwq); /* - * Ensure either worker_sched_deactivated() sees the above - * list_add_tail() or we see zero nr_running to avoid workers - * lying around lazily while there are works to be processed. + * Ensure either wq_worker_sleeping() sees the above + * list_add_tail() or we see zero nr_running to avoid workers lying + * around lazily while there are works to be processed. */ smp_mb(); @@ -1172,10 +1282,11 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } -static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, +static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; + struct worker_pool *last_pool; struct list_head *worklist; unsigned int work_flags; unsigned int req_cpu = cpu; @@ -1191,48 +1302,62 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, debug_work_activate(work); /* if dying, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & WQ_DRAINING) && + if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; +retry: + if (req_cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); - /* determine the pwq to use */ - if (!(wq->flags & WQ_UNBOUND)) { - struct worker_pool *last_pool; - - if (cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); - - /* - * It's multi cpu. If @work was previously on a different - * cpu, it might still be running there, in which case the - * work needs to be queued on that cpu to guarantee - * non-reentrancy. - */ - pwq = get_pwq(cpu, wq); - last_pool = get_work_pool(work); + /* pwq which will be used unless @work is executing elsewhere */ + if (!(wq->flags & WQ_UNBOUND)) + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + else + pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - if (last_pool && last_pool != pwq->pool) { - struct worker *worker; + /* + * If @work was previously on a different pool, it might still be + * running there, in which case the work needs to be queued on that + * pool to guarantee non-reentrancy. + */ + last_pool = get_work_pool(work); + if (last_pool && last_pool != pwq->pool) { + struct worker *worker; - spin_lock(&last_pool->lock); + spin_lock(&last_pool->lock); - worker = find_worker_executing_work(last_pool, work); + worker = find_worker_executing_work(last_pool, work); - if (worker && worker->current_pwq->wq == wq) { - pwq = get_pwq(last_pool->cpu, wq); - } else { - /* meh... not running there, queue here */ - spin_unlock(&last_pool->lock); - spin_lock(&pwq->pool->lock); - } + if (worker && worker->current_pwq->wq == wq) { + pwq = worker->current_pwq; } else { + /* meh... not running there, queue here */ + spin_unlock(&last_pool->lock); spin_lock(&pwq->pool->lock); } } else { - pwq = get_pwq(WORK_CPU_UNBOUND, wq); spin_lock(&pwq->pool->lock); } + /* + * pwq is determined and locked. For unbound pools, we could have + * raced with pwq release and it could already be dead. If its + * refcnt is zero, repeat pwq selection. Note that pwqs never die + * without another pwq replacing it in the numa_pwq_tbl or while + * work items are executing on it, so the retrying is guaranteed to + * make forward-progress. + */ + if (unlikely(!pwq->refcnt)) { + if (wq->flags & WQ_UNBOUND) { + spin_unlock(&pwq->pool->lock); + cpu_relax(); + goto retry; + } + /* oops */ + WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", + wq->name, cpu); + } + /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); @@ -1287,22 +1412,6 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_work_on); -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns %false if @work was already on a queue, %true otherwise. - * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. - */ -bool queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ - return queue_work_on(WORK_CPU_UNBOUND, wq, work); -} -EXPORT_SYMBOL_GPL(queue_work); - void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; @@ -1378,21 +1487,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, EXPORT_SYMBOL_GPL(queue_delayed_work_on); /** - * queue_delayed_work - queue work on a workqueue after delay - * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing - * - * Equivalent to queue_delayed_work_on() but tries to use the local CPU. - */ -bool queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(queue_delayed_work); - -/** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU * @cpu: CPU number to execute work on * @wq: workqueue to use @@ -1431,21 +1525,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, EXPORT_SYMBOL_GPL(mod_delayed_work_on); /** - * mod_delayed_work - modify delay of or queue a delayed work - * @wq: workqueue to use - * @dwork: work to queue - * @delay: number of jiffies to wait before queueing - * - * mod_delayed_work_on() on local CPU. - */ -bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, - unsigned long delay) -{ - return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(mod_delayed_work); - -/** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state * @@ -1459,9 +1538,10 @@ static void worker_enter_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; - BUG_ON(worker->flags & WORKER_IDLE); - BUG_ON(!list_empty(&worker->entry) && - (worker->hentry.next || worker->hentry.pprev)); + if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || + WARN_ON_ONCE(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev))) + return; /* can't use worker_set_flags(), also called from start_worker() */ worker->flags |= WORKER_IDLE; @@ -1498,22 +1578,25 @@ static void worker_leave_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; - BUG_ON(!(worker->flags & WORKER_IDLE)); + if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) + return; worker_clr_flags(worker, WORKER_IDLE); pool->nr_idle--; list_del_init(&worker->entry); } /** - * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool - * @worker: self + * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it + * @pool: target worker_pool + * + * Bind %current to the cpu of @pool if it is associated and lock @pool. * * Works which are scheduled while the cpu is online must at least be * scheduled to a worker which is bound to the cpu so that if they are * flushed from cpu callbacks while cpu is going down, they are * guaranteed to execute on the cpu. * - * This function is to be used by rogue workers and rescuers to bind + * This function is to be used by unbound workers and rescuers to bind * themselves to the target cpu and may race with cpu going down or * coming online. kthread_bind() can't be used because it may put the * worker to already dead cpu and set_cpus_allowed_ptr() can't be used @@ -1534,12 +1617,9 @@ static void worker_leave_idle(struct worker *worker) * %true if the associated pool is online (@worker is successfully * bound), %false if offline. */ -static bool worker_maybe_bind_and_lock(struct worker *worker) +static bool worker_maybe_bind_and_lock(struct worker_pool *pool) __acquires(&pool->lock) { - struct worker_pool *pool = worker->pool; - struct task_struct *task = worker->task; - while (true) { /* * The following call may fail, succeed or succeed @@ -1548,14 +1628,13 @@ __acquires(&pool->lock) * against POOL_DISASSOCIATED. */ if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); + set_cpus_allowed_ptr(current, pool->attrs->cpumask); spin_lock_irq(&pool->lock); if (pool->flags & POOL_DISASSOCIATED) return false; - if (task_cpu(task) == pool->cpu && - cpumask_equal(¤t->cpus_allowed, - get_cpu_mask(pool->cpu))) + if (task_cpu(current) == pool->cpu && + cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) return true; spin_unlock_irq(&pool->lock); @@ -1570,108 +1649,6 @@ __acquires(&pool->lock) } } -/* - * Rebind an idle @worker to its CPU. worker_thread() will test - * list_empty(@worker->entry) before leaving idle and call this function. - */ -static void idle_worker_rebind(struct worker *worker) -{ - /* CPU may go down again inbetween, clear UNBOUND only on success */ - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_UNBOUND); - - /* rebind complete, become available again */ - list_add(&worker->entry, &worker->pool->idle_list); - spin_unlock_irq(&worker->pool->lock); -} - -/* - * Function for @worker->rebind.work used to rebind unbound busy workers to - * the associated cpu which is coming back online. This is scheduled by - * cpu up but can race with other cpu hotplug operations and may be - * executed twice without intervening cpu down. - */ -static void busy_worker_rebind_fn(struct work_struct *work) -{ - struct worker *worker = container_of(work, struct worker, rebind_work); - - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_UNBOUND); - - spin_unlock_irq(&worker->pool->lock); -} - -/** - * rebind_workers - rebind all workers of a pool to the associated CPU - * @pool: pool of interest - * - * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding - * is different for idle and busy ones. - * - * Idle ones will be removed from the idle_list and woken up. They will - * add themselves back after completing rebind. This ensures that the - * idle_list doesn't contain any unbound workers when re-bound busy workers - * try to perform local wake-ups for concurrency management. - * - * Busy workers can rebind after they finish their current work items. - * Queueing the rebind work item at the head of the scheduled list is - * enough. Note that nr_running will be properly bumped as busy workers - * rebind. - * - * On return, all non-manager workers are scheduled for rebind - see - * manage_workers() for the manager special case. Any idle worker - * including the manager will not appear on @idle_list until rebind is - * complete, making local wake-ups safe. - */ -static void rebind_workers(struct worker_pool *pool) -{ - struct worker *worker, *n; - int i; - - lockdep_assert_held(&pool->assoc_mutex); - lockdep_assert_held(&pool->lock); - - /* dequeue and kick idle ones */ - list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { - /* - * idle workers should be off @pool->idle_list until rebind - * is complete to avoid receiving premature local wake-ups. - */ - list_del_init(&worker->entry); - - /* - * worker_thread() will see the above dequeuing and call - * idle_worker_rebind(). - */ - wake_up_process(worker->task); - } - - /* rebind busy workers */ - for_each_busy_worker(worker, i, pool) { - struct work_struct *rebind_work = &worker->rebind_work; - struct workqueue_struct *wq; - - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; - - debug_work_activate(rebind_work); - - /* - * wq doesn't really matter but let's keep @worker->pool - * and @pwq->pool consistent for sanity. - */ - if (std_worker_pool_pri(worker->pool)) - wq = system_highpri_wq; - else - wq = system_wq; - - insert_work(get_pwq(pool->cpu, wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } -} - static struct worker *alloc_worker(void) { struct worker *worker; @@ -1680,7 +1657,6 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); - INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1703,18 +1679,25 @@ static struct worker *alloc_worker(void) */ static struct worker *create_worker(struct worker_pool *pool) { - const char *pri = std_worker_pool_pri(pool) ? "H" : ""; struct worker *worker = NULL; int id = -1; + char id_buf[16]; + + lockdep_assert_held(&pool->manager_mutex); + /* + * ID is needed to determine kthread name. Allocate ID first + * without installing the pointer. + */ + idr_preload(GFP_KERNEL); spin_lock_irq(&pool->lock); - while (ida_get_new(&pool->worker_ida, &id)) { - spin_unlock_irq(&pool->lock); - if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) - goto fail; - spin_lock_irq(&pool->lock); - } + + id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); + spin_unlock_irq(&pool->lock); + idr_preload_end(); + if (id < 0) + goto fail; worker = alloc_worker(); if (!worker) @@ -1723,40 +1706,46 @@ static struct worker *create_worker(struct worker_pool *pool) worker->pool = pool; worker->id = id; - if (pool->cpu != WORK_CPU_UNBOUND) - worker->task = kthread_create_on_node(worker_thread, - worker, cpu_to_node(pool->cpu), - "kworker/%u:%d%s", pool->cpu, id, pri); + if (pool->cpu >= 0) + snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, + pool->attrs->nice < 0 ? "H" : ""); else - worker->task = kthread_create(worker_thread, worker, - "kworker/u:%d%s", id, pri); + snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); + + worker->task = kthread_create_on_node(worker_thread, worker, pool->node, + "kworker/%s", id_buf); if (IS_ERR(worker->task)) goto fail; - if (std_worker_pool_pri(pool)) - set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); + /* + * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any + * online CPUs. It'll be re-applied when any of the CPUs come up. + */ + set_user_nice(worker->task, pool->attrs->nice); + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + + /* prevent userland from meddling with cpumask of workqueue workers */ + worker->task->flags |= PF_NO_SETAFFINITY; /* - * Determine CPU binding of the new worker depending on - * %POOL_DISASSOCIATED. The caller is responsible for ensuring the - * flag remains stable across this function. See the comments - * above the flag definition for details. - * - * As an unbound worker may later become a regular one if CPU comes - * online, make sure every worker has %PF_THREAD_BOUND set. + * The caller is responsible for ensuring %POOL_DISASSOCIATED + * remains stable across this function. See the comments above the + * flag definition for details. */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - kthread_bind(worker->task, pool->cpu); - } else { - worker->task->flags |= PF_THREAD_BOUND; + if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; - } + + /* successful, commit the pointer to idr */ + spin_lock_irq(&pool->lock); + idr_replace(&pool->worker_idr, worker, worker->id); + spin_unlock_irq(&pool->lock); return worker; + fail: if (id >= 0) { spin_lock_irq(&pool->lock); - ida_remove(&pool->worker_ida, id); + idr_remove(&pool->worker_idr, id); spin_unlock_irq(&pool->lock); } kfree(worker); @@ -1781,6 +1770,30 @@ static void start_worker(struct worker *worker) } /** + * create_and_start_worker - create and start a worker for a pool + * @pool: the target pool + * + * Grab the managership of @pool and create and start a new worker for it. + */ +static int create_and_start_worker(struct worker_pool *pool) +{ + struct worker *worker; + + mutex_lock(&pool->manager_mutex); + + worker = create_worker(pool); + if (worker) { + spin_lock_irq(&pool->lock); + start_worker(worker); + spin_unlock_irq(&pool->lock); + } + + mutex_unlock(&pool->manager_mutex); + + return worker ? 0 : -ENOMEM; +} + +/** * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * @@ -1792,11 +1805,14 @@ static void start_worker(struct worker *worker) static void destroy_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; - int id = worker->id; + + lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->lock); /* sanity check frenzy */ - BUG_ON(worker->current_work); - BUG_ON(!list_empty(&worker->scheduled)); + if (WARN_ON(worker->current_work) || + WARN_ON(!list_empty(&worker->scheduled))) + return; if (worker->flags & WORKER_STARTED) pool->nr_workers--; @@ -1806,13 +1822,14 @@ static void destroy_worker(struct worker *worker) list_del_init(&worker->entry); worker->flags |= WORKER_DIE; + idr_remove(&pool->worker_idr, worker->id); + spin_unlock_irq(&pool->lock); kthread_stop(worker->task); kfree(worker); spin_lock_irq(&pool->lock); - ida_remove(&pool->worker_ida, id); } static void idle_worker_timeout(unsigned long __pool) @@ -1841,23 +1858,21 @@ static void idle_worker_timeout(unsigned long __pool) spin_unlock_irq(&pool->lock); } -static bool send_mayday(struct work_struct *work) +static void send_mayday(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; - unsigned int cpu; - if (!(wq->flags & WQ_RESCUER)) - return false; + lockdep_assert_held(&wq_mayday_lock); + + if (!wq->rescuer) + return; /* mayday mayday mayday */ - cpu = pwq->pool->cpu; - /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ - if (cpu == WORK_CPU_UNBOUND) - cpu = 0; - if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) + if (list_empty(&pwq->mayday_node)) { + list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); - return true; + } } static void pool_mayday_timeout(unsigned long __pool) @@ -1865,7 +1880,8 @@ static void pool_mayday_timeout(unsigned long __pool) struct worker_pool *pool = (void *)__pool; struct work_struct *work; - spin_lock_irq(&pool->lock); + spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ + spin_lock(&pool->lock); if (need_to_create_worker(pool)) { /* @@ -1878,7 +1894,8 @@ static void pool_mayday_timeout(unsigned long __pool) send_mayday(work); } - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); + spin_unlock_irq(&wq_mayday_lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -1893,8 +1910,8 @@ static void pool_mayday_timeout(unsigned long __pool) * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * - * On return, need_to_create_worker() is guaranteed to be false and - * may_start_working() true. + * On return, need_to_create_worker() is guaranteed to be %false and + * may_start_working() %true. * * LOCKING: * spin_lock_irq(pool->lock) which may be released and regrabbed @@ -1902,7 +1919,7 @@ static void pool_mayday_timeout(unsigned long __pool) * manager. * * RETURNS: - * false if no action was taken and pool->lock stayed locked, true + * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ static bool maybe_create_worker(struct worker_pool *pool) @@ -1925,7 +1942,8 @@ restart: del_timer_sync(&pool->mayday_timer); spin_lock_irq(&pool->lock); start_worker(worker); - BUG_ON(need_to_create_worker(pool)); + if (WARN_ON_ONCE(need_to_create_worker(pool))) + goto restart; return true; } @@ -1958,7 +1976,7 @@ restart: * multiple times. Called only from manager. * * RETURNS: - * false if no action was taken and pool->lock stayed locked, true + * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ static bool maybe_destroy_workers(struct worker_pool *pool) @@ -2009,42 +2027,37 @@ static bool manage_workers(struct worker *worker) struct worker_pool *pool = worker->pool; bool ret = false; - if (pool->flags & POOL_MANAGING_WORKERS) + /* + * Managership is governed by two mutexes - manager_arb and + * manager_mutex. manager_arb handles arbitration of manager role. + * Anyone who successfully grabs manager_arb wins the arbitration + * and becomes the manager. mutex_trylock() on pool->manager_arb + * failure while holding pool->lock reliably indicates that someone + * else is managing the pool and the worker which failed trylock + * can proceed to executing work items. This means that anyone + * grabbing manager_arb is responsible for actually performing + * manager duties. If manager_arb is grabbed and released without + * actual management, the pool may stall indefinitely. + * + * manager_mutex is used for exclusion of actual management + * operations. The holder of manager_mutex can be sure that none + * of management operations, including creation and destruction of + * workers, won't take place until the mutex is released. Because + * manager_mutex doesn't interfere with manager role arbitration, + * it is guaranteed that the pool's management, while may be + * delayed, won't be disturbed by someone else grabbing + * manager_mutex. + */ + if (!mutex_trylock(&pool->manager_arb)) return ret; - pool->flags |= POOL_MANAGING_WORKERS; - /* - * To simplify both worker management and CPU hotplug, hold off - * management while hotplug is in progress. CPU hotplug path can't - * grab %POOL_MANAGING_WORKERS to achieve this because that can - * lead to idle worker depletion (all become busy thinking someone - * else is managing) which in turn can result in deadlock under - * extreme circumstances. Use @pool->assoc_mutex to synchronize - * manager against CPU hotplug. - * - * assoc_mutex would always be free unless CPU hotplug is in - * progress. trylock first without dropping @pool->lock. + * With manager arbitration won, manager_mutex would be free in + * most cases. trylock first without dropping @pool->lock. */ - if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { + if (unlikely(!mutex_trylock(&pool->manager_mutex))) { spin_unlock_irq(&pool->lock); - mutex_lock(&pool->assoc_mutex); - /* - * CPU hotplug could have happened while we were waiting - * for assoc_mutex. Hotplug itself can't handle us - * because manager isn't either on idle or busy list, and - * @pool's state and ours could have deviated. - * - * As hotplug is now excluded via assoc_mutex, we can - * simply try to bind. It will succeed or fail depending - * on @pool's current state. Try it and adjust - * %WORKER_UNBOUND accordingly. - */ - if (worker_maybe_bind_and_lock(worker)) - worker->flags &= ~WORKER_UNBOUND; - else - worker->flags |= WORKER_UNBOUND; - + mutex_lock(&pool->manager_mutex); ret = true; } @@ -2057,8 +2070,8 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); - pool->flags &= ~POOL_MANAGING_WORKERS; - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->manager_arb); return ret; } @@ -2212,11 +2225,11 @@ static void process_scheduled_works(struct worker *worker) * worker_thread - the worker thread function * @__worker: self * - * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools - * of these per each cpu. These workers process all works regardless of - * their specific target workqueue. The only exception is works which - * belong to workqueues with a rescuer which will be explained in - * rescuer_thread(). + * The worker thread function. All workers belong to a worker_pool - + * either a per-cpu one or dynamic unbound one. These workers process all + * work items regardless of their specific target workqueue. The only + * exception is work items which belong to workqueues with a rescuer which + * will be explained in rescuer_thread(). */ static int worker_thread(void *__worker) { @@ -2228,19 +2241,12 @@ static int worker_thread(void *__worker) woke_up: spin_lock_irq(&pool->lock); - /* we are off idle list if destruction or rebind is requested */ - if (unlikely(list_empty(&worker->entry))) { + /* am I supposed to die? */ + if (unlikely(worker->flags & WORKER_DIE)) { spin_unlock_irq(&pool->lock); - - /* if DIE is set, destruction is requested */ - if (worker->flags & WORKER_DIE) { - worker->task->flags &= ~PF_WQ_WORKER; - return 0; - } - - /* otherwise, rebind */ - idle_worker_rebind(worker); - goto woke_up; + WARN_ON_ONCE(!list_empty(&worker->entry)); + worker->task->flags &= ~PF_WQ_WORKER; + return 0; } worker_leave_idle(worker); @@ -2258,14 +2264,16 @@ recheck: * preparing to process a work or actually processing it. * Make sure nobody diddled with it while I was sleeping. */ - BUG_ON(!list_empty(&worker->scheduled)); + WARN_ON_ONCE(!list_empty(&worker->scheduled)); /* - * When control reaches this point, we're guaranteed to have - * at least one idle worker or that someone else has already - * assumed the manager role. + * Finish PREP stage. We're guaranteed to have at least one idle + * worker or that someone else has already assumed the manager + * role. This is where @worker starts participating in concurrency + * management if applicable and concurrency management is restored + * after being rebound. See rebind_workers() for details. */ - worker_clr_flags(worker, WORKER_PREP); + worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); do { struct work_struct *work = @@ -2307,7 +2315,7 @@ sleep: * @__rescuer: self * * Workqueue rescuer thread function. There's one rescuer for each - * workqueue which has WQ_RESCUER set. + * workqueue which has WQ_MEM_RECLAIM set. * * Regular work processing on a pool may block trying to create a new * worker which uses GFP_KERNEL allocation which has slight chance of @@ -2326,8 +2334,6 @@ static int rescuer_thread(void *__rescuer) struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; struct list_head *scheduled = &rescuer->scheduled; - bool is_unbound = wq->flags & WQ_UNBOUND; - unsigned int cpu; set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2345,28 +2351,29 @@ repeat: return 0; } - /* - * See whether any cpu is asking for help. Unbounded - * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. - */ - for_each_mayday_cpu(cpu, wq->mayday_mask) { - unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; - struct pool_workqueue *pwq = get_pwq(tcpu, wq); + /* see whether any pwq is asking for help */ + spin_lock_irq(&wq_mayday_lock); + + while (!list_empty(&wq->maydays)) { + struct pool_workqueue *pwq = list_first_entry(&wq->maydays, + struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); - mayday_clear_cpu(cpu, wq->mayday_mask); + list_del_init(&pwq->mayday_node); + + spin_unlock_irq(&wq_mayday_lock); /* migrate to the target cpu if possible */ + worker_maybe_bind_and_lock(pool); rescuer->pool = pool; - worker_maybe_bind_and_lock(rescuer); /* * Slurp in all works issued via this workqueue and * process'em. */ - BUG_ON(!list_empty(&rescuer->scheduled)); + WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_pwq(work) == pwq) move_linked_works(work, scheduled, &n); @@ -2381,9 +2388,13 @@ repeat: if (keep_working(pool)) wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + rescuer->pool = NULL; + spin_unlock(&pool->lock); + spin_lock(&wq_mayday_lock); } + spin_unlock_irq(&wq_mayday_lock); + /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); @@ -2487,7 +2498,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * advanced to @work_color. * * CONTEXT: - * mutex_lock(wq->flush_mutex). + * mutex_lock(wq->mutex). * * RETURNS: * %true if @flush_color >= 0 and there's something to flush. %false @@ -2497,21 +2508,20 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, int flush_color, int work_color) { bool wait = false; - unsigned int cpu; + struct pool_workqueue *pwq; if (flush_color >= 0) { - BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); + WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); atomic_set(&wq->nr_pwqs_to_flush, 1); } - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; spin_lock_irq(&pool->lock); if (flush_color >= 0) { - BUG_ON(pwq->flush_color != -1); + WARN_ON_ONCE(pwq->flush_color != -1); if (pwq->nr_in_flight[flush_color]) { pwq->flush_color = flush_color; @@ -2521,7 +2531,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, } if (work_color >= 0) { - BUG_ON(work_color != work_next_color(pwq->work_color)); + WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); pwq->work_color = work_color; } @@ -2538,11 +2548,8 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, * flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * - * Forces execution of the workqueue and blocks until its completion. - * This is typically used in driver shutdown handlers. - * - * We sleep until all works which were queued on entry have been handled, - * but we are not livelocked by new incoming ones. + * This function sleeps until all work items which were queued on entry + * have finished execution, but it is not livelocked by new incoming ones. */ void flush_workqueue(struct workqueue_struct *wq) { @@ -2556,7 +2563,7 @@ void flush_workqueue(struct workqueue_struct *wq) lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); /* * Start-to-wait phase @@ -2569,13 +2576,13 @@ void flush_workqueue(struct workqueue_struct *wq) * becomes our flush_color and work_color is advanced * by one. */ - BUG_ON(!list_empty(&wq->flusher_overflow)); + WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); this_flusher.flush_color = wq->work_color; wq->work_color = next_color; if (!wq->first_flusher) { /* no flush in progress, become the first flusher */ - BUG_ON(wq->flush_color != this_flusher.flush_color); + WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); wq->first_flusher = &this_flusher; @@ -2588,7 +2595,7 @@ void flush_workqueue(struct workqueue_struct *wq) } } else { /* wait in queue */ - BUG_ON(wq->flush_color == this_flusher.flush_color); + WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); list_add_tail(&this_flusher.list, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } @@ -2601,7 +2608,7 @@ void flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); @@ -2614,7 +2621,7 @@ void flush_workqueue(struct workqueue_struct *wq) if (wq->first_flusher != &this_flusher) return; - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); /* we might have raced, check again with mutex held */ if (wq->first_flusher != &this_flusher) @@ -2622,8 +2629,8 @@ void flush_workqueue(struct workqueue_struct *wq) wq->first_flusher = NULL; - BUG_ON(!list_empty(&this_flusher.list)); - BUG_ON(wq->flush_color != this_flusher.flush_color); + WARN_ON_ONCE(!list_empty(&this_flusher.list)); + WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); while (true) { struct wq_flusher *next, *tmp; @@ -2636,8 +2643,8 @@ void flush_workqueue(struct workqueue_struct *wq) complete(&next->done); } - BUG_ON(!list_empty(&wq->flusher_overflow) && - wq->flush_color != work_next_color(wq->work_color)); + WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && + wq->flush_color != work_next_color(wq->work_color)); /* this flush_color is finished, advance by one */ wq->flush_color = work_next_color(wq->flush_color); @@ -2661,7 +2668,7 @@ void flush_workqueue(struct workqueue_struct *wq) } if (list_empty(&wq->flusher_queue)) { - BUG_ON(wq->flush_color != wq->work_color); + WARN_ON_ONCE(wq->flush_color != wq->work_color); break; } @@ -2669,8 +2676,8 @@ void flush_workqueue(struct workqueue_struct *wq) * Need to flush more colors. Make the next flusher * the new first flusher and arm pwqs. */ - BUG_ON(wq->flush_color == wq->work_color); - BUG_ON(wq->flush_color != next->flush_color); + WARN_ON_ONCE(wq->flush_color == wq->work_color); + WARN_ON_ONCE(wq->flush_color != next->flush_color); list_del_init(&next->list); wq->first_flusher = next; @@ -2686,7 +2693,7 @@ void flush_workqueue(struct workqueue_struct *wq) } out_unlock: - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(flush_workqueue); @@ -2704,22 +2711,23 @@ EXPORT_SYMBOL_GPL(flush_workqueue); void drain_workqueue(struct workqueue_struct *wq) { unsigned int flush_cnt = 0; - unsigned int cpu; + struct pool_workqueue *pwq; /* * __queue_work() needs to test whether there are drainers, is much * hotter than drain_workqueue() and already looks at @wq->flags. - * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. + * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ - spin_lock(&workqueue_lock); + mutex_lock(&wq->mutex); if (!wq->nr_drainers++) - wq->flags |= WQ_DRAINING; - spin_unlock(&workqueue_lock); + wq->flags |= __WQ_DRAINING; + mutex_unlock(&wq->mutex); reflush: flush_workqueue(wq); - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + mutex_lock(&wq->mutex); + + for_each_pwq(pwq, wq) { bool drained; spin_lock_irq(&pwq->pool->lock); @@ -2731,15 +2739,16 @@ reflush: if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", + pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n", wq->name, flush_cnt); + + mutex_unlock(&wq->mutex); goto reflush; } - spin_lock(&workqueue_lock); if (!--wq->nr_drainers) - wq->flags &= ~WQ_DRAINING; - spin_unlock(&workqueue_lock); + wq->flags &= ~__WQ_DRAINING; + mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); @@ -2750,11 +2759,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) struct pool_workqueue *pwq; might_sleep(); + + local_irq_disable(); pool = get_work_pool(work); - if (!pool) + if (!pool) { + local_irq_enable(); return false; + } - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -2776,7 +2789,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) * flusher is not running on the same workqueue by verifying write * access. */ - if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) + if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) lock_map_acquire(&pwq->wq->lockdep_map); else lock_map_acquire_read(&pwq->wq->lockdep_map); @@ -2933,66 +2946,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_work_on - put work task on a specific cpu - * @cpu: cpu to put the work task on - * @work: job to be done - * - * This puts a job on a specific cpu - */ -bool schedule_work_on(int cpu, struct work_struct *work) -{ - return queue_work_on(cpu, system_wq, work); -} -EXPORT_SYMBOL(schedule_work_on); - -/** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * Returns %false if @work was already on the kernel-global workqueue and - * %true otherwise. - * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global - * workqueue otherwise. - */ -bool schedule_work(struct work_struct *work) -{ - return queue_work(system_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay - * @cpu: cpu to use - * @dwork: job to be done - * @delay: number of jiffies to wait - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue on the specified CPU. - */ -bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work_on(cpu, system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work_on); - -/** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. - */ -bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work(system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work); - -/** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * @@ -3085,51 +3038,1025 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) } EXPORT_SYMBOL_GPL(execute_in_process_context); -int keventd_up(void) +#ifdef CONFIG_SYSFS +/* + * Workqueues with WQ_SYSFS flag set is visible to userland via + * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the + * following attributes. + * + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items + * + * Unbound workqueues have the following extra attributes. + * + * id RO int : the associated pool ID + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + */ +struct wq_device { + struct workqueue_struct *wq; + struct device dev; +}; + +static struct workqueue_struct *dev_to_wq(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + return wq_dev->wq; +} + +static ssize_t wq_per_cpu_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); +} + +static ssize_t wq_max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); +} + +static ssize_t wq_max_active_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int val; + + if (sscanf(buf, "%d", &val) != 1 || val <= 0) + return -EINVAL; + + workqueue_set_max_active(wq, val); + return count; +} + +static struct device_attribute wq_sysfs_attrs[] = { + __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), + __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), + __ATTR_NULL, +}; + +static ssize_t wq_pool_ids_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + const char *delim = ""; + int node, written = 0; + + rcu_read_lock_sched(); + for_each_node(node) { + written += scnprintf(buf + written, PAGE_SIZE - written, + "%s%d:%d", delim, node, + unbound_pwq_by_node(wq, node)->pool->id); + delim = " "; + } + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + rcu_read_unlock_sched(); + + return written; +} + +static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); + mutex_unlock(&wq->mutex); + + return written; +} + +/* prepare workqueue_attrs for sysfs store operations */ +static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) +{ + struct workqueue_attrs *attrs; + + attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!attrs) + return NULL; + + mutex_lock(&wq->mutex); + copy_workqueue_attrs(attrs, wq->unbound_attrs); + mutex_unlock(&wq->mutex); + return attrs; +} + +static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + if (sscanf(buf, "%d", &attrs->nice) == 1 && + attrs->nice >= -20 && attrs->nice <= 19) + ret = apply_workqueue_attrs(wq, attrs); + else + ret = -EINVAL; + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) { - return system_wq != NULL; + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask); + mutex_unlock(&wq->mutex); + + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + return written; } -static int alloc_pwqs(struct workqueue_struct *wq) +static ssize_t wq_cpumask_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = cpumask_parse(buf, attrs->cpumask); + if (!ret) + ret = apply_workqueue_attrs(wq, attrs); + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + !wq->unbound_attrs->no_numa); + mutex_unlock(&wq->mutex); + + return written; +} + +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = -EINVAL; + if (sscanf(buf, "%d", &v) == 1) { + attrs->no_numa = !v; + ret = apply_workqueue_attrs(wq, attrs); + } + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static struct device_attribute wq_sysfs_unbound_attrs[] = { + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), + __ATTR(nice, 0644, wq_nice_show, wq_nice_store), + __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(numa, 0644, wq_numa_show, wq_numa_store), + __ATTR_NULL, +}; + +static struct bus_type wq_subsys = { + .name = "workqueue", + .dev_attrs = wq_sysfs_attrs, +}; + +static int __init wq_sysfs_init(void) +{ + return subsys_virtual_register(&wq_subsys, NULL); +} +core_initcall(wq_sysfs_init); + +static void wq_device_release(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + kfree(wq_dev); +} + +/** + * workqueue_sysfs_register - make a workqueue visible in sysfs + * @wq: the workqueue to register + * + * Expose @wq in sysfs under /sys/bus/workqueue/devices. + * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set + * which is the preferred method. + * + * Workqueue user should use this function directly iff it wants to apply + * workqueue_attrs before making the workqueue visible in sysfs; otherwise, + * apply_workqueue_attrs() may race against userland updating the + * attributes. + * + * Returns 0 on success, -errno on failure. + */ +int workqueue_sysfs_register(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev; + int ret; + /* - * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. - * Make sure that the alignment isn't lower than that of - * unsigned long long. + * Adjusting max_active or creating new pwqs by applyting + * attributes breaks ordering guarantee. Disallow exposing ordered + * workqueues. */ - const size_t size = sizeof(struct pool_workqueue); - const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, - __alignof__(unsigned long long)); + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return -EINVAL; - if (!(wq->flags & WQ_UNBOUND)) - wq->pool_wq.pcpu = __alloc_percpu(size, align); - else { - void *ptr; + wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); + if (!wq_dev) + return -ENOMEM; + + wq_dev->wq = wq; + wq_dev->dev.bus = &wq_subsys; + wq_dev->dev.init_name = wq->name; + wq_dev->dev.release = wq_device_release; + + /* + * unbound_attrs are created separately. Suppress uevent until + * everything is ready. + */ + dev_set_uevent_suppress(&wq_dev->dev, true); + + ret = device_register(&wq_dev->dev); + if (ret) { + kfree(wq_dev); + wq->wq_dev = NULL; + return ret; + } + + if (wq->flags & WQ_UNBOUND) { + struct device_attribute *attr; + + for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { + ret = device_create_file(&wq_dev->dev, attr); + if (ret) { + device_unregister(&wq_dev->dev); + wq->wq_dev = NULL; + return ret; + } + } + } + + kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); + return 0; +} + +/** + * workqueue_sysfs_unregister - undo workqueue_sysfs_register() + * @wq: the workqueue to unregister + * + * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. + */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev = wq->wq_dev; + + if (!wq->wq_dev) + return; + + wq->wq_dev = NULL; + device_unregister(&wq_dev->dev); +} +#else /* CONFIG_SYSFS */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } +#endif /* CONFIG_SYSFS */ + +/** + * free_workqueue_attrs - free a workqueue_attrs + * @attrs: workqueue_attrs to free + * + * Undo alloc_workqueue_attrs(). + */ +void free_workqueue_attrs(struct workqueue_attrs *attrs) +{ + if (attrs) { + free_cpumask_var(attrs->cpumask); + kfree(attrs); + } +} + +/** + * alloc_workqueue_attrs - allocate a workqueue_attrs + * @gfp_mask: allocation mask to use + * + * Allocate a new workqueue_attrs, initialize with default settings and + * return it. Returns NULL on failure. + */ +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +{ + struct workqueue_attrs *attrs; + + attrs = kzalloc(sizeof(*attrs), gfp_mask); + if (!attrs) + goto fail; + if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) + goto fail; + + cpumask_copy(attrs->cpumask, cpu_possible_mask); + return attrs; +fail: + free_workqueue_attrs(attrs); + return NULL; +} + +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from) +{ + to->nice = from->nice; + cpumask_copy(to->cpumask, from->cpumask); +} + +/* hash value of the content of @attr */ +static u32 wqattrs_hash(const struct workqueue_attrs *attrs) +{ + u32 hash = 0; + + hash = jhash_1word(attrs->nice, hash); + hash = jhash(cpumask_bits(attrs->cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + return hash; +} + +/* content equality test */ +static bool wqattrs_equal(const struct workqueue_attrs *a, + const struct workqueue_attrs *b) +{ + if (a->nice != b->nice) + return false; + if (!cpumask_equal(a->cpumask, b->cpumask)) + return false; + return true; +} + +/** + * init_worker_pool - initialize a newly zalloc'd worker_pool + * @pool: worker_pool to initialize + * + * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * Returns 0 on success, -errno on failure. Even on failure, all fields + * inside @pool proper are initialized and put_unbound_pool() can be called + * on @pool safely to release it. + */ +static int init_worker_pool(struct worker_pool *pool) +{ + spin_lock_init(&pool->lock); + pool->id = -1; + pool->cpu = -1; + pool->node = NUMA_NO_NODE; + pool->flags |= POOL_DISASSOCIATED; + INIT_LIST_HEAD(&pool->worklist); + INIT_LIST_HEAD(&pool->idle_list); + hash_init(pool->busy_hash); + + init_timer_deferrable(&pool->idle_timer); + pool->idle_timer.function = idle_worker_timeout; + pool->idle_timer.data = (unsigned long)pool; + + setup_timer(&pool->mayday_timer, pool_mayday_timeout, + (unsigned long)pool); + + mutex_init(&pool->manager_arb); + mutex_init(&pool->manager_mutex); + idr_init(&pool->worker_idr); + + INIT_HLIST_NODE(&pool->hash_node); + pool->refcnt = 1; + + /* shouldn't fail above this point */ + pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pool->attrs) + return -ENOMEM; + return 0; +} + +static void rcu_free_pool(struct rcu_head *rcu) +{ + struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); + + idr_destroy(&pool->worker_idr); + free_workqueue_attrs(pool->attrs); + kfree(pool); +} + +/** + * put_unbound_pool - put a worker_pool + * @pool: worker_pool to put + * + * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU + * safe manner. get_unbound_pool() calls this function on its failure path + * and this function should be able to release pools which went through, + * successfully or not, init_worker_pool(). + * + * Should be called with wq_pool_mutex held. + */ +static void put_unbound_pool(struct worker_pool *pool) +{ + struct worker *worker; + + lockdep_assert_held(&wq_pool_mutex); + + if (--pool->refcnt) + return; + + /* sanity checks */ + if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || + WARN_ON(!list_empty(&pool->worklist))) + return; + + /* release id and unhash */ + if (pool->id >= 0) + idr_remove(&worker_pool_idr, pool->id); + hash_del(&pool->hash_node); + + /* + * Become the manager and destroy all workers. Grabbing + * manager_arb prevents @pool's workers from blocking on + * manager_mutex. + */ + mutex_lock(&pool->manager_arb); + mutex_lock(&pool->manager_mutex); + spin_lock_irq(&pool->lock); + + while ((worker = first_worker(pool))) + destroy_worker(worker); + WARN_ON(pool->nr_workers || pool->nr_idle); + + spin_unlock_irq(&pool->lock); + mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->manager_arb); + + /* shut down the timers */ + del_timer_sync(&pool->idle_timer); + del_timer_sync(&pool->mayday_timer); + + /* sched-RCU protected to allow dereferences from get_work_pool() */ + call_rcu_sched(&pool->rcu, rcu_free_pool); +} + +/** + * get_unbound_pool - get a worker_pool with the specified attributes + * @attrs: the attributes of the worker_pool to get + * + * Obtain a worker_pool which has the same attributes as @attrs, bump the + * reference count and return it. If there already is a matching + * worker_pool, it will be used; otherwise, this function attempts to + * create a new one. On failure, returns NULL. + * + * Should be called with wq_pool_mutex held. + */ +static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) +{ + u32 hash = wqattrs_hash(attrs); + struct worker_pool *pool; + int node; + + lockdep_assert_held(&wq_pool_mutex); + + /* do we already have a matching pool? */ + hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { + if (wqattrs_equal(pool->attrs, attrs)) { + pool->refcnt++; + goto out_unlock; + } + } + + /* nope, create a new one */ + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool || init_worker_pool(pool) < 0) + goto fail; + + if (workqueue_freezing) + pool->flags |= POOL_FREEZING; + + lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ + copy_workqueue_attrs(pool->attrs, attrs); + + /* if cpumask is contained inside a NUMA node, we belong to that node */ + if (wq_numa_enabled) { + for_each_node(node) { + if (cpumask_subset(pool->attrs->cpumask, + wq_numa_possible_cpumask[node])) { + pool->node = node; + break; + } + } + } + + if (worker_pool_assign_id(pool) < 0) + goto fail; + + /* create and start the initial worker */ + if (create_and_start_worker(pool) < 0) + goto fail; + + /* install */ + hash_add(unbound_pool_hash, &pool->hash_node, hash); +out_unlock: + return pool; +fail: + if (pool) + put_unbound_pool(pool); + return NULL; +} + +static void rcu_free_pwq(struct rcu_head *rcu) +{ + kmem_cache_free(pwq_cache, + container_of(rcu, struct pool_workqueue, rcu)); +} + +/* + * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt + * and needs to be destroyed. + */ +static void pwq_unbound_release_workfn(struct work_struct *work) +{ + struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, + unbound_release_work); + struct workqueue_struct *wq = pwq->wq; + struct worker_pool *pool = pwq->pool; + bool is_last; + + if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) + return; + + /* + * Unlink @pwq. Synchronization against wq->mutex isn't strictly + * necessary on release but do it anyway. It's easier to verify + * and consistent with the linking path. + */ + mutex_lock(&wq->mutex); + list_del_rcu(&pwq->pwqs_node); + is_last = list_empty(&wq->pwqs); + mutex_unlock(&wq->mutex); + + mutex_lock(&wq_pool_mutex); + put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + + call_rcu_sched(&pwq->rcu, rcu_free_pwq); + + /* + * If we're the last pwq going away, @wq is already dead and no one + * is gonna access it anymore. Free it. + */ + if (is_last) { + free_workqueue_attrs(wq->unbound_attrs); + kfree(wq); + } +} + +/** + * pwq_adjust_max_active - update a pwq's max_active to the current setting + * @pwq: target pool_workqueue + * + * If @pwq isn't freezing, set @pwq->max_active to the associated + * workqueue's saved_max_active and activate delayed work items + * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. + */ +static void pwq_adjust_max_active(struct pool_workqueue *pwq) +{ + struct workqueue_struct *wq = pwq->wq; + bool freezable = wq->flags & WQ_FREEZABLE; + + /* for @wq->saved_max_active */ + lockdep_assert_held(&wq->mutex); + + /* fast exit for non-freezable wqs */ + if (!freezable && pwq->max_active == wq->saved_max_active) + return; + + spin_lock_irq(&pwq->pool->lock); + + if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { + pwq->max_active = wq->saved_max_active; + + while (!list_empty(&pwq->delayed_works) && + pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); /* - * Allocate enough room to align pwq and put an extra - * pointer at the end pointing back to the originally - * allocated pointer which will be used for free. + * Need to kick a worker after thawed or an unbound wq's + * max_active is bumped. It's a slow path. Do it always. */ - ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); - if (ptr) { - wq->pool_wq.single = PTR_ALIGN(ptr, align); - *(void **)(wq->pool_wq.single + 1) = ptr; + wake_up_worker(pwq->pool); + } else { + pwq->max_active = 0; + } + + spin_unlock_irq(&pwq->pool->lock); +} + +/* initialize newly alloced @pwq which is associated with @wq and @pool */ +static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, + struct worker_pool *pool) +{ + BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); + + memset(pwq, 0, sizeof(*pwq)); + + pwq->pool = pool; + pwq->wq = wq; + pwq->flush_color = -1; + pwq->refcnt = 1; + INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->pwqs_node); + INIT_LIST_HEAD(&pwq->mayday_node); + INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); +} + +/* sync @pwq with the current state of its associated wq and link it */ +static void link_pwq(struct pool_workqueue *pwq) +{ + struct workqueue_struct *wq = pwq->wq; + + lockdep_assert_held(&wq->mutex); + + /* may be called multiple times, ignore if already linked */ + if (!list_empty(&pwq->pwqs_node)) + return; + + /* + * Set the matching work_color. This is synchronized with + * wq->mutex to avoid confusing flush_workqueue(). + */ + pwq->work_color = wq->work_color; + + /* sync max_active to the current setting */ + pwq_adjust_max_active(pwq); + + /* link in @pwq */ + list_add_rcu(&pwq->pwqs_node, &wq->pwqs); +} + +/* obtain a pool matching @attr and create a pwq associating the pool and @wq */ +static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct worker_pool *pool; + struct pool_workqueue *pwq; + + lockdep_assert_held(&wq_pool_mutex); + + pool = get_unbound_pool(attrs); + if (!pool) + return NULL; + + pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); + if (!pwq) { + put_unbound_pool(pool); + return NULL; + } + + init_pwq(pwq, wq, pool); + return pwq; +} + +/* undo alloc_unbound_pwq(), used only in the error path */ +static void free_unbound_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&wq_pool_mutex); + + if (pwq) { + put_unbound_pool(pwq->pool); + kmem_cache_free(pwq_cache, pwq); + } +} + +/** + * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node + * @attrs: the wq_attrs of interest + * @node: the target NUMA node + * @cpu_going_down: if >= 0, the CPU to consider as offline + * @cpumask: outarg, the resulting cpumask + * + * Calculate the cpumask a workqueue with @attrs should use on @node. If + * @cpu_going_down is >= 0, that cpu is considered offline during + * calculation. The result is stored in @cpumask. This function returns + * %true if the resulting @cpumask is different from @attrs->cpumask, + * %false if equal. + * + * If NUMA affinity is not enabled, @attrs->cpumask is always used. If + * enabled and @node has online CPUs requested by @attrs, the returned + * cpumask is the intersection of the possible CPUs of @node and + * @attrs->cpumask. + * + * The caller is responsible for ensuring that the cpumask of @node stays + * stable. + */ +static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, + int cpu_going_down, cpumask_t *cpumask) +{ + if (!wq_numa_enabled || attrs->no_numa) + goto use_dfl; + + /* does @node have any online CPUs @attrs wants? */ + cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); + if (cpu_going_down >= 0) + cpumask_clear_cpu(cpu_going_down, cpumask); + + if (cpumask_empty(cpumask)) + goto use_dfl; + + /* yeap, return possible CPUs in @node that @attrs wants */ + cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + return !cpumask_equal(cpumask, attrs->cpumask); + +use_dfl: + cpumask_copy(cpumask, attrs->cpumask); + return false; +} + +/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ +static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, + int node, + struct pool_workqueue *pwq) +{ + struct pool_workqueue *old_pwq; + + lockdep_assert_held(&wq->mutex); + + /* link_pwq() can handle duplicate calls */ + link_pwq(pwq); + + old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); + rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + return old_pwq; +} + +/** + * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue + * @wq: the target workqueue + * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() + * + * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA + * machines, this function maps a separate pwq to each NUMA node with + * possibles CPUs in @attrs->cpumask so that work items are affine to the + * NUMA node it was issued on. Older pwqs are released as in-flight work + * items finish. Note that a work item which repeatedly requeues itself + * back-to-back will stay on its current pwq. + * + * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on + * failure. + */ +int apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct workqueue_attrs *new_attrs, *tmp_attrs; + struct pool_workqueue **pwq_tbl, *dfl_pwq; + int node, ret; + + /* only unbound workqueues can change attributes */ + if (WARN_ON(!(wq->flags & WQ_UNBOUND))) + return -EINVAL; + + /* creating multiple pwqs breaks ordering guarantee */ + if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) + return -EINVAL; + + pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); + new_attrs = alloc_workqueue_attrs(GFP_KERNEL); + tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pwq_tbl || !new_attrs || !tmp_attrs) + goto enomem; + + /* make a copy of @attrs and sanitize it */ + copy_workqueue_attrs(new_attrs, attrs); + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + + /* + * We may create multiple pwqs with differing cpumasks. Make a + * copy of @new_attrs which will be modified and used to obtain + * pools. + */ + copy_workqueue_attrs(tmp_attrs, new_attrs); + + /* + * CPUs should stay stable across pwq creations and installations. + * Pin CPUs, determine the target cpumask for each node and create + * pwqs accordingly. + */ + get_online_cpus(); + + mutex_lock(&wq_pool_mutex); + + /* + * If something goes wrong during CPU up/down, we'll fall back to + * the default pwq covering whole @attrs->cpumask. Always create + * it even if we don't use it immediately. + */ + dfl_pwq = alloc_unbound_pwq(wq, new_attrs); + if (!dfl_pwq) + goto enomem_pwq; + + for_each_node(node) { + if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) { + pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); + if (!pwq_tbl[node]) + goto enomem_pwq; + } else { + dfl_pwq->refcnt++; + pwq_tbl[node] = dfl_pwq; } } - /* just in case, make sure it's actually aligned */ - BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); - return wq->pool_wq.v ? 0 : -ENOMEM; + mutex_unlock(&wq_pool_mutex); + + /* all pwqs have been created successfully, let's install'em */ + mutex_lock(&wq->mutex); + + copy_workqueue_attrs(wq->unbound_attrs, new_attrs); + + /* save the previous pwq and install the new one */ + for_each_node(node) + pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]); + + /* @dfl_pwq might not have been used, ensure it's linked */ + link_pwq(dfl_pwq); + swap(wq->dfl_pwq, dfl_pwq); + + mutex_unlock(&wq->mutex); + + /* put the old pwqs */ + for_each_node(node) + put_pwq_unlocked(pwq_tbl[node]); + put_pwq_unlocked(dfl_pwq); + + put_online_cpus(); + ret = 0; + /* fall through */ +out_free: + free_workqueue_attrs(tmp_attrs); + free_workqueue_attrs(new_attrs); + kfree(pwq_tbl); + return ret; + +enomem_pwq: + free_unbound_pwq(dfl_pwq); + for_each_node(node) + if (pwq_tbl && pwq_tbl[node] != dfl_pwq) + free_unbound_pwq(pwq_tbl[node]); + mutex_unlock(&wq_pool_mutex); + put_online_cpus(); +enomem: + ret = -ENOMEM; + goto out_free; } -static void free_pwqs(struct workqueue_struct *wq) +/** + * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug + * @wq: the target workqueue + * @cpu: the CPU coming up or going down + * @online: whether @cpu is coming up or going down + * + * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and + * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of + * @wq accordingly. + * + * If NUMA affinity can't be adjusted due to memory allocation failure, it + * falls back to @wq->dfl_pwq which may not be optimal but is always + * correct. + * + * Note that when the last allowed CPU of a NUMA node goes offline for a + * workqueue with a cpumask spanning multiple nodes, the workers which were + * already executing the work items for the workqueue will lose their CPU + * affinity and may execute on any CPU. This is similar to how per-cpu + * workqueues behave on CPU_DOWN. If a workqueue user wants strict + * affinity, it's the user's responsibility to flush the work item from + * CPU_DOWN_PREPARE. + */ +static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, + bool online) { - if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->pool_wq.pcpu); - else if (wq->pool_wq.single) { - /* the pointer to free is stored right after the pwq */ - kfree(*(void **)(wq->pool_wq.single + 1)); + int node = cpu_to_node(cpu); + int cpu_off = online ? -1 : cpu; + struct pool_workqueue *old_pwq = NULL, *pwq; + struct workqueue_attrs *target_attrs; + cpumask_t *cpumask; + + lockdep_assert_held(&wq_pool_mutex); + + if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND)) + return; + + /* + * We don't wanna alloc/free wq_attrs for each wq for each CPU. + * Let's use a preallocated one. The following buf is protected by + * CPU hotplug exclusion. + */ + target_attrs = wq_update_unbound_numa_attrs_buf; + cpumask = target_attrs->cpumask; + + mutex_lock(&wq->mutex); + if (wq->unbound_attrs->no_numa) + goto out_unlock; + + copy_workqueue_attrs(target_attrs, wq->unbound_attrs); + pwq = unbound_pwq_by_node(wq, node); + + /* + * Let's determine what needs to be done. If the target cpumask is + * different from wq's, we need to compare it to @pwq's and create + * a new one if they don't match. If the target cpumask equals + * wq's, the default pwq should be used. If @pwq is already the + * default one, nothing to do; otherwise, install the default one. + */ + if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { + if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) + goto out_unlock; + } else { + if (pwq == wq->dfl_pwq) + goto out_unlock; + else + goto use_dfl_pwq; + } + + mutex_unlock(&wq->mutex); + + /* create a new pwq */ + pwq = alloc_unbound_pwq(wq, target_attrs); + if (!pwq) { + pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + wq->name); + goto out_unlock; + } + + /* + * Install the new pwq. As this function is called only from CPU + * hotplug callbacks and applying a new attrs is wrapped with + * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed + * inbetween. + */ + mutex_lock(&wq->mutex); + old_pwq = numa_pwq_tbl_install(wq, node, pwq); + goto out_unlock; + +use_dfl_pwq: + spin_lock_irq(&wq->dfl_pwq->pool->lock); + get_pwq(wq->dfl_pwq); + spin_unlock_irq(&wq->dfl_pwq->pool->lock); + old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); +out_unlock: + mutex_unlock(&wq->mutex); + put_pwq_unlocked(old_pwq); +} + +static int alloc_and_link_pwqs(struct workqueue_struct *wq) +{ + bool highpri = wq->flags & WQ_HIGHPRI; + int cpu; + + if (!(wq->flags & WQ_UNBOUND)) { + wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); + if (!wq->cpu_pwqs) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct pool_workqueue *pwq = + per_cpu_ptr(wq->cpu_pwqs, cpu); + struct worker_pool *cpu_pools = + per_cpu(cpu_worker_pools, cpu); + + init_pwq(pwq, wq, &cpu_pools[highpri]); + + mutex_lock(&wq->mutex); + link_pwq(pwq); + mutex_unlock(&wq->mutex); + } + return 0; + } else { + return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } } @@ -3151,30 +4078,28 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct lock_class_key *key, const char *lock_name, ...) { - va_list args, args1; + size_t tbl_size = 0; + va_list args; struct workqueue_struct *wq; - unsigned int cpu; - size_t namelen; + struct pool_workqueue *pwq; - /* determine namelen, allocate wq and format name */ - va_start(args, lock_name); - va_copy(args1, args); - namelen = vsnprintf(NULL, 0, fmt, args) + 1; + /* allocate wq and format name */ + if (flags & WQ_UNBOUND) + tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); - wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); + wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); if (!wq) - goto err; + return NULL; - vsnprintf(wq->name, namelen, fmt, args1); - va_end(args); - va_end(args1); + if (flags & WQ_UNBOUND) { + wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!wq->unbound_attrs) + goto err_free_wq; + } - /* - * Workqueues which may be used during memory reclaim should - * have a rescuer to guarantee forward progress. - */ - if (flags & WQ_MEM_RECLAIM) - flags |= WQ_RESCUER; + va_start(args, lock_name); + vsnprintf(wq->name, sizeof(wq->name), fmt, args); + va_end(args); max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); @@ -3182,71 +4107,70 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; - mutex_init(&wq->flush_mutex); + mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); + INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); + INIT_LIST_HEAD(&wq->maydays); lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); - if (alloc_pwqs(wq) < 0) - goto err; - - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + if (alloc_and_link_pwqs(wq) < 0) + goto err_free_wq; - BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); - pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); - pwq->wq = wq; - pwq->flush_color = -1; - pwq->max_active = max_active; - INIT_LIST_HEAD(&pwq->delayed_works); - } - - if (flags & WQ_RESCUER) { + /* + * Workqueues which may be used during memory reclaim should + * have a rescuer to guarantee forward progress. + */ + if (flags & WQ_MEM_RECLAIM) { struct worker *rescuer; - if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) - goto err; - - wq->rescuer = rescuer = alloc_worker(); + rescuer = alloc_worker(); if (!rescuer) - goto err; + goto err_destroy; rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); - if (IS_ERR(rescuer->task)) - goto err; + if (IS_ERR(rescuer->task)) { + kfree(rescuer); + goto err_destroy; + } - rescuer->task->flags |= PF_THREAD_BOUND; + wq->rescuer = rescuer; + rescuer->task->flags |= PF_NO_SETAFFINITY; wake_up_process(rescuer->task); } + if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) + goto err_destroy; + /* - * workqueue_lock protects global freeze state and workqueues - * list. Grab it, set max_active accordingly and add the new - * workqueue to workqueues list. + * wq_pool_mutex protects global freeze state and workqueues list. + * Grab it, adjust max_active and add the new @wq to workqueues + * list. */ - spin_lock(&workqueue_lock); + mutex_lock(&wq_pool_mutex); - if (workqueue_freezing && wq->flags & WQ_FREEZABLE) - for_each_pwq_cpu(cpu, wq) - get_pwq(cpu, wq)->max_active = 0; + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); + mutex_unlock(&wq_pool_mutex); return wq; -err: - if (wq) { - free_pwqs(wq); - free_mayday_mask(wq->mayday_mask); - kfree(wq->rescuer); - kfree(wq); - } + +err_free_wq: + free_workqueue_attrs(wq->unbound_attrs); + kfree(wq); + return NULL; +err_destroy: + destroy_workqueue(wq); return NULL; } EXPORT_SYMBOL_GPL(__alloc_workqueue_key); @@ -3259,60 +4183,78 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); */ void destroy_workqueue(struct workqueue_struct *wq) { - unsigned int cpu; + struct pool_workqueue *pwq; + int node; /* drain it before proceeding with destruction */ drain_workqueue(wq); + /* sanity checks */ + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) { + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) { + if (WARN_ON(pwq->nr_in_flight[i])) { + mutex_unlock(&wq->mutex); + return; + } + } + + if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) || + WARN_ON(pwq->nr_active) || + WARN_ON(!list_empty(&pwq->delayed_works))) { + mutex_unlock(&wq->mutex); + return; + } + } + mutex_unlock(&wq->mutex); + /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - spin_lock(&workqueue_lock); - list_del(&wq->list); - spin_unlock(&workqueue_lock); + mutex_lock(&wq_pool_mutex); + list_del_init(&wq->list); + mutex_unlock(&wq_pool_mutex); - /* sanity check */ - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - int i; + workqueue_sysfs_unregister(wq); - for (i = 0; i < WORK_NR_COLORS; i++) - BUG_ON(pwq->nr_in_flight[i]); - BUG_ON(pwq->nr_active); - BUG_ON(!list_empty(&pwq->delayed_works)); - } - - if (wq->flags & WQ_RESCUER) { + if (wq->rescuer) { kthread_stop(wq->rescuer->task); - free_mayday_mask(wq->mayday_mask); kfree(wq->rescuer); + wq->rescuer = NULL; } - free_pwqs(wq); - kfree(wq); -} -EXPORT_SYMBOL_GPL(destroy_workqueue); - -/** - * pwq_set_max_active - adjust max_active of a pwq - * @pwq: target pool_workqueue - * @max_active: new max_active value. - * - * Set @pwq->max_active to @max_active and activate delayed works if - * increased. - * - * CONTEXT: - * spin_lock_irq(pool->lock). - */ -static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) -{ - pwq->max_active = max_active; + if (!(wq->flags & WQ_UNBOUND)) { + /* + * The base ref is never dropped on per-cpu pwqs. Directly + * free the pwqs and wq. + */ + free_percpu(wq->cpu_pwqs); + kfree(wq); + } else { + /* + * We're the sole accessor of @wq at this point. Directly + * access numa_pwq_tbl[] and dfl_pwq to put the base refs. + * @wq will be freed when the last pwq is released. + */ + for_each_node(node) { + pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); + RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); + put_pwq_unlocked(pwq); + } - while (!list_empty(&pwq->delayed_works) && - pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); + /* + * Put dfl_pwq. @wq may be freed any time after dfl_pwq is + * put. Don't access it afterwards. + */ + pwq = wq->dfl_pwq; + wq->dfl_pwq = NULL; + put_pwq_unlocked(pwq); + } } +EXPORT_SYMBOL_GPL(destroy_workqueue); /** * workqueue_set_max_active - adjust max_active of a workqueue @@ -3326,30 +4268,37 @@ static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) */ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { - unsigned int cpu; + struct pool_workqueue *pwq; + + /* disallow meddling with max_active for ordered workqueues */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); - spin_lock(&workqueue_lock); + mutex_lock(&wq->mutex); wq->saved_max_active = max_active; - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - struct worker_pool *pool = pwq->pool; - - spin_lock_irq(&pool->lock); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); - if (!(wq->flags & WQ_FREEZABLE) || - !(pool->flags & POOL_FREEZING)) - pwq_set_max_active(pwq, max_active); + mutex_unlock(&wq->mutex); +} +EXPORT_SYMBOL_GPL(workqueue_set_max_active); - spin_unlock_irq(&pool->lock); - } +/** + * current_is_workqueue_rescuer - is %current workqueue rescuer? + * + * Determine whether %current is a workqueue rescuer. Can be used from + * work functions to determine whether it's being run off the rescuer task. + */ +bool current_is_workqueue_rescuer(void) +{ + struct worker *worker = current_wq_worker(); - spin_unlock(&workqueue_lock); + return worker && worker->rescue_wq; } -EXPORT_SYMBOL_GPL(workqueue_set_max_active); /** * workqueue_congested - test whether a workqueue is congested @@ -3363,11 +4312,22 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); * RETURNS: * %true if congested, %false otherwise. */ -bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) +bool workqueue_congested(int cpu, struct workqueue_struct *wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + struct pool_workqueue *pwq; + bool ret; - return !list_empty(&pwq->delayed_works); + rcu_read_lock_sched(); + + if (!(wq->flags & WQ_UNBOUND)) + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + else + pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); + + ret = !list_empty(&pwq->delayed_works); + rcu_read_unlock_sched(); + + return ret; } EXPORT_SYMBOL_GPL(workqueue_congested); @@ -3384,19 +4344,22 @@ EXPORT_SYMBOL_GPL(workqueue_congested); */ unsigned int work_busy(struct work_struct *work) { - struct worker_pool *pool = get_work_pool(work); + struct worker_pool *pool; unsigned long flags; unsigned int ret = 0; if (work_pending(work)) ret |= WORK_BUSY_PENDING; + local_irq_save(flags); + pool = get_work_pool(work); if (pool) { - spin_lock_irqsave(&pool->lock, flags); + spin_lock(&pool->lock); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock(&pool->lock); } + local_irq_restore(flags); return ret; } @@ -3422,31 +4385,28 @@ static void wq_unbind_fn(struct work_struct *work) int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; - int i; + int wi; - for_each_std_worker_pool(pool, cpu) { - BUG_ON(cpu != smp_processor_id()); + for_each_cpu_worker_pool(pool, cpu) { + WARN_ON_ONCE(cpu != smp_processor_id()); - mutex_lock(&pool->assoc_mutex); + mutex_lock(&pool->manager_mutex); spin_lock_irq(&pool->lock); /* - * We've claimed all manager positions. Make all workers + * We've blocked all manager operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * except for the ones which are still executing works from * before the last CPU down must be on the cpu. After * this, they may become diasporas. */ - list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags |= WORKER_UNBOUND; - - for_each_busy_worker(worker, i, pool) + for_each_pool_worker(worker, wi, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); /* * Call schedule() so that we cross rq->lock and thus can @@ -3477,6 +4437,103 @@ static void wq_unbind_fn(struct work_struct *work) } } +/** + * rebind_workers - rebind all workers of a pool to the associated CPU + * @pool: pool of interest + * + * @pool->cpu is coming online. Rebind all workers to the CPU. + */ +static void rebind_workers(struct worker_pool *pool) +{ + struct worker *worker; + int wi; + + lockdep_assert_held(&pool->manager_mutex); + + /* + * Restore CPU affinity of all workers. As all idle workers should + * be on the run-queue of the associated CPU before any local + * wake-ups for concurrency management happen, restore CPU affinty + * of all workers first and then clear UNBOUND. As we're called + * from CPU_ONLINE, the following shouldn't fail. + */ + for_each_pool_worker(worker, wi, pool) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, + pool->attrs->cpumask) < 0); + + spin_lock_irq(&pool->lock); + + for_each_pool_worker(worker, wi, pool) { + unsigned int worker_flags = worker->flags; + + /* + * A bound idle worker should actually be on the runqueue + * of the associated CPU for local wake-ups targeting it to + * work. Kick all idle workers so that they migrate to the + * associated CPU. Doing this in the same loop as + * replacing UNBOUND with REBOUND is safe as no worker will + * be bound before @pool->lock is released. + */ + if (worker_flags & WORKER_IDLE) + wake_up_process(worker->task); + + /* + * We want to clear UNBOUND but can't directly call + * worker_clr_flags() or adjust nr_running. Atomically + * replace UNBOUND with another NOT_RUNNING flag REBOUND. + * @worker will clear REBOUND using worker_clr_flags() when + * it initiates the next execution cycle thus restoring + * concurrency management. Note that when or whether + * @worker clears REBOUND doesn't affect correctness. + * + * ACCESS_ONCE() is necessary because @worker->flags may be + * tested without holding any lock in + * wq_worker_waking_up(). Without it, NOT_RUNNING test may + * fail incorrectly leading to premature concurrency + * management operations. + */ + WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); + worker_flags |= WORKER_REBOUND; + worker_flags &= ~WORKER_UNBOUND; + ACCESS_ONCE(worker->flags) = worker_flags; + } + + spin_unlock_irq(&pool->lock); +} + +/** + * restore_unbound_workers_cpumask - restore cpumask of unbound workers + * @pool: unbound pool of interest + * @cpu: the CPU which is coming up + * + * An unbound pool may end up with a cpumask which doesn't have any online + * CPUs. When a worker of such pool get scheduled, the scheduler resets + * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any + * online CPU before, cpus_allowed of all its workers should be restored. + */ +static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) +{ + static cpumask_t cpumask; + struct worker *worker; + int wi; + + lockdep_assert_held(&pool->manager_mutex); + + /* is @cpu allowed for @pool? */ + if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) + return; + + /* is @cpu the only online CPU? */ + cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); + if (cpumask_weight(&cpumask) != 1) + return; + + /* as we're called from CPU_ONLINE, the following shouldn't fail */ + for_each_pool_worker(worker, wi, pool) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, + pool->attrs->cpumask) < 0); +} + /* * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. @@ -3485,39 +4542,46 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; + int cpu = (unsigned long)hcpu; struct worker_pool *pool; + struct workqueue_struct *wq; + int pi; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - for_each_std_worker_pool(pool, cpu) { - struct worker *worker; - + for_each_cpu_worker_pool(pool, cpu) { if (pool->nr_workers) continue; - - worker = create_worker(pool); - if (!worker) + if (create_and_start_worker(pool) < 0) return NOTIFY_BAD; - - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); } break; case CPU_DOWN_FAILED: case CPU_ONLINE: - for_each_std_worker_pool(pool, cpu) { - mutex_lock(&pool->assoc_mutex); - spin_lock_irq(&pool->lock); + mutex_lock(&wq_pool_mutex); - pool->flags &= ~POOL_DISASSOCIATED; - rebind_workers(pool); + for_each_pool(pool, pi) { + mutex_lock(&pool->manager_mutex); + + if (pool->cpu == cpu) { + spin_lock_irq(&pool->lock); + pool->flags &= ~POOL_DISASSOCIATED; + spin_unlock_irq(&pool->lock); + + rebind_workers(pool); + } else if (pool->cpu < 0) { + restore_unbound_workers_cpumask(pool, cpu); + } - spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); } + + /* update NUMA affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, true); + + mutex_unlock(&wq_pool_mutex); break; } return NOTIFY_OK; @@ -3531,14 +4595,23 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; + int cpu = (unsigned long)hcpu; struct work_struct unbind_work; + struct workqueue_struct *wq; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - /* unbinding should happen on the local CPU */ + /* unbinding per-cpu workers should happen on the local CPU */ INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); queue_work_on(cpu, system_highpri_wq, &unbind_work); + + /* update NUMA affinity of unbound workqueues */ + mutex_lock(&wq_pool_mutex); + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, false); + mutex_unlock(&wq_pool_mutex); + + /* wait for per-cpu unbinding to finish */ flush_work(&unbind_work); break; } @@ -3571,7 +4644,7 @@ static void work_for_cpu_fn(struct work_struct *work) * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. */ -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; @@ -3589,44 +4662,40 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their frozen_works list instead of + * workqueues will queue new works to their delayed_works list instead of * pool->worklist. * * CONTEXT: - * Grabs and releases workqueue_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void freeze_workqueues_begin(void) { - unsigned int cpu; + struct worker_pool *pool; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + int pi; - spin_lock(&workqueue_lock); + mutex_lock(&wq_pool_mutex); - BUG_ON(workqueue_freezing); + WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; - for_each_wq_cpu(cpu) { - struct worker_pool *pool; - struct workqueue_struct *wq; - - for_each_std_worker_pool(pool, cpu) { - spin_lock_irq(&pool->lock); - - WARN_ON_ONCE(pool->flags & POOL_FREEZING); - pool->flags |= POOL_FREEZING; - - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - - if (pwq && pwq->pool == pool && - (wq->flags & WQ_FREEZABLE)) - pwq->max_active = 0; - } + /* set FREEZING */ + for_each_pool(pool, pi) { + spin_lock_irq(&pool->lock); + WARN_ON_ONCE(pool->flags & POOL_FREEZING); + pool->flags |= POOL_FREEZING; + spin_unlock_irq(&pool->lock); + } - spin_unlock_irq(&pool->lock); - } + list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); } - spin_unlock(&workqueue_lock); + mutex_unlock(&wq_pool_mutex); } /** @@ -3636,7 +4705,7 @@ void freeze_workqueues_begin(void) * between freeze_workqueues_begin() and thaw_workqueues(). * * CONTEXT: - * Grabs and releases workqueue_lock. + * Grabs and releases wq_pool_mutex. * * RETURNS: * %true if some freezable workqueues are still busy. %false if freezing @@ -3644,34 +4713,34 @@ void freeze_workqueues_begin(void) */ bool freeze_workqueues_busy(void) { - unsigned int cpu; bool busy = false; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; - spin_lock(&workqueue_lock); + mutex_lock(&wq_pool_mutex); - BUG_ON(!workqueue_freezing); + WARN_ON_ONCE(!workqueue_freezing); - for_each_wq_cpu(cpu) { - struct workqueue_struct *wq; + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; /* * nr_active is monotonically decreasing. It's safe * to peek without lock. */ - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - - if (!pwq || !(wq->flags & WQ_FREEZABLE)) - continue; - - BUG_ON(pwq->nr_active < 0); + rcu_read_lock_sched(); + for_each_pwq(pwq, wq) { + WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; + rcu_read_unlock_sched(); goto out_unlock; } } + rcu_read_unlock_sched(); } out_unlock: - spin_unlock(&workqueue_lock); + mutex_unlock(&wq_pool_mutex); return busy; } @@ -3682,104 +4751,141 @@ out_unlock: * frozen works are transferred to their respective pool worklists. * * CONTEXT: - * Grabs and releases workqueue_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void thaw_workqueues(void) { - unsigned int cpu; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + struct worker_pool *pool; + int pi; - spin_lock(&workqueue_lock); + mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; - for_each_wq_cpu(cpu) { - struct worker_pool *pool; - struct workqueue_struct *wq; + /* clear FREEZING */ + for_each_pool(pool, pi) { + spin_lock_irq(&pool->lock); + WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); + pool->flags &= ~POOL_FREEZING; + spin_unlock_irq(&pool->lock); + } - for_each_std_worker_pool(pool, cpu) { - spin_lock_irq(&pool->lock); + /* restore max_active and repopulate worklist */ + list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); + } - WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); - pool->flags &= ~POOL_FREEZING; + workqueue_freezing = false; +out_unlock: + mutex_unlock(&wq_pool_mutex); +} +#endif /* CONFIG_FREEZER */ - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); +static void __init wq_numa_init(void) +{ + cpumask_var_t *tbl; + int node, cpu; - if (!pwq || pwq->pool != pool || - !(wq->flags & WQ_FREEZABLE)) - continue; + /* determine NUMA pwq table len - highest node id + 1 */ + for_each_node(node) + wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1); - /* restore max_active and repopulate worklist */ - pwq_set_max_active(pwq, wq->saved_max_active); - } + if (num_possible_nodes() <= 1) + return; - wake_up_worker(pool); + if (wq_disable_numa) { + pr_info("workqueue: NUMA affinity support disabled\n"); + return; + } + + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); + BUG_ON(!wq_update_unbound_numa_attrs_buf); - spin_unlock_irq(&pool->lock); + /* + * We want masks of possible CPUs of each node which isn't readily + * available. Build one from cpu_to_node() which should have been + * fully initialized by now. + */ + tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); + BUG_ON(!tbl); + + for_each_node(node) + BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node)); + + for_each_possible_cpu(cpu) { + node = cpu_to_node(cpu); + if (WARN_ON(node == NUMA_NO_NODE)) { + pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); + /* happens iff arch is bonkers, let's just proceed */ + return; } + cpumask_set_cpu(cpu, tbl[node]); } - workqueue_freezing = false; -out_unlock: - spin_unlock(&workqueue_lock); + wq_numa_possible_cpumask = tbl; + wq_numa_enabled = true; } -#endif /* CONFIG_FREEZER */ static int __init init_workqueues(void) { - unsigned int cpu; + int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; + int i, cpu; /* make sure we have enough bits for OFFQ pool ID */ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < WORK_CPU_END * NR_STD_WORKER_POOLS); + WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + + pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); + wq_numa_init(); + /* initialize CPU pools */ - for_each_wq_cpu(cpu) { + for_each_possible_cpu(cpu) { struct worker_pool *pool; - for_each_std_worker_pool(pool, cpu) { - spin_lock_init(&pool->lock); + i = 0; + for_each_cpu_worker_pool(pool, cpu) { + BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; - pool->flags |= POOL_DISASSOCIATED; - INIT_LIST_HEAD(&pool->worklist); - INIT_LIST_HEAD(&pool->idle_list); - hash_init(pool->busy_hash); - - init_timer_deferrable(&pool->idle_timer); - pool->idle_timer.function = idle_worker_timeout; - pool->idle_timer.data = (unsigned long)pool; - - setup_timer(&pool->mayday_timer, pool_mayday_timeout, - (unsigned long)pool); - - mutex_init(&pool->assoc_mutex); - ida_init(&pool->worker_ida); + cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); + pool->attrs->nice = std_nice[i++]; + pool->node = cpu_to_node(cpu); /* alloc pool ID */ + mutex_lock(&wq_pool_mutex); BUG_ON(worker_pool_assign_id(pool)); + mutex_unlock(&wq_pool_mutex); } } /* create the initial worker */ - for_each_online_wq_cpu(cpu) { + for_each_online_cpu(cpu) { struct worker_pool *pool; - for_each_std_worker_pool(pool, cpu) { - struct worker *worker; + for_each_cpu_worker_pool(pool, cpu) { + pool->flags &= ~POOL_DISASSOCIATED; + BUG_ON(create_and_start_worker(pool) < 0); + } + } - if (cpu != WORK_CPU_UNBOUND) - pool->flags &= ~POOL_DISASSOCIATED; + /* create default unbound wq attrs */ + for (i = 0; i < NR_STD_WORKER_POOLS; i++) { + struct workqueue_attrs *attrs; - worker = create_worker(pool); - BUG_ON(!worker); - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); - } + BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + attrs->nice = std_nice[i]; + unbound_std_wq_attrs[i] = attrs; } system_wq = alloc_workqueue("events", 0, 0); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 07650264ec15..84ab6e1dc6fb 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -32,14 +32,12 @@ struct worker { struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ + /* L: for rescuers */ /* 64 bytes boundary on 64bit, 32 on 32bit */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ - /* for rebinding worker to CPU */ - struct work_struct rebind_work; /* L: for busy worker */ - /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; @@ -58,8 +56,7 @@ static inline struct worker *current_wq_worker(void) * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched.c and workqueue.c. */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu); +void wq_worker_waking_up(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |