From 1ff6bbfd13ca2c114a5cb58e1a92d1e5d68ce0b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Jan 2014 18:10:37 -0500 Subject: arm, pm, vmpressure: add missing slab.h includes arch/arm/mach-tegra/pm.c, kernel/power/console.c and mm/vmpressure.c were somehow getting slab.h indirectly through cgroup.h which in turn was getting it indirectly through xattr.h. A scheduled cgroup change drops xattr.h inclusion from cgroup.h and breaks compilation of these three files. Add explicit slab.h includes to the three files. A pending cgroup patch depends on this change and it'd be great if this can be routed through cgroup/for-3.14-fixes branch. Signed-off-by: Tejun Heo Acked-by: Stephen Warren Cc: Thierry Reding Cc: linux-tegra@vger.kernel.org Cc: "Rafael J. Wysocki" Cc: linux-pm@vger.kernel.org Cc: Johannes Weiner Cc: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: cgroups@vger.kernel.org --- kernel/power/console.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/console.c b/kernel/power/console.c index eacb8bd8cab..aba9c545a0e 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "power.h" #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) -- cgit v1.2.3 From ab3f5faa6255a0eb4f832675507d9e295ca7e9ba Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 6 Feb 2014 15:56:01 -0800 Subject: cgroup: use an ordered workqueue for cgroup destruction Sometimes the cleanup after memcg hierarchy testing gets stuck in mem_cgroup_reparent_charges(), unable to bring non-kmem usage down to 0. There may turn out to be several causes, but a major cause is this: the workitem to offline parent can get run before workitem to offline child; parent's mem_cgroup_reparent_charges() circles around waiting for the child's pages to be reparented to its lrus, but it's holding cgroup_mutex which prevents the child from reaching its mem_cgroup_reparent_charges(). Just use an ordered workqueue for cgroup_destroy_wq. tj: Committing as the temporary fix until the reverse dependency can be removed from memcg. Comment updated accordingly. Fixes: e5fca243abae ("cgroup: use a dedicated workqueue for cgroup destruction") Suggested-by: Filipe Brandenburger Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e2f46ba37f7..aa95591c143 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4845,12 +4845,16 @@ static int __init cgroup_wq_init(void) /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. - * Use 1 for @max_active. + * + * XXX: Must be ordered to make sure parent is offlined after + * children. The ordering requirement is for memcg where a + * parent's offline may wait for a child's leading to deadlock. In + * the long term, this should be fixed from memcg side. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); + cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0); BUG_ON(!cgroup_destroy_wq); /* -- cgit v1.2.3 From eb46bf89696972b856a9adb6aebd5c7b65c266e4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:26:33 -0500 Subject: cgroup: fix error return value in cgroup_mount() When cgroup_mount() fails to allocate an id for the root, it didn't set ret before jumping to unlock_drop ending up returning 0 after a failure. Fix it. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index aa95591c143..793f3717607 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1566,10 +1566,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); - root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, - 0, 1, GFP_KERNEL); - if (root_cgrp->id < 0) + ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); + if (ret < 0) goto unlock_drop; + root_cgrp->id = ret; /* Check for name clashes with existing mounts */ ret = -EBUSY; -- cgit v1.2.3 From b58c89986a77a23658682a100eb15d8edb571ebb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:26:33 -0500 Subject: cgroup: fix error return from cgroup_create() cgroup_create() was returning 0 after allocation failures. Fix it. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 793f3717607..0eb7b868e1a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4158,7 +4158,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; - int ssid, err = 0; + int ssid, err; struct cgroup_subsys *ss; struct super_block *sb = root->sb; @@ -4168,8 +4168,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return -ENOMEM; name = cgroup_alloc_name(dentry); - if (!name) + if (!name) { + err = -ENOMEM; goto err_free_cgrp; + } rcu_assign_pointer(cgrp->name, name); /* @@ -4177,8 +4179,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * a half-baked cgroup. */ cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); - if (cgrp->id < 0) + if (cgrp->id < 0) { + err = -ENOMEM; goto err_free_name; + } /* * Only live parents can have children. Note that the liveliness -- cgit v1.2.3 From 48573a893303986e3b0b2974d6fb11f3d1bb7064 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:26:34 -0500 Subject: cgroup: fix locking in cgroup_cfts_commit() cgroup_cfts_commit() walks the cgroup hierarchy that the target subsystem is attached to and tries to apply the file changes. Due to the convolution with inode locking, it can't keep cgroup_mutex locked while iterating. It currently holds only RCU read lock around the actual iteration and then pins the found cgroup using dget(). Unfortunately, this is incorrect. Although the iteration does check cgroup_is_dead() before invoking dget(), there's nothing which prevents the dentry from going away inbetween. Note that this is different from the usual css iterations where css_tryget() is used to pin the css - css_tryget() tests whether the css can be pinned and fails if not. The problem can be solved by simply holding cgroup_mutex instead of RCU read lock around the iteration, which actually reduces LOC. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0eb7b868e1a..3edf7163b84 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2763,10 +2763,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) */ update_before = cgroup_serial_nr_next; - mutex_unlock(&cgroup_mutex); - /* add/rm files for all cgroups created before */ - rcu_read_lock(); css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; @@ -2775,23 +2772,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) inode = cgrp->dentry->d_inode; dget(cgrp->dentry); - rcu_read_unlock(); - dput(prev); prev = cgrp->dentry; + mutex_unlock(&cgroup_mutex); mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) ret = cgroup_addrm_files(cgrp, cfts, is_add); - mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - - rcu_read_lock(); if (ret) break; } - rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); dput(prev); deactivate_super(sb); return ret; -- cgit v1.2.3 From 0ab02ca8f887908152d1a96db5130fc661d36a1e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 11 Feb 2014 16:05:46 +0800 Subject: cgroup: protect modifications to cgroup_idr with cgroup_mutex Setup cgroupfs like this: # mount -t cgroup -o cpuacct xxx /cgroup # mkdir /cgroup/sub1 # mkdir /cgroup/sub2 Then run these two commands: # for ((; ;)) { mkdir /cgroup/sub1/tmp && rmdir /mnt/sub1/tmp; } & # for ((; ;)) { mkdir /cgroup/sub2/tmp && rmdir /mnt/sub2/tmp; } & After seconds you may see this warning: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 25243 at lib/idr.c:527 sub_remove+0x87/0x1b0() idr_remove called for id=6 which is not allocated. ... Call Trace: [] dump_stack+0x7a/0x96 [] warn_slowpath_common+0x8c/0xc0 [] warn_slowpath_fmt+0x46/0x50 [] sub_remove+0x87/0x1b0 [] ? css_killed_work_fn+0x32/0x1b0 [] idr_remove+0x25/0xd0 [] cgroup_destroy_css_killed+0x5b/0xc0 [] css_killed_work_fn+0x130/0x1b0 [] process_one_work+0x26c/0x550 [] worker_thread+0x12e/0x3b0 [] kthread+0xe6/0xf0 [] ret_from_fork+0x7c/0xb0 ---[ end trace 2d1577ec10cf80d0 ]--- It's because allocating/removing cgroup ID is not properly synchronized. The bug was introduced when we converted cgroup_ida to cgroup_idr. While synchronization is already done inside ida_simple_{get,remove}(), users are responsible for concurrent calls to idr_{alloc,remove}(). tj: Refreshed on top of b58c89986a77 ("cgroup: fix error return from cgroup_create()"). Fixes: 4e96ee8e981b ("cgroup: convert cgroup_ida to cgroup_idr") Cc: #3.12+ Reported-by: Michal Hocko Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 34 ++++++++++++++++++---------------- 2 files changed, 20 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5c097596104..9450f025fe0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -166,6 +166,8 @@ struct cgroup { * * The ID of the root cgroup is always 0, and a new cgroup * will be assigned with a smallest available ID. + * + * Allocating/Removing ID must be protected by cgroup_mutex. */ int id; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3edf7163b84..52719ce55dd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * per-subsystem and moved to css->id so that lookups are * successful until the target css is released. */ + mutex_lock(&cgroup_mutex); idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + mutex_unlock(&cgroup_mutex); cgrp->id = -1; call_rcu(&cgrp->rcu_head, cgroup_free_rcu); @@ -4167,16 +4169,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } rcu_assign_pointer(cgrp->name, name); - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); - if (cgrp->id < 0) { - err = -ENOMEM; - goto err_free_name; - } - /* * Only live parents can have children. Note that the liveliness * check isn't strictly necessary because cgroup_mkdir() and @@ -4186,7 +4178,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free_id; + goto err_free_name; + } + + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); + if (cgrp->id < 0) { + err = -ENOMEM; + goto err_unlock; } /* Grab a reference on the superblock so the hierarchy doesn't @@ -4218,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_unlock; + goto err_free_id; lockdep_assert_held(&dentry->d_inode->i_mutex); cgrp->serial_nr = cgroup_serial_nr_next++; @@ -4254,12 +4256,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return 0; -err_unlock: - mutex_unlock(&cgroup_mutex); - /* Release the reference count that we took on the superblock */ - deactivate_super(sb); err_free_id: idr_remove(&root->cgroup_idr, cgrp->id); + /* Release the reference count that we took on the superblock */ + deactivate_super(sb); +err_unlock: + mutex_unlock(&cgroup_mutex); err_free_name: kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: -- cgit v1.2.3 From 1a11533fbd71792e8c5d36f6763fbce8df0d231d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 19:06:19 -0500 Subject: Revert "cgroup: use an ordered workqueue for cgroup destruction" This reverts commit ab3f5faa6255a0eb4f832675507d9e295ca7e9ba. Explanation from Hugh: It's because more thorough testing, by others here, found that it wasn't always solving the problem: so I asked Tejun privately to hold off from sending it in, until we'd worked out why not. Most of our testing being on a v3,11-based kernel, it was perfectly possible that the problem was merely our own e.g. missing Tejun's 8a2b75384444 ("workqueue: fix ordered workqueues in NUMA setups"). But that turned out not to be enough to fix it either. Then Filipe pointed out how percpu_ref_kill_and_confirm() uses call_rcu_sched() before we ever get to put the offline on to the workqueue: by the time we get to the workqueue, the ordering has already been lost. So, thanks for the Acks, but I'm afraid that this ordered workqueue solution is just not good enough: we should simply forget that patch and provide a different answer." Signed-off-by: Tejun Heo Cc: Hugh Dickins --- kernel/cgroup.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 52719ce55dd..68d87103b49 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4844,16 +4844,12 @@ static int __init cgroup_wq_init(void) /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. - * - * XXX: Must be ordered to make sure parent is offlined after - * children. The ordering requirement is for memcg where a - * parent's offline may wait for a child's leading to deadlock. In - * the long term, this should be fixed from memcg side. + * Use 1 for @max_active. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0); + cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); /* -- cgit v1.2.3 From e4178d809fdaee32a56833fff1f5056c99e90a1a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 17 Feb 2014 12:24:45 -0800 Subject: printk: fix syslog() overflowing user buffer This is not a buffer overflow in the traditional sense: we don't overflow any *kernel* buffers, but we do mis-count the amount of data we copy back to user space for the SYSLOG_ACTION_READ_ALL case. In particular, if the user buffer is too small to hold everything, and *if* there is a continuation line at just the right place, we can end up giving the user more data than he asked for. The reason is that we first count up the number of bytes all the log records contains, then we walk the records again until we've skipped the records at the beginning that won't fit, and then we walk the rest of the records and copy them to the user space buffer. And in between that "skip the initial records that won't fit" and the "copy the records that *will* fit to user space", we reset the 'prev' variable that contained the record information for the last record not copied. That meant that when we started copying to user space, we now had a different character count than what we had originally calculated in the first record walk-through. The fix is to simply not clear the 'prev' flags value (in both cases where we had the same logic: syslog_print_all and kmsg_dump_get_buffer: the latter is used for pstore-like dumping) Reported-and-tested-by: Debabrata Banerjee Acked-by: Kay Sievers Cc: Greg Kroah-Hartman Cc: Jeff Mahoney Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b1d255f0413..4dae9cbe925 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1076,7 +1076,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) next_seq = log_next_seq; len = 0; - prev = 0; while (len >= 0 && seq < next_seq) { struct printk_log *msg = log_from_idx(idx); int textlen; @@ -2788,7 +2787,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, next_idx = idx; l = 0; - prev = 0; while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); -- cgit v1.2.3 From 45a22f4c11fef4ecd5c61c0a299cd3f23d77be8e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Feb 2014 13:09:50 +0100 Subject: inotify: Fix reporting of cookies for inotify events My rework of handling of notification events (namely commit 7053aee26a35 "fsnotify: do not share events between notification groups") broke sending of cookies with inotify events. We didn't propagate the value passed to fsnotify() properly and passed 4 uninitialized bytes to userspace instead (so it is also an information leak). Sadly I didn't notice this during my testing because inotify cookies aren't used very much and LTP inotify tests ignore them. Fix the problem by passing the cookie value properly. Fixes: 7053aee26a3548ebaba046ae2e52396ccf56ac6c Reported-by: Vegard Nossum Signed-off-by: Jan Kara --- fs/notify/dnotify/dnotify.c | 2 +- fs/notify/fanotify/fanotify.c | 2 +- fs/notify/fsnotify.c | 2 +- fs/notify/inotify/inotify.h | 2 +- fs/notify/inotify/inotify_fsnotify.c | 3 ++- fs/notify/inotify/inotify_user.c | 2 +- include/linux/fsnotify_backend.h | 2 +- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 2 +- 9 files changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 0b9ff4395e6..abc8cbcfe90 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -86,7 +86,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 0e792f5e314..205dc216382 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -147,7 +147,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *fanotify_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { int ret = 0; struct fanotify_event_info *event; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 1d4e1ea2f37..9d3e9c50066 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -179,7 +179,7 @@ static int send_to_group(struct inode *to_tell, return group->ops->handle_event(group, to_tell, inode_mark, vfsmount_mark, mask, data, data_is, - file_name); + file_name, cookie); } /* diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 485eef3f440..ed855ef6f07 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -27,6 +27,6 @@ extern int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name); + const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index d5ee56348bb..43ab1e1a07a 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -67,7 +67,7 @@ int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { struct inotify_inode_mark *i_mark; struct inotify_event_info *event; @@ -103,6 +103,7 @@ int inotify_handle_event(struct fsnotify_group *group, fsn_event = &event->fse; fsnotify_init_event(fsn_event, inode, mask); event->wd = i_mark->wd; + event->sync_cookie = cookie; event->name_len = len; if (len) strcpy(event->name, file_name); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 497395c8274..6528b5a54ca 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -495,7 +495,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, /* Queue ignore event for the watch */ inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, - NULL, FSNOTIFY_EVENT_NONE, NULL); + NULL, FSNOTIFY_EVENT_NONE, NULL, 0); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 3d286ff49ab..c84bc7c2bfc 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -99,7 +99,7 @@ struct fsnotify_ops { struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name); + const unsigned char *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_event *event); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 67ccf0e7cca..135944a7b28 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -916,7 +916,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { return 0; } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 2596fac5dcb..70b4554d2fb 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -471,7 +471,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *dname) + const unsigned char *dname, u32 cookie) { struct inode *inode; struct audit_parent *parent; -- cgit v1.2.3 From 5bdfff96c69a4d5ab9c49e60abf9e070ecd2acbb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 15 Feb 2014 22:02:28 +0800 Subject: workqueue: ensure @task is valid across kthread_stop() When a kworker should die, the kworkre is notified through WORKER_DIE flag instead of kthread_should_stop(). This, IIRC, is primarily to keep the test synchronized inside worker_pool lock. WORKER_DIE is first set while holding pool->lock, the lock is dropped and kthread_stop() is called. Unfortunately, this means that there's a slight chance that the target kworker may see WORKER_DIE before kthread_stop() finishes and exits and frees the target task before or during kthread_stop(). Fix it by pinning the target task before setting WORKER_DIE and putting it after kthread_stop() is done. tj: Improved patch description and comment. Moved pinning above WORKER_DIE for better signify what it's protecting. CC: stable@vger.kernel.org Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 82ef9f3b747..193e977a10e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1851,6 +1851,12 @@ static void destroy_worker(struct worker *worker) if (worker->flags & WORKER_IDLE) pool->nr_idle--; + /* + * Once WORKER_DIE is set, the kworker may destroy itself at any + * point. Pin to ensure the task stays until we're done with it. + */ + get_task_struct(worker->task); + list_del_init(&worker->entry); worker->flags |= WORKER_DIE; @@ -1859,6 +1865,7 @@ static void destroy_worker(struct worker *worker) spin_unlock_irq(&pool->lock); kthread_stop(worker->task); + put_task_struct(worker->task); kfree(worker); spin_lock_irq(&pool->lock); -- cgit v1.2.3 From 532de3fc72adc2a6525c4d53c07bf81e1732083d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 13:29:31 -0500 Subject: cgroup: update cgroup_enable_task_cg_lists() to grab siglock Currently, there's nothing preventing cgroup_enable_task_cg_lists() from missing set PF_EXITING and race against cgroup_exit(). Depending on the timing, cgroup_exit() may finish with the task still linked on css_set leading to list corruption. Fix it by grabbing siglock in cgroup_enable_task_cg_lists() so that PF_EXITING is guaranteed to be visible. This whole on-demand cg_list optimization is extremely fragile and has ample possibility to lead to bugs which can cause things like once-a-year oops during boot. I'm wondering whether the better approach would be just adding "cgroup_disable=all" handling which disables the whole cgroup rather than tempting fate with this on-demand craziness. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 68d87103b49..105f273b6f8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2905,9 +2905,14 @@ static void cgroup_enable_task_cg_lists(void) * We should check if the process is exiting, otherwise * it will race with cgroup_exit() in that the list * entry won't be deleted though the process has exited. + * Do it while holding siglock so that we don't end up + * racing against cgroup_exit(). */ + spin_lock_irq(&p->sighand->siglock); if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) list_add(&p->cg_list, &task_css_set(p)->tasks); + spin_unlock_irq(&p->sighand->siglock); + task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); -- cgit v1.2.3 From 5ae8aabeaec3fe69c4fb21cbe5b17b72b35b5892 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 17 Feb 2014 10:45:36 -0800 Subject: sched_clock: Prevent callers from seeing half-updated data The generic sched_clock registration function was previously done lockless, due to the fact that it was expected to be called only once. However, now there are systems that may register multiple sched_clock sources, for which the lack of locking has casued problems: If two sched_clock sources are registered we may end up in a situation where a call to sched_clock() may be accessing the epoch cycle count for the old counter and the cycle count for the new counter. This can lead to confusing results where sched_clock() values jump and then are reset to 0 (due to the way the registration function forces the epoch_ns to be 0). Fix this by reorganizing the registration function to hold the seqlock for as short a time as possible while we update the clock_data structure for a new counter. We also put any accumulated time into epoch_ns instead of resetting the time to 0 so that the clock doesn't reset after each successful registration. [jstultz: Added extra context to the commit message] Reported-by: Will Deacon Signed-off-by: Stephen Boyd Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Cc: Peter Zijlstra Cc: Josh Cartwright Link: http://lkml.kernel.org/r/1392662736-7803-2-git-send-email-john.stultz@linaro.org Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner --- kernel/time/sched_clock.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0abb3646428..4d23dc4d813 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -116,20 +116,42 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) void __init sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { + u64 res, wrap, new_mask, new_epoch, cyc, ns; + u32 new_mult, new_shift; + ktime_t new_wrap_kt; unsigned long r; - u64 res, wrap; char r_unit; if (cd.rate > rate) return; WARN_ON(!irqs_disabled()); - read_sched_clock = read; - sched_clock_mask = CLOCKSOURCE_MASK(bits); - cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ - clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); + clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + + new_mask = CLOCKSOURCE_MASK(bits); + + /* calculate how many ns until we wrap */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); + new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + + /* update epoch for new counter and update epoch_ns from old counter*/ + new_epoch = read(); + cyc = read_sched_clock(); + ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + + raw_write_seqcount_begin(&cd.seq); + read_sched_clock = read; + sched_clock_mask = new_mask; + cd.rate = rate; + cd.wrap_kt = new_wrap_kt; + cd.mult = new_mult; + cd.shift = new_shift; + cd.epoch_cyc = new_epoch; + cd.epoch_ns = ns; + raw_write_seqcount_end(&cd.seq); r = rate; if (r >= 4000000) { @@ -141,22 +163,12 @@ void __init sched_clock_register(u64 (*read)(void), int bits, } else r_unit = ' '; - /* calculate how many ns until we wrap */ - wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); - cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); - /* calculate the ns resolution of this counter */ - res = cyc_to_ns(1ULL, cd.mult, cd.shift); + res = cyc_to_ns(1ULL, new_mult, new_shift); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", bits, r, r_unit, res, wrap); - update_sched_clock(); - - /* - * Ensure that sched_clock() starts off at 0ns - */ - cd.epoch_ns = 0; - /* Enable IRQ time accounting if we have a fast enough sched_clock */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); -- cgit v1.2.3 From b080e047a61f7050246ff3081f87832997170d29 Mon Sep 17 00:00:00 2001 From: Brian Campbell Date: Sun, 16 Feb 2014 22:58:12 -0500 Subject: user_namespace.c: Remove duplicated word in comment Signed-off-by: Brian Campbell Signed-off-by: Linus Torvalds --- kernel/user_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 240fb62cf39..dd06439b9c8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test - * for and handle handle INVALID_UID being returned. INVALID_UID + * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) -- cgit v1.2.3 From 3d5f35bdfdef5fd627afe9b4bf9c4f32d17f4593 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 20 Feb 2014 09:19:39 +0100 Subject: sched/deadline: Fix bad accounting of nr_running Rostedt writes: My test suite was locking up hard when enabling mmiotracer. This was due to the mmiotracer placing all but one CPU offline. I found this out when I was able to reproduce the bug with just my stress-cpu-hotplug test. This bug baffled me because it would not always trigger, and would only trigger on the first run after boot up. The stress-cpu-hotplug test would crash hard the first run, or never crash at all. But a new reboot may cause it to crash on the first run again. I spent all week bisecting this, as I couldn't find a consistent reproducer. I finally narrowed it down to the sched deadline patches, and even more peculiar, to the commit that added the sched deadline boot up self test to the latency tracer. Then it dawned on me to what the bug was. All it took was to run a task under sched deadline to screw up the CPU hot plugging. This explained why it would lock up only on the first run of the stress-cpu-hotplug test. The bug happened when the boot up self test of the schedule latency tracer would test a deadline task. The deadline task would corrupt something that would cause CPU hotplug to fail. If it didn't corrupt it, the stress test would always work (there's no other sched deadline tasks that would run to cause problems). If it did corrupt on boot up, the first test would lockup hard. I proved this theory by running my deadline test program on another box, and then run the stress-cpu-hotplug test, and it would now consistently lock up. I could run stress-cpu-hotplug over and over with no problem, but once I ran the deadline test, the next run of the stress-cpu-hotplug would lock hard. After adding lots of tracing to the code, I found the cause. The function tracer showed that migrate_tasks() was stuck in an infinite loop, where rq->nr_running never equaled 1 to break out of it. When I added a trace_printk() to see what that number was, it was 335 and never decrementing! Looking at the deadline code I found: static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { dequeue_dl_entity(&p->dl); dequeue_pushable_dl_task(rq, p); } static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); dec_nr_running(rq); } And this: if (dl_runtime_exceeded(rq, dl_se)) { __dequeue_task_dl(rq, curr, 0); if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) dl_se->dl_throttled = 1; else enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) resched_task(curr); } Notice how we call __dequeue_task_dl() and in the else case we call enqueue_task_dl()? Also notice that dequeue_task_dl() has underscores where enqueue_task_dl() does not. The enqueue_task_dl() calls inc_nr_running(rq), but __dequeue_task_dl() does not. This is where we get nr_running out of sync. [snip] Another point where nr_running can get out of sync is when the dl_timer fires: dl_se->dl_throttled = 0; if (p->on_rq) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_task(rq->curr); This patch does two things: - correctly accounts for throttled tasks (that are now considered !running); - fixes the bug, updating nr_running from {inc,dec}_dl_tasks(), since we risk to update it twice in some situations (e.g., a task is dequeued while it has exceeded its budget). Cc: mingo@redhat.com Cc: torvalds@linux-foundation.org Cc: akpm@linux-foundation.org Reported-by: Steven Rostedt Reviewed-by: Steven Rostedt Tested-by: Steven Rostedt Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392884379-13744-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Thomas Gleixner --- kernel/sched/deadline.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0dd5e0971a0..b819577c21d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -717,6 +717,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; + inc_nr_running(rq_of_dl_rq(dl_rq)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -730,6 +731,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; + dec_nr_running(rq_of_dl_rq(dl_rq)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -836,8 +838,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); - - inc_nr_running(rq); } static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -850,8 +850,6 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); - - dec_nr_running(rq); } /* -- cgit v1.2.3 From 4df1638cfaf9b2b7ad993979a41965acab9cd156 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 19 Feb 2014 13:53:35 -0500 Subject: sched/deadline: Fix overflow to handle period==0 and deadline!=0 While debugging the crash with the bad nr_running accounting, I hit another bug where, after running my sched deadline test, I was getting failures to take a CPU offline. It was giving me a -EBUSY error. Adding a bunch of trace_printk()s around, I found that the cpu notifier that called sched_cpu_inactive() was returning a failure. The overflow value was coming up negative? Talking this over with Juri, the problem is that the total_bw update was suppose to be made by dl_overflow() which, during my tests, seemed to not be called. Adding more trace_printk()s, it wasn't that it wasn't called, but it exited out right away with the check of new_bw being equal to p->dl.dl_bw. The new_bw calculates the ratio between period and runtime. The bug is that if you set a deadline, you do not need to set a period if you plan on the period being equal to the deadline. That is, if period is zero and deadline is not, then the system call should set the period to be equal to the deadline. This is done elsewhere in the code. The fix is easy, check if period is set, and if it is not, then use the deadline. Cc: Juri Lelli Cc: Ingo Molnar Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140219135335.7e74abd4@gandalf.local.home Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aa..24914488da4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1952,7 +1952,7 @@ static int dl_overflow(struct task_struct *p, int policy, { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - u64 period = attr->sched_period; + u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; -- cgit v1.2.3 From e9e7cb38c21c80c82af4b16608bb4c8c5ec6a28e Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 11 Feb 2014 09:24:26 +0100 Subject: sched/core: Fix sched_rt_global_validate Don't compare sysctl_sched_rt_runtime against sysctl_sched_rt_period if the former is equal to RUNTIME_INF, otherwise disabling -rt bandwidth management (with CONFIG_RT_GROUP_SCHED=n) fails. Cc: Ingo Molnar Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392107067-19907-2-git-send-email-juri.lelli@gmail.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 24914488da4..98d33c10525 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7475,7 +7475,8 @@ static int sched_rt_global_validate(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; - if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) + if ((sysctl_sched_rt_runtime != RUNTIME_INF) && + (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) return -EINVAL; return 0; -- cgit v1.2.3 From 495163420ab5398c84af96ca3eae2c6aa4a140da Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 11 Feb 2014 09:24:27 +0100 Subject: sched/core: Make dl_b->lock IRQ safe Fix this lockdep warning: [ 44.804600] ========================================================= [ 44.805746] [ INFO: possible irq lock inversion dependency detected ] [ 44.805746] 3.14.0-rc2-test+ #14 Not tainted [ 44.805746] --------------------------------------------------------- [ 44.805746] bash/3674 just changed the state of lock: [ 44.805746] (&dl_b->lock){+.....}, at: [] sched_rt_handler+0x132/0x248 [ 44.805746] but this lock was taken by another, HARDIRQ-safe lock in the past: [ 44.805746] (&rq->lock){-.-.-.} and interrupts could create inverse lock ordering between them. [ 44.805746] [ 44.805746] other info that might help us debug this: [ 44.805746] Possible interrupt unsafe locking scenario: [ 44.805746] [ 44.805746] CPU0 CPU1 [ 44.805746] ---- ---- [ 44.805746] lock(&dl_b->lock); [ 44.805746] local_irq_disable(); [ 44.805746] lock(&rq->lock); [ 44.805746] lock(&dl_b->lock); [ 44.805746] [ 44.805746] lock(&rq->lock); by making dl_b->lock acquiring always IRQ safe. Cc: Ingo Molnar Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392107067-19907-3-git-send-email-juri.lelli@gmail.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 98d33c10525..33d030a133d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7422,6 +7422,7 @@ static int sched_dl_global_constraints(void) u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); int cpu, ret = 0; + unsigned long flags; /* * Here we want to check the bandwidth not being set to some @@ -7435,10 +7436,10 @@ static int sched_dl_global_constraints(void) for_each_possible_cpu(cpu) { struct dl_bw *dl_b = dl_bw_of(cpu); - raw_spin_lock(&dl_b->lock); + raw_spin_lock_irqsave(&dl_b->lock, flags); if (new_bw < dl_b->total_bw) ret = -EBUSY; - raw_spin_unlock(&dl_b->lock); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); if (ret) break; @@ -7451,6 +7452,7 @@ static void sched_dl_do_global(void) { u64 new_bw = -1; int cpu; + unsigned long flags; def_dl_bandwidth.dl_period = global_rt_period(); def_dl_bandwidth.dl_runtime = global_rt_runtime(); @@ -7464,9 +7466,9 @@ static void sched_dl_do_global(void) for_each_possible_cpu(cpu) { struct dl_bw *dl_b = dl_bw_of(cpu); - raw_spin_lock(&dl_b->lock); + raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; - raw_spin_unlock(&dl_b->lock); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); } } -- cgit v1.2.3 From 3cf1962cdbf6b3a9e3ef21116d215bbab350ea37 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 18 Feb 2014 17:12:44 -0500 Subject: sched,numa: add cond_resched to task_numa_work Normally task_numa_work scans over a fairly small amount of memory, but it is possible to run into a large unpopulated part of virtual memory, with no pages mapped. In that case, task_numa_work can run for a while, and it may make sense to reschedule as required. Cc: akpm@linux-foundation.org Cc: Andrea Arcangeli Signed-off-by: Rik van Riel Reported-by: Xing Gang Tested-by: Chegu Vinod Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392761566-24834-2-git-send-email-riel@redhat.com Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 966cc2bfcb7..78157099b16 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1757,6 +1757,8 @@ void task_numa_work(struct callback_head *work) start = end; if (pages <= 0) goto out; + + cond_resched(); } while (end != vma->vm_end); } -- cgit v1.2.3 From 4efbc454ba68def5ef285b26ebfcfdb605b52755 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sun, 16 Feb 2014 22:24:17 +0100 Subject: sched: Fix information leak in sys_sched_getattr() We're copying the on-stack structure to userspace, but forgot to give the right number of bytes to copy. This allows the calling process to obtain up to PAGE_SIZE bytes from the stack (and possibly adjacent kernel memory). This fix copies only as much as we actually have on the stack (attr->size defaults to the size of the struct) and leaves the rest of the userspace-provided buffer untouched. Found using kmemcheck + trinity. Fixes: d50dde5a10f30 ("sched: Add new scheduler syscalls to support an extended scheduling parameters ABI") Cc: Dario Faggioli Cc: Juri Lelli Cc: Ingo Molnar Signed-off-by: Vegard Nossum Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392585857-10725-1-git-send-email-vegard.nossum@oracle.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 33d030a133d..a6e7470166c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3786,7 +3786,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, attr->size = usize; } - ret = copy_to_user(uattr, attr, usize); + ret = copy_to_user(uattr, attr, attr->size); if (ret) return -EFAULT; -- cgit v1.2.3 From 6d35ab48090b10c5ea5604ed5d6e91f302dc6060 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Feb 2014 17:19:29 +0100 Subject: sched: Add 'flags' argument to sched_{set,get}attr() syscalls Because of a recent syscall design debate; its deemed appropriate for each syscall to have a flags argument for future extension; without immediately requiring new syscalls. Cc: juri.lelli@gmail.com Cc: Ingo Molnar Suggested-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140214161929.GL27965@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- include/linux/syscalls.h | 6 ++++-- kernel/sched/core.c | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 40ed9e9a77e..a747a77ea58 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -281,13 +281,15 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param); asmlinkage long sys_sched_setattr(pid_t pid, - struct sched_attr __user *attr); + struct sched_attr __user *attr, + unsigned int flags); asmlinkage long sys_sched_getscheduler(pid_t pid); asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param); asmlinkage long sys_sched_getattr(pid_t pid, struct sched_attr __user *attr, - unsigned int size); + unsigned int size, + unsigned int flags); asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr); asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a6e7470166c..6edbef296ec 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3661,13 +3661,14 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) * @pid: the pid in question. * @uattr: structure containing the extended parameters. */ -SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) { struct sched_attr attr; struct task_struct *p; int retval; - if (!uattr || pid < 0) + if (!uattr || pid < 0 || flags) return -EINVAL; if (sched_copy_attr(uattr, &attr)) @@ -3804,8 +3805,8 @@ err_size: * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. */ -SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size) +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size, unsigned int, flags) { struct sched_attr attr = { .size = sizeof(struct sched_attr), @@ -3814,7 +3815,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, int retval; if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0) + size < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; rcu_read_lock(); -- cgit v1.2.3 From 82b95800b256205cff2eeab5bbd03430d2d0f20d Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 17 Feb 2014 09:12:33 -0500 Subject: sched/deadline: Test for CPU's presence explicitly A hot-removed CPU may have ID that is numerically larger than the number of existing CPUs in the system (e.g. we can unplug CPU 4 from a system that has CPUs 0, 1 and 4). Thus the WARN_ONs should check whether the CPU in question is currently present, not whether its ID value is less than num_present_cpus(). Cc: Ingo Molnar Cc: Juri Lelli Cc: Steven Rostedt Reported-by: Konrad Rzeszutek Wilk Signed-off-by: Boris Ostrovsky Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392646353-1874-1-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Thomas Gleixner --- kernel/sched/cpudeadline.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 045fc74e3f0..5b8838b56d1 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx) static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { - WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); + WARN_ON(!cpu_present(idx) || idx == IDX_INVALID); if (dl_time_before(new_dl, cp->elements[idx].dl)) { cp->elements[idx].dl = new_dl; @@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } out: - WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); + WARN_ON(!cpu_present(best_cpu) && best_cpu != -1); return best_cpu; } @@ -137,7 +137,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) int old_idx, new_cpu; unsigned long flags; - WARN_ON(cpu > num_present_cpus()); + WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); old_idx = cp->cpu_to_idx[cpu]; -- cgit v1.2.3 From 995b9ea440862def83e8fcb1b498e68f93d4af59 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 18 Feb 2014 02:24:13 +0400 Subject: sched/deadline: Remove useless dl_nr_total In deadline class we do not have group scheduling like in RT. dl_nr_total is the same as dl_nr_running. So, one of them should be removed. Cc: Ingo Molnar Cc: Juri Lelli Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/368631392675853@web20h.yandex.ru Signed-off-by: Thomas Gleixner --- kernel/sched/deadline.c | 4 +--- kernel/sched/sched.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b819577c21d..15cbc17fbf8 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -121,7 +121,7 @@ static inline void dl_clear_overload(struct rq *rq) static void update_dl_migration(struct dl_rq *dl_rq) { - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { + if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { if (!dl_rq->overloaded) { dl_set_overload(rq_of_dl_rq(dl_rq)); dl_rq->overloaded = 1; @@ -137,7 +137,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) struct task_struct *p = dl_task_of(dl_se); dl_rq = &rq_of_dl_rq(dl_rq)->dl; - dl_rq->dl_nr_total++; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; @@ -149,7 +148,6 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) struct task_struct *p = dl_task_of(dl_se); dl_rq = &rq_of_dl_rq(dl_rq)->dl; - dl_rq->dl_nr_total--; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c2119fd20f8..f964add50f3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -462,7 +462,6 @@ struct dl_rq { } earliest_dl; unsigned long dl_nr_migratory; - unsigned long dl_nr_total; int overloaded; /* -- cgit v1.2.3 From c685689fd24d310343ac33942e9a54a974ae9c43 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Mon, 24 Feb 2014 11:29:50 +0800 Subject: genirq: Remove racy waitqueue_active check We hit one rare case below: T1 calling disable_irq(), but hanging at synchronize_irq() always; The corresponding irq thread is in sleeping state; And all CPUs are in idle state; After analysis, we found there is one possible scenerio which causes T1 is waiting there forever: CPU0 CPU1 synchronize_irq() wait_event() spin_lock() atomic_dec_and_test(&threads_active) insert the __wait into queue spin_unlock() if(waitqueue_active) atomic_read(&threads_active) wake_up() Here after inserted the __wait into queue on CPU0, and before test if queue is empty on CPU1, there is no barrier, it maybe cause it is not visible for CPU1 immediately, although CPU0 has updated the queue list. It is similar for CPU0 atomic_read() threads_active also. So we'd need one smp_mb() before waitqueue_active.that, but removing the waitqueue_active() check solves it as wel l and it makes things simple and clear. Signed-off-by: Chuansheng Liu Cc: Xiaoming Wang Link: http://lkml.kernel.org/r/1393212590-32543-1-git-send-email-chuansheng.liu@intel.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 481a13c43b1..d3bf660cb57 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, static void wake_threads_waitq(struct irq_desc *desc) { - if (atomic_dec_and_test(&desc->threads_active) && - waitqueue_active(&desc->wait_for_threads)) + if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } -- cgit v1.2.3 From 791c9e0292671a3bfa95286bb5c08129d8605618 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Tue, 18 Feb 2014 17:56:51 -0600 Subject: sched: Fix double normalization of vruntime dequeue_entity() is called when p->on_rq and sets se->on_rq = 0 which appears to guarentee that the !se->on_rq condition is met. If the task has done set_current_state(TASK_INTERRUPTIBLE) without schedule() the second condition will be met and vruntime will be incorrectly adjusted twice. In certain cases this can result in the task's vruntime never increasing past the vruntime of other tasks on the CFS' run queue, starving them of CPU time. This patch changes switched_from_fair() to use !p->on_rq instead of !se->on_rq. I'm able to cause a task with a priority of 120 to starve all other tasks with the same priority on an ARM platform running 3.2.51-rt72 PREEMPT RT by writing one character at time to a serial tty (16550 UART) in a tight loop. I'm also able to verify making this change corrects the problem on that platform and kernel version. Signed-off-by: George McCollister Signed-off-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1392767811-28916-1-git-send-email-george.mccollister@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 78157099b16..9b4c4f32013 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7001,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. -- cgit v1.2.3 From 3908ac13b550c93f97d8db136ff572be5495bc06 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 25 Feb 2014 19:52:23 +0400 Subject: sched/deadline: Cleanup RT leftovers from {inc/dec}_dl_migration In deadline class we do not have group scheduling. So, let's remove unnecessary X = X; equations. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Juri Lelli Link: http://lkml.kernel.org/r/1393343543.4089.5.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 15cbc17fbf8..aecf93030e0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -135,7 +135,6 @@ static void update_dl_migration(struct dl_rq *dl_rq) static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; @@ -146,7 +145,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; -- cgit v1.2.3 From eec751ed41a0ae7e92a43c33a458d7bd1b941631 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 24 Feb 2014 11:47:12 +0100 Subject: sched/deadline: Switch CPU's presence test order Commit 82b9580 ("sched/deadline: Test for CPU's presence explicitly") changed how we check if a CPU returned by cpudeadline machinery is valid. But, we don't want to call cpu_present() if best_cpu is equal to -1. So, switch the order of tests inside WARN_ON(). Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: boris.ostrovsky@oracle.com Cc: konrad.wilk@oracle.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1393238832-9100-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b8838b56d1..5b9bb42b2d4 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx) static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { - WARN_ON(!cpu_present(idx) || idx == IDX_INVALID); + WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); if (dl_time_before(new_dl, cp->elements[idx].dl)) { cp->elements[idx].dl = new_dl; @@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } out: - WARN_ON(!cpu_present(best_cpu) && best_cpu != -1); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); return best_cpu; } -- cgit v1.2.3 From faa5993736d9b44b508cab4f1f3a77d66641c6f4 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 21 Feb 2014 11:37:15 +0100 Subject: sched/deadline: Prevent rt_time growth to infinity Kirill Tkhai noted: Since deadline tasks share rt bandwidth, we must care about bandwidth timer set. Otherwise rt_time may grow up to infinity in update_curr_dl(), if there are no other available RT tasks on top level bandwidth. RT task were in fact throttled right after they got enqueued, and never executed again (rt_time never again went below rt_runtime). Peter then proposed to accrue DL execution on rt_time only when rt timer is active, and proposed a patch (this patch is a slight modification of that) to implement that behavior. While this solves Kirill problem, it has a drawback. Indeed, Kirill noted again: It looks we may get into a situation, when all CPU time is shared between RT and DL tasks: rt_runtime = n rt_period = 2n | RT working, DL sleeping | DL working, RT sleeping | ----------------------------------------------------------- | (1) duration = n | (2) duration = n | (repeat) |--------------------------|------------------------------| | (rt_bw timer is running) | (rt_bw timer is not running) | No time for fair tasks at all. While this can happen during the first period, if rq is always backlogged, RT tasks won't have the opportunity to execute anymore: rt_time reached rt_runtime during (1), suppose after (2) RT is enqueued back, it gets throttled since rt timer didn't fire, replenishment is from now on eaten up by DL tasks that accrue their execution on rt_time (while rt timer is active - we have an RT task waiting for replenishment). FAIR tasks are not touched after this first period. Ok, this is not ideal, and the situation is even worse! What above (the nice case), practically never happens in reality, where your rt timer is not aligned to tasks periods, tasks are in general not periodic, etc.. Long story short, you always risk to overload your system. This patch is based on Peter's idea, but exploits an additional fact: if you don't have RT tasks enqueued, it makes little sense to continue incrementing rt_time once you reached the upper limit (DL tasks have their own mechanism for throttling). This cures both problems: - no matter how many DL instances in the past, you'll have an rt_time slightly above rt_runtime when an RT task is enqueued, and from that point on (after the first replenishment), the task will normally execute; - you can still eat up all bandwidth during the first period, but not anymore after that, remember that DL execution will increment rt_time till the upper limit is reached. The situation is still not perfect! But, we have a simple solution for now, that limits how much you can jeopardize your system, as we keep working towards the right answer: RT groups scheduled using deadline servers. Reported-by: Kirill Tkhai Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140225151515.617714e2f2cd6c558531ba61@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 ++++++-- kernel/sched/rt.c | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aecf93030e0..6e79b3faa4c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -562,6 +562,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) return 1; } +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -625,11 +627,13 @@ static void update_curr_dl(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; /* * We'll let actual RT tasks worry about the overflow here, we - * have our own CBS to keep us inline -- see above. + * have our own CBS to keep us inline; only account when RT + * bandwidth is relevant. */ + if (sched_rt_bandwidth_account(rt_rq)) + rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a2740b775b4..1999021042c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + return (hrtimer_active(&rt_b->rt_period_timer) || + rt_rq->rt_time < rt_b->rt_runtime); +} + #ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. -- cgit v1.2.3 From e3703f8cdfcf39c25c4338c3ad8e68891cca3731 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2014 12:06:12 +0100 Subject: perf: Fix hotplug splat Drew Richardson reported that he could make the kernel go *boom* when hotplugging while having perf events active. It turned out that when you have a group event, the code in __perf_event_exit_context() fails to remove the group siblings from the context. We then proceed with destroying and freeing the event, and when you re-plug the CPU and try and add another event to that CPU, things go *boom* because you've still got dead entries there. Reported-by: Drew Richardson Signed-off-by: Peter Zijlstra Cc: Will Deacon Cc: Link: http://lkml.kernel.org/n/tip-k6v5wundvusvcseqj1si0oz0@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 56003c6edfd..fa0b2d4ad83 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7856,14 +7856,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { struct perf_event_context *ctx = __info; - struct perf_event *event, *tmp; + struct perf_event *event; perf_pmu_rotate_stop(ctx->pmu); - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_remove_from_context(event); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) __perf_remove_from_context(event); + rcu_read_unlock(); } static void perf_event_exit_cpu_context(int cpu) @@ -7887,11 +7887,11 @@ static void perf_event_exit_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + perf_event_exit_cpu_context(cpu); + mutex_lock(&swhash->hlist_mutex); swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); - - perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { } -- cgit v1.2.3 From 64be38ab03e9b238a1299857fef8b3707c0ed045 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 27 Feb 2014 17:10:12 +0530 Subject: genirq: Include missing header file in irqdomain.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include appropriate header file include/linux/of_irq.h in kernel/irq/irqdomain.c because it contains prototype definition of function define in kernel/irq/irqdomain.c. This eliminates the following warning in kernel/irq/irqdomain.c: kernel/irq/irqdomain.c:468:14: warning: no previous prototype for ‘irq_create_of_mapping’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Cc: Benjamin Herrenschmidt Link: http://lkml.kernel.org/r/eb89aebea7ff1a46122918ac389ebecf8248be9a.1393493276.git.rashika.kheria@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf68bb36fe5..f14033700c2 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 4729583006772b9530404bc1bb7c3aa4a10ffd4d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Feb 2014 18:19:03 +0800 Subject: cpuset: fix a locking issue in cpuset_migrate_mm() I can trigger a lockdep warning: # mount -t cgroup -o cpuset xxx /cgroup # mkdir /cgroup/cpuset # mkdir /cgroup/tmp # echo 0 > /cgroup/tmp/cpuset.cpus # echo 0 > /cgroup/tmp/cpuset.mems # echo 1 > /cgroup/tmp/cpuset.memory_migrate # echo $$ > /cgroup/tmp/tasks # echo 1 > /cgruop/tmp/cpuset.mems =============================== [ INFO: suspicious RCU usage. ] 3.14.0-rc1-0.1-default+ #32 Not tainted ------------------------------- include/linux/cgroup.h:682 suspicious rcu_dereference_check() usage! ... [] dump_stack+0x72/0x86 [] lockdep_rcu_suspicious+0x101/0x140 [] cpuset_migrate_mm+0xb1/0xe0 ... We used to hold cgroup_mutex when calling cpuset_migrate_mm(), but now we hold cpuset_mutex, which causes task_css() to complain. This is not a false-positive but a real issue. Holding cpuset_mutex won't prevent a task from migrating to another cpuset, and it won't prevent the original task->cgroup from destroying during this change. Fixes: 5d21cc2db040 (cpuset: replace cgroup_mutex locking with cpuset internal locking) Cc: # 3.9+ Signed-off-by: Li Zefan Sigend-off-by: Tejun Heo --- kernel/cpuset.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4410ac6a55f..dba9e4aef69 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -974,12 +974,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cpuset_mutex, so current's cpuset won't change - * during this call, as manage_mutex holds off any cpuset_attach() - * calls. Therefore we don't need to take task_lock around the - * call to guarantee_online_mems(), as we know no one is changing - * our task's cpuset. - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -996,8 +990,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + rcu_read_lock(); mems_cs = effective_nodemask_cpuset(task_cs(tsk)); guarantee_online_mems(mems_cs, &tsk->mems_allowed); + rcu_read_unlock(); } /* -- cgit v1.2.3 From 99afb0fd5f05aac467ffa85c36778fec4396209b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Feb 2014 18:19:36 +0800 Subject: cpuset: fix a race condition in __cpuset_node_allowed_softwall() It's not safe to access task's cpuset after releasing task_lock(). Holding callback_mutex won't help. Cc: Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index dba9e4aef69..e6b1b66afe5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2482,9 +2482,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) task_lock(current); cs = nearest_hardwall_ancestor(task_cs(current)); + allowed = node_isset(node, cs->mems_allowed); task_unlock(current); - allowed = node_isset(node, cs->mems_allowed); mutex_unlock(&callback_mutex); return allowed; } -- cgit v1.2.3 From 45ab2813d40d88fc575e753c38478de242d03f88 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 26 Feb 2014 13:37:38 -0500 Subject: tracing: Do not add event files for modules that fail tracepoints If a module fails to add its tracepoints due to module tainting, do not create the module event infrastructure in the debugfs directory. As the events will not work and worse yet, they will silently fail, making the user wonder why the events they enable do not display anything. Having a warning on module load and the events not visible to the users will make the cause of the problem much clearer. Link: http://lkml.kernel.org/r/20140227154923.265882695@goodmis.org Fixes: 6d723736e472 "tracing/events: add support for modules to TRACE_EVENT" Acked-by: Mathieu Desnoyers Cc: stable@vger.kernel.org # 2.6.31+ Cc: Rusty Russell Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 6 ++++++ kernel/trace/trace_events.c | 10 ++++++++++ kernel/tracepoint.c | 7 ++++++- 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index accc497f8d7..7159a0a933d 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -60,6 +60,12 @@ struct tp_module { unsigned int num_tracepoints; struct tracepoint * const *tracepoints_ptrs; }; +bool trace_module_has_bad_taint(struct module *mod); +#else +static inline bool trace_module_has_bad_taint(struct module *mod) +{ + return false; +} #endif /* CONFIG_MODULES */ struct tracepoint_iter { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e71ffd4eccb..f3989ceb5cd 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1777,6 +1777,16 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_event_call **call, **start, **end; + if (!mod->num_trace_events) + return; + + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); + return; + } + start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 29f26540e9c..031cc5655a5 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES +bool trace_module_has_bad_taint(struct module *mod) +{ + return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); +} + static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod, *iter; @@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod) * module headers (for forced load), to make sure we don't cause a crash. * Staging and out-of-tree GPL modules are fine. */ - if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) + if (trace_module_has_bad_taint(mod)) return 0; mutex_lock(&tracepoints_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); -- cgit v1.2.3 From c1bacbae8192dd2a9ebadd22d793b68054f6c6e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 8 Mar 2014 08:59:58 +0100 Subject: genirq: Provide irq_request/release_resources chip callbacks For certain irq types, e.g. gpios, it's necessary to request resources before starting up the irq. This might fail so we cannot use the irq_startup() callback because we might call the irq_set_type() callback before that which does not make sense when the resource is not available. Calling irq_startup() before irq_set_type() can lead to spurious interrupts which is not desired either. Signed-off-by: Thomas Gleixner Cc: Jean-Jacques Hiblot Cc: Grant Likely Cc: linux-arm-kernel@lists.infradead.org Reviewed-by: Linus Walleij Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1403080857160.18573@ionos.tec.linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 6 ++++++ kernel/irq/manage.c | 28 +++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 7dc10036eff..e675971bdc3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -303,6 +303,10 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_pm_shutdown: function called from core code on shutdown once per chip * @irq_calc_mask: Optional function to set irq_data.mask for special cases * @irq_print_chip: optional to print special chip info in show_interrupts + * @irq_request_resources: optional to request resources before calling + * any other callback related to this irq + * @irq_release_resources: optional to release resources acquired with + * irq_request_resources * @flags: chip specific flags */ struct irq_chip { @@ -336,6 +340,8 @@ struct irq_chip { void (*irq_calc_mask)(struct irq_data *data); void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); + int (*irq_request_resources)(struct irq_data *data); + void (*irq_release_resources)(struct irq_data *data); unsigned long flags; }; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d3bf660cb57..5d35cbe2289 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -896,6 +896,23 @@ static void irq_setup_forced_threading(struct irqaction *new) } } +static int irq_request_resources(struct irq_desc *desc) +{ + struct irq_data *d = &desc->irq_data; + struct irq_chip *c = d->chip; + + return c->irq_request_resources ? c->irq_request_resources(d) : 0; +} + +static void irq_release_resources(struct irq_desc *desc) +{ + struct irq_data *d = &desc->irq_data; + struct irq_chip *c = d->chip; + + if (c->irq_release_resources) + c->irq_release_resources(d); +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1091,6 +1108,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { + ret = irq_request_resources(desc); + if (ret) { + pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", + new->name, irq, desc->irq_data.chip->name); + goto out_mask; + } + init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -1261,8 +1285,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) *action_ptr = action->next; /* If this was the last handler, shut down the IRQ line: */ - if (!desc->action) + if (!desc->action) { irq_shutdown(desc); + irq_release_resources(desc); + } #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ -- cgit v1.2.3