From 57439f878afafefad8836ebf5c49da2a0a746105 Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Thu, 24 Jun 2010 13:02:14 +1000 Subject: fs: fix superblock iteration race list_for_each_entry_safe is not suitable to protect against concurrent modification of the list. 6754af6 introduced a race in sb walking. list_for_each_entry can use the trick of pinning the current entry in the list before we drop and retake the lock because it subsequently follows cur->next. However list_for_each_entry_safe saves n=cur->next for following before entering the loop body, so when the lock is dropped, n may be deleted. Signed-off-by: Nick Piggin Cc: Christoph Hellwig Cc: John Stultz Cc: Frank Mayhar Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 5c35bc7a499e..938119ab8dcb 100644 --- a/fs/super.c +++ b/fs/super.c @@ -374,6 +374,8 @@ void sync_supers(void) up_read(&sb->s_umount); spin_lock(&sb_lock); + /* lock was dropped, must reset next */ + list_safe_reset_next(sb, n, s_list); __put_super(sb); } } @@ -405,6 +407,8 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) up_read(&sb->s_umount); spin_lock(&sb_lock); + /* lock was dropped, must reset next */ + list_safe_reset_next(sb, n, s_list); __put_super(sb); } spin_unlock(&sb_lock); @@ -585,6 +589,8 @@ static void do_emergency_remount(struct work_struct *work) } up_write(&sb->s_umount); spin_lock(&sb_lock); + /* lock was dropped, must reset next */ + list_safe_reset_next(sb, n, s_list); __put_super(sb); } spin_unlock(&sb_lock); -- cgit v1.2.3 From 4f331f01b9c43bf001d3ffee578a97a1e0633eac Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 20 Jul 2010 15:18:07 -0700 Subject: vfs: don't hold s_umount over close_bdev_exclusive() call Fix an obscure AB-BA deadlock in get_sb_bdev(). When a superblock is mounted more than once get_sb_bdev() calls close_bdev_exclusive() to drop the extra bdev reference while holding s_umount. However, sb->s_umount nests inside bd_mutex during __invalidate_device() and close_bdev_exclusive() acquires bd_mutex during blkdev_put(); thus creating an AB-BA deadlock. This condition doesn't trigger frequently. For this condition to be visible to lockdep, the filesystem must occupy the whole device (as __invalidate_device() only grabs bd_mutex for the whole device), the FS must be mounted more than once and partition rescan should be issued while the FS is still mounted. Fix it by dropping s_umount over close_bdev_exclusive(). Signed-off-by: Tejun Heo Reported-by: Ciprian Docan Cc: Al Viro Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/super.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 938119ab8dcb..3479ca6f005f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -773,7 +773,16 @@ int get_sb_bdev(struct file_system_type *fs_type, goto error_bdev; } + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). close_bdev_exclusive() + * acquires bd_mutex and can't be called under + * s_umount. Drop s_umount temporarily. This is safe + * as we're holding an active reference. + */ + up_write(&s->s_umount); close_bdev_exclusive(bdev, mode); + down_write(&s->s_umount); } else { char b[BDEVNAME_SIZE]; -- cgit v1.2.3 From 7a4dec53897ecd3367efb1e12fe8a4edc47dc0e9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 9 Aug 2010 12:05:43 -0400 Subject: Fix sget() race with failing mount If sget() finds a matching superblock being set up, it'll grab an active reference to it and grab s_umount. That's fine - we'll wait for completion of foofs_get_sb() that way. However, if said foofs_get_sb() fails we'll end up holding the halfway-created superblock. deactivate_locked_super() called by foofs_get_sb() will just unlock the sucker since we are holding another active reference to it. What we need is a way to tell if superblock has been successfully set up. Unfortunately, neither ->s_root nor the check for MS_ACTIVE quite fit. Cheap and easy way, suitable for backport: new flag set by the (only) caller of ->get_sb(). If that flag isn't present by the time sget() grabbed s_umount on preexisting superblock it has found, it's seeing a stillborn and should just bury it with deactivate_locked_super() (and repeat the search). Longer term we want to set that flag in ->get_sb() instances (and check for it to distinguish between "sget() found us a live sb" and "sget() has allocated an sb, we need to set it up" in there, instead of checking ->s_root as we do now). Signed-off-by: Al Viro Cc: stable@kernel.org --- fs/namespace.c | 2 +- fs/super.c | 6 ++++++ include/linux/fs.h | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/super.c') diff --git a/fs/namespace.c b/fs/namespace.c index 88058de59c7c..32dcd24bbc9a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1984,7 +1984,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; - flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); diff --git a/fs/super.c b/fs/super.c index 3479ca6f005f..bd9eea4bb2bb 100644 --- a/fs/super.c +++ b/fs/super.c @@ -305,8 +305,13 @@ retry: if (s) { up_write(&s->s_umount); destroy_super(s); + s = NULL; } down_write(&old->s_umount); + if (unlikely(!(old->s_flags & MS_BORN))) { + deactivate_locked_super(old); + goto retry; + } return old; } } @@ -918,6 +923,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void goto out_free_secdata; BUG_ON(!mnt->mnt_sb); WARN_ON(!mnt->mnt_sb->s_bdi); + mnt->mnt_sb->s_flags |= MS_BORN; error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); if (error) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9bedf4219f83..58e4b035e282 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -209,6 +209,7 @@ struct inodes_stat_t { #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_BORN (1<<29) #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) -- cgit v1.2.3 From dca332528bc69e05f67161e1ed59929633d5e63d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 25 Jul 2010 02:31:46 +0400 Subject: no need for list_for_each_entry_safe()/resetting with superblock list just delay __put_super() a bit Signed-off-by: Al Viro --- fs/dcache.c | 12 +++++++----- fs/super.c | 36 +++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 20 deletions(-) (limited to 'fs/super.c') diff --git a/fs/dcache.c b/fs/dcache.c index caf08574982f..9f2c13417969 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -536,7 +536,7 @@ restart: */ static void prune_dcache(int count) { - struct super_block *sb, *n; + struct super_block *sb, *p = NULL; int w_count; int unused = dentry_stat.nr_unused; int prune_ratio; @@ -550,7 +550,7 @@ static void prune_dcache(int count) else prune_ratio = unused / count; spin_lock(&sb_lock); - list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + list_for_each_entry(sb, &super_blocks, s_list) { if (list_empty(&sb->s_instances)) continue; if (sb->s_nr_dentry_unused == 0) @@ -590,14 +590,16 @@ static void prune_dcache(int count) up_read(&sb->s_umount); } spin_lock(&sb_lock); - /* lock was dropped, must reset next */ - list_safe_reset_next(sb, n, s_list); + if (p) + __put_super(p); count -= pruned; - __put_super(sb); + p = sb; /* more work left to do? */ if (count <= 0) break; } + if (p) + __put_super(p); spin_unlock(&sb_lock); spin_unlock(&dcache_lock); } diff --git a/fs/super.c b/fs/super.c index bd9eea4bb2bb..9674ab2c8718 100644 --- a/fs/super.c +++ b/fs/super.c @@ -363,10 +363,10 @@ EXPORT_SYMBOL(drop_super); */ void sync_supers(void) { - struct super_block *sb, *n; + struct super_block *sb, *p = NULL; spin_lock(&sb_lock); - list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + list_for_each_entry(sb, &super_blocks, s_list) { if (list_empty(&sb->s_instances)) continue; if (sb->s_op->write_super && sb->s_dirt) { @@ -379,11 +379,13 @@ void sync_supers(void) up_read(&sb->s_umount); spin_lock(&sb_lock); - /* lock was dropped, must reset next */ - list_safe_reset_next(sb, n, s_list); - __put_super(sb); + if (p) + __put_super(p); + p = sb; } } + if (p) + __put_super(p); spin_unlock(&sb_lock); } @@ -397,10 +399,10 @@ void sync_supers(void) */ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) { - struct super_block *sb, *n; + struct super_block *sb, *p = NULL; spin_lock(&sb_lock); - list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + list_for_each_entry(sb, &super_blocks, s_list) { if (list_empty(&sb->s_instances)) continue; sb->s_count++; @@ -412,10 +414,12 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) up_read(&sb->s_umount); spin_lock(&sb_lock); - /* lock was dropped, must reset next */ - list_safe_reset_next(sb, n, s_list); - __put_super(sb); + if (p) + __put_super(p); + p = sb; } + if (p) + __put_super(p); spin_unlock(&sb_lock); } @@ -577,10 +581,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) static void do_emergency_remount(struct work_struct *work) { - struct super_block *sb, *n; + struct super_block *sb, *p = NULL; spin_lock(&sb_lock); - list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + list_for_each_entry(sb, &super_blocks, s_list) { if (list_empty(&sb->s_instances)) continue; sb->s_count++; @@ -594,10 +598,12 @@ static void do_emergency_remount(struct work_struct *work) } up_write(&sb->s_umount); spin_lock(&sb_lock); - /* lock was dropped, must reset next */ - list_safe_reset_next(sb, n, s_list); - __put_super(sb); + if (p) + __put_super(p); + p = sb; } + if (p) + __put_super(p); spin_unlock(&sb_lock); kfree(work); printk("Emergency Remount complete\n"); -- cgit v1.2.3