diff options
author | Mikulas Patocka <mpatocka@redhat.com> | 2012-09-26 07:46:40 +0200 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2012-09-26 07:46:40 +0200 |
commit | b87570f5d349661814b262dd5fc40787700f80d6 (patch) | |
tree | d06ef6c95ed114e19c864ebe0240c788dd75e85c | |
parent | 60ea8226cbd5c8301f9a39edc574ddabcb8150e0 (diff) |
Fix a crash when block device is read and block size is changed at the same time
The kernel may crash when block size is changed and I/O is issued
simultaneously.
Because some subsystems (udev or lvm) may read any block device anytime,
the bug actually puts any code that changes a block device size in
jeopardy.
The crash can be reproduced if you place "msleep(1000)" to
blkdev_get_blocks just before "bh->b_size = max_blocks <<
inode->i_blkbits;".
Then, run "dd if=/dev/ram0 of=/dev/null bs=4k count=1 iflag=direct"
While it is waiting in msleep, run "blockdev --setbsz 2048 /dev/ram0"
You get a BUG.
The direct and non-direct I/O is written with the assumption that block
size does not change. It doesn't seem practical to fix these crashes
one-by-one there may be many crash possibilities when block size changes
at a certain place and it is impossible to find them all and verify the
code.
This patch introduces a new rw-lock bd_block_size_semaphore. The lock is
taken for read during I/O. It is taken for write when changing block
size. Consequently, block size can't be changed while I/O is being
submitted.
For asynchronous I/O, the patch only prevents block size change while
the I/O is being submitted. The block size can change when the I/O is in
progress or when the I/O is being finished. This is acceptable because
there are no accesses to block size when asynchronous I/O is being
finished.
The patch prevents block size changing while the device is mapped with
mmap.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r-- | drivers/char/raw.c | 2 | ||||
-rw-r--r-- | fs/block_dev.c | 62 | ||||
-rw-r--r-- | include/linux/fs.h | 4 |
3 files changed, 65 insertions, 3 deletions
diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 54a3a6d09819..0bb207eaef2f 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -285,7 +285,7 @@ static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd, static const struct file_operations raw_fops = { .read = do_sync_read, - .aio_read = generic_file_aio_read, + .aio_read = blkdev_aio_read, .write = do_sync_write, .aio_write = blkdev_aio_write, .fsync = blkdev_fsync, diff --git a/fs/block_dev.c b/fs/block_dev.c index 38e721b35d45..cdfb625824e2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -116,6 +116,8 @@ EXPORT_SYMBOL(invalidate_bdev); int set_blocksize(struct block_device *bdev, int size) { + struct address_space *mapping; + /* Size must be a power of two, and between 512 and PAGE_SIZE */ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL; @@ -124,6 +126,20 @@ int set_blocksize(struct block_device *bdev, int size) if (size < bdev_logical_block_size(bdev)) return -EINVAL; + /* Prevent starting I/O or mapping the device */ + down_write(&bdev->bd_block_size_semaphore); + + /* Check that the block device is not memory mapped */ + mapping = bdev->bd_inode->i_mapping; + mutex_lock(&mapping->i_mmap_mutex); + if (!prio_tree_empty(&mapping->i_mmap) || + !list_empty(&mapping->i_mmap_nonlinear)) { + mutex_unlock(&mapping->i_mmap_mutex); + up_write(&bdev->bd_block_size_semaphore); + return -EBUSY; + } + mutex_unlock(&mapping->i_mmap_mutex); + /* Don't change the size if it is same as current */ if (bdev->bd_block_size != size) { sync_blockdev(bdev); @@ -131,6 +147,9 @@ int set_blocksize(struct block_device *bdev, int size) bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } + + up_write(&bdev->bd_block_size_semaphore); + return 0; } @@ -472,6 +491,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); + init_rwsem(&bdev->bd_block_size_semaphore); } static inline void __bd_forget(struct inode *inode) @@ -1567,6 +1587,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } +ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t ret; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + + down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + + up_read(&bdev->bd_block_size_semaphore); + + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_aio_read); + /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -1578,12 +1614,16 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); blk_start_plug(&plug); + + down_read(&bdev->bd_block_size_semaphore); + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; @@ -1592,11 +1632,29 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } + + up_read(&bdev->bd_block_size_semaphore); + blk_finish_plug(&plug); + return ret; } EXPORT_SYMBOL_GPL(blkdev_aio_write); +int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret; + struct block_device *bdev = I_BDEV(file->f_mapping->host); + + down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_mmap(file, vma); + + up_read(&bdev->bd_block_size_semaphore); + + return ret; +} + /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1627,9 +1685,9 @@ const struct file_operations def_blk_fops = { .llseek = block_llseek, .read = do_sync_read, .write = do_sync_write, - .aio_read = generic_file_aio_read, + .aio_read = blkdev_aio_read, .aio_write = blkdev_aio_write, - .mmap = generic_file_mmap, + .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT diff --git a/include/linux/fs.h b/include/linux/fs.h index bd6f6e7ca48e..e60bbd0225d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -725,6 +725,8 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; + /* A semaphore that prevents I/O while block size is being changed */ + struct rw_semaphore bd_block_size_semaphore; }; /* @@ -2565,6 +2567,8 @@ extern int generic_segment_checks(const struct iovec *iov, unsigned long *nr_segs, size_t *count, int access_flags); /* fs/block_dev.c */ +extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, |