summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-05-08 10:13:35 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2013-05-08 10:13:35 -0700
commit4de13d7aa8f4d02f4dc99d4609575659f92b3c5a (patch)
tree3bc9729eabe79c6164cd29a5d605000bc82bf837 /mm
parent5af43c24ca59a448c9312dd4a4a51d27ec3b9a73 (diff)
parentb8d4a5bf6a049303a29a3275f463f09a490b50ea (diff)
Merge branch 'for-3.10/core' of git://git.kernel.dk/linux-block
Pull block core updates from Jens Axboe: - Major bit is Kents prep work for immutable bio vecs. - Stable candidate fix for a scheduling-while-atomic in the queue bypass operation. - Fix for the hang on exceeded rq->datalen 32-bit unsigned when merging discard bios. - Tejuns changes to convert the writeback thread pool to the generic workqueue mechanism. - Runtime PM framework, SCSI patches exists on top of these in James' tree. - A few random fixes. * 'for-3.10/core' of git://git.kernel.dk/linux-block: (40 commits) relay: move remove_buf_file inside relay_close_buf partitions/efi.c: replace useless kzalloc's by kmalloc's fs/block_dev.c: fix iov_shorten() criteria in blkdev_aio_read() block: fix max discard sectors limit blkcg: fix "scheduling while atomic" in blk_queue_bypass_start Documentation: cfq-iosched: update documentation help for cfq tunables writeback: expose the bdi_wq workqueue writeback: replace custom worker pool implementation with unbound workqueue writeback: remove unused bdi_pending_list aoe: Fix unitialized var usage bio-integrity: Add explicit field for owner of bip_buf block: Add an explicit bio flag for bios that own their bvec block: Add bio_alloc_pages() block: Convert some code to bio_for_each_segment_all() block: Add bio_for_each_segment_all() bounce: Refactor __blk_queue_bounce to not use bi_io_vec raid1: use bio_copy_data() pktcdvd: Use bio_reset() in disabled code to kill bi_idx usage pktcdvd: use bio_copy_data() block: Add bio_copy_data() ...
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c259
-rw-r--r--mm/bounce.c75
-rw-r--r--mm/page_io.c1
3 files changed, 49 insertions, 286 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 41733c5dc82..50251749225 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -31,13 +31,14 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
/*
- * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
- * reader side protection for bdi_pending_list. bdi_list has RCU reader side
+ * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
* locking.
*/
DEFINE_SPINLOCK(bdi_lock);
LIST_HEAD(bdi_list);
-LIST_HEAD(bdi_pending_list);
+
+/* bdi_wq serves all asynchronous writeback tasks */
+struct workqueue_struct *bdi_wq;
void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
{
@@ -257,6 +258,11 @@ static int __init default_bdi_init(void)
{
int err;
+ bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!bdi_wq)
+ return -ENOMEM;
+
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
@@ -271,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
return wb_has_dirty_io(&bdi->wb);
}
-static void wakeup_timer_fn(unsigned long data)
-{
- struct backing_dev_info *bdi = (struct backing_dev_info *)data;
-
- spin_lock_bh(&bdi->wb_lock);
- if (bdi->wb.task) {
- trace_writeback_wake_thread(bdi);
- wake_up_process(bdi->wb.task);
- } else if (bdi->dev) {
- /*
- * When bdi tasks are inactive for long time, they are killed.
- * In this case we have to wake-up the forker thread which
- * should create and run the bdi thread.
- */
- trace_writeback_wake_forker_thread(bdi);
- wake_up_process(default_backing_dev_info.wb.task);
- }
- spin_unlock_bh(&bdi->wb_lock);
-}
-
/*
* This function is used when the first inode for this bdi is marked dirty. It
* wakes-up the corresponding bdi thread which should then take care of the
@@ -307,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
unsigned long timeout;
timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
- mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
-}
-
-/*
- * Calculate the longest interval (jiffies) bdi threads are allowed to be
- * inactive.
- */
-static unsigned long bdi_longest_inactive(void)
-{
- unsigned long interval;
-
- interval = msecs_to_jiffies(dirty_writeback_interval * 10);
- return max(5UL * 60 * HZ, interval);
-}
-
-/*
- * Clear pending bit and wakeup anybody waiting for flusher thread creation or
- * shutdown
- */
-static void bdi_clear_pending(struct backing_dev_info *bdi)
-{
- clear_bit(BDI_pending, &bdi->state);
- smp_mb__after_clear_bit();
- wake_up_bit(&bdi->state, BDI_pending);
-}
-
-static int bdi_forker_thread(void *ptr)
-{
- struct bdi_writeback *me = ptr;
-
- current->flags |= PF_SWAPWRITE;
- set_freezable();
-
- /*
- * Our parent may run at a different priority, just set us to normal
- */
- set_user_nice(current, 0);
-
- for (;;) {
- struct task_struct *task = NULL;
- struct backing_dev_info *bdi;
- enum {
- NO_ACTION, /* Nothing to do */
- FORK_THREAD, /* Fork bdi thread */
- KILL_THREAD, /* Kill inactive bdi thread */
- } action = NO_ACTION;
-
- /*
- * Temporary measure, we want to make sure we don't see
- * dirty data on the default backing_dev_info
- */
- if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
- del_timer(&me->wakeup_timer);
- wb_do_writeback(me, 0);
- }
-
- spin_lock_bh(&bdi_lock);
- /*
- * In the following loop we are going to check whether we have
- * some work to do without any synchronization with tasks
- * waking us up to do work for them. Set the task state here
- * so that we don't miss wakeups after verifying conditions.
- */
- set_current_state(TASK_INTERRUPTIBLE);
-
- list_for_each_entry(bdi, &bdi_list, bdi_list) {
- bool have_dirty_io;
-
- if (!bdi_cap_writeback_dirty(bdi) ||
- bdi_cap_flush_forker(bdi))
- continue;
-
- WARN(!test_bit(BDI_registered, &bdi->state),
- "bdi %p/%s is not registered!\n", bdi, bdi->name);
-
- have_dirty_io = !list_empty(&bdi->work_list) ||
- wb_has_dirty_io(&bdi->wb);
-
- /*
- * If the bdi has work to do, but the thread does not
- * exist - create it.
- */
- if (!bdi->wb.task && have_dirty_io) {
- /*
- * Set the pending bit - if someone will try to
- * unregister this bdi - it'll wait on this bit.
- */
- set_bit(BDI_pending, &bdi->state);
- action = FORK_THREAD;
- break;
- }
-
- spin_lock(&bdi->wb_lock);
-
- /*
- * If there is no work to do and the bdi thread was
- * inactive long enough - kill it. The wb_lock is taken
- * to make sure no-one adds more work to this bdi and
- * wakes the bdi thread up.
- */
- if (bdi->wb.task && !have_dirty_io &&
- time_after(jiffies, bdi->wb.last_active +
- bdi_longest_inactive())) {
- task = bdi->wb.task;
- bdi->wb.task = NULL;
- spin_unlock(&bdi->wb_lock);
- set_bit(BDI_pending, &bdi->state);
- action = KILL_THREAD;
- break;
- }
- spin_unlock(&bdi->wb_lock);
- }
- spin_unlock_bh(&bdi_lock);
-
- /* Keep working if default bdi still has things to do */
- if (!list_empty(&me->bdi->work_list))
- __set_current_state(TASK_RUNNING);
-
- switch (action) {
- case FORK_THREAD:
- __set_current_state(TASK_RUNNING);
- task = kthread_create(bdi_writeback_thread, &bdi->wb,
- "flush-%s", dev_name(bdi->dev));
- if (IS_ERR(task)) {
- /*
- * If thread creation fails, force writeout of
- * the bdi from the thread. Hopefully 1024 is
- * large enough for efficient IO.
- */
- writeback_inodes_wb(&bdi->wb, 1024,
- WB_REASON_FORKER_THREAD);
- } else {
- /*
- * The spinlock makes sure we do not lose
- * wake-ups when racing with 'bdi_queue_work()'.
- * And as soon as the bdi thread is visible, we
- * can start it.
- */
- spin_lock_bh(&bdi->wb_lock);
- bdi->wb.task = task;
- spin_unlock_bh(&bdi->wb_lock);
- wake_up_process(task);
- }
- bdi_clear_pending(bdi);
- break;
-
- case KILL_THREAD:
- __set_current_state(TASK_RUNNING);
- kthread_stop(task);
- bdi_clear_pending(bdi);
- break;
-
- case NO_ACTION:
- if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
- /*
- * There are no dirty data. The only thing we
- * should now care about is checking for
- * inactive bdi threads and killing them. Thus,
- * let's sleep for longer time, save energy and
- * be friendly for battery-driven devices.
- */
- schedule_timeout(bdi_longest_inactive());
- else
- schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
- try_to_freeze();
- break;
- }
- }
-
- return 0;
+ mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
}
/*
@@ -489,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
spin_unlock_bh(&bdi_lock);
synchronize_rcu_expedited();
+
+ /* bdi_list is now unused, clear it to mark @bdi dying */
+ INIT_LIST_HEAD(&bdi->bdi_list);
}
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -508,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
bdi->dev = dev;
- /*
- * Just start the forker thread for our default backing_dev_info,
- * and add other bdi's to the list. They will get a thread created
- * on-demand when they need it.
- */
- if (bdi_cap_flush_forker(bdi)) {
- struct bdi_writeback *wb = &bdi->wb;
-
- wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
- dev_name(dev));
- if (IS_ERR(wb->task))
- return PTR_ERR(wb->task);
- }
-
bdi_debug_register(bdi, dev_name(dev));
set_bit(BDI_registered, &bdi->state);
@@ -545,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev);
*/
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
- struct task_struct *task;
-
if (!bdi_cap_writeback_dirty(bdi))
return;
@@ -556,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
bdi_remove_from_list(bdi);
/*
- * If setup is pending, wait for that to complete first
+ * Drain work list and shutdown the delayed_work. At this point,
+ * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
+ * is dying and its work_list needs to be drained no matter what.
*/
- wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
- TASK_UNINTERRUPTIBLE);
+ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+ flush_delayed_work(&bdi->wb.dwork);
+ WARN_ON(!list_empty(&bdi->work_list));
/*
- * Finally, kill the kernel thread. We don't need to be RCU
- * safe anymore, since the bdi is gone from visibility.
+ * This shouldn't be necessary unless @bdi for some reason has
+ * unflushed dirty IO after work_list is drained. Do it anyway
+ * just in case.
*/
- spin_lock_bh(&bdi->wb_lock);
- task = bdi->wb.task;
- bdi->wb.task = NULL;
- spin_unlock_bh(&bdi->wb_lock);
-
- if (task)
- kthread_stop(task);
+ cancel_delayed_work_sync(&bdi->wb.dwork);
}
/*
@@ -597,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi)
bdi_set_min_ratio(bdi, 0);
trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
- del_timer_sync(&bdi->wb.wakeup_timer);
- if (!bdi_cap_flush_forker(bdi))
- bdi_wb_shutdown(bdi);
+ bdi_wb_shutdown(bdi);
bdi_debug_unregister(bdi);
spin_lock_bh(&bdi->wb_lock);
@@ -622,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
spin_lock_init(&wb->list_lock);
- setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+ INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
}
/*
@@ -695,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
bdi_unregister(bdi);
/*
- * If bdi_unregister() had already been called earlier, the
- * wakeup_timer could still be armed because bdi_prune_sb()
- * can race with the bdi_wakeup_thread_delayed() calls from
- * __mark_inode_dirty().
+ * If bdi_unregister() had already been called earlier, the dwork
+ * could still be pending because bdi_prune_sb() can race with the
+ * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
*/
- del_timer_sync(&bdi->wb.wakeup_timer);
+ cancel_delayed_work_sync(&bdi->wb.dwork);
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/bounce.c b/mm/bounce.c
index a5c2ec3589c..c9f0a4339a7 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -101,7 +101,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
struct bio_vec *tovec, *fromvec;
int i;
- __bio_for_each_segment(tovec, to, i, 0) {
+ bio_for_each_segment(tovec, to, i) {
fromvec = from->bi_io_vec + i;
/*
@@ -134,7 +134,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
/*
* free up bounce indirect pages used
*/
- __bio_for_each_segment(bvec, bio, i, 0) {
+ bio_for_each_segment_all(bvec, bio, i) {
org_vec = bio_orig->bi_io_vec + i;
if (bvec->bv_page == org_vec->bv_page)
continue;
@@ -199,78 +199,43 @@ static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
mempool_t *pool, int force)
{
- struct page *page;
- struct bio *bio = NULL;
- int i, rw = bio_data_dir(*bio_orig);
+ struct bio *bio;
+ int rw = bio_data_dir(*bio_orig);
struct bio_vec *to, *from;
+ unsigned i;
- bio_for_each_segment(from, *bio_orig, i) {
- page = from->bv_page;
+ bio_for_each_segment(from, *bio_orig, i)
+ if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
+ goto bounce;
- /*
- * is destination page below bounce pfn?
- */
- if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
- continue;
-
- /*
- * irk, bounce it
- */
- if (!bio) {
- unsigned int cnt = (*bio_orig)->bi_vcnt;
+ return;
+bounce:
+ bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
- bio = bio_alloc(GFP_NOIO, cnt);
- memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec));
- }
-
+ bio_for_each_segment_all(to, bio, i) {
+ struct page *page = to->bv_page;
- to = bio->bi_io_vec + i;
+ if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
+ continue;
- to->bv_page = mempool_alloc(pool, q->bounce_gfp);
- to->bv_len = from->bv_len;
- to->bv_offset = from->bv_offset;
inc_zone_page_state(to->bv_page, NR_BOUNCE);
+ to->bv_page = mempool_alloc(pool, q->bounce_gfp);
if (rw == WRITE) {
char *vto, *vfrom;
- flush_dcache_page(from->bv_page);
+ flush_dcache_page(page);
+
vto = page_address(to->bv_page) + to->bv_offset;
- vfrom = kmap(from->bv_page) + from->bv_offset;
+ vfrom = kmap_atomic(page) + to->bv_offset;
memcpy(vto, vfrom, to->bv_len);
- kunmap(from->bv_page);
+ kunmap_atomic(vfrom);
}
}
- /*
- * no pages bounced
- */
- if (!bio)
- return;
-
trace_block_bio_bounce(q, *bio_orig);
- /*
- * at least one page was bounced, fill in possible non-highmem
- * pages
- */
- __bio_for_each_segment(from, *bio_orig, i, 0) {
- to = bio_iovec_idx(bio, i);
- if (!to->bv_page) {
- to->bv_page = from->bv_page;
- to->bv_len = from->bv_len;
- to->bv_offset = from->bv_offset;
- }
- }
-
- bio->bi_bdev = (*bio_orig)->bi_bdev;
bio->bi_flags |= (1 << BIO_BOUNCED);
- bio->bi_sector = (*bio_orig)->bi_sector;
- bio->bi_rw = (*bio_orig)->bi_rw;
-
- bio->bi_vcnt = (*bio_orig)->bi_vcnt;
- bio->bi_idx = (*bio_orig)->bi_idx;
- bio->bi_size = (*bio_orig)->bi_size;
if (pool == page_pool) {
bio->bi_end_io = bounce_end_io_write;
diff --git a/mm/page_io.c b/mm/page_io.c
index 06a8842a6ec..a8a3ef45fed 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -36,7 +36,6 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
bio->bi_io_vec[0].bv_len = PAGE_SIZE;
bio->bi_io_vec[0].bv_offset = 0;
bio->bi_vcnt = 1;
- bio->bi_idx = 0;
bio->bi_size = PAGE_SIZE;
bio->bi_end_io = end_io;
}