diff options
author | Chris Mason <chris.mason@oracle.com> | 2008-07-31 15:42:53 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2008-09-25 11:04:05 -0400 |
commit | 61b4944018449003ac5f9757f4d125dce519cf51 (patch) | |
tree | 553855996c641a945344db870b6dfd0d2d02086e | |
parent | 37d1aeee3990385e9bb436c50c2f7e120a668df6 (diff) |
Btrfs: Fix streaming read performance with checksumming on
Large streaming reads make for large bios, which means each entry on the
list async work queues represents a large amount of data. IO
congestion throttling on the device was kicking in before the async
worker threads decided a single thread was busy and needed some help.
The end result was that a streaming read would result in a single CPU
running at 100% instead of balancing the work off to other CPUs.
This patch also changes the pre-IO checksum lookup done by reads to
work on a per-bio basis instead of a per-page. This results in many
extra btree lookups on large streaming reads. Doing the checksum lookup
right before bio submit allows us to reuse searches while processing
adjacent offsets.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r-- | fs/btrfs/async-thread.c | 2 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 2 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 15 | ||||
-rw-r--r-- | fs/btrfs/file-item.c | 77 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 57 |
5 files changed, 99 insertions, 54 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5fe6a0d532ed..bc2980c433ef 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -160,7 +160,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max) INIT_LIST_HEAD(&workers->idle_list); spin_lock_init(&workers->lock); workers->max_workers = max; - workers->idle_thresh = 64; + workers->idle_thresh = 32; } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index be16cd49ef69..d788ab0dcd96 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1613,6 +1613,8 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_key *location, int mod); /* file-item.c */ +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 disk_offset, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e826730d750f..d2d1cc87e8ad 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1357,10 +1357,25 @@ struct btrfs_root *open_ctree(struct super_block *sb, */ btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size); btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size); + + /* a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the + * devices + */ + fs_info->submit_workers.idle_thresh = 64; + btrfs_init_workers(&fs_info->fixup_workers, 1); btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size); btrfs_init_workers(&fs_info->endio_write_workers, fs_info->thread_pool_size); + + /* + * endios are largely parallel and should have a very + * low idle thresh + */ + fs_info->endio_workers.idle_thresh = 4; + fs_info->endio_write_workers.idle_thresh = 4; + btrfs_start_workers(&fs_info->workers, 1); btrfs_start_workers(&fs_info->submit_workers, 1); btrfs_start_workers(&fs_info->fixup_workers, 1); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 2311061f070e..a5ff19b34b21 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -134,6 +134,83 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio) +{ + u32 sum; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + u64 offset; + u64 item_start_offset = 0; + u64 item_last_offset = 0; + u32 diff; + int ret; + struct btrfs_path *path; + struct btrfs_csum_item *item = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + + path = btrfs_alloc_path(); + path->reada = 2; + + WARN_ON(bio->bi_vcnt <= 0); + + while(bio_index < bio->bi_vcnt) { + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + ret = btrfs_find_ordered_sum(inode, offset, &sum); + if (ret == 0) + goto found; + + if (!item || offset < item_start_offset || + offset >= item_last_offset) { + struct btrfs_key found_key; + u32 item_size; + + if (item) + btrfs_release_path(root, path); + item = btrfs_lookup_csum(NULL, root, path, + inode->i_ino, offset, 0); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + if (ret == -ENOENT || ret == -EFBIG) + ret = 0; + sum = 0; + printk("no csum found for inode %lu start " + "%llu\n", inode->i_ino, + (unsigned long long)offset); + goto found; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + item_start_offset = found_key.offset; + item_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + item_last_offset = item_start_offset + + (item_size / BTRFS_CRC32_SIZE) * + root->sectorsize; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + } + /* + * this byte range must be able to fit inside + * a single leaf so it will also fit inside a u32 + */ + diff = offset - item_start_offset; + diff = diff / root->sectorsize; + diff = diff * BTRFS_CRC32_SIZE; + + read_extent_buffer(path->nodes[0], &sum, + (unsigned long)item + diff, + BTRFS_CRC32_SIZE); +found: + set_state_private(io_tree, offset, sum); + bio_index++; + bvec++; + } + btrfs_free_path(path); + return 0; +} + int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c4afa9d78da9..31d52c51acc3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -374,6 +374,10 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, BUG_ON(ret); if (!(rw & (1 << BIO_RW))) { + if (!btrfs_test_opt(root, NODATASUM) && + !btrfs_test_flag(inode, NODATASUM)) { + btrfs_lookup_bio_sums(root, inode, bio); + } goto mapit; } @@ -598,58 +602,6 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, return btrfs_finish_ordered_io(page->mapping->host, start, end); } -int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end) -{ - int ret = 0; - struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_csum_item *item; - struct btrfs_path *path = NULL; - u32 csum; - - if (btrfs_test_opt(root, NODATASUM) || - btrfs_test_flag(inode, NODATASUM)) - return 0; - - /* - * It is possible there is an ordered extent that has - * not yet finished for this range in the file. If so, - * that extent will have a csum cached, and it will insert - * the sum after all the blocks in the extent are fully - * on disk. So, look for an ordered extent and use the - * sum if found. We have to do this before looking in the - * btree because csum items are pre-inserted based on - * the file size. btrfs_lookup_csum might find an item - * that still hasn't been fully filled. - */ - ret = btrfs_find_ordered_sum(inode, start, &csum); - if (ret == 0) - goto found; - - ret = 0; - path = btrfs_alloc_path(); - item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0); - if (IS_ERR(item)) { - ret = PTR_ERR(item); - /* a csum that isn't present is a preallocated region. */ - if (ret == -ENOENT || ret == -EFBIG) - ret = 0; - csum = 0; - printk("no csum found for inode %lu start %Lu\n", inode->i_ino, - start); - goto out; - } - read_extent_buffer(path->nodes[0], &csum, (unsigned long)item, - BTRFS_CRC32_SIZE); -found: - set_state_private(io_tree, start, csum); -out: - if (path) - btrfs_free_path(path); - return ret; -} - struct io_failure_record { struct page *page; u64 start; @@ -3613,7 +3565,6 @@ static struct extent_io_ops btrfs_extent_io_ops = { .fill_delalloc = run_delalloc_range, .submit_bio_hook = btrfs_submit_bio_hook, .merge_bio_hook = btrfs_merge_bio_hook, - .readpage_io_hook = btrfs_readpage_io_hook, .readpage_end_io_hook = btrfs_readpage_end_io_hook, .writepage_end_io_hook = btrfs_writepage_end_io_hook, .writepage_start_hook = btrfs_writepage_start_hook, |