diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-07-16 13:48:55 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-07-16 13:48:55 -0700 |
commit | 972a278fe60c361eb8f37619f562f092e8786d7c (patch) | |
tree | f80ef197b2adcab1bf504b255ddf46398e13f39f /fs | |
parent | c5fe7a97f20c7f3070ac870144515c0fabc6b999 (diff) | |
parent | 088aea3b97e0ae5a2a86f5d142ad10fec8a1b80f (diff) |
Merge tag 'for-5.19-rc7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs reverts from David Sterba:
"Due to a recent report [1] we need to revert the radix tree to xarray
conversion patches.
There's a problem with sleeping under spinlock, when xa_insert could
allocate memory under pressure. We use GFP_NOFS so this is a real
problem that we unfortunately did not discover during review.
I'm sorry to do such change at rc6 time but the revert is IMO the
safer option, there are patches to use mutex instead of the spin locks
but that would need more testing. The revert branch has been tested on
a few setups, all seem ok.
The conversion to xarray will be revisited in the future"
Link: https://lore.kernel.org/linux-btrfs/cover.1657097693.git.fdmanana@suse.com/ [1]
* tag 'for-5.19-rc7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
Revert "btrfs: turn delayed_nodes_tree into an XArray"
Revert "btrfs: turn name_cache radix tree into XArray in send_ctx"
Revert "btrfs: turn fs_info member buffer_radix into XArray"
Revert "btrfs: turn fs_roots_radix in btrfs_fs_info into an XArray"
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/ctree.h | 18 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.c | 84 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 179 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 2 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 122 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 15 | ||||
-rw-r--r-- | fs/btrfs/send.c | 40 | ||||
-rw-r--r-- | fs/btrfs/tests/btrfs-tests.c | 24 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 112 |
9 files changed, 340 insertions, 256 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 415bf1823fb3..9c21e214d29e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -675,9 +675,8 @@ struct btrfs_fs_info { rwlock_t global_root_lock; struct rb_root global_root_tree; - /* The xarray that holds all the FS roots */ - spinlock_t fs_roots_lock; - struct xarray fs_roots; + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; /* block group cache stuff */ rwlock_t block_group_cache_lock; @@ -995,10 +994,10 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer xarray */ + /* Extent buffer radix tree */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct xarray extent_buffers; + struct radix_tree_root buffer_radix; /* next backup root to be overwritten */ int backup_root_index; @@ -1119,8 +1118,7 @@ enum { */ BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, - /* The root is tracked in fs_info::fs_roots */ - BTRFS_ROOT_REGISTERED, + BTRFS_ROOT_IN_RADIX, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, BTRFS_ROOT_DEFRAG_RUNNING, BTRFS_ROOT_FORCE_COW, @@ -1224,10 +1222,10 @@ struct btrfs_root { struct rb_root inode_tree; /* - * Xarray that keeps track of delayed nodes of every inode, protected - * by inode_lock + * radix tree that keeps track of delayed nodes of every inode, + * protected by inode_lock */ - struct xarray delayed_nodes; + struct radix_tree_root delayed_nodes_tree; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 66779ab3ed4a..748bf6b0d860 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( } spin_lock(&root->inode_lock); - node = xa_load(&root->delayed_nodes, ino); + node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { @@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( /* * It's possible that we're racing into the middle of removing - * this node from the xarray. In this case, the refcount + * this node from the radix tree. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the xarray at all; our release + * NULL like it was never in the radix at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the xarray, we want to bump the + * If this node is properly in the radix, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -128,30 +128,36 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( u64 ino = btrfs_ino(btrfs_inode); int ret; - do { - node = btrfs_get_delayed_node(btrfs_inode); - if (node) - return node; +again: + node = btrfs_get_delayed_node(btrfs_inode); + if (node) + return node; - node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); - if (!node) - return ERR_PTR(-ENOMEM); - btrfs_init_delayed_node(node, root, ino); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); - /* Cached in the inode and can be accessed */ - refcount_set(&node->refs, 2); + /* cached in the btrfs inode and can be accessed */ + refcount_set(&node->refs, 2); - spin_lock(&root->inode_lock); - ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS); - if (ret) { - spin_unlock(&root->inode_lock); - kmem_cache_free(delayed_node_cache, node); - if (ret != -EBUSY) - return ERR_PTR(ret); - } - } while (ret); + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + kmem_cache_free(delayed_node_cache, node); + return ERR_PTR(ret); + } + + spin_lock(&root->inode_lock); + ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); + if (ret == -EEXIST) { + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); + radix_tree_preload_end(); + goto again; + } btrfs_inode->delayed_node = node; spin_unlock(&root->inode_lock); + radix_tree_preload_end(); return node; } @@ -270,7 +276,8 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - xa_erase(&root->delayed_nodes, delayed_node->inode_id); + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } @@ -1863,35 +1870,34 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - unsigned long index = 0; - struct btrfs_delayed_node *delayed_node; + u64 inode_id = 0; struct btrfs_delayed_node *delayed_nodes[8]; + int i, n; while (1) { - int n = 0; - spin_lock(&root->inode_lock); - if (xa_empty(&root->delayed_nodes)) { + n = radix_tree_gang_lookup(&root->delayed_nodes_tree, + (void **)delayed_nodes, inode_id, + ARRAY_SIZE(delayed_nodes)); + if (!n) { spin_unlock(&root->inode_lock); - return; + break; } - xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) { + inode_id = delayed_nodes[n - 1]->inode_id + 1; + for (i = 0; i < n; i++) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (refcount_inc_not_zero(&delayed_node->refs)) { - delayed_nodes[n] = delayed_node; - n++; - } - if (n >= ARRAY_SIZE(delayed_nodes)) - break; + if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) + delayed_nodes[i] = NULL; } - index++; spin_unlock(&root->inode_lock); - for (int i = 0; i < n; i++) { + for (i = 0; i < n; i++) { + if (!delayed_nodes[i]) + continue; __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4ba005c41983..de440ebf5648 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -5,6 +5,7 @@ #include <linux/fs.h> #include <linux/blkdev.h> +#include <linux/radix-tree.h> #include <linux/writeback.h> #include <linux/workqueue.h> #include <linux/kthread.h> @@ -485,7 +486,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, fs_info->nodesize); - /* A dirty eb shouldn't disappear from extent_buffers */ + /* A dirty eb shouldn't disappear from buffer_radix */ if (WARN_ON(!eb)) return -EUCLEAN; @@ -1158,7 +1159,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); + INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); btrfs_init_root_block_rsv(root); @@ -1210,9 +1211,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&root->leak_list); - spin_lock(&fs_info->fs_roots_lock); + spin_lock(&fs_info->fs_roots_radix_lock); list_add_tail(&root->leak_list, &fs_info->allocated_roots); - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); #endif } @@ -1659,11 +1660,12 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, { struct btrfs_root *root; - spin_lock(&fs_info->fs_roots_lock); - root = xa_load(&fs_info->fs_roots, (unsigned long)root_id); + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_id); if (root) root = btrfs_grab_root(root); - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1705,14 +1707,20 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - spin_lock(&fs_info->fs_roots_lock); - ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid, - root, GFP_NOFS); + ret = radix_tree_preload(GFP_NOFS); + if (ret) + return ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); if (ret == 0) { btrfs_grab_root(root); - set_bit(BTRFS_ROOT_REGISTERED, &root->state); + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); return ret; } @@ -2342,9 +2350,9 @@ void btrfs_put_root(struct btrfs_root *root) btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG - spin_lock(&root->fs_info->fs_roots_lock); + spin_lock(&root->fs_info->fs_roots_radix_lock); list_del_init(&root->leak_list); - spin_unlock(&root->fs_info->fs_roots_lock); + spin_unlock(&root->fs_info->fs_roots_radix_lock); #endif kfree(root); } @@ -2352,21 +2360,28 @@ void btrfs_put_root(struct btrfs_root *root) void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root; - unsigned long index = 0; + int ret; + struct btrfs_root *gang[8]; + int i; while (!list_empty(&fs_info->dead_roots)) { - root = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&root->root_list); + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); - if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) - btrfs_drop_and_free_fs_root(fs_info, root); - btrfs_put_root(root); + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) + btrfs_drop_and_free_fs_root(fs_info, gang[0]); + btrfs_put_root(gang[0]); } - xa_for_each(&fs_info->fs_roots, index, root) { - btrfs_drop_and_free_fs_root(fs_info, root); + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_drop_and_free_fs_root(fs_info, gang[i]); } } @@ -3134,8 +3149,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC); - xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -3143,7 +3158,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->fs_roots_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); @@ -3374,7 +3389,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) /* * btrfs_find_orphan_roots() is responsible for finding all the dead * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load - * them into the fs_info->fs_roots. This must be done before + * them into the fs_info->fs_roots_radix tree. This must be done before * calling btrfs_orphan_cleanup() on the tree root. If we don't do it * first, then btrfs_orphan_cleanup() will delete a dead root's orphan * item before the root's tree is deleted - this means that if we unmount @@ -4499,11 +4514,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, { bool drop_ref = false; - spin_lock(&fs_info->fs_roots_lock); - xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid); - if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state)) + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) drop_ref = true; - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); @@ -4519,48 +4535,50 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { - struct btrfs_root *roots[8]; - unsigned long index = 0; - int i; + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i = 0; int err = 0; - int grabbed; + unsigned int ret = 0; while (1) { - struct btrfs_root *root; - - spin_lock(&fs_info->fs_roots_lock); - if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) { - spin_unlock(&fs_info->fs_roots_lock); - return err; + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) { + spin_unlock(&fs_info->fs_roots_radix_lock); + break; } + root_objectid = gang[ret - 1]->root_key.objectid + 1; - grabbed = 0; - xa_for_each_start(&fs_info->fs_roots, index, root, index) { - /* Avoid grabbing roots in dead_roots */ - if (btrfs_root_refs(&root->root_item) > 0) - roots[grabbed++] = btrfs_grab_root(root); - if (grabbed >= ARRAY_SIZE(roots)) - break; + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* grab all the search result for later use */ + gang[i] = btrfs_grab_root(gang[i]); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); - for (i = 0; i < grabbed; i++) { - if (!roots[i]) + for (i = 0; i < ret; i++) { + if (!gang[i]) continue; - index = roots[i]->root_key.objectid; - err = btrfs_orphan_cleanup(roots[i]); + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); if (err) - goto out; - btrfs_put_root(roots[i]); + break; + btrfs_put_root(gang[i]); } - index++; + root_objectid++; } -out: - /* Release the roots that remain uncleaned due to error */ - for (; i < grabbed; i++) { - if (roots[i]) - btrfs_put_root(roots[i]); + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_root(gang[i]); } return err; } @@ -4879,28 +4897,31 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) { - unsigned long index = 0; - int grabbed = 0; - struct btrfs_root *roots[8]; + struct btrfs_root *gang[8]; + u64 root_objectid = 0; + int ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang))) != 0) { + int i; - spin_lock(&fs_info->fs_roots_lock); - while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index, - ULONG_MAX, 8, XA_PRESENT))) { - for (int i = 0; i < grabbed; i++) - roots[i] = btrfs_grab_root(roots[i]); - spin_unlock(&fs_info->fs_roots_lock); + for (i = 0; i < ret; i++) + gang[i] = btrfs_grab_root(gang[i]); + spin_unlock(&fs_info->fs_roots_radix_lock); - for (int i = 0; i < grabbed; i++) { - if (!roots[i]) + for (i = 0; i < ret; i++) { + if (!gang[i]) continue; - index = roots[i]->root_key.objectid; - btrfs_free_log(NULL, roots[i]); - btrfs_put_root(roots[i]); + root_objectid = gang[i]->root_key.objectid; + btrfs_free_log(NULL, gang[i]); + btrfs_put_root(gang[i]); } - index++; - spin_lock(&fs_info->fs_roots_lock); + root_objectid++; + spin_lock(&fs_info->fs_roots_radix_lock); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log_root_tree(NULL, fs_info); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4157ecc27d4b..a3afc15430ce 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5829,7 +5829,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_qgroup_convert_reserved_meta(root, INT_MAX); btrfs_qgroup_free_meta_all_pertrans(root); - if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); else btrfs_put_root(root); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 04e36343da3a..f03ab5dbda7a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2966,7 +2966,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) } /* - * Find extent buffer for a given bytenr. + * Find extent buffer for a givne bytenr. * * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking * in endio context. @@ -2985,9 +2985,11 @@ static struct extent_buffer *find_extent_buffer_readpage( return (struct extent_buffer *)page->private; } - /* For subpage case, we need to lookup extent buffer xarray */ - eb = xa_load(&fs_info->extent_buffers, - bytenr >> fs_info->sectorsize_bits); + /* For subpage case, we need to lookup buffer radix tree */ + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + bytenr >> fs_info->sectorsize_bits); + rcu_read_unlock(); ASSERT(eb); return eb; } @@ -4435,8 +4437,8 @@ static struct extent_buffer *find_extent_buffer_nolock( struct extent_buffer *eb; rcu_read_lock(); - eb = xa_load(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); return eb; @@ -6129,22 +6131,24 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, if (!eb) return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; - - do { - ret = xa_insert(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits, - eb, GFP_NOFS); - if (ret == -ENOMEM) { - exists = ERR_PTR(ret); +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) goto free_eb; - } - if (ret == -EBUSY) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - } - } while (ret); - + else + goto again; + } check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6319,22 +6323,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - do { - ret = xa_insert(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits, - eb, GFP_NOFS); - if (ret == -ENOMEM) { - exists = ERR_PTR(ret); +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } + + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) goto free_eb; - } - if (ret == -EBUSY) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - } - } while (ret); - + else + goto again; + } /* add one reference for the tree */ check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6379,8 +6386,10 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); - xa_erase(&fs_info->extent_buffers, - eb->start >> fs_info->sectorsize_bits); + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, + eb->start >> fs_info->sectorsize_bits); + spin_unlock(&fs_info->buffer_lock); } else { spin_unlock(&eb->refs_lock); } @@ -7325,25 +7334,42 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *eb; - unsigned long index; + struct extent_buffer *gang[GANG_LOOKUP_SIZE]; + struct extent_buffer *found = NULL; u64 page_start = page_offset(page); + u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - xa_for_each_start(&fs_info->extent_buffers, index, eb, - page_start >> fs_info->sectorsize_bits) { - if (in_range(eb->start, page_start, PAGE_SIZE)) - return eb; - else if (eb->start >= page_start + PAGE_SIZE) - /* Already beyond page end */ - return NULL; + while (cur < page_start + PAGE_SIZE) { + int ret; + int i; + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, + (void **)gang, cur >> fs_info->sectorsize_bits, + min_t(unsigned int, GANG_LOOKUP_SIZE, + PAGE_SIZE / fs_info->nodesize)); + if (ret == 0) + goto out; + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + goto out; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + goto out; + } + } + cur = gang[ret - 1]->start + gang[ret - 1]->len; } - return NULL; +out: + return found; } static int try_release_subpage_extent_buffer(struct page *page) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0172f75e051a..d50448bf8eed 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3578,7 +3578,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) u64 last_objectid = 0; int ret = 0, nr_unlink = 0; - /* Bail out if the cleanup is already running. */ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; @@ -3661,17 +3660,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into - * fs_info->fs_roots. So here we can find if an + * fs_info->fs_roots_radix. So here we can find if an * orphan item corresponds to a deleted root by looking - * up the root from that xarray. + * up the root from that radix tree. */ - spin_lock(&fs_info->fs_roots_lock); - dead_root = xa_load(&fs_info->fs_roots, - (unsigned long)found_key.objectid); + spin_lock(&fs_info->fs_roots_radix_lock); + dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)found_key.objectid); if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) is_dead_root = 1; - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); if (is_dead_root) { /* prevent this orphan from being found again */ @@ -3911,7 +3910,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in the delayed_nodes xarray. + * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fa56890ff81f..c7dea639a56f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -10,6 +10,7 @@ #include <linux/mount.h> #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> +#include <linux/radix-tree.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/compat.h> @@ -127,7 +128,7 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct xarray name_cache; + struct radix_tree_root name_cache; struct list_head name_cache_list; int name_cache_size; @@ -268,13 +269,14 @@ struct orphan_dir_info { struct name_cache_entry { struct list_head list; /* - * On 32bit kernels, xarray has only 32bit indices, but we need to - * handle 64bit inums. We use the lower 32bit of the 64bit inum to store - * it in the tree. If more than one inum would fall into the same entry, - * we use inum_aliases to store the additional entries. inum_aliases is - * also used to store entries with the same inum but different generations. + * radix_tree has only 32bit entries but we need to handle 64bit inums. + * We use the lower 32bit of the 64bit inum to store it in the tree. If + * more then one inum would fall into the same entry, we use radix_list + * to store the additional entries. radix_list is also used to store + * entries where two entries have the same inum but different + * generations. */ - struct list_head inum_aliases; + struct list_head radix_list; u64 ino; u64 gen; u64 parent_ino; @@ -2024,9 +2026,9 @@ out: } /* - * Insert a name cache entry. On 32bit kernels the xarray index is 32bit, + * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::inum_aliases. + * takes care of this with the help of name_cache_entry::radix_list. * In case of error, nce is kfreed. */ static int name_cache_insert(struct send_ctx *sctx, @@ -2035,7 +2037,8 @@ static int name_cache_insert(struct send_ctx *sctx, int ret = 0; struct list_head *nce_head; - nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); if (!nce_head) { @@ -2044,14 +2047,14 @@ static int name_cache_insert(struct send_ctx *sctx, } INIT_LIST_HEAD(nce_head); - ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL); + ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); if (ret < 0) { kfree(nce_head); kfree(nce); return ret; } } - list_add_tail(&nce->inum_aliases, nce_head); + list_add_tail(&nce->radix_list, nce_head); list_add_tail(&nce->list, &sctx->name_cache_list); sctx->name_cache_size++; @@ -2063,14 +2066,15 @@ static void name_cache_delete(struct send_ctx *sctx, { struct list_head *nce_head; - nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); if (!nce_head) { btrfs_err(sctx->send_root->fs_info, "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", nce->ino, sctx->name_cache_size); } - list_del(&nce->inum_aliases); + list_del(&nce->radix_list); list_del(&nce->list); sctx->name_cache_size--; @@ -2078,7 +2082,7 @@ static void name_cache_delete(struct send_ctx *sctx, * We may not get to the final release of nce_head if the lookup fails */ if (nce_head && list_empty(nce_head)) { - xa_erase(&sctx->name_cache, (unsigned long)nce->ino); + radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); kfree(nce_head); } } @@ -2089,11 +2093,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, struct list_head *nce_head; struct name_cache_entry *cur; - nce_head = xa_load(&sctx->name_cache, (unsigned long)ino); + nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); if (!nce_head) return NULL; - list_for_each_entry(cur, nce_head, inum_aliases) { + list_for_each_entry(cur, nce_head, radix_list) { if (cur->ino == ino && cur->gen == gen) return cur; } @@ -7518,7 +7522,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - xa_init_flags(&sctx->name_cache, GFP_KERNEL); + INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); sctx->flags = arg->flags; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 1591bfa55bcc..d8e56edd6991 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -150,8 +150,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - unsigned long index; - struct extent_buffer *eb; + struct radix_tree_iter iter; + void **slot; struct btrfs_device *dev, *tmp; if (!fs_info) @@ -163,9 +163,25 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - xa_for_each(&fs_info->extent_buffers, index, eb) { + spin_lock(&fs_info->buffer_lock); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + slot = radix_tree_iter_retry(&iter); + continue; + } + slot = radix_tree_iter_resume(slot, &iter); + spin_unlock(&fs_info->buffer_lock); free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); } + spin_unlock(&fs_info->buffer_lock); btrfs_mapping_tree_free(&fs_info->mapping_tree); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, @@ -186,7 +202,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) if (!root) return; /* Will be freed by btrfs_free_fs_roots */ - if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state))) + if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; btrfs_global_root_delete(root); btrfs_put_root(root); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 06c0a958d114..875b801ab3d7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,7 +23,7 @@ #include "space-info.h" #include "zoned.h" -#define BTRFS_ROOT_TRANS_TAG XA_MARK_0 +#define BTRFS_ROOT_TRANS_TAG 0 /* * Transaction states and transitions @@ -437,15 +437,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, */ smp_wmb(); - spin_lock(&fs_info->fs_roots_lock); + spin_lock(&fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid && !force) { - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } - xa_set_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_lock); + radix_tree_tag_set(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); root->last_trans = trans->transid; /* this is pretty tricky. We don't want to @@ -487,9 +487,11 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, spin_unlock(&cur_trans->dropped_roots_lock); /* Make sure we don't try to update the root at commit time */ - xa_clear_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); } int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, @@ -1402,8 +1404,9 @@ void btrfs_add_dead_root(struct btrfs_root *root) static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root; - unsigned long index; + struct btrfs_root *gang[8]; + int i; + int ret; /* * At this point no one can be using this transaction to modify any tree @@ -1411,46 +1414,57 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) */ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); - spin_lock(&fs_info->fs_roots_lock); - xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) { - int ret; - - /* - * At this point we can neither have tasks logging inodes - * from a root nor trying to commit a log tree. - */ - ASSERT(atomic_read(&root->log_writers) == 0); - ASSERT(atomic_read(&root->log_commit[0]) == 0); - ASSERT(atomic_read(&root->log_commit[1]) == 0); - - xa_clear_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_lock); - - btrfs_free_log(trans, root); - ret = btrfs_update_reloc_root(trans, root); - if (ret) - return ret; - - /* See comments in should_cow_block() */ - clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); - smp_mb__after_atomic(); + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + struct btrfs_root *root = gang[i]; + int ret2; + + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); + + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); + + btrfs_free_log(trans, root); + ret2 = btrfs_update_reloc_root(trans, root); + if (ret2) + return ret2; + + /* see comments in should_cow_block() */ + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); + + if (root->commit_root != root->node) { + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + btrfs_set_root_node(&root->root_item, + root->node); + } - if (root->commit_root != root->node) { - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); - btrfs_set_root_node(&root->root_item, root->node); + ret2 = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, + &root->root_item); + if (ret2) + return ret2; + spin_lock(&fs_info->fs_roots_radix_lock); + btrfs_qgroup_free_meta_all_pertrans(root); } - - ret = btrfs_update_root(trans, fs_info->tree_root, - &root->root_key, &root->root_item); - if (ret) - return ret; - spin_lock(&fs_info->fs_roots_lock); - btrfs_qgroup_free_meta_all_pertrans(root); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } |