summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-03-18 13:38:42 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-03-18 13:38:42 -0700
commit81aa0968b7ea6dbabcdcda37dc8434dca6e1565b (patch)
tree9cbc9fc23517c3dd6b6f89c0b0557edbb438b8b3
parentdc0337999d87a5e749ef1ac0bcc1a06d2a3f9ec0 (diff)
parent485df75554257e883d0ce39bb886e8212349748e (diff)
Merge tag 'for-5.12-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: "There are still regressions being found and fixed in the zoned mode and subpage code, the rest are fixes for bugs reported by users. Regressions: - subpage block support: - readahead works on the proper block size - fix last page zeroing - zoned mode: - linked list corruption for tree log Fixes: - qgroup leak after falloc failure - tree mod log and backref resolving: - extent buffer cloning race when resolving backrefs - pin deleted leaves with active tree mod log users - drop debugging flag from slab cache" * tag 'for-5.12-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: always pin deleted leaves when there are active tree mod log users btrfs: fix race when cloning extent buffer during rewind of an old root btrfs: fix slab cache flags for free space tree bitmap btrfs: subpage: make readahead work properly btrfs: subpage: fix wild pointer access during metadata read failure btrfs: zoned: fix linked list corruption after log root tree allocation failure btrfs: fix qgroup data rsv leak caused by falloc failure btrfs: track qgroup released data in own variable in insert_prealloc_file_extent btrfs: fix wrong offset to zero out range beyond i_size
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/extent-tree.c23
-rw-r--r--fs/btrfs/extent_io.c33
-rw-r--r--fs/btrfs/inode.c37
-rw-r--r--fs/btrfs/reada.c35
-rw-r--r--fs/btrfs/tree-log.c8
6 files changed, 103 insertions, 35 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d56730a67885..34b929bd5c1a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1365,7 +1365,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
"failed to read tree block %llu from get_old_root",
logical);
} else {
+ btrfs_tree_read_lock(old);
eb = btrfs_clone_extent_buffer(old);
+ btrfs_tree_read_unlock(old);
free_extent_buffer(old);
}
} else if (old_root) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 78ad31a59e59..36a3c973fda1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3323,6 +3323,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
struct btrfs_block_group *cache;
+ bool must_pin = false;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
@@ -3340,7 +3341,27 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
goto out;
}
- if (btrfs_is_zoned(fs_info)) {
+ /*
+ * If this is a leaf and there are tree mod log users, we may
+ * have recorded mod log operations that point to this leaf.
+ * So we must make sure no one reuses this leaf's extent before
+ * mod log operations are applied to a node, otherwise after
+ * rewinding a node using the mod log operations we get an
+ * inconsistent btree, as the leaf's extent may now be used as
+ * a node or leaf for another different btree.
+ * We are safe from races here because at this point no other
+ * node or root points to this extent buffer, so if after this
+ * check a new tree mod log user joins, it will not be able to
+ * find a node pointing to this leaf and record operations that
+ * point to this leaf.
+ */
+ if (btrfs_header_level(buf) == 0) {
+ read_lock(&fs_info->tree_mod_log_lock);
+ must_pin = !list_empty(&fs_info->tree_mod_seq_list);
+ read_unlock(&fs_info->tree_mod_log_lock);
+ }
+
+ if (must_pin || btrfs_is_zoned(fs_info)) {
btrfs_redirty_list_add(trans->transaction, buf);
pin_down_extent(trans, cache, buf->start, buf->len, 1);
btrfs_put_block_group(cache);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 191e358f1322..910769d5fcdb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2886,6 +2886,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
}
/*
+ * Find extent buffer for a givne bytenr.
+ *
+ * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
+ * in endio context.
+ */
+static struct extent_buffer *find_extent_buffer_readpage(
+ struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+{
+ struct extent_buffer *eb;
+
+ /*
+ * For regular sectorsize, we can use page->private to grab extent
+ * buffer
+ */
+ if (fs_info->sectorsize == PAGE_SIZE) {
+ ASSERT(PagePrivate(page) && page->private);
+ return (struct extent_buffer *)page->private;
+ }
+
+ /* For subpage case, we need to lookup buffer radix tree */
+ rcu_read_lock();
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ bytenr >> fs_info->sectorsize_bits);
+ rcu_read_unlock();
+ ASSERT(eb);
+ return eb;
+}
+
+/*
* after a readpage IO is done, we need to:
* clear the uptodate bits on error
* set the uptodate bits if things worked
@@ -2996,7 +3025,7 @@ static void end_bio_extent_readpage(struct bio *bio)
} else {
struct extent_buffer *eb;
- eb = (struct extent_buffer *)page->private;
+ eb = find_extent_buffer_readpage(fs_info, page, start);
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = mirror;
atomic_dec(&eb->io_pages);
@@ -3020,7 +3049,7 @@ readpage_ok:
*/
if (page->index == end_index && i_size <= end) {
u32 zero_start = max(offset_in_page(i_size),
- offset_in_page(end));
+ offset_in_page(start));
zero_user_segment(page, zero_start,
offset_in_page(end) + 1);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 35bfa0533f23..7cdf65be3707 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9008,7 +9008,7 @@ int __init btrfs_init_cachep(void)
btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
PAGE_SIZE, PAGE_SIZE,
- SLAB_RED_ZONE, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_bitmap_cachep)
goto fail;
@@ -9877,6 +9877,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
struct btrfs_path *path;
u64 start = ins->objectid;
u64 len = ins->offset;
+ int qgroup_released;
int ret;
memset(&stack_fi, 0, sizeof(stack_fi));
@@ -9889,16 +9890,16 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
/* Encryption and other encoding is reserved and all 0 */
- ret = btrfs_qgroup_release_data(inode, file_offset, len);
- if (ret < 0)
- return ERR_PTR(ret);
+ qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
+ if (qgroup_released < 0)
+ return ERR_PTR(qgroup_released);
if (trans) {
ret = insert_reserved_file_extent(trans, inode,
file_offset, &stack_fi,
- true, ret);
+ true, qgroup_released);
if (ret)
- return ERR_PTR(ret);
+ goto free_qgroup;
return trans;
}
@@ -9909,21 +9910,35 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
extent_info.file_offset = file_offset;
extent_info.extent_buf = (char *)&stack_fi;
extent_info.is_new_extent = true;
- extent_info.qgroup_reserved = ret;
+ extent_info.qgroup_reserved = qgroup_released;
extent_info.insertions = 0;
path = btrfs_alloc_path();
- if (!path)
- return ERR_PTR(-ENOMEM);
+ if (!path) {
+ ret = -ENOMEM;
+ goto free_qgroup;
+ }
ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
file_offset + len - 1, &extent_info,
&trans);
btrfs_free_path(path);
if (ret)
- return ERR_PTR(ret);
-
+ goto free_qgroup;
return trans;
+
+free_qgroup:
+ /*
+ * We have released qgroup data range at the beginning of the function,
+ * and normally qgroup_released bytes will be freed when committing
+ * transaction.
+ * But if we error out early, we have to free what we have released
+ * or we leak qgroup data reservation.
+ */
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid, qgroup_released,
+ BTRFS_QGROUP_RSV_DATA);
+ return ERR_PTR(ret);
}
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 20fd4aa48a8c..06713a8fe26b 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -209,7 +209,7 @@ int btree_readahead_hook(struct extent_buffer *eb, int err)
/* find extent */
spin_lock(&fs_info->reada_lock);
re = radix_tree_lookup(&fs_info->reada_tree,
- eb->start >> PAGE_SHIFT);
+ eb->start >> fs_info->sectorsize_bits);
if (re)
re->refcnt++;
spin_unlock(&fs_info->reada_lock);
@@ -240,7 +240,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
zone = NULL;
spin_lock(&fs_info->reada_lock);
ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> PAGE_SHIFT, 1);
+ logical >> fs_info->sectorsize_bits, 1);
if (ret == 1 && logical >= zone->start && logical <= zone->end) {
kref_get(&zone->refcnt);
spin_unlock(&fs_info->reada_lock);
@@ -283,13 +283,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&dev->reada_zones,
- (unsigned long)(zone->end >> PAGE_SHIFT),
- zone);
+ (unsigned long)(zone->end >> fs_info->sectorsize_bits),
+ zone);
if (ret == -EEXIST) {
kfree(zone);
ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> PAGE_SHIFT, 1);
+ logical >> fs_info->sectorsize_bits, 1);
if (ret == 1 && logical >= zone->start && logical <= zone->end)
kref_get(&zone->refcnt);
else
@@ -315,7 +315,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
u64 length;
int real_stripes;
int nzones = 0;
- unsigned long index = logical >> PAGE_SHIFT;
+ unsigned long index = logical >> fs_info->sectorsize_bits;
int dev_replace_is_ongoing;
int have_zone = 0;
@@ -497,7 +497,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
struct reada_extent *re)
{
int i;
- unsigned long index = re->logical >> PAGE_SHIFT;
+ unsigned long index = re->logical >> fs_info->sectorsize_bits;
spin_lock(&fs_info->reada_lock);
if (--re->refcnt) {
@@ -538,11 +538,12 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
static void reada_zone_release(struct kref *kref)
{
struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+ struct btrfs_fs_info *fs_info = zone->device->fs_info;
- lockdep_assert_held(&zone->device->fs_info->reada_lock);
+ lockdep_assert_held(&fs_info->reada_lock);
radix_tree_delete(&zone->device->reada_zones,
- zone->end >> PAGE_SHIFT);
+ zone->end >> fs_info->sectorsize_bits);
kfree(zone);
}
@@ -593,7 +594,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
{
int i;
- unsigned long index = zone->end >> PAGE_SHIFT;
+ unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits;
for (i = 0; i < zone->ndevs; ++i) {
struct reada_zone *peer;
@@ -628,7 +629,7 @@ static int reada_pick_zone(struct btrfs_device *dev)
(void **)&zone, index, 1);
if (ret == 0)
break;
- index = (zone->end >> PAGE_SHIFT) + 1;
+ index = (zone->end >> dev->fs_info->sectorsize_bits) + 1;
if (zone->locked) {
if (zone->elems > top_locked_elems) {
top_locked_elems = zone->elems;
@@ -709,7 +710,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
* plugging to speed things up
*/
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> PAGE_SHIFT, 1);
+ dev->reada_next >> fs_info->sectorsize_bits, 1);
if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
ret = reada_pick_zone(dev);
if (!ret) {
@@ -718,7 +719,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
}
re = NULL;
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> PAGE_SHIFT, 1);
+ dev->reada_next >> fs_info->sectorsize_bits, 1);
}
if (ret == 0) {
spin_unlock(&fs_info->reada_lock);
@@ -885,7 +886,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
pr_cont(" curr off %llu",
device->reada_next - zone->start);
pr_cont("\n");
- index = (zone->end >> PAGE_SHIFT) + 1;
+ index = (zone->end >> fs_info->sectorsize_bits) + 1;
}
cnt = 0;
index = 0;
@@ -910,7 +911,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
}
}
pr_cont("\n");
- index = (re->logical >> PAGE_SHIFT) + 1;
+ index = (re->logical >> fs_info->sectorsize_bits) + 1;
if (++cnt > 15)
break;
}
@@ -926,7 +927,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
if (ret == 0)
break;
if (!re->scheduled) {
- index = (re->logical >> PAGE_SHIFT) + 1;
+ index = (re->logical >> fs_info->sectorsize_bits) + 1;
continue;
}
pr_debug("re: logical %llu size %u list empty %d scheduled %d",
@@ -942,7 +943,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
}
}
pr_cont("\n");
- index = (re->logical >> PAGE_SHIFT) + 1;
+ index = (re->logical >> fs_info->sectorsize_bits) + 1;
}
spin_unlock(&fs_info->reada_lock);
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2f1acc9aea9e..92a368627791 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3169,10 +3169,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
- index2 = log_root_tree->log_transid % 2;
- list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
- root_log_ctx.log_transid = log_root_tree->log_transid;
-
if (btrfs_is_zoned(fs_info)) {
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
@@ -3183,6 +3179,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
}
+ index2 = log_root_tree->log_transid % 2;
+ list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+ root_log_ctx.log_transid = log_root_tree->log_transid;
+
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit