summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-09-29 09:17:44 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-09-29 09:17:44 -0700
commit9f9a53472452b83d44d5e1d77b6dea6eaa043204 (patch)
treee95e3b7e4e2f715f4b01d30f91602a2e208b8d45
parentd37421e655cec032084bba2601e46ea61e6f9044 (diff)
parent3a5895e3ac2bb4b252a4e816575eeec6ac3deeec (diff)
Merge tag 'bcachefs-2024-09-28' of git://evilpiepirate.org/bcachefs
Pull more bcachefs updates from Kent Overstreet: "Assorted minor syzbot fixes, and for bigger stuff: Fix two disk accounting rewrite bugs: - Disk accounting keys use the version field of bkey so that journal replay can tell which updates have been applied to the btree. This is set in the transaction commit path, after we've gotten our journal reservation (and our time ordering), but the BCH_TRANS_COMMIT_skip_accounting_apply flag that journal replay uses was incorrectly skipping this for new updates generated prior to journal replay. This fixes the underlying cause of an assertion pop in disk_accounting_read. - A couple of fixes for disk accounting + device removal. Checking if acocunting replicas entries were marked in the superblock was being done at the wrong point, when deltas in the journal could still zero them out, and then additionally we'd try to add a missing replicas entry to the superblock without checking if it referred to an invalid (removed) device. A whole slew of repair fixes: - fix infinite loop in propagate_key_to_snapshot_leaves(), this fixes an infinite loop when repairing a filesystem with many snapshots - fix incorrect transaction restart handling leading to occasional "fsck counted ..." warnings - fix warning in __bch2_fsck_err() for bkey fsck errors - check_inode() in fsck now correctly checks if the filesystem was clean - there shouldn't be pending logged ops if the fs was clean, we now check for this - remove_backpointer() doesn't remove a dirent that doesn't actually point to the inode - many more fsck errors are AUTOFIX" * tag 'bcachefs-2024-09-28' of git://evilpiepirate.org/bcachefs: (35 commits) bcachefs: check_subvol_path() now prints subvol root inode bcachefs: remove_backpointer() now checks if dirent points to inode bcachefs: dirent_points_to_inode() now warns on mismatch bcachefs: Fix lost wake up bcachefs: Check for logged ops when clean bcachefs: BCH_FS_clean_recovery bcachefs: Convert disk accounting BUG_ON() to WARN_ON() bcachefs: Fix BCH_TRANS_COMMIT_skip_accounting_apply bcachefs: Check for accounting keys with bversion=0 bcachefs: rename version -> bversion bcachefs: Don't delete unlinked inodes before logged op resume bcachefs: Fix BCH_SB_ERRS() so we can reorder bcachefs: Fix fsck warnings from bkey validation bcachefs: Move transaction commit path validation to as late as possible bcachefs: Fix disk accounting attempting to mark invalid replicas entry bcachefs: Fix unlocked access to c->disk_sb.sb in bch2_replicas_entry_validate() bcachefs: Fix accounting read + device removal bcachefs: bch_accounting_mode bcachefs: fix transaction restart handling in check_extents(), check_dirents() bcachefs: kill inode_walker_entry.seen_this_pos ...
-rw-r--r--fs/bcachefs/backpointers.c2
-rw-r--r--fs/bcachefs/bcachefs.h3
-rw-r--r--fs/bcachefs/bcachefs_format.h6
-rw-r--r--fs/bcachefs/bkey.h8
-rw-r--r--fs/bcachefs/bkey_methods.c2
-rw-r--r--fs/bcachefs/bkey_methods.h2
-rw-r--r--fs/bcachefs/btree_gc.c8
-rw-r--r--fs/bcachefs/btree_io.c6
-rw-r--r--fs/bcachefs/btree_node_scan.c2
-rw-r--r--fs/bcachefs/btree_trans_commit.c108
-rw-r--r--fs/bcachefs/btree_update.h3
-rw-r--r--fs/bcachefs/data_update.c2
-rw-r--r--fs/bcachefs/disk_accounting.c82
-rw-r--r--fs/bcachefs/disk_accounting.h29
-rw-r--r--fs/bcachefs/disk_accounting_types.h2
-rw-r--r--fs/bcachefs/error.c14
-rw-r--r--fs/bcachefs/error.h2
-rw-r--r--fs/bcachefs/fsck.c295
-rw-r--r--fs/bcachefs/inode.c12
-rw-r--r--fs/bcachefs/inode.h1
-rw-r--r--fs/bcachefs/io_read.c4
-rw-r--r--fs/bcachefs/io_write.c4
-rw-r--r--fs/bcachefs/journal_io.c2
-rw-r--r--fs/bcachefs/logged_ops.c13
-rw-r--r--fs/bcachefs/recovery.c7
-rw-r--r--fs/bcachefs/recovery_passes_types.h2
-rw-r--r--fs/bcachefs/reflink.c2
-rw-r--r--fs/bcachefs/replicas.c18
-rw-r--r--fs/bcachefs/replicas.h2
-rw-r--r--fs/bcachefs/sb-clean.c1
-rw-r--r--fs/bcachefs/sb-downgrade.c9
-rw-r--r--fs/bcachefs/sb-errors.c6
-rw-r--r--fs/bcachefs/sb-errors.h2
-rw-r--r--fs/bcachefs/sb-errors_format.h39
-rw-r--r--fs/bcachefs/six.c12
-rw-r--r--fs/bcachefs/snapshot.c3
-rw-r--r--fs/bcachefs/subvolume.c54
-rw-r--r--fs/bcachefs/super-io.c7
-rw-r--r--fs/bcachefs/tests.c2
39 files changed, 469 insertions, 309 deletions
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index e11989a57ca0..47455a85c909 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -501,7 +501,7 @@ found:
prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
bch2_bkey_val_to_text(&buf, c, extent2);
- struct nonce nonce = extent_nonce(extent.k->version, p.crc);
+ struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
trans, dup_backpointer_to_bad_csum_extent,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c711d4c27a03..f4151ee51b03 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -594,6 +594,7 @@ struct bch_dev {
#define BCH_FS_FLAGS() \
x(new_fs) \
x(started) \
+ x(clean_recovery) \
x(btree_running) \
x(accounting_replay_done) \
x(may_go_rw) \
@@ -776,7 +777,7 @@ struct bch_fs {
unsigned nsec_per_time_unit;
u64 features;
u64 compat;
- unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+ unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
u64 btrees_lost_data;
} sb;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 8c4addddd07e..203ee627cab5 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -217,7 +217,7 @@ struct bkey {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u8 pad[1];
- struct bversion version;
+ struct bversion bversion;
__u32 size; /* extent size, in sectors */
struct bpos p;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
@@ -328,8 +328,8 @@ enum bch_bkey_fields {
bkey_format_field(OFFSET, p.offset), \
bkey_format_field(SNAPSHOT, p.snapshot), \
bkey_format_field(SIZE, size), \
- bkey_format_field(VERSION_HI, version.hi), \
- bkey_format_field(VERSION_LO, version.lo), \
+ bkey_format_field(VERSION_HI, bversion.hi), \
+ bkey_format_field(VERSION_LO, bversion.lo), \
}, \
})
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index e34cb2bf329c..41df24a53d97 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -214,9 +214,9 @@ static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
-static __always_inline int bversion_zero(struct bversion v)
+static __always_inline bool bversion_zero(struct bversion v)
{
- return !bversion_cmp(v, ZERO_VERSION);
+ return bversion_cmp(v, ZERO_VERSION) == 0;
}
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -554,8 +554,8 @@ static inline void bch2_bkey_pack_test(void) {}
x(BKEY_FIELD_OFFSET, p.offset) \
x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
x(BKEY_FIELD_SIZE, size) \
- x(BKEY_FIELD_VERSION_HI, version.hi) \
- x(BKEY_FIELD_VERSION_LO, version.lo)
+ x(BKEY_FIELD_VERSION_HI, bversion.hi) \
+ x(BKEY_FIELD_VERSION_LO, bversion.lo)
struct bkey_format_state {
u64 field_min[BKEY_NR_FIELDS];
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 88d8958281e8..e7ac227ba7e8 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -289,7 +289,7 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
bch2_bpos_to_text(out, k->p);
- prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
+ prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo);
} else {
prt_printf(out, "(null)");
}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 3df3dd2723a1..018fb72e32d3 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -70,7 +70,7 @@ bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
{
return l->type == r->type &&
- !bversion_cmp(l->version, r->version) &&
+ !bversion_cmp(l->bversion, r->bversion) &&
bpos_eq(l->p, bkey_start_pos(r));
}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index b5e0692f03c6..660d2fa02da2 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -513,6 +513,8 @@ int bch2_check_topology(struct bch_fs *c)
struct bpos pulled_from_scan = POS_MIN;
int ret = 0;
+ bch2_trans_srcu_unlock(trans);
+
for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
bool reconstructed_root = false;
@@ -599,15 +601,15 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
if (initial) {
BUG_ON(bch2_journal_seq_verify &&
- k.k->version.lo > atomic64_read(&c->journal.seq));
+ k.k->bversion.lo > atomic64_read(&c->journal.seq));
if (fsck_err_on(btree_id != BTREE_ID_accounting &&
- k.k->version.lo > atomic64_read(&c->key_version),
+ k.k->bversion.lo > atomic64_read(&c->key_version),
trans, bkey_version_in_future,
"key version number higher than recorded %llu\n %s",
atomic64_read(&c->key_version),
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- atomic64_set(&c->key_version, k.k->version.lo);
+ atomic64_set(&c->key_version, k.k->bversion.lo);
}
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index cb48a9477514..1c1448b52207 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1195,6 +1195,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
set_btree_bset(b, b->set, &b->data->keys);
b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
+ memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
+ btree_buf_bytes(b) -
+ sizeof(struct btree_node) -
+ b->nr.live_u64s * sizeof(u64));
u64s = le16_to_cpu(sorted->keys.u64s);
*sorted = *b->data;
@@ -1219,7 +1223,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_bkey_val_validate(c, u.s_c, READ);
if (ret == -BCH_ERR_fsck_delete_bkey ||
(bch2_inject_invalid_keys &&
- !bversion_cmp(u.k->version, MAX_VERSION))) {
+ !bversion_cmp(u.k->bversion, MAX_VERSION))) {
btree_keys_account_key_drop(&b->nr, 0, k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index b28c649c6838..1e694fedc5da 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -275,7 +275,7 @@ static int read_btree_nodes(struct find_btree_nodes *f)
w->ca = ca;
t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
- ret = IS_ERR_OR_NULL(t);
+ ret = PTR_ERR_OR_ZERO(t);
if (ret) {
percpu_ref_put(&ca->io_ref);
closure_put(&cl);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 91884da4e30a..1a74a1a252ee 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -684,10 +684,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
!(flags & BCH_TRANS_COMMIT_no_journal_res)) {
if (bch2_journal_seq_verify)
trans_for_each_update(trans, i)
- i->k->k.version.lo = trans->journal_res.seq;
+ i->k->k.bversion.lo = trans->journal_res.seq;
else if (bch2_inject_invalid_keys)
trans_for_each_update(trans, i)
- i->k->k.version = MAX_VERSION;
+ i->k->k.bversion = MAX_VERSION;
}
h = trans->hooks;
@@ -700,27 +700,31 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
struct jset_entry *entry = trans->journal_entries;
- if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
- percpu_down_read(&c->mark_lock);
+ percpu_down_read(&c->mark_lock);
+
+ for (entry = trans->journal_entries;
+ entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ entry = vstruct_next(entry))
+ if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
+ entry->start->k.type == KEY_TYPE_accounting) {
+ BUG_ON(!trans->journal_res.ref);
+
+ struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
- for (entry = trans->journal_entries;
- entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- entry = vstruct_next(entry))
- if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) {
- struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
+ a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
+ (u64 *) entry - (u64 *) trans->journal_entries);
+ BUG_ON(bversion_zero(a->k.bversion));
- a->k.version = journal_pos_to_bversion(&trans->journal_res,
- (u64 *) entry - (u64 *) trans->journal_entries);
- BUG_ON(bversion_zero(a->k.version));
- ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false, false);
+ if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
+ ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal);
if (ret)
goto revert_fs_usage;
}
- percpu_up_read(&c->mark_lock);
+ }
+ percpu_up_read(&c->mark_lock);
- /* XXX: we only want to run this if deltas are nonzero */
- bch2_trans_account_disk_usage_change(trans);
- }
+ /* XXX: we only want to run this if deltas are nonzero */
+ bch2_trans_account_disk_usage_change(trans);
trans_for_each_update(trans, i)
if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
@@ -735,6 +739,40 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
goto fatal_err;
}
+ trans_for_each_update(trans, i) {
+ enum bch_validate_flags invalid_flags = 0;
+
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+ invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
+
+ ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
+ i->bkey_type, invalid_flags);
+ if (unlikely(ret)){
+ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
+ trans->fn, (void *) i->ip_allocated);
+ goto fatal_err;
+ }
+ btree_insert_entry_checks(trans, i);
+ }
+
+ for (struct jset_entry *i = trans->journal_entries;
+ i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ i = vstruct_next(i)) {
+ enum bch_validate_flags invalid_flags = 0;
+
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+ invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
+
+ ret = bch2_journal_entry_validate(c, NULL, i,
+ bcachefs_metadata_version_current,
+ CPU_BIG_ENDIAN, invalid_flags);
+ if (unlikely(ret)) {
+ bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
+ trans->fn);
+ goto fatal_err;
+ }
+ }
+
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal;
struct jset_entry *entry;
@@ -798,7 +836,7 @@ revert_fs_usage:
struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
bch2_accounting_neg(a);
- bch2_accounting_mem_mod_locked(trans, a.c, false, false);
+ bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
bch2_accounting_neg(a);
}
percpu_up_read(&c->mark_lock);
@@ -1019,40 +1057,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (ret)
goto out_reset;
- trans_for_each_update(trans, i) {
- enum bch_validate_flags invalid_flags = 0;
-
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
- ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
- i->bkey_type, invalid_flags);
- if (unlikely(ret)){
- bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
- trans->fn, (void *) i->ip_allocated);
- return ret;
- }
- btree_insert_entry_checks(trans, i);
- }
-
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- i = vstruct_next(i)) {
- enum bch_validate_flags invalid_flags = 0;
-
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
- ret = bch2_journal_entry_validate(c, NULL, i,
- bcachefs_metadata_version_current,
- CPU_BIG_ENDIAN, invalid_flags);
- if (unlikely(ret)) {
- bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
- trans->fn);
- return ret;
- }
- }
-
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
ret = do_bch2_trans_commit_to_journal_replay(trans);
goto out_reset;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 60393e98084d..6a454f2fa005 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -220,7 +220,8 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t
if (type && k.k->type != type)
return ERR_PTR(-ENOENT);
- mut = bch2_trans_kmalloc_nomemzero(trans, bytes);
+ /* extra padding for varint_decode_fast... */
+ mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8);
if (!IS_ERR(mut)) {
bkey_reassemble(mut, k);
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 757b9884ef55..462b1a2fe1ad 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -639,7 +639,7 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
- m->op.version = k.k->version;
+ m->op.version = k.k->bversion;
m->op.target = data_opts.target;
m->op.write_point = wp;
m->op.nr_replicas = 0;
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index e972e2bca546..9f3133e3e7e5 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -134,6 +134,10 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
void *end = &acc_k + 1;
int ret = 0;
+ bkey_fsck_err_on(bversion_zero(k.k->bversion),
+ c, accounting_key_version_0,
+ "accounting key with version=0");
+
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_nr_inodes:
end = field_end(acc_k, nr_inodes);
@@ -291,7 +295,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
struct accounting_mem_entry n = {
.pos = a.k->p,
- .version = a.k->version,
+ .bversion = a.k->bversion,
.nr_counters = bch2_accounting_counters(a.k),
.v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
sizeof(u64), GFP_KERNEL),
@@ -319,11 +323,13 @@ err:
return -BCH_ERR_ENOMEM_disk_accounting;
}
-int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
+int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
+ enum bch_accounting_mode mode)
{
struct bch_replicas_padded r;
- if (accounting_to_replicas(&r.e, a.k->p) &&
+ if (mode != BCH_ACCOUNTING_read &&
+ accounting_to_replicas(&r.e, a.k->p) &&
!bch2_replicas_marked_locked(c, &r.e))
return -BCH_ERR_btree_insert_need_mark_replicas;
@@ -566,7 +572,9 @@ int bch2_gc_accounting_done(struct bch_fs *c)
struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
accounting_key_init(&k_i.k, &acc_k, src_v, nr);
- bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false, false);
+ bch2_accounting_mem_mod_locked(trans,
+ bkey_i_to_s_c_accounting(&k_i.k),
+ BCH_ACCOUNTING_normal);
preempt_disable();
struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
@@ -589,30 +597,14 @@ fsck_err:
static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
if (k.k->type != KEY_TYPE_accounting)
return 0;
percpu_down_read(&c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), false, true);
+ int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
+ BCH_ACCOUNTING_read);
percpu_up_read(&c->mark_lock);
-
- if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) &&
- ret == -BCH_ERR_btree_insert_need_mark_replicas)
- ret = 0;
-
- struct disk_accounting_pos acc;
- bpos_to_disk_accounting_pos(&acc, k.k->p);
-
- if (fsck_err_on(ret == -BCH_ERR_btree_insert_need_mark_replicas,
- trans, accounting_replicas_not_marked,
- "accounting not marked in superblock replicas\n %s",
- (bch2_accounting_key_to_text(&buf, &acc),
- buf.buf)))
- ret = bch2_accounting_update_sb_one(c, k.k->p);
-fsck_err:
- printbuf_exit(&buf);
return ret;
}
@@ -624,6 +616,7 @@ int bch2_accounting_read(struct bch_fs *c)
{
struct bch_accounting_mem *acc = &c->accounting;
struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
int ret = for_each_btree_key(trans, iter,
BTREE_ID_accounting, POS_MIN,
@@ -647,7 +640,7 @@ int bch2_accounting_read(struct bch_fs *c)
accounting_pos_cmp, &k.k->p);
bool applied = idx < acc->k.nr &&
- bversion_cmp(acc->k.data[idx].version, k.k->version) >= 0;
+ bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
if (applied)
continue;
@@ -655,7 +648,7 @@ int bch2_accounting_read(struct bch_fs *c)
if (i + 1 < &darray_top(*keys) &&
i[1].k->k.type == KEY_TYPE_accounting &&
!journal_key_cmp(i, i + 1)) {
- BUG_ON(bversion_cmp(i[0].k->k.version, i[1].k->k.version) >= 0);
+ WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
i[1].journal_seq = i[0].journal_seq;
@@ -674,6 +667,45 @@ int bch2_accounting_read(struct bch_fs *c)
keys->gap = keys->nr = dst - keys->data;
percpu_down_read(&c->mark_lock);
+ for (unsigned i = 0; i < acc->k.nr; i++) {
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+
+ if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters))
+ continue;
+
+ struct bch_replicas_padded r;
+ if (!accounting_to_replicas(&r.e, acc->k.data[i].pos))
+ continue;
+
+ /*
+ * If the replicas entry is invalid it'll get cleaned up by
+ * check_allocations:
+ */
+ if (bch2_replicas_entry_validate(&r.e, c, &buf))
+ continue;
+
+ struct disk_accounting_pos k;
+ bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
+
+ if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
+ trans, accounting_replicas_not_marked,
+ "accounting not marked in superblock replicas\n %s",
+ (printbuf_reset(&buf),
+ bch2_accounting_key_to_text(&buf, &k),
+ buf.buf))) {
+ /*
+ * We're not RW yet and still single threaded, dropping
+ * and retaking lock is ok:
+ */
+ percpu_up_read(&c->mark_lock);
+ ret = bch2_mark_replicas(c, &r.e);
+ if (ret)
+ goto fsck_err;
+ percpu_down_read(&c->mark_lock);
+ }
+ }
+
preempt_disable();
struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
@@ -709,8 +741,10 @@ int bch2_accounting_read(struct bch_fs *c)
}
}
preempt_enable();
+fsck_err:
percpu_up_read(&c->mark_lock);
err:
+ printbuf_exit(&buf);
bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index f29fd0dd9581..4ea6c8a092bc 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -36,8 +36,8 @@ static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
dst->v.d[i] += src.v->d[i];
- if (bversion_cmp(dst->k.version, src.k->version) < 0)
- dst->k.version = src.k->version;
+ if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
+ dst->k.bversion = src.k->bversion;
}
static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
@@ -103,23 +103,35 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r)
return bpos_cmp(*l, *r);
}
-int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool);
+enum bch_accounting_mode {
+ BCH_ACCOUNTING_normal,
+ BCH_ACCOUNTING_gc,
+ BCH_ACCOUNTING_read,
+};
+
+int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
void bch2_accounting_mem_gc(struct bch_fs *);
/*
* Update in memory counters so they match the btree update we're doing; called
* from transaction commit path
*/
-static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc, bool read)
+static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
+ struct bkey_s_c_accounting a,
+ enum bch_accounting_mode mode)
{
struct bch_fs *c = trans->c;
+ struct bch_accounting_mem *acc = &c->accounting;
struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+ bool gc = mode == BCH_ACCOUNTING_gc;
+
+ EBUG_ON(gc && !acc->gc_running);
if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
return 0;
- if (!gc && !read) {
+ if (mode == BCH_ACCOUNTING_normal) {
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_persistent_reserved:
trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
@@ -140,14 +152,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru
}
}
- struct bch_accounting_mem *acc = &c->accounting;
unsigned idx;
- EBUG_ON(gc && !acc->gc_running);
-
while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
- int ret = bch2_accounting_mem_insert(c, a, gc);
+ int ret = bch2_accounting_mem_insert(c, a, mode);
if (ret)
return ret;
}
@@ -164,7 +173,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
{
percpu_down_read(&trans->c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, a, gc, false);
+ int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
percpu_up_read(&trans->c->mark_lock);
return ret;
}
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
index 1687a45177a7..b1982131b206 100644
--- a/fs/bcachefs/disk_accounting_types.h
+++ b/fs/bcachefs/disk_accounting_types.h
@@ -6,7 +6,7 @@
struct accounting_mem_entry {
struct bpos pos;
- struct bversion version;
+ struct bversion bversion;
unsigned nr_counters;
u64 __percpu *v[2];
};
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 95afa7bf2020..3a16b535b6c3 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -239,7 +239,19 @@ int __bch2_fsck_err(struct bch_fs *c,
if (!c)
c = trans->c;
- WARN_ON(!trans && bch2_current_has_btree_trans(c));
+ /*
+ * Ugly: if there's a transaction in the current task it has to be
+ * passed in to unlock if we prompt for user input.
+ *
+ * But, plumbing a transaction and transaction restarts into
+ * bkey_validate() is problematic.
+ *
+ * So:
+ * - make all bkey errors AUTOFIX, they're simple anyways (we just
+ * delete the key)
+ * - and we don't need to warn if we're not prompting
+ */
+ WARN_ON(!(flags & FSCK_AUTOFIX) && !trans && bch2_current_has_btree_trans(c));
if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent))
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 2f1b86978f36..21ee7211b03e 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -184,7 +184,7 @@ do { \
ret = -BCH_ERR_fsck_delete_bkey; \
goto fsck_err; \
} \
- int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX, \
+ int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX|FSCK_AUTOFIX,\
BCH_FSCK_ERR_##_err_type, \
_err_msg, ##__VA_ARGS__); \
if (_ret != -BCH_ERR_fsck_fix && \
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9b3470a97546..0d8b782b63fb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -21,6 +21,49 @@
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static bool dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *inode)
+{
+ if (d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
+ : le64_to_cpu(d.v->d_inum) == inode->bi_inum)
+ return 0;
+ return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+}
+
+static void dirent_inode_mismatch_msg(struct printbuf *out,
+ struct bch_fs *c,
+ struct bkey_s_c_dirent dirent,
+ struct bch_inode_unpacked *inode)
+{
+ prt_str(out, "inode points to dirent that does not point back:");
+ prt_newline(out);
+ bch2_bkey_val_to_text(out, c, dirent.s_c);
+ prt_newline(out);
+ bch2_inode_unpacked_to_text(out, inode);
+}
+
+static int dirent_points_to_inode(struct bch_fs *c,
+ struct bkey_s_c_dirent dirent,
+ struct bch_inode_unpacked *inode)
+{
+ int ret = dirent_points_to_inode_nowarn(dirent, inode);
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+ dirent_inode_mismatch_msg(&buf, c, dirent, inode);
+ bch_warn(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+ return ret;
+}
+
/*
* XXX: this is handling transaction restarts without returning
* -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
@@ -346,14 +389,17 @@ static int reattach_inode(struct btree_trans *trans,
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
- struct btree_iter iter;
- struct bkey_s_c_dirent d;
- int ret;
+ if (!inode->bi_dir)
+ return 0;
- d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
- POS(inode->bi_dir, inode->bi_dir_offset), 0,
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d =
+ bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
+ SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0,
dirent);
- ret = bkey_err(d) ?:
+ int ret = bkey_err(d) ?:
+ dirent_points_to_inode(c, d, inode) ?:
__remove_dirent(trans, d.k->p);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -371,7 +417,8 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume
return ret;
ret = remove_backpointer(trans, &inode);
- bch_err_msg(c, ret, "removing dirent");
+ if (!bch2_err_matches(ret, ENOENT))
+ bch_err_msg(c, ret, "removing dirent");
if (ret)
return ret;
@@ -626,12 +673,12 @@ static int ref_visible2(struct bch_fs *c,
struct inode_walker_entry {
struct bch_inode_unpacked inode;
u32 snapshot;
- bool seen_this_pos;
u64 count;
};
struct inode_walker {
bool first_this_inode;
+ bool have_inodes;
bool recalculate_sums;
struct bpos last_pos;
@@ -669,6 +716,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
+ /*
+ * We no longer have inodes for w->last_pos; clear this to avoid
+ * screwing up check_i_sectors/check_subdir_count if we take a
+ * transaction restart here:
+ */
+ w->have_inodes = false;
w->recalculate_sums = false;
w->inodes.nr = 0;
@@ -686,6 +739,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
return ret;
w->first_this_inode = true;
+ w->have_inodes = true;
return 0;
}
@@ -740,9 +794,6 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
if (ret)
return ERR_PTR(ret);
- } else if (bkey_cmp(w->last_pos, k.k->p)) {
- darray_for_each(w->inodes, i)
- i->seen_this_pos = false;
}
w->last_pos = k.k->p;
@@ -896,21 +947,6 @@ static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
}
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
-{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
-}
-
-static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
-{
- return d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
- : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
-}
-
static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
{
struct btree_iter iter;
@@ -920,13 +956,14 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
return ret;
}
-static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
+static int check_inode_dirent_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
- u32 inode_snapshot, bool *write_inode)
+ bool *write_inode)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
+ u32 inode_snapshot = inode->bi_snapshot;
struct btree_iter dirent_iter = {};
struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
int ret = bkey_err(d);
@@ -936,13 +973,13 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i
if (fsck_err_on(ret,
trans, inode_points_to_missing_dirent,
"inode points to missing dirent\n%s",
- (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
- fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
+ (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
+ fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
trans, inode_points_to_wrong_dirent,
- "inode points to dirent that does not point back:\n%s",
- (bch2_bkey_val_to_text(&buf, c, inode_k),
- prt_newline(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ "%s",
+ (printbuf_reset(&buf),
+ dirent_inode_mismatch_msg(&buf, c, d, inode),
+ buf.buf))) {
/*
* We just clear the backpointer fields for now. If we find a
* dirent that points to this inode in check_dirents(), we'll
@@ -963,7 +1000,7 @@ fsck_err:
return ret;
}
-static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
+static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
{
subvol_inum inum = {
.subvol = snapshot_t(c, p.snapshot)->subvol,
@@ -972,7 +1009,7 @@ static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
/* snapshot tree corruption, can't safely delete */
if (!inum.subvol) {
- bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot);
+ bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
return true;
}
@@ -1045,30 +1082,44 @@ static int check_inode(struct btree_trans *trans,
}
if (u.bi_flags & BCH_INODE_unlinked) {
- ret = check_inode_deleted_list(trans, k.k->p);
- if (ret < 0)
- return ret;
+ if (!test_bit(BCH_FS_started, &c->flags)) {
+ /*
+ * If we're not in online fsck, don't delete unlinked
+ * inodes, just make sure they're on the deleted list.
+ *
+ * They might be referred to by a logged operation -
+ * i.e. we might have crashed in the middle of a
+ * truncate on an unlinked but open file - so we want to
+ * let the delete_dead_inodes kill it after resuming
+ * logged ops.
+ */
+ ret = check_inode_deleted_list(trans, k.k->p);
+ if (ret < 0)
+ return ret;
- fsck_err_on(!ret,
- trans, unlinked_inode_not_on_deleted_list,
- "inode %llu:%u unlinked, but not on deleted list",
- u.bi_inum, k.k->p.snapshot);
- ret = 0;
- }
+ fsck_err_on(!ret,
+ trans, unlinked_inode_not_on_deleted_list,
+ "inode %llu:%u unlinked, but not on deleted list",
+ u.bi_inum, k.k->p.snapshot);
- if (u.bi_flags & BCH_INODE_unlinked &&
- !bch2_inode_open(c, k.k->p) &&
- (!c->sb.clean ||
- fsck_err(trans, inode_unlinked_but_clean,
- "filesystem marked clean, but inode %llu unlinked",
- u.bi_inum))) {
- ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
- bch_err_msg(c, ret, "in fsck deleting inode");
- return ret;
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
+ if (ret)
+ goto err;
+ } else {
+ if (fsck_err_on(bch2_inode_is_open(c, k.k->p),
+ trans, inode_unlinked_and_not_open,
+ "inode %llu%u unlinked and not open",
+ u.bi_inum, u.bi_snapshot)) {
+ ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
+ bch_err_msg(c, ret, "in fsck deleting inode");
+ return ret;
+ }
+ }
}
+ /* i_size_dirty is vestigal, since we now have logged ops for truncate * */
if (u.bi_flags & BCH_INODE_i_size_dirty &&
- (!c->sb.clean ||
+ (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
fsck_err(trans, inode_i_size_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_size dirty",
u.bi_inum))) {
@@ -1097,8 +1148,9 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
+ /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */
if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
- (!c->sb.clean ||
+ (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
fsck_err(trans, inode_i_sectors_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_sectors dirty",
u.bi_inum))) {
@@ -1126,7 +1178,7 @@ static int check_inode(struct btree_trans *trans,
}
if (u.bi_dir || u.bi_dir_offset) {
- ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
+ ret = check_inode_dirent_inode(trans, &u, &do_update);
if (ret)
goto err;
}
@@ -1555,10 +1607,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k,
struct inode_walker *inode,
struct snapshots_seen *s,
- struct extent_ends *extent_ends)
+ struct extent_ends *extent_ends,
+ struct disk_reservation *res)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -1568,7 +1620,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto out;
}
- if (inode->last_pos.inode != k.k->p.inode) {
+ if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
ret = check_i_sectors(trans, inode);
if (ret)
goto err;
@@ -1578,12 +1630,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
- i = walk_inode(trans, inode, k);
- ret = PTR_ERR_OR_ZERO(i);
+ struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
+ ret = PTR_ERR_OR_ZERO(extent_i);
if (ret)
goto err;
- ret = check_key_has_inode(trans, iter, inode, i, k);
+ ret = check_key_has_inode(trans, iter, inode, extent_i, k);
if (ret)
goto err;
@@ -1592,24 +1644,19 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
&inode->recalculate_sums);
if (ret)
goto err;
- }
- /*
- * Check inodes in reverse order, from oldest snapshots to newest,
- * starting from the inode that matches this extent's snapshot. If we
- * didn't have one, iterate over all inodes:
- */
- if (!i)
- i = &darray_last(inode->inodes);
-
- for (;
- inode->inodes.data && i >= inode->inodes.data;
- --i) {
- if (i->snapshot > k.k->p.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
- continue;
+ /*
+ * Check inodes in reverse order, from oldest snapshots to
+ * newest, starting from the inode that matches this extent's
+ * snapshot. If we didn't have one, iterate over all inodes:
+ */
+ for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
+ inode->inodes.data && i >= inode->inodes.data;
+ --i) {
+ if (i->snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ continue;
- if (k.k->type != KEY_TYPE_whiteout) {
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
!bkey_extent_is_reservation(k),
@@ -1629,13 +1676,25 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
iter->k.type = KEY_TYPE_whiteout;
+ break;
}
-
- if (bkey_extent_is_allocation(k.k))
- i->count += k.k->size;
}
+ }
- i->seen_this_pos = true;
+ ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ if (bkey_extent_is_allocation(k.k)) {
+ for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
+ inode->inodes.data && i >= inode->inodes.data;
+ --i) {
+ if (i->snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ continue;
+
+ i->count += k.k->size;
+ }
}
if (k.k->type != KEY_TYPE_whiteout) {
@@ -1666,13 +1725,11 @@ int bch2_check_extents(struct bch_fs *c)
extent_ends_init(&extent_ends);
int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
+ for_each_btree_key(trans, iter, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- &res, NULL,
- BCH_TRANS_COMMIT_no_enospc, ({
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+ check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
check_extent_overbig(trans, &iter, k);
})) ?:
check_i_sectors_notnested(trans, &w));
@@ -1758,6 +1815,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
+ struct btree_iter bp_iter = { NULL };
int ret = 0;
if (inode_points_to_dirent(target, d))
@@ -1770,7 +1828,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
prt_printf(&buf, "\n "),
bch2_inode_unpacked_to_text(&buf, target),
buf.buf)))
- goto out_noiter;
+ goto err;
if (!target->bi_dir &&
!target->bi_dir_offset) {
@@ -1779,7 +1837,6 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
return __bch2_fsck_write_inode(trans, target, target_snapshot);
}
- struct btree_iter bp_iter = { NULL };
struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
ret = bkey_err(bp_dirent);
@@ -1840,7 +1897,6 @@ out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &bp_iter);
-out_noiter:
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
@@ -2075,7 +2131,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type == KEY_TYPE_whiteout)
goto out;
- if (dir->last_pos.inode != k.k->p.inode) {
+ if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
ret = check_subdir_count(trans, dir);
if (ret)
goto err;
@@ -2137,11 +2193,15 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
}
-
- if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
- i->count++;
}
+
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ if (d.v->d_type == DT_DIR)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ i->count++;
out:
err:
fsck_err:
@@ -2164,12 +2224,9 @@ int bch2_check_dirents(struct bch_fs *c)
snapshots_seen_init(&s);
int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
+ for_each_btree_key(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
- k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
check_subdir_count_notnested(trans, &dir));
@@ -2314,22 +2371,6 @@ static bool darray_u32_has(darray_u32 *d, u32 v)
return false;
}
-/*
- * We've checked that inode backpointers point to valid dirents; here, it's
- * sufficient to check that the subvolume root has a dirent:
- */
-static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
-{
- struct bch_inode_unpacked inode;
- int ret = bch2_inode_find_by_inum_trans(trans,
- (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
- &inode);
- if (ret)
- return ret;
-
- return inode.bi_dir != 0;
-}
-
static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
@@ -2348,14 +2389,24 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
- ret = subvol_has_dirent(trans, s);
- if (ret < 0)
+ struct bch_inode_unpacked subvol_root;
+ ret = bch2_inode_find_by_inum_trans(trans,
+ (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+ &subvol_root);
+ if (ret)
break;
- if (fsck_err_on(!ret,
+ /*
+ * We've checked that inode backpointers point to valid dirents;
+ * here, it's sufficient to check that the subvolume root has a
+ * dirent:
+ */
+ if (fsck_err_on(!subvol_root.bi_dir,
trans, subvol_unreachable,
"unreachable subvolume %s",
(bch2_bkey_val_to_text(&buf, c, s.s_c),
+ prt_newline(&buf),
+ bch2_inode_unpacked_to_text(&buf, &subvol_root),
buf.buf))) {
ret = reattach_subvol(trans, s);
break;
@@ -2450,10 +2501,8 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
if (ret && !bch2_err_matches(ret, ENOENT))
break;
- if (!ret && !dirent_points_to_inode(d, &inode)) {
+ if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
bch2_trans_iter_exit(trans, &dirent_iter);
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
- }
if (bch2_err_matches(ret, ENOENT)) {
ret = 0;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 6ac0ff7e074b..753c208896c3 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -320,9 +320,11 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
int bch2_inode_unpack(struct bkey_s_c k,
struct bch_inode_unpacked *unpacked)
{
- if (likely(k.k->type == KEY_TYPE_inode_v3))
- return bch2_inode_unpack_v3(k, unpacked);
- return bch2_inode_unpack_slowpath(k, unpacked);
+ unpacked->bi_snapshot = k.k->p.snapshot;
+
+ return likely(k.k->type == KEY_TYPE_inode_v3)
+ ? bch2_inode_unpack_v3(k, unpacked)
+ : bch2_inode_unpack_slowpath(k, unpacked);
}
int bch2_inode_peek_nowarn(struct btree_trans *trans,
@@ -557,7 +559,7 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
{
- prt_printf(out, "inum: %llu ", inode->bi_inum);
+ prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
__bch2_inode_unpacked_to_text(out, inode);
}
@@ -1111,7 +1113,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
pos.offset, pos.snapshot))
goto delete;
- if (c->sb.clean &&
+ if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
!fsck_err(trans, deleted_inode_but_clean,
"filesystem marked as clean but have deleted inode %llu:%u",
pos.offset, pos.snapshot)) {
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index f1fcb4c58039..695abd707cb6 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -69,6 +69,7 @@ typedef u64 u96;
struct bch_inode_unpacked {
u64 bi_inum;
+ u32 bi_snapshot;
u64 bi_journal_seq;
__le64 bi_hash_seed;
u64 bi_size;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index b2f50e74bb76..e4fc17c548fd 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -517,7 +517,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
if ((ret = bkey_err(k)))
goto out;
- if (bversion_cmp(k.k->version, rbio->version) ||
+ if (bversion_cmp(k.k->bversion, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
goto out;
@@ -1031,7 +1031,7 @@ get_bio:
rbio->read_pos = read_pos;
rbio->data_btree = data_btree;
rbio->data_pos = data_pos;
- rbio->version = k.k->version;
+ rbio->version = k.k->bversion;
rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index d3b5be7fd9bf..b5fe9e0dc155 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -697,7 +697,7 @@ static void init_append_extent(struct bch_write_op *op,
e = bkey_extent_init(op->insert_keys.top);
e->k.p = op->pos;
e->k.size = crc.uncompressed_size;
- e->k.version = version;
+ e->k.bversion = version;
if (crc.csum_type ||
crc.compression_type ||
@@ -1544,7 +1544,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
id = bkey_inline_data_init(op->insert_keys.top);
id->k.p = op->pos;
- id->k.version = op->version;
+ id->k.bversion = op->version;
id->k.size = sectors;
iter = bio->bi_iter;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 30460bce04be..954f6a96e0f4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -605,7 +605,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
goto out;
}
- if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err),
+ if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
c, version, jset, entry,
journal_entry_data_usage_bad_size,
"invalid journal entry usage: %s", err.buf)) {
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index f49fdca1d07d..6f4a4e1083c9 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -37,6 +37,14 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
struct bkey_buf sk;
u32 restart_count = trans->restart_count;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags),
+ trans, logged_op_but_clean,
+ "filesystem marked as clean but have logged op\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf));
if (!fn)
return 0;
@@ -47,8 +55,9 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
fn->resume(trans, sk.k);
bch2_bkey_buf_exit(&sk, c);
-
- return trans_was_restarted(trans, restart_count);
+fsck_err:
+ printbuf_exit(&buf);
+ return ret ?: trans_was_restarted(trans, restart_count);
}
int bch2_resume_logged_ops(struct bch_fs *c)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index be1e7ca4362f..6db72d3bad7d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -151,7 +151,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
/* Has this delta already been applied to the btree? */
- if (bversion_cmp(old.k->version, k->k->k.version) >= 0) {
+ if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
ret = 0;
goto out;
}
@@ -717,6 +717,8 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->opts.fsck)
set_bit(BCH_FS_fsck_running, &c->flags);
+ if (c->sb.clean)
+ set_bit(BCH_FS_clean_recovery, &c->flags);
ret = bch2_blacklist_table_initialize(c);
if (ret) {
@@ -862,6 +864,9 @@ use_clean:
clear_bit(BCH_FS_fsck_running, &c->flags);
+ /* in case we don't run journal replay, i.e. norecovery mode */
+ set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
/* fsync if we fixed errors */
if (test_bit(BCH_FS_errors_fixed, &c->flags) &&
bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) {
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 8c7dee5983d2..50406ce0e4ef 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -50,7 +50,7 @@
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
x(resume_logged_ops, 23, PASS_ALWAYS) \
- x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
+ x(delete_dead_inodes, 32, PASS_ALWAYS) \
x(fix_reflink_p, 33, 0) \
x(set_fs_needs_rebalance, 34, 0) \
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index e59c0abb4772..f457925fa362 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -367,7 +367,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
r_v->k.type = bkey_type_to_indirect(&orig->k);
r_v->k.p = reflink_iter.pos;
bch2_key_resize(&r_v->k, orig->k.size);
- r_v->k.version = orig->k.version;
+ r_v->k.bversion = orig->k.bversion;
set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 998c0bd06802..bcb3276747e0 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
prt_printf(out, "]");
}
-int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
- struct bch_sb *sb,
- struct printbuf *err)
+static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r,
+ struct bch_sb *sb,
+ struct printbuf *err)
{
if (!r->nr_devs) {
prt_printf(err, "no devices in entry ");
@@ -94,6 +94,16 @@ bad:
return -BCH_ERR_invalid_replicas_entry;
}
+int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
+ struct bch_fs *c,
+ struct printbuf *err)
+{
+ mutex_lock(&c->sb_lock);
+ int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err);
+ mutex_unlock(&c->sb_lock);
+ return ret;
+}
+
void bch2_cpu_replicas_to_text(struct printbuf *out,
struct bch_replicas_cpu *r)
{
@@ -676,7 +686,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i);
- int ret = bch2_replicas_entry_validate(e, sb, err);
+ int ret = bch2_replicas_entry_validate_locked(e, sb, err);
if (ret)
return ret;
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 622482559c3d..5aba2c1ce133 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -10,7 +10,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry_v1 *);
int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
- struct bch_sb *, struct printbuf *);
+ struct bch_fs *, struct printbuf *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
static inline struct bch_replicas_entry_v1 *
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 025848a9c4c0..005275281804 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -167,6 +167,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
ret = bch2_sb_clean_validate_late(c, clean, READ);
if (ret) {
+ kfree(clean);
mutex_unlock(&c->sb_lock);
return ERR_PTR(ret);
}
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index c7e4cdd3f6a5..5102059a0f1d 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -312,8 +312,7 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
if (!first)
prt_char(out, ',');
first = false;
- unsigned e = le16_to_cpu(i->errors[j]);
- prt_str(out, e < BCH_SB_ERR_MAX ? bch2_sb_error_strs[e] : "(unknown)");
+ bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
}
prt_newline(out);
}
@@ -353,7 +352,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
for (unsigned i = 0; i < src->nr_errors; i++)
dst->errors[i] = cpu_to_le16(src->errors[i]);
- downgrade_table_extra(c, &table);
+ ret = downgrade_table_extra(c, &table);
+ if (ret)
+ goto out;
if (!dst->recovery_passes[0] &&
!dst->recovery_passes[1] &&
@@ -399,7 +400,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
unsigned e = le16_to_cpu(i->errors[j]);
- if (e < BCH_SB_ERR_MAX)
+ if (e < BCH_FSCK_ERR_MAX)
__set_bit(e, c->sb.errors_silent);
if (e < sizeof(ext->errors_silent) * 8)
__set_bit_le64(e, ext->errors_silent);
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index c1270d790e43..013a96883b4e 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -7,12 +7,12 @@
const char * const bch2_sb_error_strs[] = {
#define x(t, n, ...) [n] = #t,
BCH_SB_ERRS()
- NULL
+#undef x
};
-static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
+void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
{
- if (id < BCH_SB_ERR_MAX)
+ if (id < BCH_FSCK_ERR_MAX)
prt_str(out, bch2_sb_error_strs[id]);
else
prt_printf(out, "(unknown error %u)", id);
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
index 8889001e7db4..b2357b8e6107 100644
--- a/fs/bcachefs/sb-errors.h
+++ b/fs/bcachefs/sb-errors.h
@@ -6,6 +6,8 @@
extern const char * const bch2_sb_error_strs[];
+void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
+
extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index f0c14702f9e6..ed5dca5e1161 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -210,22 +210,23 @@ enum bch_fsck_flags {
x(inode_snapshot_mismatch, 196, 0) \
x(inode_unlinked_but_clean, 197, 0) \
x(inode_unlinked_but_nlink_nonzero, 198, 0) \
+ x(inode_unlinked_and_not_open, 281, 0) \
x(inode_checksum_type_invalid, 199, 0) \
x(inode_compression_type_invalid, 200, 0) \
x(inode_subvol_root_but_not_dir, 201, 0) \
- x(inode_i_size_dirty_but_clean, 202, 0) \
- x(inode_i_sectors_dirty_but_clean, 203, 0) \
- x(inode_i_sectors_wrong, 204, 0) \
- x(inode_dir_wrong_nlink, 205, 0) \
- x(inode_dir_multiple_links, 206, 0) \
- x(inode_multiple_links_but_nlink_0, 207, 0) \
- x(inode_wrong_backpointer, 208, 0) \
- x(inode_wrong_nlink, 209, 0) \
- x(inode_unreachable, 210, 0) \
- x(deleted_inode_but_clean, 211, 0) \
- x(deleted_inode_missing, 212, 0) \
- x(deleted_inode_is_dir, 213, 0) \
- x(deleted_inode_not_unlinked, 214, 0) \
+ x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \
+ x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \
+ x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \
+ x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \
+ x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \
+ x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
+ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
+ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
+ x(inode_unreachable, 210, FSCK_AUTOFIX) \
+ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
+ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
+ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
+ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
x(extent_overlapping, 215, 0) \
x(key_in_missing_inode, 216, 0) \
x(key_in_wrong_inode_type, 217, 0) \
@@ -255,7 +256,7 @@ enum bch_fsck_flags {
x(dir_loop, 241, 0) \
x(hash_table_key_duplicate, 242, 0) \
x(hash_table_key_wrong_offset, 243, 0) \
- x(unlinked_inode_not_on_deleted_list, 244, 0) \
+ x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \
x(reflink_p_front_pad_bad, 245, 0) \
x(journal_entry_dup_same_device, 246, 0) \
x(inode_bi_subvol_missing, 247, 0) \
@@ -270,7 +271,7 @@ enum bch_fsck_flags {
x(subvol_children_not_set, 256, 0) \
x(subvol_children_bad, 257, 0) \
x(subvol_loop, 258, 0) \
- x(subvol_unreachable, 259, 0) \
+ x(subvol_unreachable, 259, FSCK_AUTOFIX) \
x(btree_node_bkey_bad_u64s, 260, 0) \
x(btree_node_topology_empty_interior_node, 261, 0) \
x(btree_ptr_v2_min_key_bad, 262, 0) \
@@ -282,8 +283,8 @@ enum bch_fsck_flags {
x(btree_ptr_v2_written_0, 268, 0) \
x(subvol_snapshot_bad, 269, 0) \
x(subvol_inode_bad, 270, 0) \
- x(alloc_key_stripe_sectors_wrong, 271, 0) \
- x(accounting_mismatch, 272, 0) \
+ x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
+ x(accounting_mismatch, 272, FSCK_AUTOFIX) \
x(accounting_replicas_not_marked, 273, 0) \
x(invalid_btree_id, 274, 0) \
x(alloc_key_io_time_bad, 275, 0) \
@@ -292,12 +293,14 @@ enum bch_fsck_flags {
x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
+ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
+ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
+ x(MAX, 284, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
BCH_SB_ERRS()
#undef x
- BCH_SB_ERR_MAX
};
struct bch_sb_field_errors {
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 9cbd3c14c94f..617d07e53b20 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -169,11 +169,17 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
ret = -1 - SIX_LOCK_write;
}
} else if (type == SIX_LOCK_write && lock->readers) {
- if (try) {
+ if (try)
atomic_add(SIX_LOCK_HELD_write, &lock->state);
- smp_mb__after_atomic();
- }
+ /*
+ * Make sure atomic_add happens before pcpu_read_count and
+ * six_set_bitmask in slow path happens before pcpu_read_count.
+ *
+ * Paired with the smp_mb() in read lock fast path (per-cpu mode)
+ * and the one before atomic_read in read unlock path.
+ */
+ smp_mb();
ret = !pcpu_read_count(lock);
if (try && !ret) {
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 8b18a9b483a4..1809442b00ee 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -469,6 +469,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
u32 id = snapshot_root;
u32 subvol = 0, s;
+ rcu_read_lock();
while (id) {
s = snapshot_t(c, id)->subvol;
@@ -477,6 +478,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
id = bch2_snapshot_tree_next(c, id);
}
+ rcu_read_unlock();
return subvol;
}
@@ -1782,6 +1784,7 @@ static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
new->k.p.snapshot = leaf_id;
ret = bch2_trans_update(trans, &iter, new, 0);
out:
+ bch2_set_btree_iter_dontneed(&iter);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index dbe834cb349f..6845dde1b339 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -92,34 +92,32 @@ static int check_subvol(struct btree_trans *trans,
}
struct bch_inode_unpacked inode;
- struct btree_iter inode_iter = {};
- ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
+ ret = bch2_inode_find_by_inum_nowarn_trans(trans,
(subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
- 0);
- bch2_trans_iter_exit(trans, &inode_iter);
-
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (fsck_err_on(ret,
- trans, subvol_to_missing_root,
- "subvolume %llu points to missing subvolume root %llu:%u",
- k.k->p.offset, le64_to_cpu(subvol.v->inode),
- le32_to_cpu(subvol.v->snapshot))) {
- ret = bch2_subvolume_delete(trans, iter->pos.offset);
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
- return ret ?: -BCH_ERR_transaction_restart_nested;
- }
-
- if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
- trans, subvol_root_wrong_bi_subvol,
- "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
- inode.bi_inum, inode_iter.k.p.snapshot,
- inode.bi_subvol, subvol.k->p.offset)) {
- inode.bi_subvol = subvol.k->p.offset;
- ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
- if (ret)
+ &inode);
+ if (!ret) {
+ if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+ trans, subvol_root_wrong_bi_subvol,
+ "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+ inode.bi_inum, inode.bi_snapshot,
+ inode.bi_subvol, subvol.k->p.offset)) {
+ inode.bi_subvol = subvol.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+ if (ret)
+ goto err;
+ }
+ } else if (bch2_err_matches(ret, ENOENT)) {
+ if (fsck_err(trans, subvol_to_missing_root,
+ "subvolume %llu points to missing subvolume root %llu:%u",
+ k.k->p.offset, le64_to_cpu(subvol.v->inode),
+ le32_to_cpu(subvol.v->snapshot))) {
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+ ret = ret ?: -BCH_ERR_transaction_restart_nested;
goto err;
+ }
+ } else {
+ goto err;
}
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
@@ -137,7 +135,7 @@ static int check_subvol(struct btree_trans *trans,
"%s: snapshot tree %u not found", __func__, snapshot_tree);
if (ret)
- return ret;
+ goto err;
if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
trans, subvol_not_master_and_not_snapshot,
@@ -147,7 +145,7 @@ static int check_subvol(struct btree_trans *trans,
bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
ret = PTR_ERR_OR_ZERO(s);
if (ret)
- return ret;
+ goto err;
SET_BCH_SUBVOLUME_SNAP(&s->v, true);
}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index d86d5dae54c9..ce7410d72089 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -799,8 +799,10 @@ retry:
i < layout.sb_offset + layout.nr_superblocks; i++) {
offset = le64_to_cpu(*i);
- if (offset == opt_get(*opts, sb))
+ if (offset == opt_get(*opts, sb)) {
+ ret = -BCH_ERR_invalid;
continue;
+ }
ret = read_one_super(sb, offset, &err);
if (!ret)
@@ -1188,7 +1190,8 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
prt_printf(out, "Errors to silently fix:\t");
- prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8);
+ prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent,
+ min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8));
prt_newline(out);
kfree(errors_silent);
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 01b768c9b767..b2f209743afe 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -394,7 +394,7 @@ static int insert_test_extent(struct bch_fs *c,
k.k_i.k.p.offset = end;
k.k_i.k.p.snapshot = U32_MAX;
k.k_i.k.size = end - start;
- k.k_i.k.version.lo = test_version++;
+ k.k_i.k.bversion.lo = test_version++;
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
bch_err_fn(c, ret);