summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/alloc_background.c77
-rw-r--r--fs/bcachefs/alloc_background_format.h2
-rw-r--r--fs/bcachefs/alloc_foreground.c2
-rw-r--r--fs/bcachefs/bcachefs_format.h5
-rw-r--r--fs/bcachefs/bset.c2
-rw-r--r--fs/bcachefs/btree_gc.c30
-rw-r--r--fs/bcachefs/btree_io.c7
-rw-r--r--fs/bcachefs/btree_iter.c6
-rw-r--r--fs/bcachefs/btree_iter.h10
-rw-r--r--fs/bcachefs/btree_node_scan.c3
-rw-r--r--fs/bcachefs/btree_trans_commit.c3
-rw-r--r--fs/bcachefs/btree_update.c4
-rw-r--r--fs/bcachefs/btree_update.h2
-rw-r--r--fs/bcachefs/btree_update_interior.c4
-rw-r--r--fs/bcachefs/buckets.c7
-rw-r--r--fs/bcachefs/buckets.h12
-rw-r--r--fs/bcachefs/chardev.c1
-rw-r--r--fs/bcachefs/darray.c15
-rw-r--r--fs/bcachefs/data_update.c1
-rw-r--r--fs/bcachefs/dirent.c7
-rw-r--r--fs/bcachefs/dirent.h7
-rw-r--r--fs/bcachefs/disk_accounting.c155
-rw-r--r--fs/bcachefs/ec.c108
-rw-r--r--fs/bcachefs/errcode.h3
-rw-r--r--fs/bcachefs/error.c28
-rw-r--r--fs/bcachefs/error.h9
-rw-r--r--fs/bcachefs/extents.h10
-rw-r--r--fs/bcachefs/fs-io-buffered.c6
-rw-r--r--fs/bcachefs/fs-io-direct.c3
-rw-r--r--fs/bcachefs/fs-io-pagecache.c70
-rw-r--r--fs/bcachefs/fs-io.c2
-rw-r--r--fs/bcachefs/fs.c161
-rw-r--r--fs/bcachefs/fs.h9
-rw-r--r--fs/bcachefs/fsck.c813
-rw-r--r--fs/bcachefs/fsck.h1
-rw-r--r--fs/bcachefs/inode.c348
-rw-r--r--fs/bcachefs/inode.h39
-rw-r--r--fs/bcachefs/inode_format.h9
-rw-r--r--fs/bcachefs/io_misc.c63
-rw-r--r--fs/bcachefs/io_read.c8
-rw-r--r--fs/bcachefs/io_write.c4
-rw-r--r--fs/bcachefs/journal.c23
-rw-r--r--fs/bcachefs/journal.h2
-rw-r--r--fs/bcachefs/logged_ops.c16
-rw-r--r--fs/bcachefs/logged_ops.h2
-rw-r--r--fs/bcachefs/lru.c34
-rw-r--r--fs/bcachefs/move.c2
-rw-r--r--fs/bcachefs/movinggc.c12
-rw-r--r--fs/bcachefs/opts.c10
-rw-r--r--fs/bcachefs/opts.h3
-rw-r--r--fs/bcachefs/quota.c2
-rw-r--r--fs/bcachefs/rebalance.c4
-rw-r--r--fs/bcachefs/recovery.c5
-rw-r--r--fs/bcachefs/recovery_passes_types.h1
-rw-r--r--fs/bcachefs/replicas.c39
-rw-r--r--fs/bcachefs/sb-downgrade.c5
-rw-r--r--fs/bcachefs/sb-errors_format.h41
-rw-r--r--fs/bcachefs/sb-members.c10
-rw-r--r--fs/bcachefs/siphash.c2
-rw-r--r--fs/bcachefs/snapshot.c129
-rw-r--r--fs/bcachefs/snapshot.h3
-rw-r--r--fs/bcachefs/str_hash.h60
-rw-r--r--fs/bcachefs/subvolume.c23
-rw-r--r--fs/bcachefs/subvolume.h2
-rw-r--r--fs/bcachefs/super.c36
-rw-r--r--fs/bcachefs/tests.c4
-rw-r--r--fs/bcachefs/util.c2
-rw-r--r--fs/bcachefs/varint.c2
-rw-r--r--fs/bcachefs/xattr.c2
69 files changed, 1737 insertions, 795 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 645b5ed4babb..c84a91572a1d 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -332,7 +332,6 @@ void bch2_alloc_v4_swab(struct bkey_s k)
a->io_time[1] = swab64(a->io_time[1]);
a->stripe = swab32(a->stripe);
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
- a->fragmentation_lru = swab64(a->fragmentation_lru);
a->stripe_sectors = swab32(a->stripe_sectors);
bps = alloc_v4_backpointers(a);
@@ -347,6 +346,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
{
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+ struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL;
prt_newline(out);
printbuf_indent_add(out, 2);
@@ -364,9 +364,13 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
- prt_printf(out, "fragmentation %llu\n", a->fragmentation_lru);
+
+ if (ca)
+ prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca));
prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
printbuf_indent_sub(out, 2);
+
+ bch2_dev_put(ca);
}
void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
@@ -635,6 +639,16 @@ int bch2_alloc_read(struct bch_fs *c)
continue;
}
+ if (k.k->p.offset < ca->mi.first_bucket) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
+ continue;
+ }
+
+ if (k.k->p.offset >= ca->mi.nbuckets) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ continue;
+ }
+
struct bch_alloc_v4 a;
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
0;
@@ -882,12 +896,13 @@ int bch2_trigger_alloc(struct btree_trans *trans,
goto err;
}
- new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca);
- if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+ old_lru = alloc_lru_idx_fragmentation(*old_a, ca);
+ new_lru = alloc_lru_idx_fragmentation(*new_a, ca);
+ if (old_lru != new_lru) {
ret = bch2_lru_change(trans,
BCH_LRU_FRAGMENTATION_START,
bucket_to_u64(new.k->p),
- old_a->fragmentation_lru, new_a->fragmentation_lru);
+ old_lru, new_lru);
if (ret)
goto err;
}
@@ -1629,18 +1644,22 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (ret)
return ret;
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode);
+ if (!ca)
+ return 0;
+
a = bch2_alloc_to_v4(alloc_k, &a_convert);
- if (a->fragmentation_lru) {
+ u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
+ if (lru_idx) {
ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
- a->fragmentation_lru,
- alloc_k, last_flushed);
+ lru_idx, alloc_k, last_flushed);
if (ret)
- return ret;
+ goto err;
}
if (a->data_type != BCH_DATA_cached)
- return 0;
+ goto err;
if (fsck_err_on(!a->io_time[READ],
trans, alloc_key_cached_but_read_time_zero,
@@ -1669,6 +1688,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
goto err;
err:
fsck_err:
+ bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
}
@@ -1957,7 +1977,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
ca->mi.bucket_size,
GFP_KERNEL);
- int ret = bch2_trans_do(c, NULL, NULL,
+ int ret = bch2_trans_commit_do(c, NULL, NULL,
BCH_WATERMARK_btree|
BCH_TRANS_COMMIT_no_enospc,
bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
@@ -2117,14 +2137,15 @@ static void bch2_do_invalidates_work(struct work_struct *work)
struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
if (ret)
- break;
+ goto restart_err;
if (!k.k)
break;
ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+restart_err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
if (ret)
break;
@@ -2330,24 +2351,19 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
/* Bucket IO clocks: */
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
- size_t bucket_nr, int rw)
+static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
- u64 now;
- int ret = 0;
- if (bch2_trans_relock(trans))
- bch2_trans_begin(trans);
-
- a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
- ret = PTR_ERR_OR_ZERO(a);
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a =
+ bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
+ int ret = PTR_ERR_OR_ZERO(a);
if (ret)
return ret;
- now = bch2_current_io_time(c, rw);
+ u64 now = bch2_current_io_time(c, rw);
if (a->v.io_time[rw] == now)
goto out;
@@ -2360,6 +2376,15 @@ out:
return ret;
}
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
+{
+ if (bch2_trans_relock(trans))
+ bch2_trans_begin(trans);
+
+ return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw));
+}
+
/* Startup/shutdown (ro/rw): */
void bch2_recalc_capacity(struct bch_fs *c)
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
index f754a2951d8a..befdaa95c515 100644
--- a/fs/bcachefs/alloc_background_format.h
+++ b/fs/bcachefs/alloc_background_format.h
@@ -70,7 +70,7 @@ struct bch_alloc_v4 {
__u32 stripe;
__u32 nr_external_backpointers;
/* end of fields in original version of alloc_v4 */
- __u64 fragmentation_lru;
+ __u64 _fragmentation_lru; /* obsolete */
__u32 stripe_sectors;
__u32 pad;
} __packed __aligned(8);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index d0e0b56892e3..5836870ab882 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -684,7 +684,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
struct bch_dev_usage usage;
struct open_bucket *ob;
- bch2_trans_do(c, NULL, NULL, 0,
+ bch2_trans_do(c,
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
data_type, cl, false, &usage)));
return ob;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 203ee627cab5..5004f6ba997c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -223,7 +223,7 @@ struct bkey {
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
struct bpos p;
__u32 size; /* extent size, in sectors */
- struct bversion version;
+ struct bversion bversion;
__u8 pad[1];
#endif
@@ -678,7 +678,8 @@ struct bch_sb_field_ext {
x(disk_accounting_v2, BCH_VERSION(1, 9)) \
x(disk_accounting_v3, BCH_VERSION(1, 10)) \
x(disk_accounting_inum, BCH_VERSION(1, 11)) \
- x(rebalance_work_acct_fix, BCH_VERSION(1, 12))
+ x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \
+ x(inode_has_child_snapshots, BCH_VERSION(1, 13))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index d1f6092624d8..9a4a83d6fd2d 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -13,7 +13,7 @@
#include "trace.h"
#include "util.h"
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/console.h>
#include <linux/random.h>
#include <linux/prefetch.h>
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 660d2fa02da2..0ca3feeb42c8 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -820,15 +820,23 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
* fix that here:
*/
alloc_data_type_set(&gc, gc.data_type);
-
if (gc.data_type != old_gc.data_type ||
gc.dirty_sectors != old_gc.dirty_sectors) {
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
if (ret)
return ret;
- }
- gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca);
+ /*
+ * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
+ * safe w.r.t. transaction restarts, so fixup the gc_bucket so
+ * we don't run it twice:
+ */
+ percpu_down_read(&c->mark_lock);
+ struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
+ gc_m->data_type = gc.data_type;
+ gc_m->dirty_sectors = gc.dirty_sectors;
+ percpu_up_read(&c->mark_lock);
+ }
if (fsck_err_on(new.data_type != gc.data_type,
trans, alloc_key_data_type_wrong,
@@ -857,7 +865,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors);
copy_bucket_field(alloc_key_stripe_wrong, stripe);
copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy);
- copy_bucket_field(alloc_key_fragmentation_lru_wrong, fragmentation_lru);
#undef copy_bucket_field
if (!bch2_alloc_v4_cmp(*old, new))
@@ -1227,17 +1234,20 @@ int bch2_gc_gens(struct bch_fs *c)
u64 b, start_time = local_clock();
int ret;
- /*
- * Ideally we would be using state_lock and not gc_gens_lock here, but that
- * introduces a deadlock in the RO path - we currently take the state
- * lock at the start of going RO, thus the gc thread may get stuck:
- */
if (!mutex_trylock(&c->gc_gens_lock))
return 0;
trace_and_count(c, gc_gens_start, c);
- down_read(&c->state_lock);
+ /*
+ * We have to use trylock here. Otherwise, we would
+ * introduce a deadlock in the RO path - we take the
+ * state lock at the start of going RO.
+ */
+ if (!down_read_trylock(&c->state_lock)) {
+ mutex_unlock(&c->gc_gens_lock);
+ return 0;
+ }
for_each_member_device(c, ca) {
struct bucket_gens *gens = bucket_gens(ca);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 1c1448b52207..6296a11ccb09 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1838,10 +1838,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
struct btree_trans *trans = bch2_trans_get(c);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- __btree_node_write_done(c, b);
- six_unlock_read(&b->c.lock);
+ /* we don't need transaction context anymore after we got the lock. */
bch2_trans_put(trans);
+ __btree_node_write_done(c, b);
+ six_unlock_read(&b->c.lock);
}
static void btree_node_write_work(struct work_struct *work)
@@ -1870,7 +1871,7 @@ static void btree_node_write_work(struct work_struct *work)
}
} else {
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_do(c,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bfe9f0c1e1be..0883cf6e1a3e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2381,9 +2381,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
else
iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
- if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
- ? bkey_gt(iter_pos, end)
- : bkey_ge(iter_pos, end)))
+ if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) :
+ iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) :
+ bkey_gt(iter_pos, end)))
goto end;
break;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 78e63ad7d380..0bda054f80d7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -857,6 +857,14 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
SPOS_MAX, _flags, _k, _ret)
+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_rewind(&(_iter)))
+
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
@@ -904,6 +912,8 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
_ret; \
})
+#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
+
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
void bch2_trans_put(struct btree_trans *);
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 1e694fedc5da..a7aedb134e9f 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -171,6 +171,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
return;
+ if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
+ return;
+
rcu_read_lock();
struct found_btree_node n = {
.btree_id = BTREE_NODE_ID(bn),
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 1a74a1a252ee..9bf471fa4361 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -832,7 +832,8 @@ revert_fs_usage:
for (struct jset_entry *entry2 = trans->journal_entries;
entry2 != entry;
entry2 = vstruct_next(entry2))
- if (jset_entry_is_key(entry2) && entry2->start->k.type == KEY_TYPE_accounting) {
+ if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys &&
+ entry2->start->k.type == KEY_TYPE_accounting) {
struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
bch2_accounting_neg(a);
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 514df618548e..5d809e8bd170 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -668,7 +668,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
struct disk_reservation *disk_res, int flags,
enum btree_iter_update_trigger_flags iter_flags)
{
- return bch2_trans_do(c, disk_res, NULL, flags,
+ return bch2_trans_commit_do(c, disk_res, NULL, flags,
bch2_btree_insert_trans(trans, id, k, iter_flags));
}
@@ -865,7 +865,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
memcpy(l->d, buf.buf, buf.pos);
c->journal.early_journal_entries.nr += jset_u64s(u64s);
} else {
- ret = bch2_trans_do(c, NULL, NULL,
+ ret = bch2_trans_commit_do(c, NULL, NULL,
BCH_TRANS_COMMIT_lazy_rw|commit_flags,
__bch2_trans_log_msg(trans, &buf, u64s));
}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 6a454f2fa005..70b3c989fac2 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -192,7 +192,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_flags)))
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
+#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \
bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
#define trans_for_each_update(_trans, _i) \
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 190bc1e81756..64f0928e1137 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2239,10 +2239,8 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
struct async_btree_rewrite *a =
container_of(work, struct async_btree_rewrite, work);
struct bch_fs *c = a->c;
- int ret;
- ret = bch2_trans_do(c, NULL, NULL, 0,
- async_btree_node_rewrite_trans(trans, a));
+ int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
bch_err_fn_ratelimited(c, ret);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 546cd01a72e3..ec7d9a59bea9 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1160,11 +1160,11 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c)
#define SECTORS_CACHE 1024
int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, int flags)
+ u64 sectors, enum bch_reservation_flags flags)
{
struct bch_fs_pcpu *pcpu;
u64 old, get;
- s64 sectors_available;
+ u64 sectors_available;
int ret;
percpu_down_read(&c->mark_lock);
@@ -1202,6 +1202,9 @@ recalculate:
percpu_u64_set(&c->pcpu->sectors_available, 0);
sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
+ if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL))
+ sectors = min(sectors, sectors_available);
+
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
atomic64_set(&c->sectors_available,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e2cb7b24b220..fd5e6ccad45e 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -344,14 +344,16 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
}
}
-#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
+enum bch_reservation_flags {
+ BCH_DISK_RESERVATION_NOFAIL = 1 << 0,
+ BCH_DISK_RESERVATION_PARTIAL = 1 << 1,
+};
-int __bch2_disk_reservation_add(struct bch_fs *,
- struct disk_reservation *,
- u64, int);
+int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *,
+ u64, enum bch_reservation_flags);
static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, int flags)
+ u64 sectors, enum bch_reservation_flags flags)
{
#ifdef __KERNEL__
u64 old, new;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index cbfd88f98472..2182b555c112 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -225,6 +225,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
opt_set(thr->opts, read_only, 1);
+ opt_set(thr->opts, ratelimit_errors, 0);
/* We need request_key() to be called before we punt to kthread: */
opt_set(thr->opts, nostart, true);
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index 4f06cd8bbbe1..e86d36d23e9e 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -2,6 +2,7 @@
#include <linux/log2.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include "darray.h"
int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
@@ -9,7 +10,19 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
- void *data = kvmalloc_array_noprof(new_size, element_size, gfp);
+ /*
+ * This is a workaround: kvmalloc() doesn't support > INT_MAX
+ * allocations, but vmalloc() does.
+ * The limit needs to be lifted from kvmalloc, and when it does
+ * we'll go back to just using that.
+ */
+ size_t bytes;
+ if (unlikely(check_mul_overflow(new_size, element_size, &bytes)))
+ return -ENOMEM;
+
+ void *data = likely(bytes < INT_MAX)
+ ? kvmalloc_noprof(bytes, gfp)
+ : vmalloc_noprof(bytes);
if (!data)
return -ENOMEM;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 462b1a2fe1ad..a6ee0beee6b0 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -80,6 +80,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
if (ptr2 == ptr)
break;
+ ca = bch2_dev_have_ref(c, ptr2->dev);
bucket = PTR_BUCKET_POS(ca, ptr2);
bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 84dd4a879d98..faffc98d5605 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -250,13 +250,6 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret;
}
-static void dirent_copy_target(struct bkey_i_dirent *dst,
- struct bkey_s_c_dirent src)
-{
- dst->v.d_inum = src.v->d_inum;
- dst->v.d_type = src.v->d_type;
-}
-
int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
struct bkey_s_c_dirent d, subvol_inum *target)
{
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 8945145865c5..53ad99666022 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -34,6 +34,13 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
struct bkey_s_c_dirent, subvol_inum *);
+static inline void dirent_copy_target(struct bkey_i_dirent *dst,
+ struct bkey_s_c_dirent src)
+{
+ dst->v.d_inum = src.v->d_inum;
+ dst->v.d_type = src.v->d_type;
+}
+
int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 9f3133e3e7e5..07eb8fa1b026 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -242,6 +242,14 @@ void bch2_accounting_swab(struct bkey_s k)
*p = swab64(*p);
}
+static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
+ struct disk_accounting_pos acc)
+{
+ unsafe_memcpy(r, &acc.replicas,
+ replicas_entry_bytes(&acc.replicas),
+ "variable length struct");
+}
+
static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
{
struct disk_accounting_pos acc_k;
@@ -249,9 +257,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_replicas:
- unsafe_memcpy(r, &acc_k.replicas,
- replicas_entry_bytes(&acc_k.replicas),
- "variable length struct");
+ __accounting_to_replicas(r, acc_k);
return true;
default:
return false;
@@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
return ret;
}
+static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
+ struct disk_accounting_pos acc,
+ u64 *v, unsigned nr)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0, invalid_dev = -1;
+
+ switch (acc.type) {
+ case BCH_DISK_ACCOUNTING_replicas: {
+ struct bch_replicas_padded r;
+ __accounting_to_replicas(&r.e, acc);
+
+ for (unsigned i = 0; i < r.e.nr_devs; i++)
+ if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
+ !bch2_dev_exists(c, r.e.devs[i])) {
+ invalid_dev = r.e.devs[i];
+ goto invalid_device;
+ }
+
+ /*
+ * All replicas entry checks except for invalid device are done
+ * in bch2_accounting_validate
+ */
+ BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
+
+ if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
+ trans, accounting_replicas_not_marked,
+ "accounting not marked in superblock replicas\n %s",
+ (printbuf_reset(&buf),
+ bch2_accounting_key_to_text(&buf, &acc),
+ buf.buf))) {
+ /*
+ * We're not RW yet and still single threaded, dropping
+ * and retaking lock is ok:
+ */
+ percpu_up_write(&c->mark_lock);
+ ret = bch2_mark_replicas(c, &r.e);
+ if (ret)
+ goto fsck_err;
+ percpu_down_write(&c->mark_lock);
+ }
+ break;
+ }
+
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
+ invalid_dev = acc.dev_data_type.dev;
+ goto invalid_device;
+ }
+ break;
+ }
+
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+invalid_device:
+ if (fsck_err(trans, accounting_to_invalid_device,
+ "accounting entry points to invalid device %i\n %s",
+ invalid_dev,
+ (printbuf_reset(&buf),
+ bch2_accounting_key_to_text(&buf, &acc),
+ buf.buf))) {
+ for (unsigned i = 0; i < nr; i++)
+ v[i] = -v[i];
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
+ -BCH_ERR_remove_disk_accounting_entry;
+ } else {
+ ret = -BCH_ERR_remove_disk_accounting_entry;
+ }
+ goto fsck_err;
+}
+
/*
* At startup time, initialize the in memory accounting from the btree (and
* journal)
@@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c)
}
keys->gap = keys->nr = dst - keys->data;
- percpu_down_read(&c->mark_lock);
- for (unsigned i = 0; i < acc->k.nr; i++) {
- u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+ percpu_down_write(&c->mark_lock);
+ unsigned i = 0;
+ while (i < acc->k.nr) {
+ unsigned idx = inorder_to_eytzinger0(i, acc->k.nr);
- if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters))
- continue;
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos);
- struct bch_replicas_padded r;
- if (!accounting_to_replicas(&r.e, acc->k.data[i].pos))
- continue;
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false);
/*
- * If the replicas entry is invalid it'll get cleaned up by
- * check_allocations:
+ * If the entry counters are zeroed, it should be treated as
+ * nonexistent - it might point to an invalid device.
+ *
+ * Remove it, so that if it's re-added it gets re-marked in the
+ * superblock:
*/
- if (bch2_replicas_entry_validate(&r.e, c, &buf))
+ ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters)
+ ? -BCH_ERR_remove_disk_accounting_entry
+ : bch2_disk_accounting_validate_late(trans, acc_k,
+ v, acc->k.data[idx].nr_counters);
+
+ if (ret == -BCH_ERR_remove_disk_accounting_entry) {
+ free_percpu(acc->k.data[idx].v[0]);
+ free_percpu(acc->k.data[idx].v[1]);
+ darray_remove_item(&acc->k, &acc->k.data[idx]);
+ eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, NULL);
+ ret = 0;
continue;
-
- struct disk_accounting_pos k;
- bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
-
- if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
- trans, accounting_replicas_not_marked,
- "accounting not marked in superblock replicas\n %s",
- (printbuf_reset(&buf),
- bch2_accounting_key_to_text(&buf, &k),
- buf.buf))) {
- /*
- * We're not RW yet and still single threaded, dropping
- * and retaking lock is ok:
- */
- percpu_up_read(&c->mark_lock);
- ret = bch2_mark_replicas(c, &r.e);
- if (ret)
- goto fsck_err;
- percpu_down_read(&c->mark_lock);
}
+
+ if (ret)
+ goto fsck_err;
+ i++;
}
preempt_disable();
@@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c)
}
preempt_enable();
fsck_err:
- percpu_up_read(&c->mark_lock);
+ percpu_up_write(&c->mark_lock);
err:
printbuf_exit(&buf);
bch2_trans_put(trans);
@@ -777,8 +856,10 @@ int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
};
u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
- int ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc));
+ int ret = bch2_trans_do(c, ({
+ bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?:
+ (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0);
+ }));
bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 1587c6e1866a..a0aa5bb467d9 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -124,6 +124,11 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
"incorrect value size (%zu < %u)",
bkey_val_u64s(k.k), stripe_val_u64s(s));
+ bkey_fsck_err_on(s->csum_granularity_bits >= 64,
+ c, stripe_csum_granularity_bad,
+ "invalid csum granularity (%u >= 64)",
+ s->csum_granularity_bits);
+
ret = bch2_bkey_ptrs_validate(c, k, flags);
fsck_err:
return ret;
@@ -145,7 +150,11 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
nr_data,
s.nr_redundant);
bch2_prt_csum_type(out, s.csum_type);
- prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+ prt_str(out, " gran ");
+ if (s.csum_granularity_bits < 64)
+ prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
+ else
+ prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
if (s.disk_label) {
prt_str(out, " label");
@@ -257,12 +266,12 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
if (!deleting) {
a->stripe = s.k->p.offset;
a->stripe_redundancy = s.v->nr_redundant;
+ alloc_data_type_set(a, data_type);
} else {
a->stripe = 0;
a->stripe_redundancy = 0;
+ alloc_data_type_set(a, BCH_DATA_user);
}
-
- alloc_data_type_set(a, data_type);
err:
printbuf_exit(&buf);
return ret;
@@ -1177,7 +1186,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
if (!idx)
break;
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
ec_stripe_delete(trans, idx));
bch_err_fn(c, ret);
if (ret)
@@ -1197,47 +1206,62 @@ void bch2_do_stripe_deletes(struct bch_fs *c)
/* stripe creation: */
static int ec_stripe_key_update(struct btree_trans *trans,
- struct bkey_i_stripe *new,
- bool create)
+ struct bkey_i_stripe *old,
+ struct bkey_i_stripe *new)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
+ bool create = !old;
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- new->k.p, BTREE_ITER_intent);
- ret = bkey_err(k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+ new->k.p, BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (ret)
goto err;
- if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
- bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
- create ? "creating" : "updating",
- bch2_bkey_types[k.k->type]);
+ if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
+ c, "error %s stripe: got existing key type %s",
+ create ? "creating" : "updating",
+ bch2_bkey_types[k.k->type])) {
ret = -EINVAL;
goto err;
}
if (k.k->type == KEY_TYPE_stripe) {
- const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
- unsigned i;
+ const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
- if (old->nr_blocks != new->v.nr_blocks) {
- bch_err(c, "error updating stripe: nr_blocks does not match");
- ret = -EINVAL;
- goto err;
- }
+ BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
+ BUG_ON(old->v.nr_blocks != v->nr_blocks);
- for (i = 0; i < new->v.nr_blocks; i++) {
- unsigned v = stripe_blockcount_get(old, i);
+ for (unsigned i = 0; i < new->v.nr_blocks; i++) {
+ unsigned sectors = stripe_blockcount_get(v, i);
- BUG_ON(v &&
- (old->ptrs[i].dev != new->v.ptrs[i].dev ||
- old->ptrs[i].gen != new->v.ptrs[i].gen ||
- old->ptrs[i].offset != new->v.ptrs[i].offset));
+ if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
+ struct printbuf buf = PRINTBUF;
- stripe_blockcount_set(&new->v, i, v);
+ prt_printf(&buf, "stripe changed nonempty block %u", i);
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /*
+ * If the stripe ptr changed underneath us, it must have
+ * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
+ */
+ if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
+ BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
+
+ if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
+ new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
+ }
+
+ stripe_blockcount_set(&new->v, i, sectors);
}
}
@@ -1495,12 +1519,14 @@ static void ec_stripe_create(struct ec_stripe_new *s)
goto err;
}
- ret = bch2_trans_do(c, &s->res, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_key_update(trans,
- bkey_i_to_stripe(&s->new_stripe.key),
- !s->have_existing_stripe));
+ ret = bch2_trans_commit_do(c, &s->res, NULL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
+ ec_stripe_key_update(trans,
+ s->have_existing_stripe
+ ? bkey_i_to_stripe(&s->existing_stripe.key)
+ : NULL,
+ bkey_i_to_stripe(&s->new_stripe.key)));
bch_err_msg(c, ret, "creating stripe key");
if (ret) {
goto err;
@@ -1876,7 +1902,15 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
- __clear_bit(v->ptrs[i].dev, devs.d);
+ /*
+ * Note: we don't yet repair invalid blocks (failed/removed
+ * devices) when reusing stripes - we still need a codepath to
+ * walk backpointers and update all extents that point to that
+ * block when updating the stripe
+ */
+ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
+ __clear_bit(v->ptrs[i].dev, devs.d);
+
if (i < h->s->nr_data)
nr_have_data++;
else
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 60b7875adada..649263516ab1 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -268,7 +268,8 @@
x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem) \
x(0, invalid_snapshot_node) \
- x(0, option_needs_open_fs)
+ x(0, option_needs_open_fs) \
+ x(0, remove_disk_accounting_entry)
enum bch_errcode {
BCH_ERR_START = 2048,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 3a16b535b6c3..b679def8fb98 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -251,7 +251,10 @@ int __bch2_fsck_err(struct bch_fs *c,
* delete the key)
* - and we don't need to warn if we're not prompting
*/
- WARN_ON(!(flags & FSCK_AUTOFIX) && !trans && bch2_current_has_btree_trans(c));
+ WARN_ON((flags & FSCK_CAN_FIX) &&
+ !(flags & FSCK_AUTOFIX) &&
+ !trans &&
+ bch2_current_has_btree_trans(c));
if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent))
@@ -393,6 +396,14 @@ int __bch2_fsck_err(struct bch_fs *c,
!(flags & FSCK_CAN_IGNORE)))
ret = -BCH_ERR_fsck_errors_not_fixed;
+ bool exiting =
+ test_bit(BCH_FS_fsck_running, &c->flags) &&
+ (ret != -BCH_ERR_fsck_fix &&
+ ret != -BCH_ERR_fsck_ignore);
+
+ if (exiting)
+ print = true;
+
if (print) {
if (bch2_fs_stdio_redirect(c))
bch2_print(c, "%s\n", out->buf);
@@ -400,9 +411,7 @@ int __bch2_fsck_err(struct bch_fs *c,
bch2_print_string_as_lines(KERN_ERR, out->buf);
}
- if (test_bit(BCH_FS_fsck_running, &c->flags) &&
- (ret != -BCH_ERR_fsck_fix &&
- ret != -BCH_ERR_fsck_ignore))
+ if (exiting)
bch_err(c, "Unable to continue, halting");
else if (suppressing)
bch_err(c, "Ratelimiting new instances of previous error");
@@ -430,10 +439,17 @@ err:
int __bch2_bkey_fsck_err(struct bch_fs *c,
struct bkey_s_c k,
- enum bch_fsck_flags flags,
+ enum bch_validate_flags validate_flags,
enum bch_sb_error_id err,
const char *fmt, ...)
{
+ if (validate_flags & BCH_VALIDATE_silent)
+ return -BCH_ERR_fsck_delete_bkey;
+
+ unsigned fsck_flags = 0;
+ if (!(validate_flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)))
+ fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
+
struct printbuf buf = PRINTBUF;
va_list args;
@@ -445,7 +461,7 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
va_end(args);
prt_str(&buf, ": delete?");
- int ret = __bch2_fsck_err(c, NULL, flags, err, "%s", buf.buf);
+ int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf);
printbuf_exit(&buf);
return ret;
}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 21ee7211b03e..6551ada926b6 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -167,10 +167,11 @@ void bch2_flush_fsck_errs(struct bch_fs *);
#define fsck_err_on(cond, c, _err_type, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
+enum bch_validate_flags;
__printf(5, 6)
int __bch2_bkey_fsck_err(struct bch_fs *,
struct bkey_s_c,
- enum bch_fsck_flags,
+ enum bch_validate_flags,
enum bch_sb_error_id,
const char *, ...);
@@ -180,11 +181,7 @@ int __bch2_bkey_fsck_err(struct bch_fs *,
*/
#define bkey_fsck_err(c, _err_type, _err_msg, ...) \
do { \
- if ((flags & BCH_VALIDATE_silent)) { \
- ret = -BCH_ERR_fsck_delete_bkey; \
- goto fsck_err; \
- } \
- int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX|FSCK_AUTOFIX,\
+ int _ret = __bch2_bkey_fsck_err(c, k, flags, \
BCH_FSCK_ERR_##_err_type, \
_err_msg, ##__VA_ARGS__); \
if (_ret != -BCH_ERR_fsck_fix && \
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index ed5001dd662e..923a5f1849a8 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -695,6 +695,16 @@ void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
enum bch_validate_flags);
+static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
+ struct bch_extent_ptr ptr2)
+{
+ return (ptr1.cached == ptr2.cached &&
+ ptr1.unwritten == ptr2.unwritten &&
+ ptr1.offset == ptr2.offset &&
+ ptr1.dev == ptr2.dev &&
+ ptr1.dev == ptr2.dev);
+}
+
void bch2_ptr_swab(struct bkey_s);
const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 48a1ab9a649b..95972809e76d 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -856,6 +856,12 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
folios_trunc(&fs, fi);
end = min(end, folio_end_pos(darray_last(fs)));
} else {
+ if (!folio_test_uptodate(f)) {
+ ret = bch2_read_single_folio(f, mapping);
+ if (ret)
+ goto out;
+ }
+
folios_trunc(&fs, fi + 1);
end = f_pos + f_reserved;
}
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index ee1c0325f313..6d3a05ae5da8 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -369,6 +369,7 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio)
static __always_inline long bch2_dio_write_done(struct dio_write *dio)
{
+ struct bch_fs *c = dio->op.c;
struct kiocb *req = dio->req;
struct bch_inode_info *inode = dio->inode;
bool sync = dio->sync;
@@ -387,7 +388,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
ret = dio->op.error ?: ((long) dio->written << 9);
bio_put(&dio->op.wbio.bio);
- bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
+ bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index af3a24546aa3..1d4910ea0f1d 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -399,14 +399,17 @@ void bch2_folio_reservation_put(struct bch_fs *c,
bch2_quota_reservation_put(c, inode, &res->quota);
}
-int bch2_folio_reservation_get(struct bch_fs *c,
+static int __bch2_folio_reservation_get(struct bch_fs *c,
struct bch_inode_info *inode,
struct folio *folio,
struct bch2_folio_reservation *res,
- size_t offset, size_t len)
+ size_t offset, size_t len,
+ bool partial)
{
struct bch_folio *s = bch2_folio_create(folio, 0);
unsigned i, disk_sectors = 0, quota_sectors = 0;
+ struct disk_reservation disk_res = {};
+ size_t reserved = len;
int ret;
if (!s)
@@ -422,48 +425,65 @@ int bch2_folio_reservation_get(struct bch_fs *c,
}
if (disk_sectors) {
- ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
+ ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors,
+ partial ? BCH_DISK_RESERVATION_PARTIAL : 0);
if (unlikely(ret))
return ret;
+
+ if (unlikely(disk_res.sectors != disk_sectors)) {
+ disk_sectors = quota_sectors = 0;
+
+ for (i = round_down(offset, block_bytes(c)) >> 9;
+ i < round_up(offset + len, block_bytes(c)) >> 9;
+ i++) {
+ disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
+ if (disk_sectors > disk_res.sectors) {
+ /*
+ * Make sure to get a reservation that's
+ * aligned to the filesystem blocksize:
+ */
+ unsigned reserved_offset = round_down(i << 9, block_bytes(c));
+ reserved = clamp(reserved_offset, offset, offset + len) - offset;
+
+ if (!reserved) {
+ bch2_disk_reservation_put(c, &disk_res);
+ return -BCH_ERR_ENOSPC_disk_reservation;
+ }
+ break;
+ }
+ quota_sectors += s->s[i].state == SECTOR_unallocated;
+ }
+ }
}
if (quota_sectors) {
ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
if (unlikely(ret)) {
- struct disk_reservation tmp = { .sectors = disk_sectors };
-
- bch2_disk_reservation_put(c, &tmp);
- res->disk.sectors -= disk_sectors;
+ bch2_disk_reservation_put(c, &disk_res);
return ret;
}
}
- return 0;
+ res->disk.sectors += disk_res.sectors;
+ return partial ? reserved : 0;
}
-ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
+int bch2_folio_reservation_get(struct bch_fs *c,
struct bch_inode_info *inode,
struct folio *folio,
struct bch2_folio_reservation *res,
size_t offset, size_t len)
{
- size_t l, reserved = 0;
- int ret;
-
- while ((l = len - reserved)) {
- while ((ret = bch2_folio_reservation_get(c, inode, folio, res, offset, l))) {
- if ((offset & (block_bytes(c) - 1)) + l <= block_bytes(c))
- return reserved ?: ret;
-
- len = reserved + l;
- l /= 2;
- }
-
- offset += l;
- reserved += l;
- }
+ return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
+}
- return reserved;
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct folio *folio,
+ struct bch2_folio_reservation *res,
+ size_t offset, size_t len)
+{
+ return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true);
}
static void bch2_clear_folio_bits(struct folio *folio)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 71d0fa387509..15d3f073b824 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -182,7 +182,7 @@ static int bch2_flush_inode(struct bch_fs *c,
struct bch_inode_unpacked u;
int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
- bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+ bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?:
bch2_inode_flush_nocow_writes(c, inode);
bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
return ret;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 4a1bb07a2574..a41d0d8a2f7b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
return a.subvol == b.subvol && a.inum == b.inum;
}
+static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
+{
+ const subvol_inum *inum = data;
+
+ return jhash(&inum->inum, sizeof(inum->inum), seed);
+}
+
+static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
+{
+ const struct bch_inode_info *inode = data;
+
+ return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
+}
+
static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
@@ -170,26 +184,111 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
.head_offset = offsetof(struct bch_inode_info, hash),
.key_offset = offsetof(struct bch_inode_info, ei_inum),
.key_len = sizeof(subvol_inum),
+ .hashfn = bch2_vfs_inode_hash_fn,
+ .obj_hashfn = bch2_vfs_inode_obj_hash_fn,
.obj_cmpfn = bch2_vfs_inode_cmp_fn,
.automatic_shrinking = true,
};
-static void __wait_on_freeing_inode(struct inode *inode)
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
{
- wait_queue_head_t *wq;
- DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
- wq = bit_waitqueue(&inode->i_state, __I_NEW);
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- spin_unlock(&inode->i_lock);
- schedule();
- finish_wait(wq, &wait.wq_entry);
+ struct bch_fs *c = trans->c;
+ struct rhashtable *ht = &c->vfs_inodes_table;
+ subvol_inum inum = (subvol_inum) { .inum = p.offset };
+ DARRAY(u32) subvols;
+ int ret = 0;
+
+ if (!test_bit(BCH_FS_started, &c->flags))
+ return false;
+
+ darray_init(&subvols);
+restart_from_top:
+
+ /*
+ * Tweaked version of __rhashtable_lookup(); we need to get a list of
+ * subvolumes in which the given inode number is open.
+ *
+ * For this to work, we don't include the subvolume ID in the key that
+ * we hash - all inodes with the same inode number regardless of
+ * subvolume will hash to the same slot.
+ *
+ * This will be less than ideal if the same file is ever open
+ * simultaneously in many different snapshots:
+ */
+ rcu_read_lock();
+ struct rhash_lock_head __rcu *const *bkt;
+ struct rhash_head *he;
+ unsigned int hash;
+ struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+restart:
+ hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params);
+ bkt = rht_bucket(tbl, hash);
+ do {
+ struct bch_inode_info *inode;
+
+ rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
+ if (inode->ei_inum.inum == inum.inum) {
+ ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
+ GFP_NOWAIT|__GFP_NOWARN);
+ if (ret) {
+ rcu_read_unlock();
+ ret = darray_make_room(&subvols, 1);
+ if (ret)
+ goto err;
+ subvols.nr = 0;
+ goto restart_from_top;
+ }
+ }
+ }
+ /* An object might have been moved to a different hash chain,
+ * while we walk along it - better check and retry.
+ */
+ } while (he != RHT_NULLS_MARKER(bkt));
+
+ /* Ensure we see any new tables. */
+ smp_rmb();
+
+ tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+ if (unlikely(tbl))
+ goto restart;
+ rcu_read_unlock();
+
+ darray_for_each(subvols, i) {
+ u32 snap;
+ ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
+ if (ret)
+ goto err;
+
+ ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
+ if (ret)
+ break;
+ }
+err:
+ darray_exit(&subvols);
+ return ret;
}
-struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
{
return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
}
+static void __wait_on_freeing_inode(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ subvol_inum inum)
+{
+ wait_queue_head_t *wq;
+ struct wait_bit_queue_entry wait;
+
+ wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->v.i_lock);
+
+ if (__bch2_inode_hash_find(c, inum) == inode)
+ schedule_timeout(HZ * 10);
+ finish_wait(wq, &wait.wq_entry);
+}
+
static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
subvol_inum inum)
{
@@ -204,10 +303,10 @@ repeat:
}
if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
if (!trans) {
- __wait_on_freeing_inode(&inode->v);
+ __wait_on_freeing_inode(c, inode, inum);
} else {
bch2_trans_unlock(trans);
- __wait_on_freeing_inode(&inode->v);
+ __wait_on_freeing_inode(c, inode, inum);
int ret = bch2_trans_relock(trans);
if (ret)
return ERR_PTR(ret);
@@ -232,6 +331,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod
&inode->hash, bch2_vfs_inodes_params);
BUG_ON(ret);
inode->v.i_hash.pprev = NULL;
+ /*
+ * This pairs with the bch2_inode_hash_find() ->
+ * __wait_on_freeing_inode() path
+ */
+ inode_wake_up_bit(&inode->v, __I_NEW);
}
}
@@ -243,7 +347,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
set_bit(EI_INODE_HASHED, &inode->ei_flags);
retry:
- if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
+ if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
+ &inode->ei_inum,
&inode->hash,
bch2_vfs_inodes_params))) {
old = bch2_inode_hash_find(c, trans, inode->ei_inum);
@@ -291,10 +396,10 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
BUG();
}
-static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
+static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
{
struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
- bch2_inode_cache, GFP_NOFS);
+ bch2_inode_cache, gfp);
if (!inode)
return NULL;
@@ -306,7 +411,7 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
mutex_init(&inode->ei_quota_lock);
memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
- if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
+ if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
kmem_cache_free(bch2_inode_cache, inode);
return NULL;
}
@@ -319,12 +424,10 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
*/
static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
{
- struct bch_inode_info *inode =
- memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
- __bch2_new_inode(trans->c));
+ struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
if (unlikely(!inode)) {
- int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM);
+ int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
if (ret && inode) {
__destroy_inode(&inode->v);
kmem_cache_free(bch2_inode_cache, inode);
@@ -398,7 +501,7 @@ __bch2_create(struct mnt_idmap *idmap,
if (ret)
return ERR_PTR(ret);
#endif
- inode = __bch2_new_inode(c);
+ inode = __bch2_new_inode(c, GFP_NOFS);
if (unlikely(!inode)) {
inode = ERR_PTR(-ENOMEM);
goto err;
@@ -553,7 +656,7 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
struct bch_inode_info *inode;
- bch2_trans_do(c, NULL, NULL, 0,
+ bch2_trans_do(c,
PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
&hash, &dentry->d_name)));
if (IS_ERR(inode))
@@ -766,7 +869,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
if (ret)
- goto err;
+ goto err_tx_restart;
if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
ret = bch2_fs_quota_transfer(c, src_inode,
@@ -1163,7 +1266,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(ei->v.i_ino, start), 0);
- while (true) {
+ while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
enum btree_id data_btree = BTREE_ID_extents;
bch2_trans_begin(trans);
@@ -1171,14 +1274,14 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
u32 snapshot;
ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
if (ret)
- goto err;
+ continue;
bch2_btree_iter_set_snapshot(&iter, snapshot);
k = bch2_btree_iter_peek_upto(&iter, end);
ret = bkey_err(k);
if (ret)
- goto err;
+ continue;
if (!k.k)
break;
@@ -1198,7 +1301,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &cur);
if (ret)
- break;
+ continue;
k = bkey_i_to_s_c(cur.k);
bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
@@ -1226,10 +1329,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
-err:
- if (ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -1937,7 +2036,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
printbuf_nul_terminate(&buf);
- seq_puts(seq, buf.buf);
+ seq_printf(seq, ",%s", buf.buf);
int ret = buf.allocation_failure ? -ENOMEM : 0;
printbuf_exit(&buf);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index da74ecc236e7..59f9f7ae728d 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -54,8 +54,6 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
return inode->ei_inum;
}
-struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
-
/*
* Set if we've gotten a btree error for this inode, and thus the vfs inode and
* btree inode may be inconsistent:
@@ -148,6 +146,8 @@ struct bch_inode_info *
__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
+
int bch2_fs_quota_transfer(struct bch_fs *,
struct bch_inode_info *,
struct bch_qid,
@@ -198,10 +198,7 @@ int bch2_vfs_init(void);
#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
-static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
-{
- return NULL;
-}
+static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 0d8b782b63fb..75c8a97a6954 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -28,8 +28,8 @@ static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
inode->bi_dir_offset == d.k->p.offset;
}
-static bool dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
+static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *inode)
{
if (d.v->d_type == DT_SUBVOL
? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
@@ -137,16 +137,15 @@ found:
return ret;
}
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode,
- u32 *snapshot)
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
+ struct bch_inode_unpacked *inode)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inode_nr, *snapshot), 0);
+ SPOS(0, inode_nr, snapshot), 0);
ret = bkey_err(k);
if (ret)
goto err;
@@ -154,8 +153,6 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
ret = bkey_is_inode(k.k)
? bch2_inode_unpack(k, inode)
: -BCH_ERR_ENOENT_inode;
- if (!ret)
- *snapshot = iter.pos.snapshot;
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -250,8 +247,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
- u32 root_inode_snapshot = snapshot;
- ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
+ ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
root_inum.inum, le32_to_cpu(st.master_subvol));
if (ret)
@@ -277,17 +273,23 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
* The bch2_check_dirents pass has already run, dangling dirents
* shouldn't exist here:
*/
- ret = lookup_inode(trans, inum, lostfound, &snapshot);
+ ret = lookup_inode(trans, inum, snapshot, lostfound);
bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
return ret;
create_lostfound:
/*
+ * we always create lost+found in the root snapshot; we don't want
+ * different branches of the snapshot tree to have different lost+found
+ */
+ snapshot = le32_to_cpu(st.root_snapshot);
+ /*
* XXX: we could have a nicer log message here if we had a nice way to
* walk backpointers to print a path
*/
- bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
+ bch_notice(c, "creating lost+found in subvol %llu snapshot %u",
+ root_inum.subvol, le32_to_cpu(st.root_snapshot));
u64 now = bch2_current_time(c);
struct btree_iter lostfound_iter = { NULL };
@@ -296,6 +298,7 @@ create_lostfound:
bch2_inode_init_early(c, lostfound);
bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
lostfound->bi_dir = root_inode.bi_inum;
+ lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
root_inode.bi_nlink++;
@@ -323,19 +326,54 @@ err:
return ret;
}
-static int reattach_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 inode_snapshot)
+static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
+{
+ if (inode->bi_inum == BCACHEFS_ROOT_INO &&
+ inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
+ return false;
+
+ return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
+}
+
+static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
+ SPOS(d_pos.inode, d_pos.offset, snapshot),
+ BTREE_ITER_intent|
+ BTREE_ITER_with_updates);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (bpos_eq(k.k->p, d_pos)) {
+ /*
+ * delet_at() doesn't work because the update path doesn't
+ * internally use BTREE_ITER_with_updates yet
+ */
+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+ ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ goto err;
+
+ bkey_init(&k->k);
+ k->k.type = KEY_TYPE_whiteout;
+ k->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
{
struct bch_fs *c = trans->c;
- struct bch_hash_info dir_hash;
struct bch_inode_unpacked lostfound;
char name_buf[20];
- struct qstr name;
- u64 dir_offset = 0;
- u32 dirent_snapshot = inode_snapshot;
int ret;
+ u32 dirent_snapshot = inode->bi_snapshot;
if (inode->bi_subvol) {
inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
@@ -354,17 +392,22 @@ static int reattach_inode(struct btree_trans *trans,
if (ret)
return ret;
- if (S_ISDIR(inode->bi_mode)) {
- lostfound.bi_nlink++;
+ lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
- ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
- if (ret)
- return ret;
+ /* ensure lost+found inode is also present in inode snapshot */
+ if (!inode->bi_subvol) {
+ BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
+ lostfound.bi_snapshot = inode->bi_snapshot;
}
- dir_hash = bch2_hash_info_init(c, &lostfound);
+ ret = __bch2_fsck_write_inode(trans, &lostfound);
+ if (ret)
+ return ret;
+
+ struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
+ struct qstr name = (struct qstr) QSTR(name_buf);
- name = (struct qstr) QSTR(name_buf);
+ inode->bi_dir = lostfound.bi_inum;
ret = bch2_dirent_create_snapshot(trans,
inode->bi_parent_subvol, lostfound.bi_inum,
@@ -373,17 +416,70 @@ static int reattach_inode(struct btree_trans *trans,
inode_d_type(inode),
&name,
inode->bi_subvol ?: inode->bi_inum,
- &dir_offset,
+ &inode->bi_dir_offset,
STR_HASH_must_create);
if (ret) {
bch_err_msg(c, ret, "error creating dirent");
return ret;
}
- inode->bi_dir = lostfound.bi_inum;
- inode->bi_dir_offset = dir_offset;
+ ret = __bch2_fsck_write_inode(trans, inode);
+ if (ret)
+ return ret;
+
+ /*
+ * Fix up inodes in child snapshots: if they should also be reattached
+ * update the backpointer field, if they should not be we need to emit
+ * whiteouts for the dirent we just created.
+ */
+ if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
+ snapshot_id_list whiteouts_done;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ darray_init(&whiteouts_done);
+
+ for_each_btree_key_reverse_norestart(trans, iter,
+ BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
+ BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
+ if (k.k->p.offset != inode->bi_inum)
+ break;
+
+ if (!bkey_is_inode(k.k) ||
+ !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
+ snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
+ continue;
+
+ struct bch_inode_unpacked child_inode;
+ bch2_inode_unpack(k, &child_inode);
+
+ if (!inode_should_reattach(&child_inode)) {
+ ret = maybe_delete_dirent(trans,
+ SPOS(lostfound.bi_inum, inode->bi_dir_offset,
+ dirent_snapshot),
+ k.k->p.snapshot);
+ if (ret)
+ break;
+
+ ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
+ if (ret)
+ break;
+ } else {
+ iter.snapshot = k.k->p.snapshot;
+ child_inode.bi_dir = inode->bi_dir;
+ child_inode.bi_dir_offset = inode->bi_dir_offset;
+
+ ret = bch2_inode_write_flags(trans, &iter, &child_inode,
+ BTREE_UPDATE_internal_snapshot_node);
+ if (ret)
+ break;
+ }
+ }
+ darray_exit(&whiteouts_done);
+ bch2_trans_iter_exit(trans, &iter);
+ }
- return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
+ return ret;
}
static int remove_backpointer(struct btree_trans *trans,
@@ -422,7 +518,7 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume
if (ret)
return ret;
- ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
+ ret = reattach_inode(trans, &inode);
bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
return ret;
}
@@ -540,8 +636,9 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
new_inode.bi_size = i_size;
new_inode.bi_inum = inum;
+ new_inode.bi_snapshot = snapshot;
- return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
+ return __bch2_fsck_write_inode(trans, &new_inode);
}
struct snapshots_seen {
@@ -832,35 +929,138 @@ static int get_visible_inodes(struct btree_trans *trans,
return ret;
}
-static int hash_redo_key(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c k)
+static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
{
- struct bkey_i *delete;
- struct bkey_i *tmp;
+ if (d.v->d_type == DT_SUBVOL) {
+ u32 snap;
+ u64 inum;
+ int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+ return !ret;
+ } else {
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = bkey_is_inode(k.k);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+ }
+}
- delete = bch2_trans_kmalloc(trans, sizeof(*delete));
- if (IS_ERR(delete))
- return PTR_ERR(delete);
+/*
+ * Prefer to delete the first one, since that will be the one at the wrong
+ * offset:
+ * return value: 0 -> delete k1, 1 -> delete k2
+ */
+static int hash_pick_winner(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c k1,
+ struct bkey_s_c k2)
+{
+ if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
+ !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
+ return 0;
- tmp = bch2_bkey_make_mut_noupdate(trans, k);
- if (IS_ERR(tmp))
- return PTR_ERR(tmp);
+ switch (desc.btree_id) {
+ case BTREE_ID_dirents: {
+ int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return 0;
- bkey_init(&delete->k);
- delete->k.p = k_iter->pos;
- return bch2_btree_iter_traverse(k_iter) ?:
- bch2_trans_update(trans, k_iter, delete, 0) ?:
- bch2_hash_set_in_snapshot(trans, desc, hash_info,
- (subvol_inum) { 0, k.k->p.inode },
- k.k->p.snapshot, tmp,
- STR_HASH_must_create|
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return 1;
+ return 2;
+ }
+ default:
+ return 0;
+ }
+}
+
+static int fsck_update_backpointers(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_i *new)
+{
+ if (new->k.type != KEY_TYPE_dirent)
+ return 0;
+
+ struct bkey_i_dirent *d = bkey_i_to_dirent(new);
+ struct inode_walker target = inode_walker_init();
+ int ret = 0;
+
+ if (d->v.d_type == DT_SUBVOL) {
+ BUG();
+ } else {
+ ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
+ if (ret)
+ goto err;
+
+ darray_for_each(target.inodes, i) {
+ i->inode.bi_dir_offset = d->k.p.offset;
+ ret = __bch2_fsck_write_inode(trans, &i->inode);
+ if (ret)
+ goto err;
+ }
+ }
+err:
+ inode_walker_exit(&target);
+ return ret;
+}
+
+static int fsck_rename_dirent(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c_dirent old)
+{
+ struct qstr old_name = bch2_dirent_get_name(old);
+ struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
+ int ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ bkey_dirent_init(&new->k_i);
+ dirent_copy_target(new, old);
+ new->k.p = old.k->p;
+
+ for (unsigned i = 0; i < 1000; i++) {
+ unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
+ old_name.len, old_name.name, i);
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
+
+ if (u64s > U8_MAX)
+ return -EINVAL;
+
+ new->k.u64s = u64s;
+
+ ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ (subvol_inum) { 0, old.k->p.inode },
+ old.k->p.snapshot, &new->k_i,
+ BTREE_UPDATE_internal_snapshot_node);
+ if (!bch2_err_matches(ret, EEXIST))
+ break;
+ }
+
+ if (ret)
+ return ret;
+
+ return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
}
static int hash_check_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
const struct bch_hash_desc desc,
struct bch_hash_info *hash_info,
struct btree_iter *k_iter, struct bkey_s_c hash_k)
@@ -889,16 +1089,9 @@ static int hash_check_key(struct btree_trans *trans,
if (bkey_eq(k.k->p, hash_k.k->p))
break;
- if (fsck_err_on(k.k->type == desc.key_type &&
- !desc.cmp_bkey(k, hash_k),
- trans, hash_table_key_duplicate,
- "duplicate hash table keys:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k),
- buf.buf))) {
- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
- break;
- }
+ if (k.k->type == desc.key_type &&
+ !desc.cmp_bkey(k, hash_k))
+ goto duplicate_entries;
if (bkey_deleted(k.k)) {
bch2_trans_iter_exit(trans, &iter);
@@ -911,18 +1104,66 @@ out:
return ret;
bad_hash:
if (fsck_err(trans, hash_table_key_wrong_offset,
- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
+ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s",
bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
- ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
- bch_err_fn(c, ret);
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info,
+ (subvol_inum) { 0, hash_k.k->p.inode },
+ hash_k.k->p.snapshot, new,
+ STR_HASH_must_create|
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = bkey_err(k);
if (ret)
- return ret;
- ret = -BCH_ERR_transaction_restart_nested;
+ goto out;
+ if (k.k)
+ goto duplicate_entries;
+
+ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter,
+ BTREE_UPDATE_internal_snapshot_node) ?:
+ fsck_update_backpointers(trans, s, desc, hash_info, new) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
}
fsck_err:
goto out;
+duplicate_entries:
+ ret = hash_pick_winner(trans, desc, hash_info, hash_k, k);
+ if (ret < 0)
+ goto out;
+
+ if (!fsck_err(trans, hash_table_key_duplicate,
+ "duplicate hash table keys%s:\n%s",
+ ret != 2 ? "" : ", both point to valid inodes",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k),
+ prt_newline(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf)))
+ goto out;
+
+ switch (ret) {
+ case 0:
+ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+ break;
+ case 1:
+ ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0);
+ break;
+ case 2:
+ ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
+ bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+ goto out;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
}
static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
@@ -988,7 +1229,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
*/
inode->bi_dir = 0;
inode->bi_dir_offset = 0;
- inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
*write_inode = true;
}
@@ -1000,30 +1240,40 @@ fsck_err:
return ret;
}
-static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
+static int get_snapshot_root_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *root,
+ u64 inum)
{
- subvol_inum inum = {
- .subvol = snapshot_t(c, p.snapshot)->subvol,
- .inum = p.offset,
- };
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
- /* snapshot tree corruption, can't safely delete */
- if (!inum.subvol) {
- bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
- return true;
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum, U32_MAX),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+ if (bkey_is_inode(k.k))
+ goto found_root;
}
-
- return __bch2_inode_hash_find(c, inum) != NULL;
+ if (ret)
+ goto err;
+ BUG();
+found_root:
+ BUG_ON(bch2_inode_unpack(k, root));
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
- struct bch_inode_unpacked *prev,
- struct snapshots_seen *s,
- bool full)
+ struct bch_inode_unpacked *snapshot_root,
+ struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
struct bch_inode_unpacked u;
bool do_update = false;
int ret;
@@ -1043,45 +1293,75 @@ static int check_inode(struct btree_trans *trans,
BUG_ON(bch2_inode_unpack(k, &u));
- if (!full &&
- !(u.bi_flags & (BCH_INODE_i_size_dirty|
- BCH_INODE_i_sectors_dirty|
- BCH_INODE_unlinked)))
- return 0;
-
- if (prev->bi_inum != u.bi_inum)
- *prev = u;
+ if (snapshot_root->bi_inum != u.bi_inum) {
+ ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
+ if (ret)
+ goto err;
+ }
- if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
- inode_d_type(prev) != inode_d_type(&u),
+ if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
+ INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
trans, inode_snapshot_mismatch,
"inodes in different snapshots don't match")) {
- bch_err(c, "repair not implemented yet");
- return -BCH_ERR_fsck_repair_unimplemented;
+ u.bi_hash_seed = snapshot_root->bi_hash_seed;
+ SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
+ do_update = true;
}
- if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
- bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
- struct bpos new_min_pos;
-
- ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
+ if (u.bi_dir || u.bi_dir_offset) {
+ ret = check_inode_dirent_inode(trans, &u, &do_update);
if (ret)
goto err;
+ }
- u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
+ if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
+ trans, inode_unlinked_but_has_dirent,
+ "inode unlinked but has dirent\n%s",
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &u),
+ buf.buf))) {
+ u.bi_flags &= ~BCH_INODE_unlinked;
+ do_update = true;
+ }
- ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
+ if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
+ /* Check for this early so that check_unreachable_inode() will reattach it */
- bch_err_msg(c, ret, "in fsck updating inode");
- if (ret)
- return ret;
+ ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
+ if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
+ goto err;
- if (!bpos_eq(new_min_pos, POS_MIN))
- bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
- return 0;
+ fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
+ "dir unlinked but not empty\n%s",
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &u),
+ buf.buf));
+ u.bi_flags &= ~BCH_INODE_unlinked;
+ do_update = true;
+ ret = 0;
+ }
+
+ ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret < 0)
+ goto err;
+
+ if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
+ trans, inode_has_child_snapshots_wrong,
+ "inode has_child_snapshots flag wrong (should be %u)\n%s",
+ ret,
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &u),
+ buf.buf))) {
+ if (ret)
+ u.bi_flags |= BCH_INODE_has_child_snapshot;
+ else
+ u.bi_flags &= ~BCH_INODE_has_child_snapshot;
+ do_update = true;
}
+ ret = 0;
- if (u.bi_flags & BCH_INODE_unlinked) {
+ if ((u.bi_flags & BCH_INODE_unlinked) &&
+ !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
if (!test_bit(BCH_FS_started, &c->flags)) {
/*
* If we're not in online fsck, don't delete unlinked
@@ -1095,7 +1375,7 @@ static int check_inode(struct btree_trans *trans,
*/
ret = check_inode_deleted_list(trans, k.k->p);
if (ret < 0)
- return ret;
+ goto err_noprint;
fsck_err_on(!ret,
trans, unlinked_inode_not_on_deleted_list,
@@ -1106,83 +1386,22 @@ static int check_inode(struct btree_trans *trans,
if (ret)
goto err;
} else {
- if (fsck_err_on(bch2_inode_is_open(c, k.k->p),
+ ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
+ if (ret < 0)
+ goto err;
+
+ if (fsck_err_on(!ret,
trans, inode_unlinked_and_not_open,
"inode %llu%u unlinked and not open",
u.bi_inum, u.bi_snapshot)) {
ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck deleting inode");
- return ret;
+ goto err_noprint;
}
+ ret = 0;
}
}
- /* i_size_dirty is vestigal, since we now have logged ops for truncate * */
- if (u.bi_flags & BCH_INODE_i_size_dirty &&
- (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
- fsck_err(trans, inode_i_size_dirty_but_clean,
- "filesystem marked clean, but inode %llu has i_size dirty",
- u.bi_inum))) {
- bch_verbose(c, "truncating inode %llu", u.bi_inum);
-
- /*
- * XXX: need to truncate partial blocks too here - or ideally
- * just switch units to bytes and that issue goes away
- */
- ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
- iter->pos.snapshot),
- POS(u.bi_inum, U64_MAX),
- 0, NULL);
- bch_err_msg(c, ret, "in fsck truncating inode");
- if (ret)
- return ret;
-
- /*
- * We truncated without our normal sector accounting hook, just
- * make sure we recalculate it:
- */
- u.bi_flags |= BCH_INODE_i_sectors_dirty;
-
- u.bi_flags &= ~BCH_INODE_i_size_dirty;
- do_update = true;
- }
-
- /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */
- if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
- (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
- fsck_err(trans, inode_i_sectors_dirty_but_clean,
- "filesystem marked clean, but inode %llu has i_sectors dirty",
- u.bi_inum))) {
- s64 sectors;
-
- bch_verbose(c, "recounting sectors for inode %llu",
- u.bi_inum);
-
- sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
- if (sectors < 0) {
- bch_err_msg(c, sectors, "in fsck recounting inode sectors");
- return sectors;
- }
-
- u.bi_sectors = sectors;
- u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
- do_update = true;
- }
-
- if (u.bi_flags & BCH_INODE_backptr_untrusted) {
- u.bi_dir = 0;
- u.bi_dir_offset = 0;
- u.bi_flags &= ~BCH_INODE_backptr_untrusted;
- do_update = true;
- }
-
- if (u.bi_dir || u.bi_dir_offset) {
- ret = check_inode_dirent_inode(trans, &u, &do_update);
- if (ret)
- goto err;
- }
-
if (fsck_err_on(u.bi_parent_subvol &&
(u.bi_subvol == 0 ||
u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
@@ -1224,21 +1443,22 @@ static int check_inode(struct btree_trans *trans,
}
do_update:
if (do_update) {
- ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u);
bch_err_msg(c, ret, "in fsck updating inode");
if (ret)
- return ret;
+ goto err_noprint;
}
err:
fsck_err:
bch_err_fn(c, ret);
+err_noprint:
+ printbuf_exit(&buf);
return ret;
}
int bch2_check_inodes(struct bch_fs *c)
{
- bool full = c->opts.fsck;
- struct bch_inode_unpacked prev = { 0 };
+ struct bch_inode_unpacked snapshot_root = {};
struct snapshots_seen s;
snapshots_seen_init(&s);
@@ -1248,13 +1468,104 @@ int bch2_check_inodes(struct bch_fs *c)
POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_inode(trans, &iter, k, &prev, &s, full)));
+ check_inode(trans, &iter, k, &snapshot_root, &s)));
snapshots_seen_exit(&s);
bch_err_fn(c, ret);
return ret;
}
+static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ /*
+ * We look for inodes to reattach in natural key order, leaves first,
+ * but we should do the reattach at the oldest version that needs to be
+ * reattached:
+ */
+ for_each_btree_key_norestart(trans, iter,
+ BTREE_ID_inodes,
+ SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inode->bi_inum)
+ break;
+
+ if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
+ continue;
+
+ if (!bkey_is_inode(k.k))
+ break;
+
+ struct bch_inode_unpacked parent_inode;
+ bch2_inode_unpack(k, &parent_inode);
+
+ if (!inode_should_reattach(&parent_inode))
+ break;
+
+ *inode = parent_inode;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+static int check_unreachable_inode(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (!bkey_is_inode(k.k))
+ return 0;
+
+ struct bch_inode_unpacked inode;
+ BUG_ON(bch2_inode_unpack(k, &inode));
+
+ if (!inode_should_reattach(&inode))
+ return 0;
+
+ ret = find_oldest_inode_needs_reattach(trans, &inode);
+ if (ret)
+ return ret;
+
+ if (fsck_err(trans, inode_unreachable,
+ "unreachable inode:\n%s",
+ (bch2_inode_unpacked_to_text(&buf, &inode),
+ buf.buf)))
+ ret = reattach_inode(trans, &inode);
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/*
+ * Reattach unreachable (but not unlinked) inodes
+ *
+ * Run after check_inodes() and check_dirents(), so we node that inode
+ * backpointer fields point to valid dirents, and every inode that has a dirent
+ * that points to it has its backpointer field set - so we're just looking for
+ * non-unlinked inodes without backpointers:
+ *
+ * XXX: this is racy w.r.t. hardlink removal in online fsck
+ */
+int bch2_check_unreachable_inodes(struct bch_fs *c)
+{
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+ POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_unreachable_inode(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
{
switch (btree) {
@@ -1347,7 +1658,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
w->last_pos.inode, i->snapshot,
i->inode.bi_sectors, i->count)) {
i->inode.bi_sectors = i->count;
- ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
+ ret = bch2_fsck_write_inode(trans, &i->inode);
if (ret)
break;
}
@@ -1657,8 +1968,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
!key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
continue;
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
- k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+ if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
!bkey_extent_is_reservation(k),
trans, extent_past_end_of_inode,
"extent type past end of inode %llu:%u, i_size %llu\n %s",
@@ -1789,7 +2099,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
- ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
+ ret = bch2_fsck_write_inode(trans, &i->inode);
if (ret)
break;
}
@@ -1810,8 +2120,7 @@ noinline_for_stack
static int check_dirent_inode_dirent(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- u32 target_snapshot)
+ struct bch_inode_unpacked *target)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
@@ -1821,6 +2130,32 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
if (inode_points_to_dirent(target, d))
return 0;
+ if (!target->bi_dir &&
+ !target->bi_dir_offset) {
+ fsck_err_on(S_ISDIR(target->bi_mode),
+ trans, inode_dir_missing_backpointer,
+ "directory with missing backpointer\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n"),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf));
+
+ fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
+ trans, inode_unlinked_but_has_dirent,
+ "inode unlinked but has dirent\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n"),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf));
+
+ target->bi_flags &= ~BCH_INODE_unlinked;
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+ return __bch2_fsck_write_inode(trans, target);
+ }
+
if (bch2_inode_should_have_bp(target) &&
!fsck_err(trans, inode_wrong_backpointer,
"dirent points to inode that does not point back:\n %s",
@@ -1830,15 +2165,8 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
buf.buf)))
goto err;
- if (!target->bi_dir &&
- !target->bi_dir_offset) {
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
- return __bch2_fsck_write_inode(trans, target, target_snapshot);
- }
-
struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
- SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+ SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
ret = bkey_err(bp_dirent);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
@@ -1851,14 +2179,14 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
"inode %llu:%u has wrong backpointer:\n"
"got %llu:%llu\n"
"should be %llu:%llu",
- target->bi_inum, target_snapshot,
+ target->bi_inum, target->bi_snapshot,
target->bi_dir,
target->bi_dir_offset,
d.k->p.inode,
d.k->p.offset)) {
target->bi_dir = d.k->p.inode;
target->bi_dir_offset = d.k->p.offset;
- ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+ ret = __bch2_fsck_write_inode(trans, target);
goto out;
}
@@ -1873,7 +2201,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
trans, inode_dir_multiple_links,
"%s %llu:%u with multiple links\n%s",
S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
- target->bi_inum, target_snapshot, buf.buf)) {
+ target->bi_inum, target->bi_snapshot, buf.buf)) {
ret = __remove_dirent(trans, d.k->p);
goto out;
}
@@ -1886,10 +2214,10 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
if (fsck_err_on(backpointer_exists && !target->bi_nlink,
trans, inode_multiple_links_but_nlink_0,
"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
- target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+ target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
target->bi_nlink++;
target->bi_flags &= ~BCH_INODE_unlinked;
- ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+ ret = __bch2_fsck_write_inode(trans, target);
if (ret)
goto err;
}
@@ -1906,15 +2234,14 @@ noinline_for_stack
static int check_dirent_target(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- u32 target_snapshot)
+ struct bch_inode_unpacked *target)
{
struct bch_fs *c = trans->c;
struct bkey_i_dirent *n;
struct printbuf buf = PRINTBUF;
int ret = 0;
- ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
+ ret = check_dirent_inode_dirent(trans, iter, d, target);
if (ret)
goto err;
@@ -2073,7 +2400,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
u64 target_inum = le64_to_cpu(s.v->inode);
u32 target_snapshot = le32_to_cpu(s.v->snapshot);
- ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
+ ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
@@ -2089,13 +2416,13 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
target_inum,
subvol_root.bi_parent_subvol, parent_subvol)) {
subvol_root.bi_parent_subvol = parent_subvol;
- ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
+ subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
+ ret = __bch2_fsck_write_inode(trans, &subvol_root);
if (ret)
goto err;
}
- ret = check_dirent_target(trans, iter, d, &subvol_root,
- target_snapshot);
+ ret = check_dirent_target(trans, iter, d, &subvol_root);
if (ret)
goto err;
out:
@@ -2153,7 +2480,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
dir->first_this_inode = false;
- ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
+ ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k);
if (ret < 0)
goto err;
if (ret) {
@@ -2188,8 +2515,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
}
darray_for_each(target->inodes, i) {
- ret = check_dirent_target(trans, iter, d,
- &i->inode, i->snapshot);
+ ret = check_dirent_target(trans, iter, d, &i->inode);
if (ret)
goto err;
}
@@ -2268,7 +2594,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
inode->first_this_inode = false;
- ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
+ ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k);
bch_err_fn(c, ret);
return ret;
}
@@ -2330,7 +2656,7 @@ static int check_root_trans(struct btree_trans *trans)
goto err;
}
- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+ ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
@@ -2343,8 +2669,9 @@ static int check_root_trans(struct btree_trans *trans)
bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
0, NULL);
root_inode.bi_inum = inum;
+ root_inode.bi_snapshot = snapshot;
- ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
+ ret = __bch2_fsck_write_inode(trans, &root_inode);
bch_err_msg(c, ret, "writing root inode");
}
err:
@@ -2355,7 +2682,7 @@ fsck_err:
/* Get root directory, create if it doesn't exist: */
int bch2_check_root(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_root_trans(trans));
bch_err_fn(c, ret);
return ret;
@@ -2396,22 +2723,6 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
break;
- /*
- * We've checked that inode backpointers point to valid dirents;
- * here, it's sufficient to check that the subvolume root has a
- * dirent:
- */
- if (fsck_err_on(!subvol_root.bi_dir,
- trans, subvol_unreachable,
- "unreachable subvolume %s",
- (bch2_bkey_val_to_text(&buf, c, s.s_c),
- prt_newline(&buf),
- bch2_inode_unpacked_to_text(&buf, &subvol_root),
- buf.buf))) {
- ret = reattach_subvol(trans, s);
- break;
- }
-
u32 parent = le32_to_cpu(s.v->fs_path_parent);
if (darray_u32_has(&subvol_path, parent)) {
@@ -2472,12 +2783,6 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
return false;
}
-/*
- * Check that a given inode is reachable from its subvolume root - we already
- * verified subvolume connectivity:
- *
- * XXX: we should also be verifying that inodes are in the right subvolumes
- */
static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
{
struct bch_fs *c = trans->c;
@@ -2491,6 +2796,9 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
BUG_ON(bch2_inode_unpack(inode_k, &inode));
+ if (!S_ISDIR(inode.bi_mode))
+ return 0;
+
while (!inode.bi_subvol) {
struct btree_iter dirent_iter;
struct bkey_s_c_dirent d;
@@ -2505,21 +2813,15 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
bch2_trans_iter_exit(trans, &dirent_iter);
if (bch2_err_matches(ret, ENOENT)) {
- ret = 0;
- if (fsck_err(trans, inode_unreachable,
- "unreachable inode\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, inode_k),
- buf.buf)))
- ret = reattach_inode(trans, &inode, snapshot);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, inode_k);
+ bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
+ bch2_err_str(ret), buf.buf);
goto out;
}
bch2_trans_iter_exit(trans, &dirent_iter);
- if (!S_ISDIR(inode.bi_mode))
- break;
-
ret = darray_push(p, ((struct pathbuf_entry) {
.inum = inode.bi_inum,
.snapshot = snapshot,
@@ -2557,7 +2859,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
if (ret)
break;
- ret = reattach_inode(trans, &inode, snapshot);
+ ret = reattach_inode(trans, &inode);
bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
}
break;
@@ -2572,9 +2874,8 @@ fsck_err:
}
/*
- * Check for unreachable inodes, as well as loops in the directory structure:
- * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
- * unreachable:
+ * Check for loops in the directory structure: all other connectivity issues
+ * have been fixed by prior passes
*/
int bch2_check_directory_structure(struct bch_fs *c)
{
@@ -2702,6 +3003,10 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
if (S_ISDIR(u.bi_mode))
continue;
+ /*
+ * Previous passes ensured that bi_nlink is nonzero if
+ * it had multiple hardlinks:
+ */
if (!u.bi_nlink)
continue;
@@ -2787,7 +3092,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
bch2_inode_nlink_get(&u), link->count)) {
bch2_inode_nlink_set(&u, link->count);
- ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u);
}
fsck_err:
return ret;
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index a4ef94271784..1cca31011530 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -9,6 +9,7 @@ int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);
int bch2_check_subvolume_structure(struct bch_fs *);
+int bch2_check_unreachable_inodes(struct bch_fs *);
int bch2_check_directory_structure(struct bch_fs *);
int bch2_check_nlinks(struct bch_fs *);
int bch2_fix_reflink_p(struct bch_fs *);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 753c208896c3..039cb7a22244 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -12,6 +12,7 @@
#include "error.h"
#include "extents.h"
#include "extent_update.h"
+#include "fs.h"
#include "inode.h"
#include "str_hash.h"
#include "snapshot.h"
@@ -20,7 +21,7 @@
#include <linux/random.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#define x(name, ...) #name,
const char * const bch2_inode_opts[] = {
@@ -34,6 +35,8 @@ static const char * const bch2_inode_flag_strs[] = {
};
#undef x
+static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
+
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
static int inode_decode_field(const u8 *in, const u8 *end,
@@ -160,8 +163,8 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
unsigned fieldnr = 0, field_bits;
int ret;
-#define x(_name, _bits) \
- if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
+#define x(_name, _bits) \
+ if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \
unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
memset((void *) unpacked + offset, 0, \
sizeof(*unpacked) - offset); \
@@ -280,6 +283,8 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
{
memset(unpacked, 0, sizeof(*unpacked));
+ unpacked->bi_snapshot = k.k->p.snapshot;
+
switch (k.k->type) {
case KEY_TYPE_inode: {
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
@@ -290,10 +295,10 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
- if (INODE_NEW_VARINT(inode.v)) {
+ if (INODEv1_NEW_VARINT(inode.v)) {
return bch2_inode_unpack_v2(unpacked, inode.v->fields,
bkey_val_end(inode),
- INODE_NR_FIELDS(inode.v));
+ INODEv1_NR_FIELDS(inode.v));
} else {
return bch2_inode_unpack_v1(inode, unpacked);
}
@@ -327,22 +332,20 @@ int bch2_inode_unpack(struct bkey_s_c k,
: bch2_inode_unpack_slowpath(k, unpacked);
}
-int bch2_inode_peek_nowarn(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- subvol_inum inum, unsigned flags)
+int __bch2_inode_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ subvol_inum inum, unsigned flags,
+ bool warn)
{
- struct bkey_s_c k;
u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn);
if (ret)
return ret;
- k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot),
- flags|BTREE_ITER_cached);
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum.inum, snapshot),
+ flags|BTREE_ITER_cached);
ret = bkey_err(k);
if (ret)
return ret;
@@ -357,20 +360,12 @@ int bch2_inode_peek_nowarn(struct btree_trans *trans,
return 0;
err:
+ if (warn)
+ bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
bch2_trans_iter_exit(trans, iter);
return ret;
}
-int bch2_inode_peek(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- subvol_inum inum, unsigned flags)
-{
- int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
- bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
- return ret;
-}
-
int bch2_inode_write_flags(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
@@ -387,9 +382,7 @@ int bch2_inode_write_flags(struct btree_trans *trans,
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
}
-int __bch2_fsck_write_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
+int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
{
struct bkey_inode_buf *inode_p =
bch2_trans_kmalloc(trans, sizeof(*inode_p));
@@ -398,19 +391,17 @@ int __bch2_fsck_write_inode(struct btree_trans *trans,
return PTR_ERR(inode_p);
bch2_inode_pack(inode_p, inode);
- inode_p->inode.k.p.snapshot = snapshot;
+ inode_p->inode.k.p.snapshot = inode->bi_snapshot;
return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
&inode_p->inode.k_i,
BTREE_UPDATE_internal_snapshot_node);
}
-int bch2_fsck_write_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
+int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
{
int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_fsck_write_inode(trans, inode, snapshot));
+ __bch2_fsck_write_inode(trans, inode));
bch_err_fn(trans->c, ret);
return ret;
}
@@ -482,10 +473,10 @@ int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
int ret = 0;
- bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+ bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
c, inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
- INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
+ INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
ret = __bch2_inode_validate(c, k, flags);
fsck_err:
@@ -544,6 +535,10 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
prt_printf(out, "(%x)\n", inode->bi_flags);
prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq);
+ prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed);
+ prt_printf(out, "hash_type=");
+ bch2_prt_str_hash_type(out, INODE_STR_HASH(inode));
+ prt_newline(out);
prt_printf(out, "bi_size=%llu\n", inode->bi_size);
prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors);
prt_printf(out, "bi_version=%llu\n", inode->bi_version);
@@ -589,9 +584,137 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k)
}
}
-static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode:
+ bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
+ return;
+ case KEY_TYPE_inode_v2:
+ bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
+ return;
+ case KEY_TYPE_inode_v3:
+ bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
+ return;
+ default:
+ BUG();
+ }
+}
+
+static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
+{
+ unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
+
+ return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
+}
+
+static struct bkey_s_c
+bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+ enum btree_id btree, struct bpos pos,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_upto_norestart(trans, *iter, btree,
+ bpos_successor(pos),
+ SPOS(pos.inode, pos.offset, U32_MAX),
+ flags|BTREE_ITER_all_snapshots, k, ret)
+ if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
+ return k;
+
+ bch2_trans_iter_exit(trans, iter);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+}
+
+static struct bkey_s_c
+bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos, unsigned flags)
+{
+ struct bkey_s_c k;
+again:
+ k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
+ if (!k.k ||
+ bkey_err(k) ||
+ bkey_is_inode(k.k))
+ return k;
+
+ bch2_trans_iter_exit(trans, iter);
+ pos = k.k->p;
+ goto again;
+}
+
+int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_upto_norestart(trans, iter,
+ BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_with_updates, k, ret)
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
+ bkey_is_inode(k.k)) {
+ ret = 1;
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int update_inode_has_children(struct btree_trans *trans,
+ struct bkey_s k,
+ bool have_child)
+{
+ if (!have_child) {
+ int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret)
+ return ret < 0 ? ret : 0;
+ }
+
+ u64 f = bkey_inode_flags(k.s_c);
+ if (have_child != !!(f & BCH_INODE_has_child_snapshot))
+ bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
+
+ return 0;
+}
+
+static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
+ bool have_child)
{
- return bkey_inode_flags(k) & BCH_INODE_unlinked;
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
+ &iter, pos, BTREE_ITER_with_updates);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (!k.k)
+ return 0;
+
+ if (!have_child) {
+ ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
+ goto err;
+ }
+ }
+
+ u64 f = bkey_inode_flags(k);
+ if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
+ struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
int bch2_trigger_inode(struct btree_trans *trans,
@@ -600,6 +723,8 @@ int bch2_trigger_inode(struct btree_trans *trans,
struct bkey_s new,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
+
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
BUG_ON(!trans->journal_res.seq);
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
@@ -613,13 +738,41 @@ int bch2_trigger_inode(struct btree_trans *trans,
return ret;
}
- int deleted_delta = (int) bkey_is_deleted_inode(new.s_c) -
- (int) bkey_is_deleted_inode(old);
- if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) {
- int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
- new.k->p, deleted_delta > 0);
- if (ret)
- return ret;
+ if (flags & BTREE_TRIGGER_transactional) {
+ int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) -
+ (int) bkey_is_unlinked_inode(old);
+ if (unlinked_delta) {
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+ new.k->p, unlinked_delta > 0);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * If we're creating or deleting an inode at this snapshot ID,
+ * and there might be an inode in a parent snapshot ID, we might
+ * need to set or clear the has_child_snapshot flag on the
+ * parent.
+ */
+ int deleted_delta = (int) bkey_is_inode(new.k) -
+ (int) bkey_is_inode(old.k);
+ if (deleted_delta &&
+ bch2_snapshot_parent(c, new.k->p.snapshot)) {
+ int ret = update_parent_inode_has_children(trans, new.k->p,
+ deleted_delta > 0);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * When an inode is first updated in a new snapshot, we may need
+ * to clear has_child_snapshot
+ */
+ if (deleted_delta > 0) {
+ int ret = update_inode_has_children(trans, new, false);
+ if (ret)
+ return ret;
+ }
}
return 0;
@@ -653,10 +806,8 @@ void bch2_inode_init_early(struct bch_fs *c,
memset(inode_u, 0, sizeof(*inode_u));
- /* ick */
- inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
- get_random_bytes(&inode_u->bi_hash_seed,
- sizeof(inode_u->bi_hash_seed));
+ SET_INODE_STR_HASH(inode_u, str_hash);
+ get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
}
void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
@@ -902,6 +1053,11 @@ err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
+ if (ret)
+ goto err2;
+
+ ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
+err2:
bch2_trans_put(trans);
return ret;
}
@@ -935,8 +1091,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
struct bch_inode_unpacked *inode)
{
- return bch2_trans_do(c, NULL, NULL, 0,
- bch2_inode_find_by_inum_trans(trans, inum, inode));
+ return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
}
int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
@@ -1006,7 +1161,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i
return 0;
}
-int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
struct btree_iter iter = { NULL };
@@ -1069,6 +1224,45 @@ err:
return ret ?: -BCH_ERR_transaction_restart_nested;
}
+/*
+ * After deleting an inode, there may be versions in older snapshots that should
+ * also be deleted - if they're not referenced by sibling snapshots and not open
+ * in other subvolumes:
+ */
+static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+next_parent:
+ ret = lockrestart_do(trans,
+ bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
+ if (ret || !k.k)
+ return ret;
+
+ bool unlinked = bkey_is_unlinked_inode(k);
+ pos = k.k->p;
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (!unlinked)
+ return 0;
+
+ ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
+ if (ret)
+ return ret;
+ goto next_parent;
+}
+
+int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+ return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
+ delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
+}
+
static int may_delete_deleted_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos pos,
@@ -1078,6 +1272,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
struct btree_iter inode_iter;
struct bkey_s_c k;
struct bch_inode_unpacked inode;
+ struct printbuf buf = PRINTBUF;
int ret;
k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
@@ -1113,6 +1308,31 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
pos.offset, pos.snapshot))
goto delete;
+ if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
+ trans, deleted_inode_has_child_snapshots,
+ "inode with child snapshots %llu:%u in deleted_inodes btree",
+ pos.offset, pos.snapshot))
+ goto delete;
+
+ ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret < 0)
+ goto out;
+
+ if (ret) {
+ if (fsck_err(trans, inode_has_child_snapshots_wrong,
+ "inode has_child_snapshots flag wrong (should be set)\n%s",
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &inode),
+ buf.buf))) {
+ inode.bi_flags |= BCH_INODE_has_child_snapshot;
+ ret = __bch2_fsck_write_inode(trans, &inode);
+ if (ret)
+ goto out;
+ }
+ goto delete;
+
+ }
+
if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
!fsck_err(trans, deleted_inode_but_clean,
"filesystem marked as clean but have deleted inode %llu:%u",
@@ -1121,33 +1341,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
}
- if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
- struct bpos new_min_pos;
-
- ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
- if (ret)
- goto out;
-
- inode.bi_flags &= ~BCH_INODE_unlinked;
-
- ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
- BTREE_UPDATE_internal_snapshot_node);
- bch_err_msg(c, ret, "clearing inode unlinked flag");
- if (ret)
- goto out;
-
- /*
- * We'll need another write buffer flush to pick up the new
- * unlinked inodes in the snapshot leaves:
- */
- *need_another_pass = true;
- goto out;
- }
-
ret = 1;
out:
fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
+ printbuf_exit(&buf);
return ret;
delete:
ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 695abd707cb6..eab82b5eb897 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -5,6 +5,7 @@
#include "bkey.h"
#include "bkey_methods.h"
#include "opts.h"
+#include "snapshot.h"
enum bch_validate_flags;
extern const char * const bch2_inode_opts[];
@@ -17,6 +18,15 @@ int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
enum bch_validate_flags);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
+
+static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
+{
+ return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
+ ? __bch2_inode_has_child_snapshots(trans, pos)
+ : 0;
+}
+
int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
enum btree_iter_update_trigger_flags);
@@ -82,6 +92,7 @@ struct bch_inode_unpacked {
BCH_INODE_FIELDS_v3()
#undef x
};
+BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24);
struct bkey_inode_buf {
struct bkey_i_inode_v3 inode;
@@ -97,10 +108,26 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
-int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, subvol_inum, unsigned);
-int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, subvol_inum, unsigned);
+int __bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, subvol_inum, unsigned, bool);
+
+static inline int bch2_inode_peek_nowarn(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ subvol_inum inum, unsigned flags)
+{
+ return __bch2_inode_peek(trans, iter, inode, inum, flags, false);
+}
+
+static inline int bch2_inode_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ subvol_inum inum, unsigned flags)
+{
+ return __bch2_inode_peek(trans, iter, inode, inum, flags, true);
+ int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+ return ret;
+}
int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
@@ -112,8 +139,8 @@ static inline int bch2_inode_write(struct btree_trans *trans,
return bch2_inode_write_flags(trans, iter, inode, 0);
}
-int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
-int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
+int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index 83d107331edf..7928d0c6954f 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -133,7 +133,8 @@ enum inode_opt_id {
x(i_size_dirty, 5) \
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
- x(backptr_untrusted, 8)
+ x(backptr_untrusted, 8) \
+ x(has_child_snapshot, 9)
/* bits 20+ reserved for packed fields below: */
@@ -149,9 +150,9 @@ enum __bch_inode_flags {
#undef x
};
-LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32);
LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 177ed331c00b..f283051758d6 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -224,13 +224,14 @@ void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, str
static int truncate_set_isize(struct btree_trans *trans,
subvol_inum inum,
- u64 new_i_size)
+ u64 new_i_size,
+ bool warn)
{
struct btree_iter iter = { NULL };
struct bch_inode_unpacked inode_u;
int ret;
- ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent) ?:
+ ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?:
(inode_u.bi_size = new_i_size, 0) ?:
bch2_inode_write(trans, &iter, &inode_u);
@@ -247,10 +248,11 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
u64 new_i_size = le64_to_cpu(op->v.new_i_size);
+ bool warn_errors = i_sectors_delta != NULL;
int ret;
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- truncate_set_isize(trans, inum, new_i_size));
+ truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL));
if (ret)
goto err;
@@ -263,8 +265,8 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
ret = 0;
err:
- bch2_logged_op_finish(trans, op_k);
- bch_err_fn(c, ret);
+ if (warn_errors)
+ bch_err_fn(c, ret);
return ret;
}
@@ -288,9 +290,14 @@ int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sec
* resume only proceeding in one of the snapshots
*/
down_read(&c->snapshot_create_lock);
- int ret = bch2_trans_run(c,
- bch2_logged_op_start(trans, &op.k_i) ?:
- __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
+ struct btree_trans *trans = bch2_trans_get(c);
+ int ret = bch2_logged_op_start(trans, &op.k_i);
+ if (ret)
+ goto out;
+ ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta);
+ ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
+out:
+ bch2_trans_put(trans);
up_read(&c->snapshot_create_lock);
return ret;
@@ -308,7 +315,8 @@ void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, stru
prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset));
}
-static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
+static int adjust_i_size(struct btree_trans *trans, subvol_inum inum,
+ u64 offset, s64 len, bool warn)
{
struct btree_iter iter;
struct bch_inode_unpacked inode_u;
@@ -317,7 +325,7 @@ static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset
offset <<= 9;
len <<= 9;
- ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent);
+ ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn);
if (ret)
return ret;
@@ -357,12 +365,22 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
u64 len = abs(shift);
u64 pos = le64_to_cpu(op->v.pos);
bool insert = shift > 0;
+ u32 snapshot;
+ bool warn_errors = i_sectors_delta != NULL;
int ret = 0;
ret = bch2_inum_opts_get(trans, inum, &opts);
if (ret)
return ret;
+ /*
+ * check for missing subvolume before fpunch, as in resume we don't want
+ * it to be a fatal error
+ */
+ ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors));
+ if (ret)
+ return ret;
+
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, 0),
BTREE_ITER_intent);
@@ -373,7 +391,7 @@ case LOGGED_OP_FINSERT_start:
if (insert) {
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, src_offset, len) ?:
+ adjust_i_size(trans, inum, src_offset, len, warn_errors) ?:
bch2_logged_op_update(trans, &op->k_i));
if (ret)
goto err;
@@ -396,11 +414,11 @@ case LOGGED_OP_FINSERT_shift_extents:
struct bkey_i delete, *copy;
struct bkey_s_c k;
struct bpos src_pos = POS(inum.inum, src_offset);
- u32 snapshot;
bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot,
+ warn_errors);
if (ret)
goto btree_err;
@@ -463,12 +481,12 @@ btree_err:
if (!insert) {
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, src_offset, shift) ?:
+ adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?:
bch2_logged_op_update(trans, &op->k_i));
} else {
/* We need an inode update to update bi_journal_seq for fsync: */
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, 0, 0) ?:
+ adjust_i_size(trans, inum, 0, 0, warn_errors) ?:
bch2_logged_op_update(trans, &op->k_i));
}
@@ -477,9 +495,9 @@ case LOGGED_OP_FINSERT_finish:
break;
}
err:
- bch_err_fn(c, ret);
- bch2_logged_op_finish(trans, op_k);
bch2_trans_iter_exit(trans, &iter);
+ if (warn_errors)
+ bch_err_fn(c, ret);
return ret;
}
@@ -508,9 +526,14 @@ int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
* resume only proceeding in one of the snapshots
*/
down_read(&c->snapshot_create_lock);
- int ret = bch2_trans_run(c,
- bch2_logged_op_start(trans, &op.k_i) ?:
- __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
+ struct btree_trans *trans = bch2_trans_get(c);
+ int ret = bch2_logged_op_start(trans, &op.k_i);
+ if (ret)
+ goto out;
+ ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta);
+ ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
+out:
+ bch2_trans_put(trans);
up_read(&c->snapshot_create_lock);
return ret;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index e4fc17c548fd..fc246f342820 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -409,8 +409,8 @@ retry:
bch2_trans_begin(trans);
rbio->bio.bi_status = 0;
- k = bch2_btree_iter_peek_slot(&iter);
- if (bkey_err(k))
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (ret)
goto err;
bch2_bkey_buf_reassemble(&sk, c, k);
@@ -557,8 +557,8 @@ out:
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{
- bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_rbio_narrow_crcs(trans, rbio));
+ bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __bch2_rbio_narrow_crcs(trans, rbio));
}
/* Inner part that may run in process context */
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index b5fe9e0dc155..8609e25e450f 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1437,7 +1437,7 @@ again:
* freeing up space on specific disks, which means that
* allocations for specific disks may hang arbitrarily long:
*/
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_run(c, lockrestart_do(trans,
bch2_alloc_sectors_start_trans(trans,
op->target,
op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
@@ -1447,7 +1447,7 @@ again:
op->nr_replicas_required,
op->watermark,
op->flags,
- &op->cl, &wp));
+ &op->cl, &wp)));
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
break;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f5f7db50ca31..2dc0d60c1745 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -603,6 +603,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
{
int ret;
+ if (closure_wait_event_timeout(&j->async_wait,
+ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ (flags & JOURNAL_RES_GET_NONBLOCK),
+ HZ * 10))
+ return ret;
+
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct printbuf buf = PRINTBUF;
+ bch2_journal_debug_to_text(&buf, j);
+ bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
+ buf.buf);
+ printbuf_exit(&buf);
+
closure_wait_event(&j->async_wait,
(ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
(flags & JOURNAL_RES_GET_NONBLOCK));
@@ -745,7 +758,7 @@ out:
return ret;
}
-int bch2_journal_flush_seq(struct journal *j, u64 seq)
+int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state)
{
u64 start_time = local_clock();
int ret, ret2;
@@ -756,7 +769,9 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
if (seq <= j->flushed_seq_ondisk)
return 0;
- ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
+ ret = wait_event_state(j->wait,
+ (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)),
+ task_state);
if (!ret)
bch2_time_stats_update(j->flush_seq_time, start_time);
@@ -775,7 +790,7 @@ void bch2_journal_flush_async(struct journal *j, struct closure *parent)
int bch2_journal_flush(struct journal *j)
{
- return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
+ return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE);
}
/*
@@ -838,7 +853,7 @@ int bch2_journal_meta(struct journal *j)
bch2_journal_res_put(j, &res);
- return bch2_journal_flush_seq(j, res.seq);
+ return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE);
}
/* block/unlock the journal: */
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 377a3750406e..2762be6f9814 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -401,7 +401,7 @@ void bch2_journal_entry_res_resize(struct journal *,
int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
void bch2_journal_flush_async(struct journal *, struct closure *);
-int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush_seq(struct journal *, u64, unsigned);
int bch2_journal_flush(struct journal *);
bool bch2_journal_noflush_seq(struct journal *, u64);
int bch2_journal_meta(struct journal *);
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index 6f4a4e1083c9..60e00702d1a4 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -34,8 +34,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
- struct bkey_buf sk;
u32 restart_count = trans->restart_count;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -46,13 +44,15 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
(bch2_bkey_val_to_text(&buf, c, k),
buf.buf));
- if (!fn)
- return 0;
-
+ struct bkey_buf sk;
bch2_bkey_buf_init(&sk);
bch2_bkey_buf_reassemble(&sk, c, k);
- fn->resume(trans, sk.k);
+ const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type);
+ if (fn)
+ fn->resume(trans, sk.k);
+
+ ret = bch2_logged_op_finish(trans, sk.k);
bch2_bkey_buf_exit(&sk, c);
fsck_err:
@@ -93,7 +93,7 @@ int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
__bch2_logged_op_start(trans, k));
}
-void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
+int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
{
int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
@@ -113,4 +113,6 @@ void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
buf.buf, bch2_err_str(ret));
printbuf_exit(&buf);
}
+
+ return ret;
}
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
index 4d1e786a27a8..30ae9ef737dd 100644
--- a/fs/bcachefs/logged_ops.h
+++ b/fs/bcachefs/logged_ops.h
@@ -15,6 +15,6 @@ static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i
int bch2_resume_logged_ops(struct bch_fs *);
int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
-void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
+int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 96f2f4f8c397..10857eccdeaf 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
+#include "bkey_buf.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
@@ -118,7 +119,7 @@ fsck_err:
static int bch2_check_lru_key(struct btree_trans *trans,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
- struct bpos *last_flushed_pos)
+ struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -132,11 +133,13 @@ static int bch2_check_lru_key(struct btree_trans *trans,
u64 idx;
int ret;
- if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos),
+ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos);
+
+ if (fsck_err_on(!ca,
trans, lru_entry_to_invalid_bucket,
"lru key points to nonexistent device:bucket %llu:%llu",
alloc_pos.inode, alloc_pos.offset))
- return bch2_btree_delete_at(trans, lru_iter, 0);
+ return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
ret = bkey_err(k);
@@ -150,18 +153,15 @@ static int bch2_check_lru_key(struct btree_trans *trans,
idx = alloc_lru_idx_read(*a);
break;
case BCH_LRU_fragmentation:
- idx = a->fragmentation_lru;
+ idx = alloc_lru_idx_fragmentation(*a, ca);
break;
}
if (lru_k.k->type != KEY_TYPE_set ||
lru_pos_time(lru_k.k->p) != idx) {
- if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
- *last_flushed_pos = lru_k.k->p;
- ret = bch2_btree_write_buffer_flush_sync(trans) ?:
- -BCH_ERR_transaction_restart_write_buffer_flush;
- goto out;
- }
+ ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
+ if (ret)
+ goto err;
if (fsck_err(trans, lru_entry_bad,
"incorrect lru entry: lru %s time %llu\n"
@@ -171,12 +171,12 @@ static int bch2_check_lru_key(struct btree_trans *trans,
lru_pos_time(lru_k.k->p),
(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
- ret = bch2_btree_delete_at(trans, lru_iter, 0);
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
}
-out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &iter);
+ bch2_dev_put(ca);
printbuf_exit(&buf2);
printbuf_exit(&buf1);
return ret;
@@ -184,12 +184,18 @@ fsck_err:
int bch2_check_lrus(struct bch_fs *c)
{
- struct bpos last_flushed_pos = POS_MIN;
+ struct bkey_buf last_flushed;
+
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
- bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
+ bch2_check_lru_key(trans, &iter, k, &last_flushed)));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 7d3920e03742..8c456d8b8b99 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -692,7 +692,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
a = bch2_alloc_to_v4(k, &a_convert);
dirty_sectors = bch2_bucket_sectors_dirty(*a);
bucket_size = ca->mi.bucket_size;
- fragmentation = a->fragmentation_lru;
+ fragmentation = alloc_lru_idx_fragmentation(*a, ca);
ret = bch2_btree_write_buffer_tryflush(trans);
bch_err_msg(c, ret, "flushing btree write buffer");
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index d86565bf07c8..d658be90f737 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -73,6 +73,7 @@ move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
static int bch2_bucket_is_movable(struct btree_trans *trans,
struct move_bucket *b, u64 time)
{
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct bch_alloc_v4 _a;
@@ -90,14 +91,19 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
if (ret)
return ret;
+ struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode);
+ if (!ca)
+ goto out;
+
a = bch2_alloc_to_v4(k, &_a);
b->k.gen = a->gen;
b->sectors = bch2_bucket_sectors_dirty(*a);
+ u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
- ret = data_type_movable(a->data_type) &&
- a->fragmentation_lru &&
- a->fragmentation_lru <= time;
+ ret = lru_idx && lru_idx <= time;
+ bch2_dev_put(ca);
+out:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 232be8a44051..6673cbd8bdb9 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -63,7 +63,7 @@ const char * const bch2_compression_opts[] = {
NULL
};
-const char * const bch2_str_hash_types[] = {
+const char * const __bch2_str_hash_types[] = {
BCH_STR_HASH_TYPES()
NULL
};
@@ -115,6 +115,7 @@ PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type);
PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type);
PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
+PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type);
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err)
@@ -427,7 +428,9 @@ void bch2_opt_to_text(struct printbuf *out,
prt_printf(out, "%lli", v);
break;
case BCH_OPT_STR:
- if (flags & OPT_SHOW_FULL_LIST)
+ if (v < opt->min || v >= opt->max - 1)
+ prt_printf(out, "(invalid option %lli)", v);
+ else if (flags & OPT_SHOW_FULL_LIST)
prt_string_option(out, opt->choices, v);
else
prt_str(out, opt->choices[v]);
@@ -594,6 +597,9 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
copied_opts_start = copied_opts;
while ((opt = strsep(&copied_opts, ",")) != NULL) {
+ if (!*opt)
+ continue;
+
name = strsep(&opt, "=");
val = opt;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index cb2e244a2429..23dda014e331 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -18,7 +18,7 @@ extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_opts[];
extern const char * const bch2_compression_opts[];
-extern const char * const bch2_str_hash_types[];
+extern const char * const __bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
@@ -29,6 +29,7 @@ void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type);
void bch2_prt_data_type(struct printbuf *, enum bch_data_type);
void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
+void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type);
static inline const char *bch2_d_type_str(unsigned d_type)
{
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index c32a05e252e2..74f45a8162ad 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -869,7 +869,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
bkey_quota_init(&new_quota.k_i);
new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 2d299a37cf07..cd6647374353 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -70,7 +70,9 @@ err:
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+ int ret = bch2_trans_commit_do(c, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
__bch2_set_rebalance_needs_scan(trans, inum));
rebalance_wakeup(c);
return ret;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6db72d3bad7d..454b5a32dd7f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -287,7 +287,8 @@ int bch2_journal_replay(struct bch_fs *c)
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_skip_accounting_apply|
- BCH_TRANS_COMMIT_no_journal_res,
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_WATERMARK_reclaim,
bch2_journal_replay_accounting_key(trans, k));
if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
goto err;
@@ -1090,7 +1091,7 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_inode_init_early(c, &lostfound_inode);
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_create_trans(trans,
BCACHEFS_ROOT_SUBVOL_INUM,
&root_inode, &lostfound_inode,
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 50406ce0e4ef..9d96c06e365c 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -46,6 +46,7 @@
x(check_dirents, 27, PASS_FSCK) \
x(check_xattrs, 28, PASS_FSCK) \
x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \
x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index bcb3276747e0..477ef0997949 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
prt_printf(out, "]");
}
-static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r,
- struct bch_sb *sb,
- struct printbuf *err)
+static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
+ struct bch_sb *sb,
+ struct printbuf *err)
{
if (!r->nr_devs) {
prt_printf(err, "no devices in entry ");
@@ -98,10 +98,28 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
struct bch_fs *c,
struct printbuf *err)
{
- mutex_lock(&c->sb_lock);
- int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err);
- mutex_unlock(&c->sb_lock);
- return ret;
+ if (!r->nr_devs) {
+ prt_printf(err, "no devices in entry ");
+ goto bad;
+ }
+
+ if (r->nr_required > 1 &&
+ r->nr_required >= r->nr_devs) {
+ prt_printf(err, "bad nr_required in entry ");
+ goto bad;
+ }
+
+ for (unsigned i = 0; i < r->nr_devs; i++)
+ if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
+ !bch2_dev_exists(c, r->devs[i])) {
+ prt_printf(err, "invalid device %u in entry ", r->devs[i]);
+ goto bad;
+ }
+
+ return 0;
+bad:
+ bch2_replicas_entry_to_text(err, r);
+ return -BCH_ERR_invalid_replicas_entry;
}
void bch2_cpu_replicas_to_text(struct printbuf *out,
@@ -686,7 +704,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i);
- int ret = bch2_replicas_entry_validate_locked(e, sb, err);
+ int ret = bch2_replicas_entry_sb_validate(e, sb, err);
if (ret)
return ret;
@@ -803,6 +821,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
rcu_read_lock();
for (unsigned i = 0; i < e->nr_devs; i++) {
+ if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
+ nr_failed++;
+ continue;
+ }
+
nr_online += test_bit(e->devs[i], devs.d);
struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 5102059a0f1d..ae715ff658e8 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -78,7 +78,10 @@
BCH_FSCK_ERR_accounting_mismatch) \
x(rebalance_work_acct_fix, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch)
+ BCH_FSCK_ERR_accounting_mismatch) \
+ x(inode_has_child_snapshots, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
+ BCH_FSCK_ERR_inode_has_child_snapshots_wrong)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index ed5dca5e1161..937275d061fe 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -115,8 +115,8 @@ enum bch_fsck_flags {
x(alloc_key_data_type_inconsistency, 101, 0) \
x(alloc_key_to_missing_dev_bucket, 102, 0) \
x(alloc_key_cached_inconsistency, 103, 0) \
- x(alloc_key_cached_but_read_time_zero, 104, 0) \
- x(alloc_key_to_missing_lru_entry, 105, 0) \
+ x(alloc_key_cached_but_read_time_zero, 104, FSCK_AUTOFIX) \
+ x(alloc_key_to_missing_lru_entry, 105, FSCK_AUTOFIX) \
x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \
x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \
x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \
@@ -129,20 +129,20 @@ enum bch_fsck_flags {
x(freespace_key_wrong, 115, 0) \
x(freespace_hole_missing, 116, 0) \
x(bucket_gens_val_size_bad, 117, 0) \
- x(bucket_gens_key_wrong, 118, 0) \
- x(bucket_gens_hole_wrong, 119, 0) \
- x(bucket_gens_to_invalid_dev, 120, 0) \
- x(bucket_gens_to_invalid_buckets, 121, 0) \
- x(bucket_gens_nonzero_for_invalid_buckets, 122, 0) \
+ x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \
+ x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \
+ x(bucket_gens_to_invalid_dev, 120, FSCK_AUTOFIX) \
+ x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \
+ x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \
x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \
x(need_discard_freespace_key_bad, 124, 0) \
x(backpointer_bucket_offset_wrong, 125, 0) \
x(backpointer_to_missing_device, 126, 0) \
x(backpointer_to_missing_alloc, 127, 0) \
x(backpointer_to_missing_ptr, 128, 0) \
- x(lru_entry_at_time_0, 129, 0) \
- x(lru_entry_to_invalid_bucket, 130, 0) \
- x(lru_entry_bad, 131, 0) \
+ x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \
+ x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \
+ x(lru_entry_bad, 131, FSCK_AUTOFIX) \
x(btree_ptr_val_too_big, 132, 0) \
x(btree_ptr_v2_val_too_big, 133, 0) \
x(btree_ptr_has_non_ptr, 134, 0) \
@@ -158,9 +158,9 @@ enum bch_fsck_flags {
x(ptr_after_last_bucket, 144, 0) \
x(ptr_before_first_bucket, 145, 0) \
x(ptr_spans_multiple_buckets, 146, 0) \
- x(ptr_to_missing_backpointer, 147, 0) \
- x(ptr_to_missing_alloc_key, 148, 0) \
- x(ptr_to_missing_replicas_entry, 149, 0) \
+ x(ptr_to_missing_backpointer, 147, FSCK_AUTOFIX) \
+ x(ptr_to_missing_alloc_key, 148, FSCK_AUTOFIX) \
+ x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \
x(ptr_to_missing_stripe, 150, 0) \
x(ptr_to_incorrect_stripe, 151, 0) \
x(ptr_gen_newer_than_bucket_gen, 152, 0) \
@@ -180,6 +180,7 @@ enum bch_fsck_flags {
x(reflink_p_to_missing_reflink_v, 166, 0) \
x(stripe_pos_bad, 167, 0) \
x(stripe_val_size_bad, 168, 0) \
+ x(stripe_csum_granularity_bad, 290, 0) \
x(stripe_sector_count_wrong, 169, 0) \
x(snapshot_tree_pos_bad, 170, 0) \
x(snapshot_tree_to_missing_snapshot, 171, 0) \
@@ -194,7 +195,7 @@ enum bch_fsck_flags {
x(snapshot_skiplist_not_normalized, 180, 0) \
x(snapshot_skiplist_bad, 181, 0) \
x(snapshot_should_not_have_subvol, 182, 0) \
- x(snapshot_to_bad_snapshot_tree, 183, 0) \
+ x(snapshot_to_bad_snapshot_tree, 183, FSCK_AUTOFIX) \
x(snapshot_bad_depth, 184, 0) \
x(snapshot_bad_skiplist, 185, 0) \
x(subvol_pos_bad, 186, 0) \
@@ -211,6 +212,7 @@ enum bch_fsck_flags {
x(inode_unlinked_but_clean, 197, 0) \
x(inode_unlinked_but_nlink_nonzero, 198, 0) \
x(inode_unlinked_and_not_open, 281, 0) \
+ x(inode_unlinked_but_has_dirent, 285, 0) \
x(inode_checksum_type_invalid, 199, 0) \
x(inode_compression_type_invalid, 200, 0) \
x(inode_subvol_root_but_not_dir, 201, 0) \
@@ -219,14 +221,18 @@ enum bch_fsck_flags {
x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \
x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \
x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \
+ x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \
+ x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \
x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
+ x(inode_has_child_snapshots_wrong, 287, 0) \
x(inode_unreachable, 210, FSCK_AUTOFIX) \
x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
+ x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \
x(extent_overlapping, 215, 0) \
x(key_in_missing_inode, 216, 0) \
x(key_in_wrong_inode_type, 217, 0) \
@@ -261,8 +267,8 @@ enum bch_fsck_flags {
x(journal_entry_dup_same_device, 246, 0) \
x(inode_bi_subvol_missing, 247, 0) \
x(inode_bi_subvol_wrong, 248, 0) \
- x(inode_points_to_missing_dirent, 249, 0) \
- x(inode_points_to_wrong_dirent, 250, 0) \
+ x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \
+ x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \
x(inode_bi_parent_nonzero, 251, 0) \
x(dirent_to_missing_parent_subvol, 252, 0) \
x(dirent_not_visible_in_parent_subvol, 253, 0) \
@@ -286,6 +292,7 @@ enum bch_fsck_flags {
x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
x(accounting_mismatch, 272, FSCK_AUTOFIX) \
x(accounting_replicas_not_marked, 273, 0) \
+ x(accounting_to_invalid_device, 289, 0) \
x(invalid_btree_id, 274, 0) \
x(alloc_key_io_time_bad, 275, 0) \
x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
@@ -295,7 +302,7 @@ enum bch_fsck_flags {
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
- x(MAX, 284, 0)
+ x(MAX, 291, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 02bcde3c1b02..fb08dd680dac 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -163,6 +163,11 @@ static int validate_member(struct printbuf *err,
return -BCH_ERR_invalid_sb_members;
}
+ if (m.btree_bitmap_shift >= 64) {
+ prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
+ return -BCH_ERR_invalid_sb_members;
+ }
+
return 0;
}
@@ -247,7 +252,10 @@ static void member_to_text(struct printbuf *out,
prt_newline(out);
prt_printf(out, "Btree allocated bitmap blocksize:\t");
- prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
+ if (m.btree_bitmap_shift < 64)
+ prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
+ else
+ prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift);
prt_newline(out);
prt_printf(out, "Btree allocated bitmap:\t");
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
index dc1a27cc31cd..a1cc44e66c7e 100644
--- a/fs/bcachefs/siphash.c
+++ b/fs/bcachefs/siphash.c
@@ -45,7 +45,7 @@
*/
#include <asm/byteorder.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/bitops.h>
#include <linux/string.h>
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 1809442b00ee..ae57638506c3 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -905,12 +905,30 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
if (bch2_snapshot_equiv(c, id))
return 0;
- /* 0 is an invalid tree ID */
+ /* Do we need to reconstruct the snapshot_tree entry as well? */
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
u32 tree_id = 0;
- int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
+ 0, k, ret) {
+ if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
+ tree_id = k.k->p.offset;
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
if (ret)
return ret;
+ if (!tree_id) {
+ ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+ if (ret)
+ return ret;
+ }
+
struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
ret = PTR_ERR_OR_ZERO(snapshot);
if (ret)
@@ -921,6 +939,16 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
snapshot->v.tree = cpu_to_le32(tree_id);
snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+ 0, k, ret) {
+ if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
+ snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
+ SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
@@ -1732,103 +1760,6 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
return ret;
}
-static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s = snapshot_t(c, id);
-
- return s->children[1] ?: s->children[0];
-}
-
-static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
-{
- u32 child;
-
- while ((child = bch2_snapshot_smallest_child(c, id)))
- id = child;
- return id;
-}
-
-static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_s_c interior_k,
- u32 leaf_id, struct bpos *new_min_pos)
-{
- struct btree_iter iter;
- struct bpos pos = interior_k.k->p;
- struct bkey_s_c k;
- struct bkey_i *new;
- int ret;
-
- pos.snapshot = leaf_id;
-
- bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto out;
-
- /* key already overwritten in this snapshot? */
- if (k.k->p.snapshot != interior_k.k->p.snapshot)
- goto out;
-
- if (bpos_eq(*new_min_pos, POS_MIN)) {
- *new_min_pos = k.k->p;
- new_min_pos->snapshot = leaf_id;
- }
-
- new = bch2_bkey_make_mut_noupdate(trans, interior_k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto out;
-
- new->k.p.snapshot = leaf_id;
- ret = bch2_trans_update(trans, &iter, new, 0);
-out:
- bch2_set_btree_iter_dontneed(&iter);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_s_c k,
- struct bpos *new_min_pos)
-{
- struct bch_fs *c = trans->c;
- struct bkey_buf sk;
- u32 restart_count = trans->restart_count;
- int ret = 0;
-
- bch2_bkey_buf_init(&sk);
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
- *new_min_pos = POS_MIN;
-
- for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
- id < k.k->p.snapshot;
- id++) {
- if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
- !bch2_snapshot_is_leaf(c, id))
- continue;
-again:
- ret = btree_trans_too_many_iters(trans) ?:
- bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
- if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- bch2_trans_begin(trans);
- goto again;
- }
-
- if (ret)
- break;
- }
-
- bch2_bkey_buf_exit(&sk, c);
-
- return ret ?: trans_was_restarted(trans, restart_count);
-}
-
static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index eb5ef64221d6..29c94716293e 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -259,9 +259,6 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
return __bch2_key_has_snapshot_overwrites(trans, id, pos);
}
-int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
- struct bkey_s_c, struct bpos *);
-
int bch2_snapshots_read(struct bch_fs *);
void bch2_fs_snapshots_exit(struct bch_fs *);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 215eed4cce6d..ec2b1feea520 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -46,8 +46,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
/* XXX ick */
struct bch_hash_info info = {
- .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
- ~(~0U << INODE_STR_HASH_BITS),
+ .type = INODE_STR_HASH(bi),
.siphash_key = { .k0 = bi->bi_hash_seed }
};
@@ -253,19 +252,20 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
-int bch2_hash_set_in_snapshot(struct btree_trans *trans,
+struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
enum btree_iter_update_trigger_flags flags)
{
- struct btree_iter iter, slot = { NULL };
+ struct btree_iter slot = {};
struct bkey_s_c k;
bool found = false;
int ret;
- for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(insert->k.p.inode,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
@@ -280,7 +280,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
}
if (!slot.path && !(flags & STR_HASH_must_replace))
- bch2_trans_copy_iter(&slot, &iter);
+ bch2_trans_copy_iter(&slot, iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
goto not_found;
@@ -290,29 +290,50 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
ret = -BCH_ERR_ENOSPC_str_hash_create;
out:
bch2_trans_iter_exit(trans, &slot);
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
+ bch2_trans_iter_exit(trans, iter);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
found:
found = true;
not_found:
-
- if (!found && (flags & STR_HASH_must_replace)) {
+ if (found && (flags & STR_HASH_must_create)) {
+ bch2_trans_iter_exit(trans, &slot);
+ return k;
+ } else if (!found && (flags & STR_HASH_must_replace)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else if (found && (flags & STR_HASH_must_create)) {
- ret = -BCH_ERR_EEXIST_str_hash_set;
} else {
if (!found && slot.path)
- swap(iter, slot);
+ swap(*iter, slot);
- insert->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, insert, flags);
+ insert->k.p = iter->pos;
+ ret = bch2_trans_update(trans, iter, insert, flags);
}
goto out;
}
static __always_inline
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, u32 snapshot,
+ struct bkey_i *insert,
+ enum btree_iter_update_trigger_flags flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum,
+ snapshot, insert, flags);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (k.k) {
+ bch2_trans_iter_exit(trans, &iter);
+ return -BCH_ERR_EEXIST_str_hash_set;
+ }
+
+ return 0;
+}
+
+static __always_inline
int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
@@ -363,8 +384,11 @@ int bch2_hash_delete(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
BTREE_ITER_intent);
- int ret = bkey_err(k) ?:
- bch2_hash_delete_at(trans, desc, info, &iter, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 6845dde1b339..80e5efaff524 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -102,7 +102,8 @@ static int check_subvol(struct btree_trans *trans,
inode.bi_inum, inode.bi_snapshot,
inode.bi_subvol, subvol.k->p.offset)) {
inode.bi_subvol = subvol.k->p.offset;
- ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+ inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
+ ret = __bch2_fsck_write_inode(trans, &inode);
if (ret)
goto err;
}
@@ -318,8 +319,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol)
{
- return bch2_trans_do(c, NULL, NULL, 0,
- bch2_subvol_is_ro_trans(trans, subvol));
+ return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol));
}
int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
@@ -331,8 +331,8 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
}
-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
- u32 *snapid)
+int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
+ u32 *snapid, bool warn)
{
struct btree_iter iter;
struct bkey_s_c_subvolume subvol;
@@ -343,7 +343,8 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
BTREE_ITER_cached|BTREE_ITER_with_updates,
subvolume);
ret = bkey_err(subvol);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+
+ bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c,
"missing subvolume %u", subvolid);
if (likely(!ret))
@@ -352,6 +353,12 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
return ret;
}
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
+ u32 *snapid)
+{
+ return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true);
+}
+
static int bch2_subvolume_reparent(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -668,8 +675,8 @@ err:
/* set bi_subvol on root inode */
int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
- __bch2_fs_upgrade_for_subvolumes(trans));
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ __bch2_fs_upgrade_for_subvolumes(trans));
bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index e62f876541fe..f897d106e142 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -26,6 +26,8 @@ int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
int bch2_subvol_has_children(struct btree_trans *, u32);
int bch2_subvolume_get(struct btree_trans *, unsigned,
bool, int, struct bch_subvolume *);
+int __bch2_subvolume_get_snapshot(struct btree_trans *, u32,
+ u32 *, bool);
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 873e4be7e1dc..657fd3759e7b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -184,6 +184,7 @@ static DEFINE_MUTEX(bch_fs_list_lock);
DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
+static void bch2_dev_unlink(struct bch_dev *);
static void bch2_dev_free(struct bch_dev *);
static int bch2_dev_alloc(struct bch_fs *, unsigned);
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
@@ -620,9 +621,7 @@ void __bch2_fs_stop(struct bch_fs *c)
up_write(&c->state_lock);
for_each_member_device(c, ca)
- if (ca->kobj.state_in_sysfs &&
- ca->disk_sb.bdev)
- sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+ bch2_dev_unlink(ca);
if (c->kobj.state_in_sysfs)
kobject_del(&c->kobj);
@@ -1187,9 +1186,7 @@ static void bch2_dev_free(struct bch_dev *ca)
{
cancel_work_sync(&ca->io_error_work);
- if (ca->kobj.state_in_sysfs &&
- ca->disk_sb.bdev)
- sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+ bch2_dev_unlink(ca);
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
@@ -1226,10 +1223,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
percpu_ref_kill(&ca->io_ref);
wait_for_completion(&ca->io_ref_completion);
- if (ca->kobj.state_in_sysfs) {
- sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
- sysfs_remove_link(&ca->kobj, "block");
- }
+ bch2_dev_unlink(ca);
bch2_free_super(&ca->disk_sb);
bch2_dev_journal_exit(ca);
@@ -1251,6 +1245,26 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
complete(&ca->io_ref_completion);
}
+static void bch2_dev_unlink(struct bch_dev *ca)
+{
+ struct kobject *b;
+
+ /*
+ * This is racy w.r.t. the underlying block device being hot-removed,
+ * which removes it from sysfs.
+ *
+ * It'd be lovely if we had a way to handle this race, but the sysfs
+ * code doesn't appear to provide a good method and block/holder.c is
+ * susceptible as well:
+ */
+ if (ca->kobj.state_in_sysfs &&
+ ca->disk_sb.bdev &&
+ (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
+ sysfs_remove_link(b, "bcachefs");
+ sysfs_remove_link(&ca->kobj, "block");
+ }
+}
+
static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
{
int ret;
@@ -1958,7 +1972,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
};
u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
- ret = bch2_trans_do(ca->fs, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
if (ret)
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index b2f209743afe..315038a0a92d 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -450,7 +450,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start,
k.k_i.k.p.snapshot = snapid;
k.k_i.k.size = len;
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
BTREE_UPDATE_internal_snapshot_node));
bch_err_fn(c, ret);
@@ -510,7 +510,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
if (ret)
return ret;
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_snapshot_node_create(trans, U32_MAX,
snapids,
snapid_subvols,
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 42f565c76181..e0a876cbaa6b 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -222,7 +222,7 @@ u64 bch2_read_flag_list(const char *opt, const char * const list[])
break;
}
- ret |= 1 << flag;
+ ret |= BIT_ULL(flag);
}
kfree(d);
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index a9ebcd82c602..6a78553d9b0c 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -3,7 +3,7 @@
#include <linux/bitops.h>
#include <linux/math.h>
#include <linux/string.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#ifdef CONFIG_VALGRIND
#include <valgrind/memcheck.h>
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 56c8d3fe55a4..952aca400faf 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -330,7 +330,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret = bch2_trans_do(c, NULL, NULL, 0,
+ int ret = bch2_trans_do(c,
bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
if (ret < 0 && bch2_err_matches(ret, ENOENT))