summaryrefslogtreecommitdiff
path: root/fs/bcachefs/journal_seq_blacklist.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/journal_seq_blacklist.c')
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c360
1 files changed, 360 insertions, 0 deletions
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
new file mode 100644
index 000000000000..c26f36d58633
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ *
+ * Blacklisted journal sequence numbers are themselves recorded as entries in
+ * the journal.
+ */
+
+/*
+ * Called when journal needs to evict a blacklist entry to reclaim space: find
+ * any btree nodes that refer to the blacklist journal sequence numbers, and
+ * rewrite them:
+ */
+static void journal_seq_blacklist_flush(struct journal *j,
+ struct journal_entry_pin *pin, u64 seq)
+{
+ struct bch_fs *c =
+ container_of(j, struct bch_fs, journal);
+ struct journal_seq_blacklist *bl =
+ container_of(pin, struct journal_seq_blacklist, pin);
+ struct blacklisted_node n;
+ struct closure cl;
+ unsigned i;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ for (i = 0;; i++) {
+ struct btree_iter iter;
+ struct btree *b;
+
+ mutex_lock(&j->blacklist_lock);
+ if (i >= bl->nr_entries) {
+ mutex_unlock(&j->blacklist_lock);
+ break;
+ }
+ n = bl->entries[i];
+ mutex_unlock(&j->blacklist_lock);
+
+ __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos,
+ 0, 0, BTREE_ITER_NODES);
+
+ b = bch2_btree_iter_peek_node(&iter);
+
+ /* The node might have already been rewritten: */
+
+ if (b->data->keys.seq == n.seq) {
+ ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
+ if (ret) {
+ bch2_btree_iter_unlock(&iter);
+ bch2_fs_fatal_error(c,
+ "error %i rewriting btree node with blacklisted journal seq",
+ ret);
+ bch2_journal_halt(j);
+ return;
+ }
+ }
+
+ bch2_btree_iter_unlock(&iter);
+ }
+
+ for (i = 0;; i++) {
+ struct btree_update *as;
+ struct pending_btree_node_free *d;
+
+ mutex_lock(&j->blacklist_lock);
+ if (i >= bl->nr_entries) {
+ mutex_unlock(&j->blacklist_lock);
+ break;
+ }
+ n = bl->entries[i];
+ mutex_unlock(&j->blacklist_lock);
+redo_wait:
+ mutex_lock(&c->btree_interior_update_lock);
+
+ /*
+ * Is the node on the list of pending interior node updates -
+ * being freed? If so, wait for that to finish:
+ */
+ for_each_pending_btree_node_free(c, as, d)
+ if (n.seq == d->seq &&
+ n.btree_id == d->btree_id &&
+ !d->level &&
+ !bkey_cmp(n.pos, d->key.k.p)) {
+ closure_wait(&as->wait, &cl);
+ mutex_unlock(&c->btree_interior_update_lock);
+ closure_sync(&cl);
+ goto redo_wait;
+ }
+
+ mutex_unlock(&c->btree_interior_update_lock);
+ }
+
+ mutex_lock(&j->blacklist_lock);
+
+ bch2_journal_pin_drop(j, &bl->pin);
+ list_del(&bl->list);
+ kfree(bl->entries);
+ kfree(bl);
+
+ mutex_unlock(&j->blacklist_lock);
+}
+
+/*
+ * Determine if a particular sequence number is blacklisted - if so, return
+ * blacklist entry:
+ */
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
+{
+ struct journal_seq_blacklist *bl;
+
+ lockdep_assert_held(&j->blacklist_lock);
+
+ list_for_each_entry(bl, &j->seq_blacklist, list)
+ if (seq >= bl->start && seq <= bl->end)
+ return bl;
+
+ return NULL;
+}
+
+/*
+ * Allocate a new, in memory blacklist entry:
+ */
+static struct journal_seq_blacklist *
+bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
+{
+ struct journal_seq_blacklist *bl;
+
+ lockdep_assert_held(&j->blacklist_lock);
+
+ /*
+ * When we start the journal, bch2_journal_start() will skip over @seq:
+ */
+
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl)
+ return NULL;
+
+ bl->start = start;
+ bl->end = end;
+
+ list_add_tail(&bl->list, &j->seq_blacklist);
+ return bl;
+}
+
+/*
+ * Returns true if @seq is newer than the most recent journal entry that got
+ * written, and data corresponding to @seq should be ignored - also marks @seq
+ * as blacklisted so that on future restarts the corresponding data will still
+ * be ignored:
+ */
+int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
+{
+ struct journal *j = &c->journal;
+ struct journal_seq_blacklist *bl = NULL;
+ struct blacklisted_node *n;
+ u64 journal_seq;
+ int ret = 0;
+
+ if (!seq)
+ return 0;
+
+ spin_lock(&j->lock);
+ journal_seq = journal_cur_seq(j);
+ spin_unlock(&j->lock);
+
+ /* Interier updates aren't journalled: */
+ BUG_ON(b->level);
+ BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
+
+ /*
+ * Decrease this back to j->seq + 2 when we next rev the on disk format:
+ * increasing it temporarily to work around bug in old kernels
+ */
+ fsck_err_on(seq > journal_seq + 4, c,
+ "bset journal seq too far in the future: %llu > %llu",
+ seq, journal_seq);
+
+ if (seq <= journal_seq &&
+ list_empty_careful(&j->seq_blacklist))
+ return 0;
+
+ mutex_lock(&j->blacklist_lock);
+
+ if (seq <= journal_seq) {
+ bl = bch2_journal_seq_blacklist_find(j, seq);
+ if (!bl)
+ goto out;
+ } else {
+ bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
+ b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
+
+ if (!j->new_blacklist) {
+ j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
+ journal_seq + 1,
+ journal_seq + 1);
+ if (!j->new_blacklist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ bl = j->new_blacklist;
+ bl->end = max(bl->end, seq);
+ }
+
+ for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
+ if (b->data->keys.seq == n->seq &&
+ b->btree_id == n->btree_id &&
+ !bkey_cmp(b->key.k.p, n->pos))
+ goto found_entry;
+
+ if (!bl->nr_entries ||
+ is_power_of_2(bl->nr_entries)) {
+ n = krealloc(bl->entries,
+ max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
+ GFP_KERNEL);
+ if (!n) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ bl->entries = n;
+ }
+
+ bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
+ .seq = b->data->keys.seq,
+ .btree_id = b->btree_id,
+ .pos = b->key.k.p,
+ };
+found_entry:
+ ret = 1;
+out:
+fsck_err:
+ mutex_unlock(&j->blacklist_lock);
+ return ret;
+}
+
+static int __bch2_journal_seq_blacklist_read(struct journal *j,
+ struct journal_replay *i,
+ u64 start, u64 end)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_seq_blacklist *bl;
+
+ bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
+ start, end);
+
+ bl = bch2_journal_seq_blacklisted_new(j, start, end);
+ if (!bl)
+ return -ENOMEM;
+
+ bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
+ journal_seq_blacklist_flush);
+ return 0;
+}
+
+/*
+ * After reading the journal, find existing journal seq blacklist entries and
+ * read them into memory:
+ */
+int bch2_journal_seq_blacklist_read(struct journal *j,
+ struct journal_replay *i)
+{
+ struct jset_entry *entry;
+ int ret = 0;
+
+ vstruct_for_each(&i->j, entry) {
+ switch (entry->type) {
+ case BCH_JSET_ENTRY_blacklist: {
+ struct jset_entry_blacklist *bl_entry =
+ container_of(entry, struct jset_entry_blacklist, entry);
+
+ ret = __bch2_journal_seq_blacklist_read(j, i,
+ le64_to_cpu(bl_entry->seq),
+ le64_to_cpu(bl_entry->seq));
+ break;
+ }
+ case BCH_JSET_ENTRY_blacklist_v2: {
+ struct jset_entry_blacklist_v2 *bl_entry =
+ container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+ ret = __bch2_journal_seq_blacklist_read(j, i,
+ le64_to_cpu(bl_entry->start),
+ le64_to_cpu(bl_entry->end));
+ break;
+ }
+ }
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * After reading the journal and walking the btree, we might have new journal
+ * sequence numbers to blacklist - add entries to the next journal entry to be
+ * written:
+ */
+void bch2_journal_seq_blacklist_write(struct journal *j)
+{
+ struct journal_seq_blacklist *bl = j->new_blacklist;
+ struct jset_entry_blacklist_v2 *bl_entry;
+ struct jset_entry *entry;
+
+ if (!bl)
+ return;
+
+ entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
+ (sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
+
+ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+ bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2;
+ bl_entry->start = cpu_to_le64(bl->start);
+ bl_entry->end = cpu_to_le64(bl->end);
+
+ bch2_journal_pin_add(j,
+ journal_cur_seq(j),
+ &bl->pin,
+ journal_seq_blacklist_flush);
+
+ j->new_blacklist = NULL;
+}