diff options
123 files changed, 4632 insertions, 4324 deletions
diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst new file mode 100644 index 000000000000..0c45829a4899 --- /dev/null +++ b/Documentation/filesystems/bcachefs/CodingStyle.rst @@ -0,0 +1,186 @@ +.. SPDX-License-Identifier: GPL-2.0 + +bcachefs coding style +===================== + +Good development is like gardening, and codebases are our gardens. Tend to them +every day; look for little things that are out of place or in need of tidying. +A little weeding here and there goes a long way; don't wait until things have +spiraled out of control. + +Things don't always have to be perfect - nitpicking often does more harm than +good. But appreciate beauty when you see it - and let people know. + +The code that you are afraid to touch is the code most in need of refactoring. + +A little organizing here and there goes a long way. + +Put real thought into how you organize things. + +Good code is readable code, where the structure is simple and leaves nowhere +for bugs to hide. + +Assertions are one of our most important tools for writing reliable code. If in +the course of writing a patchset you encounter a condition that shouldn't +happen (and will have unpredictable or undefined behaviour if it does), or +you're not sure if it can happen and not sure how to handle it yet - make it a +BUG_ON(). Don't leave undefined or unspecified behavior lurking in the codebase. + +By the time you finish the patchset, you should understand better which +assertions need to be handled and turned into checks with error paths, and +which should be logically impossible. Leave the BUG_ON()s in for the ones which +are logically impossible. (Or, make them debug mode assertions if they're +expensive - but don't turn everything into a debug mode assertion, so that +we're not stuck debugging undefined behaviour should it turn out that you were +wrong). + +Assertions are documentation that can't go out of date. Good assertions are +wonderful. + +Good assertions drastically and dramatically reduce the amount of testing +required to shake out bugs. + +Good assertions are based on state, not logic. To write good assertions, you +have to think about what the invariants on your state are. + +Good invariants and assertions will hold everywhere in your codebase. This +means that you can run them in only a few places in the checked in version, but +should you need to debug something that caused the assertion to fail, you can +quickly shotgun them everywhere to find the codepath that broke the invariant. + +A good assertion checks something that the compiler could check for us, and +elide - if we were working in a language with embedded correctness proofs that +the compiler could check. This is something that exists today, but it'll likely +still be a few decades before it comes to systems programming languages. But we +can still incorporate that kind of thinking into our code and document the +invariants with runtime checks - much like the way people working in +dynamically typed languages may add type annotations, gradually making their +code statically typed. + +Looking for ways to make your assertions simpler - and higher level - will +often nudge you towards making the entire system simpler and more robust. + +Good code is code where you can poke around and see what it's doing - +introspection. We can't debug anything if we can't see what's going on. + +Whenever we're debugging, and the solution isn't immediately obvious, if the +issue is that we don't know where the issue is because we can't see what's +going on - fix that first. + +We have the tools to make anything visible at runtime, efficiently - RCU and +percpu data structures among them. Don't let things stay hidden. + +The most important tool for introspection is the humble pretty printer - in +bcachefs, this means `*_to_text()` functions, which output to printbufs. + +Pretty printers are wonderful, because they compose and you can use them +everywhere. Having functions to print whatever object you're working with will +make your error messages much easier to write (therefore they will actually +exist) and much more informative. And they can be used from sysfs/debugfs, as +well as tracepoints. + +Runtime info and debugging tools should come with clear descriptions and +labels, and good structure - we don't want files with a list of bare integers, +like in procfs. Part of the job of the debugging tools is to educate users and +new developers as to how the system works. + +Error messages should, whenever possible, tell you everything you need to debug +the issue. It's worth putting effort into them. + +Tracepoints shouldn't be the first thing you reach for. They're an important +tool, but always look for more immediate ways to make things visible. When we +have to rely on tracing, we have to know which tracepoints we're looking for, +and then we have to run the troublesome workload, and then we have to sift +through logs. This is a lot of steps to go through when a user is hitting +something, and if it's intermittent it may not even be possible. + +The humble counter is an incredibly useful tool. They're cheap and simple to +use, and many complicated internal operations with lots of things that can +behave weirdly (anything involving memory reclaim, for example) become +shockingly easy to debug once you have counters on every distinct codepath. + +Persistent counters are even better. + +When debugging, try to get the most out of every bug you come across; don't +rush to fix the initial issue. Look for things that will make related bugs +easier the next time around - introspection, new assertions, better error +messages, new debug tools, and do those first. Look for ways to make the system +better behaved; often one bug will uncover several other bugs through +downstream effects. + +Fix all that first, and then the original bug last - even if that means keeping +a user waiting. They'll thank you in the long run, and when they understand +what you're doing you'll be amazed at how patient they're happy to be. Users +like to help - otherwise they wouldn't be reporting the bug in the first place. + +Talk to your users. Don't isolate yourself. + +Users notice all sorts of interesting things, and by just talking to them and +interacting with them you can benefit from their experience. + +Spend time doing support and helpdesk stuff. Don't just write code - code isn't +finished until it's being used trouble free. + +This will also motivate you to make your debugging tools as good as possible, +and perhaps even your documentation, too. Like anything else in life, the more +time you spend at it the better you'll get, and you the developer are the +person most able to improve the tools to make debugging quick and easy. + +Be wary of how you take on and commit to big projects. Don't let development +become product-manager focused. Often time an idea is a good one but needs to +wait for its proper time - but you won't know if it's the proper time for an +idea until you start writing code. + +Expect to throw a lot of things away, or leave them half finished for later. +Nobody writes all perfect code that all gets shipped, and you'll be much more +productive in the long run if you notice this early and shift to something +else. The experience gained and lessons learned will be valuable for all the +other work you do. + +But don't be afraid to tackle projects that require significant rework of +existing code. Sometimes these can be the best projects, because they can lead +us to make existing code more general, more flexible, more multipurpose and +perhaps more robust. Just don't hesitate to abandon the idea if it looks like +it's going to make a mess of things. + +Complicated features can often be done as a series of refactorings, with the +final change that actually implements the feature as a quite small patch at the +end. It's wonderful when this happens, especially when those refactorings are +things that improve the codebase in their own right. When that happens there's +much less risk of wasted effort if the feature you were going for doesn't work +out. + +Always strive to work incrementally. Always strive to turn the big projects +into little bite sized projects that can prove their own merits. + +Instead of always tackling those big projects, look for little things that +will be useful, and make the big projects easier. + +The question of what's likely to be useful is where junior developers most +often go astray - doing something because it seems like it'll be useful often +leads to overengineering. Knowing what's useful comes from many years of +experience, or talking with people who have that experience - or from simply +reading lots of code and looking for common patterns and issues. Don't be +afraid to throw things away and do something simpler. + +Talk about your ideas with your fellow developers; often times the best things +come from relaxed conversations where people aren't afraid to say "what if?". + +Don't neglect your tools. + +The most important tools (besides the compiler and our text editor) are the +tools we use for testing. The shortest possible edit/test/debug cycle is +essential for working productively. We learn, gain experience, and discover the +errors in our thinking by running our code and seeing what happens. If your +time is being wasted because your tools are bad or too slow - don't accept it, +fix it. + +Put effort into your documentation, commmit messages, and code comments - but +don't go overboard. A good commit message is wonderful - but if the information +was important enough to go in a commit message, ask yourself if it would be +even better as a code comment. + +A good code comment is wonderful, but even better is the comment that didn't +need to exist because the code was so straightforward as to be obvious; +organized into small clean and tidy modules, with clear and descriptive names +for functions and variable, where every line of code has a clear purpose. diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst index e2bd61ccd96f..95fc4b90739e 100644 --- a/Documentation/filesystems/bcachefs/index.rst +++ b/Documentation/filesystems/bcachefs/index.rst @@ -8,4 +8,5 @@ bcachefs Documentation :maxdepth: 2 :numbered: + CodingStyle errorcodes diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 5c180fdc3efb..250d6c6d3a3a 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -282,18 +282,12 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct posix_acl *acl = NULL; - struct bkey_s_c k; - int ret; retry: bch2_trans_begin(trans); - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash, inode_inum(inode), &search, 0); - if (ret) - goto err; - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash, inode_inum(inode), &search, 0); + int ret = bkey_err(k); if (ret) goto err; @@ -366,7 +360,7 @@ retry: ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?: bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto btree_err; @@ -414,39 +408,30 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); struct btree_iter iter; - struct bkey_s_c_xattr xattr; - struct bkey_i_xattr *new; struct posix_acl *acl = NULL; - struct bkey_s_c k; - int ret; - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash_info, inum, &search, BTREE_ITER_INTENT); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash_info, inum, &search, BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) return bch2_err_matches(ret, ENOENT) ? 0 : ret; - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); ret = PTR_ERR_OR_ZERO(acl); - if (IS_ERR_OR_NULL(acl)) + if (ret) goto err; - ret = allocate_dropping_locks_errcode(trans, - __posix_acl_chmod(&acl, _gfp, mode)); + ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode)); if (ret) goto err; - new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); - if (IS_ERR(new)) { - ret = PTR_ERR(new); + struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); + ret = PTR_ERR_OR_ZERO(new); + if (ret) goto err; - } new->k.p = iter.pos; ret = bch2_trans_update(trans, &iter, &new->k_i, 0); diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 534ba2b02bd6..346cd91f91f9 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -195,7 +195,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) } int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); @@ -211,7 +211,7 @@ fsck_err: } int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; @@ -225,7 +225,7 @@ fsck_err: } int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; @@ -239,7 +239,7 @@ fsck_err: } int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); int ret = 0; @@ -263,7 +263,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: - bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe, + bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe, c, err, alloc_key_empty_but_have_data, "empty data type free but have data"); break; @@ -330,27 +330,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); bch2_prt_data_type(out, a->data_type); prt_newline(out); - prt_printf(out, "journal_seq %llu", a->journal_seq); - prt_newline(out); - prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); - prt_newline(out); - prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); - prt_newline(out); - prt_printf(out, "dirty_sectors %u", a->dirty_sectors); - prt_newline(out); - prt_printf(out, "cached_sectors %u", a->cached_sectors); - prt_newline(out); - prt_printf(out, "stripe %u", a->stripe); - prt_newline(out); - prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); - prt_newline(out); - prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); - prt_newline(out); - prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); - prt_newline(out); - prt_printf(out, "fragmentation %llu", a->fragmentation_lru); - prt_newline(out); - prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); + prt_printf(out, "journal_seq %llu\n", a->journal_seq); + prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); + prt_printf(out, "cached_sectors %u\n", a->cached_sectors); + prt_printf(out, "stripe %u\n", a->stripe); + prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); + prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); + prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); + prt_printf(out, "fragmentation %llu\n", a->fragmentation_lru); + prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a)); printbuf_indent_sub(out, 2); } @@ -439,22 +429,18 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct b } struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos) +bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) { - struct bkey_s_c k; - struct bkey_i_alloc_v4 *a; - int ret; - - k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - ret = bkey_err(k); + struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, + BTREE_ITER_with_updates| + BTREE_ITER_cached| + BTREE_ITER_intent); + int ret = bkey_err(k); if (unlikely(ret)) return ERR_PTR(ret); - a = bch2_alloc_to_v4_mut_inlined(trans, k); + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); ret = PTR_ERR_OR_ZERO(a); if (unlikely(ret)) goto err; @@ -464,6 +450,20 @@ err: return ERR_PTR(ret); } +__flatten +struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); + int ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ERR_PTR(ret); + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + bch2_trans_iter_exit(trans, &iter); + return unlikely(ret) ? ERR_PTR(ret) : a; +} + static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) { *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; @@ -487,7 +487,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) } int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -520,7 +520,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) int ret; ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -567,29 +567,31 @@ iter_err: int bch2_alloc_read(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); + struct bch_dev *ca = NULL; int ret; down_read(&c->gc_lock); if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; if (k.k->type != KEY_TYPE_bucket_gens) continue; - const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; - + ca = bch2_dev_iterate(c, ca, k.k->p.inode); /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ - if (!bch2_dev_exists2(c, k.k->p.inode)) + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; + } - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; for (u64 b = max_t(u64, ca->mi.first_bucket, start); b < min_t(u64, ca->mi.nbuckets, end); @@ -599,15 +601,16 @@ int bch2_alloc_read(struct bch_fs *c) })); } else { ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ - if (!bch2_dev_bucket_exists(c, k.k->p)) + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; - - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + } struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; @@ -615,6 +618,7 @@ int bch2_alloc_read(struct bch_fs *c) })); } + bch2_dev_put(ca); bch2_trans_put(trans); up_read(&c->gc_lock); @@ -625,12 +629,12 @@ int bch2_alloc_read(struct bch_fs *c) /* Free space/discard btree: */ static int bch2_bucket_do_index(struct btree_trans *trans, + struct bch_dev *ca, struct bkey_s_c alloc_k, const struct bch_alloc_v4 *a, bool set) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); struct btree_iter iter; struct bkey_s_c old; struct bkey_i *k; @@ -667,7 +671,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans, old = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(&k->k), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bkey_err(old); if (ret) return ret; @@ -711,8 +715,8 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, return ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES); + BTREE_ITER_intent| + BTREE_ITER_with_updates); ret = bkey_err(k); if (ret) return ret; @@ -734,26 +738,24 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, int bch2_trigger_alloc(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; int ret = 0; - if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, - "alloc key for invalid device or bucket")) + struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); + if (!ca) return -EIO; - struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); - struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; - new_a->data_type = alloc_data_type(*new_a, new_a->data_type); + alloc_data_type_set(new_a, new_a->data_type); - if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) { + if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); @@ -770,10 +772,10 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (old_a->data_type != new_a->data_type || (new_a->data_type == BCH_DATA_free && alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { - ret = bch2_bucket_do_index(trans, old, old_a, false) ?: - bch2_bucket_do_index(trans, new.s_c, new_a, true); + ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: + bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); if (ret) - return ret; + goto err; } if (new_a->data_type == BCH_DATA_cached && @@ -787,24 +789,23 @@ int bch2_trigger_alloc(struct btree_trans *trans, bucket_to_u64(new.k->p), old_lru, new_lru); if (ret) - return ret; + goto err; } - new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, - bch_dev_bkey_exists(c, new.k->p.inode)); + new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca); if (old_a->fragmentation_lru != new_a->fragmentation_lru) { ret = bch2_lru_change(trans, BCH_LRU_FRAGMENTATION_START, bucket_to_u64(new.k->p), old_a->fragmentation_lru, new_a->fragmentation_lru); if (ret) - return ret; + goto err; } if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); if (ret) - return ret; + goto err; } /* @@ -812,21 +813,21 @@ int bch2_trigger_alloc(struct btree_trans *trans, * not: */ - if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + if ((flags & BTREE_TRIGGER_bucket_invalidate) && old_a->cached_sectors) { ret = bch2_update_cached_sectors_list(trans, new.k->p.inode, -((s64) old_a->cached_sectors)); if (ret) - return ret; + goto err; } } - if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; u64 journal_seq = trans->journal_res.seq; u64 bucket_journal_seq = new_a->journal_seq; - if ((flags & BTREE_TRIGGER_INSERT) && + if ((flags & BTREE_TRIGGER_insert) && data_type_is_empty(old_a->data_type) != data_type_is_empty(new_a->data_type) && new.k->type == KEY_TYPE_alloc_v4) { @@ -854,7 +855,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (ret) { bch2_fs_fatal_error(c, "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)); - return ret; + goto err; } } @@ -884,11 +885,11 @@ int bch2_trigger_alloc(struct btree_trans *trans, bch2_do_invalidates(c); if (statechange(a->data_type == BCH_DATA_need_gc_gens)) - bch2_do_gc_gens(c); + bch2_gc_gens_async(c); } - if ((flags & BTREE_TRIGGER_GC) && - (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) { + if ((flags & BTREE_TRIGGER_gc) && + (flags & BTREE_TRIGGER_bucket_invalidate)) { struct bch_alloc_v4 new_a_convert; const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert); @@ -908,12 +909,13 @@ int bch2_trigger_alloc(struct btree_trans *trans, bucket_unlock(g); percpu_up_read(&c->mark_lock); } - - return 0; +err: + bch2_dev_put(ca); + return ret; } /* - * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for + * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for * extents style btrees, but works on non-extents btrees: */ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) @@ -958,35 +960,34 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos } } -static bool next_bucket(struct bch_fs *c, struct bpos *bucket) +static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket) { - struct bch_dev *ca; + if (*ca) { + if (bucket->offset < (*ca)->mi.first_bucket) + bucket->offset = (*ca)->mi.first_bucket; - if (bch2_dev_bucket_exists(c, *bucket)) - return true; - - if (bch2_dev_exists2(c, bucket->inode)) { - ca = bch_dev_bkey_exists(c, bucket->inode); - - if (bucket->offset < ca->mi.first_bucket) { - bucket->offset = ca->mi.first_bucket; + if (bucket->offset < (*ca)->mi.nbuckets) return true; - } + bch2_dev_put(*ca); + *ca = NULL; bucket->inode++; bucket->offset = 0; } rcu_read_lock(); - ca = __bch2_next_dev_idx(c, bucket->inode, NULL); - if (ca) - *bucket = POS(ca->dev_idx, ca->mi.first_bucket); + *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); + if (*ca) { + *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); + bch2_dev_get(*ca); + } rcu_read_unlock(); - return ca != NULL; + return *ca != NULL; } -static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, + struct bch_dev **ca, struct bkey *hole) { struct bch_fs *c = iter->trans->c; struct bkey_s_c k; @@ -995,22 +996,21 @@ again: if (bkey_err(k)) return k; + *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode); + if (!k.k->type) { - struct bpos bucket = bkey_start_pos(k.k); + struct bpos hole_start = bkey_start_pos(k.k); - if (!bch2_dev_bucket_exists(c, bucket)) { - if (!next_bucket(c, &bucket)) + if (!*ca || !bucket_valid(*ca, hole_start.offset)) { + if (!next_bucket(c, ca, &hole_start)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bucket); + bch2_btree_iter_set_pos(iter, hole_start); goto again; } - if (!bch2_dev_bucket_exists(c, k.k->p)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); - - bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset); - } + if (k.k->p.offset > (*ca)->mi.nbuckets) + bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset); } return k; @@ -1025,24 +1025,25 @@ int bch2_check_alloc_key(struct btree_trans *trans, struct btree_iter *bucket_gens_iter) { struct bch_fs *c = trans->c; - struct bch_dev *ca; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; unsigned discard_key_type, freespace_key_type; unsigned gens_offset; struct bkey_s_c k; struct printbuf buf = PRINTBUF; - int ret; + int ret = 0; - if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, - alloc_key_to_missing_dev_bucket, + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); + if (fsck_err_on(!ca, + c, alloc_key_to_missing_dev_bucket, "alloc key for invalid device:bucket %llu:%llu", alloc_k.k->p.inode, alloc_k.k->p.offset)) - return bch2_btree_delete_at(trans, alloc_iter, 0); + ret = bch2_btree_delete_at(trans, alloc_iter, 0); + if (!ca) + return ret; - ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); if (!ca->mi.freespace_initialized) - return 0; + goto out; a = bch2_alloc_to_v4(alloc_k, &a_convert); @@ -1141,25 +1142,26 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; } +out: err: fsck_err: + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } static noinline_for_stack int bch2_check_alloc_hole_freespace(struct btree_trans *trans, + struct bch_dev *ca, struct bpos start, struct bpos *end, struct btree_iter *freespace_iter) { struct bch_fs *c = trans->c; - struct bch_dev *ca; struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; - ca = bch_dev_bkey_exists(c, start.inode); if (!ca->mi.freespace_initialized) return 0; @@ -1313,7 +1315,7 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran goto delete; out: fsck_err: - set_btree_iter_dontneed(&alloc_iter); + bch2_set_btree_iter_dontneed(&alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; @@ -1337,30 +1339,25 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_i_bucket_gens g; - struct bch_dev *ca; u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; u64 b; - bool need_update = false, dev_exists; + bool need_update = false; struct printbuf buf = PRINTBUF; int ret = 0; BUG_ON(k.k->type != KEY_TYPE_bucket_gens); bkey_reassemble(&g.k_i, k); - /* if no bch_dev, skip out whether we repair or not */ - dev_exists = bch2_dev_exists2(c, k.k->p.inode); - if (!dev_exists) { - if (fsck_err_on(!dev_exists, c, - bucket_gens_to_invalid_dev, - "bucket_gens key for invalid device:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); + if (!ca) { + if (fsck_err(c, bucket_gens_to_invalid_dev, + "bucket_gens key for invalid device:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, 0); - } goto out; } - ca = bch_dev_bkey_exists(c, k.k->p.inode); if (fsck_err_on(end <= ca->mi.first_bucket || start >= ca->mi.nbuckets, c, bucket_gens_to_invalid_buckets, @@ -1398,6 +1395,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, } out: fsck_err: + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } @@ -1406,25 +1404,26 @@ int bch2_check_alloc_info(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; + struct bch_dev *ca = NULL; struct bkey hole; struct bkey_s_c k; int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); while (1) { struct bpos next; bch2_trans_begin(trans); - k = bch2_get_key_or_real_bucket_hole(&iter, &hole); + k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -1445,7 +1444,7 @@ int bch2_check_alloc_info(struct bch_fs *c) } else { next = k.k->p; - ret = bch2_check_alloc_hole_freespace(trans, + ret = bch2_check_alloc_hole_freespace(trans, ca, bkey_start_pos(k.k), &next, &freespace_iter) ?: @@ -1473,19 +1472,21 @@ bkey_err: bch2_trans_iter_exit(trans, &freespace_iter); bch2_trans_iter_exit(trans, &discard_iter); bch2_trans_iter_exit(trans, &iter); + bch2_dev_put(ca); + ca = NULL; if (ret < 0) goto err; ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, bch2_check_discard_freespace_key(trans, &iter)); if (ret) goto err; bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); while (1) { bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); @@ -1515,7 +1516,7 @@ bkey_err: ret = for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_bucket_gens_key(trans, &iter, k)); err: @@ -1562,7 +1563,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); ret = bch2_trans_update(trans, alloc_iter, - &a_mut->k_i, BTREE_TRIGGER_NORUN); + &a_mut->k_i, BTREE_TRIGGER_norun); if (ret) goto err; @@ -1601,7 +1602,7 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_PREFETCH, k, + POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_alloc_to_lru_ref(trans, &iter))); bch_err_fn(c, ret); @@ -1657,9 +1658,7 @@ static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_st bch2_journal_flush_async(&c->journal, NULL); if (s->ca) - percpu_ref_put(&s->ca->ref); - if (ca) - percpu_ref_get(&ca->ref); + percpu_ref_put(&s->ca->io_ref); s->ca = ca; s->need_journal_commit_this_dev = 0; } @@ -1673,15 +1672,15 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bpos pos = need_discard_iter->pos; struct btree_iter iter = { NULL }; struct bkey_s_c k; - struct bch_dev *ca; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; bool discard_locked = false; int ret = 0; - ca = bch_dev_bkey_exists(c, pos.inode); - - if (!percpu_ref_tryget(&ca->io_ref)) { + struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode + ? s->ca + : bch2_dev_get_ioref(c, pos.inode, WRITE); + if (!ca) { bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); return 0; } @@ -1703,7 +1702,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, need_discard_iter->pos, - BTREE_ITER_CACHED); + BTREE_ITER_cached); ret = bkey_err(k); if (ret) goto out; @@ -1713,7 +1712,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (ret) goto out; - if (a->v.dirty_sectors) { + if (bch2_bucket_sectors_total(a->v)) { if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, trans, "attempting to discard bucket with dirty data\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) @@ -1771,7 +1770,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - a->v.data_type = alloc_data_type(a->v, a->v.data_type); + alloc_data_type_set(&a->v, a->v.data_type); write: ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, @@ -1787,7 +1786,6 @@ out: discard_in_flight_remove(c, iter.pos); s->seen++; bch2_trans_iter_exit(trans, &iter); - percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); return ret; } @@ -1827,7 +1825,7 @@ void bch2_do_discards(struct bch_fs *c) static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) { struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) @@ -1840,7 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo BUG_ON(a->v.dirty_sectors); SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - a->v.data_type = alloc_data_type(a->v, a->v.data_type); + alloc_data_type_set(&a->v, a->v.data_type); ret = bch2_trans_update(trans, &iter, &a->k_i, 0); err: @@ -1862,9 +1860,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) if (i->snapshot) continue; - ca = bch_dev_bkey_exists(c, i->inode); - - if (!percpu_ref_tryget(&ca->io_ref)) { + ca = bch2_dev_get_ioref(c, i->inode, WRITE); + if (!ca) { darray_remove_item(&c->discard_buckets_in_flight, i); continue; } @@ -1903,9 +1900,12 @@ static void bch2_do_discards_fast_work(struct work_struct *work) static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode); + bool dead = !ca || percpu_ref_is_dying(&ca->io_ref); + rcu_read_unlock(); - if (!percpu_ref_is_dying(&ca->io_ref) && + if (!dead && !discard_in_flight_add(c, bucket) && bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && !queue_work(c->write_ref_wq, &c->discard_fast_work)) @@ -1918,7 +1918,6 @@ static int invalidate_one_bucket(struct btree_trans *trans, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = { NULL }; struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); @@ -1936,7 +1935,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; - a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); + a = bch2_trans_start_alloc_update(trans, bucket); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; @@ -1961,18 +1960,15 @@ static int invalidate_one_bucket(struct btree_trans *trans, a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); - ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, - BTREE_TRIGGER_BUCKET_INVALIDATE) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); + ret = bch2_trans_commit(trans, NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); --*nr_to_invalidate; out: - bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; err: @@ -2014,11 +2010,11 @@ static void bch2_do_invalidates_work(struct work_struct *work) ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(ca->dev_idx, 0, 0), lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); if (ret < 0) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); break; } } @@ -2051,7 +2047,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); /* * Scan the alloc btree for every bucket on @ca, and add buckets to the * freespace/need_discard/need_gc_gens btrees as needed: @@ -2083,7 +2079,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - ret = bch2_bucket_do_index(trans, k, a, true) ?: + ret = bch2_bucket_do_index(trans, ca, k, a, true) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) @@ -2155,7 +2151,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); bch_err_fn(c, ret); return ret; } @@ -2182,7 +2178,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, u64 now; int ret = 0; - a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); + if (bch2_trans_relock(trans)) + bch2_trans_begin(trans); + + a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); ret = PTR_ERR_OR_ZERO(a); if (ret) return ret; diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 2790e516383d..ae31a94be6f9 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -8,21 +8,18 @@ #include "debug.h" #include "super.h" -enum bkey_invalid_flags; +enum bch_validate_flags; /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) { - struct bch_dev *ca; - - if (!bch2_dev_exists2(c, pos.inode)) - return false; - - ca = bch_dev_bkey_exists(c, pos.inode); - return pos.offset >= ca->mi.first_bucket && - pos.offset < ca->mi.nbuckets; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, pos.inode); + bool ret = ca && bucket_valid(ca, pos.offset); + rcu_read_unlock(); + return ret; } static inline u64 bucket_to_u64(struct bpos bucket) @@ -40,38 +37,50 @@ static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) return a.gen - a.oldest_gen; } -static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors, - u32 cached_sectors, - u32 stripe, - struct bch_alloc_v4 a, - enum bch_data_type data_type) +static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src) { - if (stripe) - return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; - if (dirty_sectors) - return data_type; - if (cached_sectors) - return BCH_DATA_cached; - if (BCH_ALLOC_V4_NEED_DISCARD(&a)) - return BCH_DATA_need_discard; - if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) - return BCH_DATA_need_gc_gens; - return BCH_DATA_free; + dst->gen = src.gen; + dst->data_type = src.data_type; + dst->dirty_sectors = src.dirty_sectors; + dst->cached_sectors = src.cached_sectors; + dst->stripe = src.stripe; } -static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, - enum bch_data_type data_type) +static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src) { - return __alloc_data_type(a.dirty_sectors, a.cached_sectors, - a.stripe, a, data_type); + dst->gen = src.gen; + dst->data_type = src.data_type; + dst->dirty_sectors = src.dirty_sectors; + dst->cached_sectors = src.cached_sectors; + dst->stripe = src.stripe; +} + +static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) +{ + struct bch_alloc_v4 ret = {}; + __bucket_m_to_alloc(&ret, b); + return ret; } static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) { - return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; + switch (data_type) { + case BCH_DATA_cached: + case BCH_DATA_stripe: + return BCH_DATA_user; + default: + return data_type; + } +} + +static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, + enum bch_data_type ptr) +{ + return !data_type_is_empty(bucket) && + bucket_data_type(bucket) != bucket_data_type(ptr); } -static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a) +static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a) { return a.dirty_sectors + a.cached_sectors; } @@ -89,6 +98,27 @@ static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca, return d ? max(0, ca->mi.bucket_size - d) : 0; } +static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, + enum bch_data_type data_type) +{ + if (a.stripe) + return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; + if (a.dirty_sectors) + return data_type; + if (a.cached_sectors) + return BCH_DATA_cached; + if (BCH_ALLOC_V4_NEED_DISCARD(&a)) + return BCH_DATA_need_discard; + if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) + return BCH_DATA_need_gc_gens; + return BCH_DATA_free; +} + +static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type) +{ + a->data_type = alloc_data_type(*a, data_type); +} + static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; @@ -147,7 +177,9 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) } struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); +bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos); +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *, struct bpos); void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); @@ -173,13 +205,13 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -213,7 +245,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); }) int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ @@ -233,7 +265,8 @@ static inline bool bkey_is_alloc(const struct bkey *k) int bch2_alloc_read(struct bch_fs *); int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_do_discards(struct bch_fs *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index a1fc30adf912..927a5f300b30 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -71,7 +71,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *c) { rcu_read_lock(); for_each_member_device_rcu(c, ca, NULL) - ca->alloc_cursor = 0; + memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); rcu_read_unlock(); } @@ -100,7 +100,7 @@ static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *o void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); if (ob->ec) { ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); @@ -300,7 +300,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), - BTREE_ITER_CACHED); + BTREE_ITER_cached); ret = bkey_err(k); if (ret) { ob = ERR_PTR(ret); @@ -342,9 +342,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc struct bch_backpointer bp; struct bpos bp_pos = POS_MIN; - ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, + ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1, &bp_pos, &bp, - BTREE_ITER_NOPRESERVE); + BTREE_ITER_nopreserve); if (ret) { ob = ERR_PTR(ret); goto err; @@ -363,10 +363,10 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); if (!ob) - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); err: if (iter.path) - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ob; @@ -389,7 +389,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bkey_s_c k, ck; struct open_bucket *ob = NULL; u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); - u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; int ret; @@ -404,9 +405,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans, */ again: for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), - BTREE_ITER_SLOTS, k, ret) { - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; + BTREE_ITER_slots, k, ret) { + u64 bucket = k.k->p.offset; if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; @@ -415,12 +415,29 @@ again: is_superblock_bucket(ca, k.k->p.offset)) continue; - a = bch2_alloc_to_v4(k, &a_convert); + if (s->btree_bitmap != BTREE_BITMAP_ANY && + s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { + if (s->btree_bitmap == BTREE_BITMAP_YES && + bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) + break; + + bucket = sector_to_bucket(ca, + round_up(bucket_to_sector(ca, bucket) + 1, + 1ULL << ca->mi.btree_bitmap_shift)); + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); + s->buckets_seen++; + s->skipped_mi_btree_bitmap++; + continue; + } + + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); if (a->data_type != BCH_DATA_free) continue; /* now check the cached key to serialize concurrent allocs of the bucket */ - ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); + ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); ret = bkey_err(ck); if (ret) break; @@ -433,7 +450,7 @@ again: ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); next: - set_btree_iter_dontneed(&citer); + bch2_set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; @@ -441,7 +458,6 @@ next: bch2_trans_iter_exit(trans, &iter); alloc_cursor = iter.pos.offset; - ca->alloc_cursor = alloc_cursor; if (!ob && ret) ob = ERR_PTR(ret); @@ -451,6 +467,8 @@ next: goto again; } + *dev_alloc_cursor = alloc_cursor; + return ob; } @@ -463,7 +481,8 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; @@ -485,10 +504,30 @@ again: s->buckets_seen++; + u64 bucket = alloc_cursor & ~(~0ULL << 56); + if (s->btree_bitmap != BTREE_BITMAP_ANY && + s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { + if (s->btree_bitmap == BTREE_BITMAP_YES && + bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) + goto fail; + + bucket = sector_to_bucket(ca, + round_up(bucket_to_sector(ca, bucket) + 1, + 1ULL << ca->mi.btree_bitmap_shift)); + u64 genbits = alloc_cursor >> 56; + alloc_cursor = bucket | (genbits << 56); + + if (alloc_cursor > k.k->p.offset) + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); + s->skipped_mi_btree_bitmap++; + continue; + } + ob = try_alloc_bucket(trans, ca, watermark, alloc_cursor, s, k, cl); if (ob) { - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); break; } } @@ -496,10 +535,9 @@ again: if (ob || ret) break; } +fail: bch2_trans_iter_exit(trans, &iter); - ca->alloc_cursor = alloc_cursor; - if (!ob && ret) ob = ERR_PTR(ret); @@ -508,14 +546,56 @@ again: goto again; } + *dev_alloc_cursor = alloc_cursor; + return ob; } +static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, + enum bch_watermark watermark, + enum bch_data_type data_type, + struct closure *cl, + struct bch_dev_usage *usage, + struct bucket_alloc_state *s, + struct open_bucket *ob) +{ + struct printbuf buf = PRINTBUF; + + printbuf_tabstop_push(&buf, 24); + + prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); + prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); + prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); + prt_printf(&buf, "blocking\t%u\n", cl != NULL); + prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets); + prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); + prt_printf(&buf, "copygc_wait\t%lu/%lli\n", + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); + prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); + prt_printf(&buf, "open\t%llu\n", s->skipped_open); + prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); + prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); + prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); + prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); + + if (!IS_ERR(ob)) { + prt_printf(&buf, "allocated\t%llu\n", ob->bucket); + trace_bucket_alloc(c, buf.buf); + } else { + prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); + trace_bucket_alloc_fail(c, buf.buf); + } + + printbuf_exit(&buf); +} + /** * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object * @ca: device to allocate from * @watermark: how important is this allocation? + * @data_type: BCH_DATA_journal, btree, user... * @cl: if not NULL, closure to be used to wait if buckets not available * @usage: for secondarily also returning the current device usage * @@ -524,6 +604,7 @@ again: static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_dev *ca, enum bch_watermark watermark, + enum bch_data_type data_type, struct closure *cl, struct bch_dev_usage *usage) { @@ -531,7 +612,9 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; - struct bucket_alloc_state s = { 0 }; + struct bucket_alloc_state s = { + .btree_bitmap = data_type == BCH_DATA_btree, + }; bool waiting = false; again: bch2_dev_usage_read_fast(ca, usage); @@ -541,7 +624,7 @@ again: bch2_do_discards(c); if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) - bch2_do_gc_gens(c); + bch2_gc_gens_async(c); if (should_invalidate_buckets(ca, *usage)) bch2_do_invalidates(c); @@ -569,6 +652,11 @@ alloc: if (s.skipped_need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); + if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { + s.btree_bitmap = BTREE_BITMAP_ANY; + goto alloc; + } + if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; @@ -578,33 +666,24 @@ err: ob = ERR_PTR(-BCH_ERR_no_buckets_found); if (!IS_ERR(ob)) - trace_and_count(c, bucket_alloc, ca, - bch2_watermarks[watermark], - ob->bucket, - usage->d[BCH_DATA_free].buckets, - avail, - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), - &s, - cl == NULL, - ""); + ob->data_type = data_type; + + if (!IS_ERR(ob)) + count_event(c, bucket_alloc); else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) - trace_and_count(c, bucket_alloc_fail, ca, - bch2_watermarks[watermark], - 0, - usage->d[BCH_DATA_free].buckets, - avail, - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), - &s, - cl == NULL, - bch2_err_str(PTR_ERR(ob))); + count_event(c, bucket_alloc_fail); + + if (!IS_ERR(ob) + ? trace_bucket_alloc_enabled() + : trace_bucket_alloc_fail_enabled()) + trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); return ob; } struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum bch_watermark watermark, + enum bch_data_type data_type, struct closure *cl) { struct bch_dev_usage usage; @@ -612,7 +691,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, bch2_trans_do(c, NULL, NULL, 0, PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - cl, &usage))); + data_type, cl, &usage))); return ob; } @@ -678,8 +757,7 @@ static int add_new_bucket(struct bch_fs *c, unsigned flags, struct open_bucket *ob) { - unsigned durability = - bch_dev_bkey_exists(c, ob->dev)->mi.durability; + unsigned durability = ob_dev(c, ob)->mi.durability; BUG_ON(*nr_effective >= nr_replicas); @@ -711,37 +789,28 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct bch_fs *c = trans->c; struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); - unsigned dev; - struct bch_dev *ca; int ret = -BCH_ERR_insufficient_devices; - unsigned i; BUG_ON(*nr_effective >= nr_replicas); - for (i = 0; i < devs_sorted.nr; i++) { + for (unsigned i = 0; i < devs_sorted.nr; i++) { struct bch_dev_usage usage; struct open_bucket *ob; - dev = devs_sorted.devs[i]; - - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - + unsigned dev = devs_sorted.devs[i]; + struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); if (!ca) continue; if (!ca->mi.durability && *have_cache) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); continue; } - ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage); + ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); if (IS_ERR(ob)) { ret = PTR_ERR(ob); @@ -750,8 +819,6 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob->data_type = data_type; - if (add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, have_cache, flags, ob)) { @@ -836,7 +903,7 @@ static bool want_bucket(struct bch_fs *c, bool *have_cache, bool ec, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); if (!test_bit(ob->dev, devs_may_alloc->d)) return false; @@ -906,7 +973,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); struct bch_dev_usage usage; u64 avail; @@ -1291,7 +1358,7 @@ deallocate_extra_replicas(struct bch_fs *c, unsigned i; open_bucket_for_each(c, ptrs, ob, i) { - unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + unsigned d = ob_dev(c, ob)->mi.durability; if (d && d <= extra_replicas) { extra_replicas -= d; @@ -1342,6 +1409,10 @@ retry: *wp_ret = wp = writepoint_find(trans, write_point.v); + ret = bch2_trans_relock(trans); + if (ret) + goto err; + /* metadata may not allocate on cache devices: */ if (wp->data_type != BCH_DATA_user) have_cache = true; @@ -1444,7 +1515,7 @@ err: struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); return (struct bch_extent_ptr) { .type = 1 << BCH_EXTENT_ENTRY_ptr, @@ -1520,7 +1591,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ @@ -1622,3 +1693,104 @@ void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) prt_str(out, "Btree write point\n"); bch2_write_point_to_text(out, c, &c->btree_write_point); } + +void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) +{ + unsigned nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].data_type]++; + + printbuf_tabstop_push(out, 24); + + percpu_down_read(&c->mark_lock); + prt_printf(out, "hidden\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.hidden)); + prt_printf(out, "btree\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.btree)); + prt_printf(out, "data\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.data)); + prt_printf(out, "cached\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.cached)); + prt_printf(out, "reserved\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.reserved)); + prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); + prt_printf(out, "nr_inodes\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes)); + percpu_up_read(&c->mark_lock); + + prt_newline(out); + prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty"); + prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); + prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT); + prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty"); + prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]); + prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]); + prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr); +} + +void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_dev_usage stats = bch2_dev_usage_read(ca); + unsigned nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].data_type]++; + + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + + bch2_dev_usage_to_text(out, &stats); + + prt_newline(out); + + prt_printf(out, "reserves:\n"); + for (unsigned i = 0; i < BCH_WATERMARK_NR; i++) + prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i)); + + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 16); + + prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); + prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); +} + +void bch2_print_allocator_stuck(struct bch_fs *c) +{ + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n"); + + prt_printf(&buf, "Allocator debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_fs_alloc_debug_to_text(&buf, c); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + + for_each_online_member(c, ca) { + prt_printf(&buf, "Dev %u:\n", ca->dev_idx); + printbuf_indent_add(&buf, 2); + bch2_dev_alloc_debug_to_text(&buf, ca); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + + prt_printf(&buf, "Copygc debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_copygc_wait_to_text(&buf, c); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + + prt_printf(&buf, "Journal debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_journal_debug_to_text(&buf, &c->journal); + printbuf_indent_sub(&buf, 2); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 7aaeec44c746..a42c9730d32a 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -30,8 +30,14 @@ void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); long bch2_bucket_alloc_new_fs(struct bch_dev *); +static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) +{ + return bch2_dev_have_ref(c, ob->dev); +} + struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, - enum bch_watermark, struct closure *); + enum bch_watermark, enum bch_data_type, + struct closure *); static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, struct open_bucket *ob) @@ -184,7 +190,7 @@ bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, wp->sectors_allocated += sectors; open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); ptr.cached = cached || @@ -221,4 +227,9 @@ void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); +void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *); +void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *); + +void bch2_print_allocator_stuck(struct bch_fs *); + #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index c2226e947c41..9bbb28e90b93 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -9,11 +9,18 @@ #include "fifo.h" struct bucket_alloc_state { + enum { + BTREE_BITMAP_NO, + BTREE_BITMAP_YES, + BTREE_BITMAP_ANY, + } btree_bitmap; + u64 buckets_seen; u64 skipped_open; u64 skipped_need_journal_commit; u64 skipped_nocow; u64 skipped_nouse; + u64 skipped_mi_btree_bitmap; }; #define BCH_WATERMARKS() \ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index af7a71de1bdf..692b1c7d5018 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -23,6 +23,7 @@ static bool extent_matches_bp(struct bch_fs *c, const union bch_extent_entry *entry; struct extent_ptr_decoded p; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { struct bpos bucket2; struct bch_backpointer bp2; @@ -30,31 +31,43 @@ static bool extent_matches_bp(struct bch_fs *c, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2); + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (!ca) + continue; + + bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2); if (bpos_eq(bucket, bucket2) && - !memcmp(&bp, &bp2, sizeof(bp))) + !memcmp(&bp, &bp2, sizeof(bp))) { + rcu_read_unlock(); return true; + } } + rcu_read_unlock(); return false; } int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - /* these will be caught by fsck */ - if (!bch2_dev_exists2(c, bp.k->p.inode)) + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, bp.k->p.inode); + if (!ca) { + /* these will be caught by fsck */ + rcu_read_unlock(); return 0; + } - struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode); - struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); + struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p); + struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset); + rcu_read_unlock(); int ret = 0; bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || - !bpos_eq(bp.k->p, bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset)), + !bpos_eq(bp.k->p, bp_pos), c, err, backpointer_bucket_offset_wrong, "backpointer bucket_offset wrong"); @@ -75,10 +88,16 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - if (bch2_dev_exists2(c, k.k->p.inode)) { + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, k.k->p.inode); + if (ca) { + struct bpos bucket = bp_pos_to_bucket(ca, k.k->p); + rcu_read_unlock(); prt_str(out, "bucket="); - bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + bch2_bpos_to_text(out, bucket); prt_str(out, " "); + } else { + rcu_read_unlock(); } bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); @@ -117,8 +136,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch_err(c, "%s", buf.buf); } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - prt_printf(&buf, "backpointer not found when deleting"); - prt_newline(&buf); + prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); prt_printf(&buf, "searching for "); @@ -145,6 +163,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, @@ -161,7 +180,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, return ret; bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); bp_k->v = bp; if (!insert) { @@ -171,9 +190,9 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, - BTREE_ITER_INTENT| - BTREE_ITER_SLOTS| - BTREE_ITER_WITH_UPDATES); + BTREE_ITER_intent| + BTREE_ITER_slots| + BTREE_ITER_with_updates); ret = bkey_err(k); if (ret) goto err; @@ -197,13 +216,13 @@ err: * Find the next backpointer >= *bp_offset: */ int bch2_get_next_backpointer(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, int gen, struct bpos *bp_pos, struct bch_backpointer *bp, unsigned iter_flags) { - struct bch_fs *c = trans->c; - struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); + struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0); struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; struct bkey_s_c k; int ret = 0; @@ -213,7 +232,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans, if (gen >= 0) { k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED|iter_flags); + bucket, BTREE_ITER_cached|iter_flags); ret = bkey_err(k); if (ret) goto out; @@ -223,7 +242,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans, goto done; } - *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0)); + *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0)); for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, *bp_pos, iter_flags, k, ret) { @@ -249,7 +268,6 @@ static void backpointer_not_found(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); /* * If we're using the btree write buffer, the backpointer we were @@ -259,6 +277,10 @@ static void backpointer_not_found(struct btree_trans *trans, if (likely(!bch2_backpointers_no_use_write_buffer)) return; + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return; + prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", bp.level ? "btree node" : "extent"); prt_printf(&buf, "bucket: "); @@ -288,15 +310,17 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, { if (likely(!bp.level)) { struct bch_fs *c = trans->c; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct bkey_s_c k; + + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return bkey_s_c_err(-EIO); bch2_trans_node_iter_init(trans, iter, bp.btree_id, bp.pos, 0, 0, iter_flags); - k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); return k; @@ -325,18 +349,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct bch_backpointer bp) { struct bch_fs *c = trans->c; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct btree *b; BUG_ON(!bp.level); + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return ERR_PTR(-EIO); + bch2_trans_node_iter_init(trans, iter, bp.btree_id, bp.pos, 0, bp.level - 1, 0); - b = bch2_btree_iter_peek_node(iter); + struct btree *b = bch2_btree_iter_peek_node(iter); if (IS_ERR_OR_NULL(b)) goto err; @@ -367,16 +393,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ struct printbuf buf = PRINTBUF; int ret = 0; - if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, - backpointer_to_missing_device, - "backpointer for missing device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, bp_iter, 0); + struct bpos bucket; + if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { + if (fsck_err(c, backpointer_to_missing_device, + "backpointer for missing device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, bp_iter, 0); goto out; } - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bp_pos_to_bucket(c, k.k->p), 0); + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); ret = bkey_err(alloc_k); if (ret) goto out; @@ -460,8 +486,8 @@ found: bytes = p.crc.compressed_size << 9; - struct bch_dev *ca = bch_dev_bkey_exists(c, dev); - if (!bch2_dev_get_ioref(ca, READ)) + struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); + if (!ca) return false; data_buf = kvmalloc(bytes, GFP_KERNEL); @@ -511,25 +537,27 @@ static int check_bp_exists(struct btree_trans *trans, struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; struct bkey_buf tmp; - int ret; + int ret = 0; bch2_bkey_buf_init(&tmp); - if (!bch2_dev_bucket_exists(c, bucket)) { + struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket); + if (!ca) { prt_str(&buf, "extent for nonexistent device:bucket "); bch2_bpos_to_text(&buf, bucket); prt_str(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, orig_k); bch_err(c, "%s", buf.buf); - return -BCH_ERR_fsck_repair_unimplemented; + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; } if (bpos_lt(bucket, s->bucket_start) || bpos_gt(bucket, s->bucket_end)) - return 0; + goto out; bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(c, bucket, bp.bucket_offset), + bucket_pos_to_bp(ca, bucket, bp.bucket_offset), 0); ret = bkey_err(bp_k); if (ret) @@ -562,6 +590,7 @@ fsck_err: bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); bch2_bkey_buf_exit(&tmp, c); + bch2_dev_put(ca); printbuf_exit(&buf); return ret; check_existing_bp: @@ -637,13 +666,13 @@ missing: struct bkey_i_backpointer n_bp_k; bkey_backpointer_init(&n_bp_k.k_i); - n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); n_bp_k.v = bp; prt_printf(&buf, "\n want: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); + ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true); goto out; } @@ -667,7 +696,14 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp); + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (ca) + bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp); + rcu_read_unlock(); + + if (!ca) + continue; ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) @@ -760,7 +796,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, __for_each_btree_node(trans, iter, btree, btree == start.btree ? start.pos : POS_MIN, - 0, depth, BTREE_ITER_PREFETCH, b, ret) { + 0, depth, BTREE_ITER_prefetch, b, ret) { mem_may_pin -= btree_buf_bytes(b); if (mem_may_pin <= 0) { c->btree_cache.pinned_nodes_end = *end = @@ -794,31 +830,13 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, while (level >= depth) { struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, - level, - BTREE_ITER_PREFETCH); - while (1) { - bch2_trans_begin(trans); - - struct bkey_s_c k = bch2_btree_iter_peek(&iter); - if (!k.k) - break; - ret = bkey_err(k) ?: - check_extent_to_backpointers(trans, s, btree_id, level, k) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - if (ret) - break; - if (bpos_eq(iter.pos, SPOS_MAX)) - break; - bch2_btree_iter_advance(&iter); - } - bch2_trans_iter_exit(trans, &iter); + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, + BTREE_ITER_prefetch); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + })); if (ret) return ret; @@ -936,7 +954,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bpos last_flushed_pos = SPOS_MAX; return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_PREFETCH, k, + POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_one_backpointer(trans, start, end, bkey_s_c_to_backpointer(k), diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index c1b274eadda1..6021de1c5e98 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -6,6 +6,7 @@ #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" +#include "error.h" #include "super.h" static inline u64 swab40(u64 x) @@ -18,7 +19,7 @@ static inline u64 swab40(u64 x) } int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); @@ -36,15 +37,29 @@ void bch2_backpointer_swab(struct bkey_s); * Convert from pos in backpointer btree to pos of corresponding bucket in alloc * btree: */ -static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, - struct bpos bp_pos) +static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); } +static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode); + if (ca) + *bucket = bp_pos_to_bucket(ca, bp_pos); + rcu_read_unlock(); + return ca != NULL; +} + +static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) +{ + return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket), + c, "backpointer for missing device %llu", bp_pos.inode); +} + static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, struct bpos bucket, u64 bucket_offset) @@ -57,32 +72,32 @@ static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, /* * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: */ -static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, +static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, struct bpos bucket, u64 bucket_offset) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset); - EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); + EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret))); return ret; } -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket, - struct bch_backpointer, struct bkey_s_c, bool); +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *, + struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, bool insert) { if (unlikely(bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert); + return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert); struct bkey_i_backpointer bp_k; bkey_backpointer_init(&bp_k.k_i); - bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); bp_k.v = bp; if (!insert) { @@ -120,7 +135,7 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, } } -static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, +static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, @@ -130,7 +145,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, s64 sectors = level ? btree_sectors(c) : k.k->size; u32 bucket_offset; - *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); + *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); *bp = (struct bch_backpointer) { .btree_id = btree_id, .level = level, @@ -142,7 +157,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, }; } -int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, +int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int, struct bpos *, struct bch_backpointer *, unsigned); struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, struct bpos, struct bch_backpointer, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 91c3c1fef233..bc0ea2c4efef 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -359,6 +359,8 @@ do { \ #define BCH_DEBUG_PARAMS_ALWAYS() \ BCH_DEBUG_PARAM(key_merging_disabled, \ "Disables merging of extents") \ + BCH_DEBUG_PARAM(btree_node_merging_disabled, \ + "Disables merging of btree nodes") \ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ "Causes mark and sweep to compact and rewrite every " \ "btree node it traverses") \ @@ -468,6 +470,7 @@ enum bch_time_stats { #include "quota_types.h" #include "rebalance_types.h" #include "replicas_types.h" +#include "sb-members_types.h" #include "subvolume_types.h" #include "super_types.h" #include "thread_with_file_types.h" @@ -516,8 +519,8 @@ enum gc_phase { struct gc_pos { enum gc_phase phase; + u16 level; struct bpos pos; - unsigned level; }; struct reflink_gc { @@ -534,7 +537,13 @@ struct io_count { struct bch_dev { struct kobject kobj; +#ifdef CONFIG_BCACHEFS_DEBUG + atomic_long_t ref; + bool dying; + unsigned long last_put; +#else struct percpu_ref ref; +#endif struct completion ref_completion; struct percpu_ref io_ref; struct completion io_ref_completion; @@ -560,14 +569,11 @@ struct bch_dev { struct bch_devs_mask self; - /* biosets used in cloned bios for writing multiple replicas */ - struct bio_set replica_set; - /* * Buckets: * Per-bucket arrays are protected by c->mark_lock, bucket_lock and * gc_lock, for device resize - holding any is sufficient for access: - * Or rcu_read_lock(), but only for ptr_stale(): + * Or rcu_read_lock(), but only for dev_ptr_stale(): */ struct bucket_array __rcu *buckets_gc; struct bucket_gens __rcu *bucket_gens; @@ -581,7 +587,7 @@ struct bch_dev { /* Allocator: */ u64 new_fs_bucket_idx; - u64 alloc_cursor; + u64 alloc_cursor[3]; unsigned nr_open_buckets; unsigned nr_btree_reserve; @@ -627,12 +633,12 @@ struct bch_dev { x(clean_shutdown) \ x(fsck_running) \ x(initial_gc_unfixed) \ - x(need_another_gc) \ x(need_delete_dead_snapshots) \ x(error) \ x(topology_error) \ x(errors_fixed) \ - x(errors_not_fixed) + x(errors_not_fixed) \ + x(no_invalid_checks) enum bch_fs_flags { #define x(n) BCH_FS_##n, @@ -715,6 +721,7 @@ struct btree_trans_buf { x(discard_fast) \ x(invalidate) \ x(delete_dead_snapshots) \ + x(gc_gens) \ x(snapshot_delete_pagecache) \ x(sysfs) \ x(btree_write_buffer) @@ -926,7 +933,6 @@ struct bch_fs { /* JOURNAL SEQ BLACKLIST */ struct journal_seq_blacklist_table * journal_seq_blacklist_table; - struct work_struct journal_seq_blacklist_gc_work; /* ALLOCATOR */ spinlock_t freelist_lock; @@ -957,8 +963,7 @@ struct bch_fs { struct work_struct discard_fast_work; /* GARBAGE COLLECTION */ - struct task_struct *gc_thread; - atomic_t kick_gc; + struct work_struct gc_gens_work; unsigned long gc_count; enum btree_id gc_gens_btree; @@ -988,6 +993,7 @@ struct bch_fs { struct bio_set bio_read; struct bio_set bio_read_split; struct bio_set bio_write; + struct bio_set replica_set; struct mutex bio_bounce_pages_lock; mempool_t bio_bounce_pages; struct bucket_nocow_lock_table @@ -1115,7 +1121,6 @@ struct bch_fs { u64 counters_on_mount[BCH_COUNTER_NR]; u64 __percpu *counters; - unsigned btree_gc_periodic:1; unsigned copy_gc_enabled:1; bool promote_whole_extents; @@ -1250,11 +1255,6 @@ static inline s64 bch2_current_time(const struct bch_fs *c) return timespec_to_bch2_time(c, now); } -static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) -{ - return dev < c->sb.nr_devices && c->devs[dev]; -} - static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) { struct stdio_redirect *stdio = c->stdio; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 2e8b1a489c20..1bebba881d89 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -76,6 +76,7 @@ #include <asm/byteorder.h> #include <linux/kernel.h> #include <linux/uuid.h> +#include <uapi/linux/magic.h> #include "vstructs.h" #ifdef __KERNEL__ @@ -589,6 +590,13 @@ struct bch_member { __le64 errors_reset_time; __le64 seq; __le64 btree_allocated_bitmap; + /* + * On recovery from a clean shutdown we don't normally read the journal, + * but we still want to resume writing from where we left off so we + * don't overwrite more than is necessary, for list journal debugging: + */ + __le32 last_journal_bucket; + __le32 last_journal_bucket_offset; }; /* @@ -1283,7 +1291,7 @@ enum bch_compression_opts { UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) -#define BCACHEFS_STATFS_MAGIC 0xca451a4e +#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC #define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) #define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 76e79a15ba08..f46978e5cb7c 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -640,7 +640,7 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) int bch2_bkey_format_invalid(struct bch_fs *c, struct bkey_format *f, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { unsigned i, bits = KEY_PACKED_BITS_START; @@ -656,20 +656,17 @@ int bch2_bkey_format_invalid(struct bch_fs *c, * unpacked format: */ for (i = 0; i < f->nr_fields; i++) { - if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) { + if ((!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) && + bch2_bkey_format_field_overflows(f, i)) { unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); u64 packed_max = f->bits_per_field[i] ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) : 0; - u64 field_offset = le64_to_cpu(f->field_offset[i]); - if (packed_max + field_offset < packed_max || - packed_max + field_offset > unpacked_max) { - prt_printf(err, "field %u too large: %llu + %llu > %llu", - i, packed_max, field_offset, unpacked_max); - return -BCH_ERR_invalid; - } + prt_printf(err, "field %u too large: %llu + %llu > %llu", + i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max); + return -BCH_ERR_invalid; } bits += f->bits_per_field[i]; diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 3a45d128f608..fcd43915df07 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -9,10 +9,10 @@ #include "util.h" #include "vstructs.h" -enum bkey_invalid_flags { - BKEY_INVALID_WRITE = (1U << 0), - BKEY_INVALID_COMMIT = (1U << 1), - BKEY_INVALID_JOURNAL = (1U << 2), +enum bch_validate_flags { + BCH_VALIDATE_write = (1U << 0), + BCH_VALIDATE_commit = (1U << 1), + BCH_VALIDATE_journal = (1U << 2), }; #if 0 @@ -574,8 +574,31 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); + +static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i) +{ + unsigned f_bits = f->bits_per_field[i]; + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (f_bits > unpacked_bits) + return true; + + if ((f_bits == unpacked_bits) && field_offset) + return true; + + u64 f_mask = f_bits + ? ~((~0ULL << (f_bits - 1)) << 1) + : 0; + + if (((field_offset + f_mask) & unpacked_mask) < field_offset) + return true; + return false; +} + int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); #endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index a275a9e8e341..c2c3dae52186 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -27,7 +27,7 @@ const char * const bch2_bkey_types[] = { }; static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { return 0; } @@ -41,7 +41,7 @@ static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, }) static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -58,7 +58,7 @@ fsck_err: }) static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { return 0; } @@ -82,7 +82,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, }) static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { return 0; } @@ -123,9 +123,12 @@ const struct bkey_ops bch2_bkey_null_ops = { }; int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) + return 0; + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); int ret = 0; @@ -159,9 +162,12 @@ const char *bch2_btree_node_type_str(enum btree_node_type type) int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) + return 0; + int ret = 0; bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err, @@ -172,7 +178,7 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, return 0; bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && - (type == BKEY_TYPE_btree || (flags & BKEY_INVALID_COMMIT)) && + (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", @@ -224,7 +230,7 @@ fsck_err: int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { return __bch2_bkey_invalid(c, k, type, flags, err) ?: diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 03efe8ee565a..726ef7483763 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -22,14 +22,15 @@ extern const struct bkey_ops bch2_bkey_null_ops; */ struct bkey_ops { int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err); + enum bch_validate_flags flags, struct printbuf *err); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); int (*trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); @@ -48,11 +49,11 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) } int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, struct printbuf *); @@ -76,56 +77,10 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -enum btree_update_flags { - __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, - __BTREE_UPDATE_NOJOURNAL, - __BTREE_UPDATE_KEY_CACHE_RECLAIM, - - __BTREE_TRIGGER_NORUN, - __BTREE_TRIGGER_TRANSACTIONAL, - __BTREE_TRIGGER_ATOMIC, - __BTREE_TRIGGER_GC, - __BTREE_TRIGGER_INSERT, - __BTREE_TRIGGER_OVERWRITE, - __BTREE_TRIGGER_BUCKET_INVALIDATE, -}; - -#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) -#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) -#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) - -/* Don't run triggers at all */ -#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) - -/* - * If set, we're running transactional triggers as part of a transaction commit: - * triggers may generate new updates - * - * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set, - * we're running atomic triggers during a transaction commit: we have our - * journal reservation, we're holding btree node write locks, and we know the - * transaction is going to commit (returning an error here is a fatal error, - * causing us to go emergency read-only) - */ -#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL) -#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC) - -/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) - -/* @new is entering the btree */ -#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) - -/* @old is leaving the btree */ -#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) - -/* signal from bucket invalidate path to alloc trigger */ -#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) - static inline int bch2_key_trigger(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); @@ -135,8 +90,9 @@ static inline int bch2_key_trigger(struct btree_trans *trans, } static inline int bch2_key_trigger_old(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, unsigned flags) + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + enum btree_iter_update_trigger_flags flags) { struct bkey_i deleted; @@ -144,12 +100,13 @@ static inline int bch2_key_trigger_old(struct btree_trans *trans, deleted.k.p = old.k->p; return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted), - BTREE_TRIGGER_OVERWRITE|flags); + BTREE_TRIGGER_overwrite|flags); } static inline int bch2_key_trigger_new(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s new, unsigned flags) + enum btree_id btree_id, unsigned level, + struct bkey_s new, + enum btree_iter_update_trigger_flags flags) { struct bkey_i deleted; @@ -157,7 +114,7 @@ static inline int bch2_key_trigger_new(struct btree_trans *trans, deleted.k.p = new.k->p; return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, - BTREE_TRIGGER_INSERT|flags); + BTREE_TRIGGER_insert|flags); } void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c index bcca9e76a0b4..4536eb50fc40 100644 --- a/fs/bcachefs/bkey_sort.c +++ b/fs/bcachefs/bkey_sort.c @@ -6,9 +6,9 @@ #include "bset.h" #include "extents.h" -typedef int (*sort_cmp_fn)(struct btree *, - struct bkey_packed *, - struct bkey_packed *); +typedef int (*sort_cmp_fn)(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); static inline bool sort_iter_end(struct sort_iter *iter) { @@ -70,9 +70,9 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, /* * If keys compare equal, compare by pointer order: */ -static inline int key_sort_fix_overlapping_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) +static inline int key_sort_fix_overlapping_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { return bch2_bkey_cmp_packed(b, l, r) ?: cmp_int((unsigned long) l, (unsigned long) r); @@ -154,46 +154,59 @@ bch2_sort_repack(struct bset *dst, struct btree *src, return nr; } -static inline int sort_keys_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) +static inline int keep_unwritten_whiteouts_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { return bch2_bkey_cmp_packed_inlined(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: - (int) l->needs_whiteout - (int) r->needs_whiteout; + (long) l - (long) r; } -unsigned bch2_sort_keys(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) +#include "btree_update_interior.h" + +/* + * For sorting in the btree node write path: whiteouts not in the unwritten + * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are + * dropped if overwritten by real keys: + */ +unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter) { - const struct bkey_format *f = &iter->b->format; struct bkey_packed *in, *next, *out = dst; - sort_iter_sort(iter, sort_keys_cmp); + sort_iter_sort(iter, keep_unwritten_whiteouts_cmp); - while ((in = sort_iter_next(iter, sort_keys_cmp))) { - bool needs_whiteout = false; + while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) { + if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b)) + continue; - if (bkey_deleted(in) && - (filter_whiteouts || !in->needs_whiteout)) + if ((next = sort_iter_peek(iter)) && + !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) continue; - while ((next = sort_iter_peek(iter)) && - !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) { - BUG_ON(in->needs_whiteout && - next->needs_whiteout); - needs_whiteout |= in->needs_whiteout; - in = sort_iter_next(iter, sort_keys_cmp); - } + bkey_p_copy(out, in); + out = bkey_p_next(out); + } - if (bkey_deleted(in)) { - memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in)); - set_bkeyp_val_u64s(f, out, 0); - } else { - bkey_p_copy(out, in); - } - out->needs_whiteout |= needs_whiteout; + return (u64 *) out - (u64 *) dst; +} + +/* + * Main sort routine for compacting a btree node in memory: we always drop + * whiteouts because any whiteouts that need to be written are in the unwritten + * whiteouts area: + */ +unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter) +{ + struct bkey_packed *in, *out = dst; + + sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined); + + while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) { + if (bkey_deleted(in)) + continue; + + bkey_p_copy(out, in); out = bkey_p_next(out); } diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h index 7c0f0b160f18..9be969d46890 100644 --- a/fs/bcachefs/bkey_sort.h +++ b/fs/bcachefs/bkey_sort.h @@ -48,7 +48,7 @@ bch2_sort_repack(struct bset *, struct btree *, struct btree_node_iter *, struct bkey_format *, bool); -unsigned bch2_sort_keys(struct bkey_packed *, - struct sort_iter *, bool); +unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *); +unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *); #endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 3bb477840eab..575e1d0b6eeb 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -103,8 +103,6 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) { - struct bset_tree *t; - console_lock(); for_each_bset(b, t) bch2_dump_bset(c, b, bset(b, t), t - b->set); @@ -136,7 +134,6 @@ void bch2_dump_btree_node_iter(struct btree *b, struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) { - struct bset_tree *t; struct bkey_packed *k; struct btree_nr_keys nr = {}; @@ -198,7 +195,6 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, { struct btree_node_iter_set *set, *s2; struct bkey_packed *k, *p; - struct bset_tree *t; if (bch2_btree_node_iter_end(iter)) return; @@ -213,12 +209,14 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, /* Verify that set->end is correct: */ btree_node_iter_for_each(iter, set) { for_each_bset(b, t) - if (set->end == t->end_offset) + if (set->end == t->end_offset) { + BUG_ON(set->k < btree_bkey_first_offset(t) || + set->k >= t->end_offset); goto found; + } BUG(); found: - BUG_ON(set->k < btree_bkey_first_offset(t) || - set->k >= t->end_offset); + do {} while (0); } /* Verify iterator is sorted: */ @@ -377,11 +375,9 @@ static struct bkey_float *bkey_float(const struct btree *b, return ro_aux_tree_base(b, t)->f + idx; } -static void bset_aux_tree_verify(const struct btree *b) +static void bset_aux_tree_verify(struct btree *b) { #ifdef CONFIG_BCACHEFS_DEBUG - const struct bset_tree *t; - for_each_bset(b, t) { if (t->aux_data_offset == U16_MAX) continue; @@ -685,20 +681,20 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, } /* bytes remaining - only valid for last bset: */ -static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t) { bset_aux_tree_verify(b); return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); } -static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / (sizeof(struct bkey_float) + sizeof(u8)); } -static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); } @@ -1374,8 +1370,6 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, struct btree *b) { - struct bset_tree *t; - memset(iter, 0, sizeof(*iter)); for_each_bset(b, t) @@ -1481,7 +1475,6 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, { struct bkey_packed *k, *prev = NULL; struct btree_node_iter_set *set; - struct bset_tree *t; unsigned end = 0; if (bch2_expensive_debug_checks) @@ -1550,9 +1543,7 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) { - const struct bset_tree *t; - - for_each_bset(b, t) { + for_each_bset_c(b, t) { enum bset_aux_tree_type type = bset_aux_tree_type(t); size_t j; diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 120a79fd456b..5c6c7a14fa0f 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -206,7 +206,10 @@ static inline size_t btree_aux_data_u64s(const struct btree *b) } #define for_each_bset(_b, _t) \ - for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + +#define for_each_bset_c(_b, _t) \ + for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) #define bset_tree_for_each_key(_b, _t, _k) \ for (_k = btree_bkey_first(_b, _t); \ @@ -294,7 +297,6 @@ static inline struct bset_tree * bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) { unsigned offset = __btree_node_key_to_offset(b, k); - struct bset_tree *t; for_each_bset(b, t) if (offset <= t->end_offset) { diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 02c70e813fac..9e4ed75d3675 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -16,6 +16,12 @@ #include <linux/prefetch.h> #include <linux/sched/mm.h> +#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ +do { \ + if (shrinker_counter) \ + bc->not_freed_##counter++; \ +} while (0) + const char * const bch2_btree_node_flags[] = { #define x(f) #f, BTREE_FLAGS() @@ -162,6 +168,9 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) /* Cause future lookups for this node to fail: */ b->hash_val = 0; + + if (b->c.btree_id < BTREE_ID_NR) + --bc->used_by_btree[b->c.btree_id]; } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) @@ -169,8 +178,11 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) BUG_ON(b->hash_val); b->hash_val = btree_ptr_hash_val(&b->key); - return rhashtable_lookup_insert_fast(&bc->table, &b->hash, - bch_btree_cache_params); + int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, + bch_btree_cache_params); + if (!ret && b->c.btree_id < BTREE_ID_NR) + bc->used_by_btree[b->c.btree_id]++; + return ret; } int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, @@ -190,6 +202,35 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, return ret; } +void bch2_btree_node_update_key_early(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_i *new) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bkey_buf tmp; + int ret; + + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_reassemble(&tmp, c, old); + + b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); + if (!IS_ERR_OR_NULL(b)) { + mutex_lock(&c->btree_cache.lock); + + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&b->c.lock); + } + + bch2_bkey_buf_exit(&tmp, c); +} + __flatten static inline struct btree *btree_cache_find(struct btree_cache *bc, const struct bkey_i *k) @@ -203,7 +244,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, * this version is for btree nodes that have already been freed (we're not * reaping a real btree node) */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) { struct btree_cache *bc = &c->btree_cache; int ret = 0; @@ -225,38 +266,64 @@ wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) + if (!flush) { + if (btree_node_dirty(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + else if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); return -BCH_ERR_ENOMEM_btree_node_reclaim; + } /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } - if (!six_trylock_intent(&b->c.lock)) + if (!six_trylock_intent(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); return -BCH_ERR_ENOMEM_btree_node_reclaim; + } - if (!six_trylock_write(&b->c.lock)) + if (!six_trylock_write(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); goto out_unlock_intent; + } /* recheck under lock */ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) + if (!flush) { + if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); goto out_unlock; + } six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; } - if (btree_node_noevict(b) || - btree_node_write_blocked(b) || - btree_node_will_make_reachable(b)) + if (btree_node_noevict(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(noevict); + goto out_unlock; + } + if (btree_node_write_blocked(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); goto out_unlock; + } + if (btree_node_will_make_reachable(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); + goto out_unlock; + } if (btree_node_dirty(b)) { - if (!flush) + if (!flush) { + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); goto out_unlock; + } /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted @@ -286,14 +353,14 @@ out_unlock_intent: goto out; } -static int btree_node_reclaim(struct bch_fs *c, struct btree *b) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) { - return __btree_node_reclaim(c, b, false); + return __btree_node_reclaim(c, b, false, shrinker_counter); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, true); + return __btree_node_reclaim(c, b, true, false); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, @@ -341,11 +408,12 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (touched >= nr) goto out; - if (!btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b, true)) { btree_node_data_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; + bc->freed++; } } restart: @@ -354,9 +422,11 @@ restart: if (btree_node_accessed(b)) { clear_btree_node_accessed(b); - } else if (!btree_node_reclaim(c, b)) { + bc->not_freed_access_bit++; + } else if (!btree_node_reclaim(c, b, true)) { freed++; btree_node_data_free(c, b); + bc->freed++; bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); @@ -564,7 +634,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree *b; list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b)) + if (!btree_node_reclaim(c, b, false)) return b; while (1) { @@ -600,7 +670,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b, false)) { list_del_init(&b->list); goto got_node; } @@ -626,7 +696,7 @@ got_node: * the list. Check if there's any freed nodes there: */ list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2)) { + if (!btree_node_reclaim(c, b2, false)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); btree_node_to_freedlist(bc, b2); @@ -846,7 +916,6 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - struct bset_tree *t; bool need_relock = false; int ret; @@ -966,7 +1035,6 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * { struct bch_fs *c = trans->c; struct btree *b; - struct bset_tree *t; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); @@ -1043,7 +1111,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - struct bset_tree *t; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); @@ -1240,9 +1307,39 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc stats.failed); } -void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) +static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, + const char *label, unsigned nr) { - prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); - prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); - prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); + prt_printf(out, "%s\t", label); + prt_human_readable_u64(out, nr * c->opts.btree_node_size); + prt_printf(out, " (%u)\n", nr); +} + +void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_btree_cache_line(out, c, "total:", bc->used); + prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty)); + prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); + prt_newline(out); + + for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++) + prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]); + + prt_newline(out); + prt_printf(out, "freed:\t%u\n", bc->freed); + prt_printf(out, "not freed:\n"); + prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty); + prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight); + prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight); + prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent); + prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write); + prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit); + prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict); + prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked); + prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable); } diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 6d33885fdbde..fed35de3e4de 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -17,6 +17,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); +void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *); + void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); @@ -131,6 +134,6 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) const char *bch2_btree_id_str(enum btree_id); void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); -void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *); +void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 791470b0c654..8035c8b797ab 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -52,12 +52,6 @@ static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) }}}; } -static bool should_restart_for_topology_repair(struct bch_fs *c) -{ - return c->opts.fix_errors != FSCK_FIX_no && - !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); -} - static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { preempt_disable(); @@ -69,7 +63,7 @@ static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { - BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); + BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0); __gc_pos_set(c, new_pos); } @@ -97,35 +91,6 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) } } -static void bch2_btree_node_update_key_early(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_i *new) -{ - struct bch_fs *c = trans->c; - struct btree *b; - struct bkey_buf tmp; - int ret; - - bch2_bkey_buf_init(&tmp); - bch2_bkey_buf_reassemble(&tmp, c, old); - - b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); - if (!IS_ERR_OR_NULL(b)) { - mutex_lock(&c->btree_cache.lock); - - bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, new); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - - mutex_unlock(&c->btree_cache.lock); - six_unlock_read(&b->c.lock); - } - - bch2_bkey_buf_exit(&tmp, c); -} - static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) { struct bkey_i_btree_ptr_v2 *new; @@ -546,9 +511,9 @@ reconstruct_root: if (!bch2_btree_has_scanned_nodes(c, i)) { mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing, "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); - bch2_btree_root_alloc_fake(c, i, 0); + bch2_btree_root_alloc_fake_trans(trans, i, 0); } else { - bch2_btree_root_alloc_fake(c, i, 1); + bch2_btree_root_alloc_fake_trans(trans, i, 1); bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); if (ret) @@ -576,7 +541,7 @@ reconstruct_root: goto reconstruct_root; bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); - bch2_btree_root_alloc_fake(c, i, 0); + bch2_btree_root_alloc_fake_trans(trans, i, 0); r->alive = false; ret = 0; } @@ -586,495 +551,123 @@ fsck_err: return ret; } -static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, - unsigned level, bool is_root, - struct bkey_s_c *k) +/* marking of btree keys/nodes: */ + +static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + unsigned level, struct btree **prev, + struct btree_iter *iter, struct bkey_s_c k, + bool initial) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k); - const union bch_extent_entry *entry_c; - struct extent_ptr_decoded p = { 0 }; - bool do_update = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - /* - * XXX - * use check_bucket_ref here - */ - bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c); - - if (fsck_err_on(!g->gen_valid, - c, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - } else { - do_update = true; - } - } - - if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, - c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - set_bit(BCH_FS_need_another_gc, &c->flags); - } else { - do_update = true; - } - } - - if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, - c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - - if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, - c, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - - if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) - continue; - - if (fsck_err_on(bucket_data_type(g->data_type) && - bucket_data_type(g->data_type) != - bucket_data_type(data_type), c, - ptr_bucket_data_type_mismatch, - "bucket %u:%zu different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->data_type = data_type; - set_bit(BCH_FS_need_another_gc, &c->flags); - } else { - do_update = true; - } - } - - if (p.has_ec) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - - if (fsck_err_on(!m || !m->alive, c, - ptr_to_missing_stripe, - "pointer to nonexistent stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - - if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, - ptr_to_incorrect_stripe, - "pointer does not match stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - } - } - - if (do_update) { - if (is_root) { - bch_err(c, "cannot update btree roots yet"); - ret = -EINVAL; - goto err; - } - - struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); - if (!new) { - ret = -BCH_ERR_ENOMEM_gc_repair_key; - bch_err_msg(c, ret, "allocating new key"); - goto err; - } - - bkey_reassemble(new, *k); - - if (level) { - /* - * We don't want to drop btree node pointers - if the - * btree node isn't there anymore, the read path will - * sort it out: - */ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - - ptr->gen = g->gen; - } - } else { - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); - - if ((p.ptr.cached && - (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || - (!p.ptr.cached && - gen_cmp(p.ptr.gen, g->gen) < 0) || - gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || - (g->data_type && - g->data_type != data_type)) { - bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); - goto restart_drop_ptrs; - } - } -again: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_extent_entry_for_each(ptrs, entry) { - if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, - entry->stripe_ptr.idx); - union bch_extent_entry *next_ptr; - - bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) - if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) - goto found; - next_ptr = NULL; -found: - if (!next_ptr) { - bch_err(c, "aieee, found stripe ptr with no data ptr"); - continue; - } - - if (!m || !m->alive || - !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], - &next_ptr->ptr, - m->sectors)) { - bch2_bkey_extent_entry_drop(new, entry); - goto again; - } - } - } - } - if (level) - bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); + if (iter) { + struct btree_path *path = btree_iter_path(trans, iter); + struct btree *b = path_l(path)->b; - if (0) { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, *k); - bch_info(c, "updated %s", buf.buf); - - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); - bch_info(c, "new key %s", buf.buf); - } - - ret = bch2_journal_key_insert_take(c, btree_id, level, new); - if (ret) { - kfree(new); - goto err; + if (*prev != b) { + int ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; } - - *k = bkey_i_to_s_c(new); + *prev = b; } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} -/* marking of btree keys/nodes: */ - -static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, - unsigned level, bool is_root, - struct bkey_s_c *k, - bool initial) -{ - struct bch_fs *c = trans->c; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; struct printbuf buf = PRINTBUF; int ret = 0; - deleted.p = k->k->p; + deleted.p = k.k->p; if (initial) { BUG_ON(bch2_journal_seq_verify && - k->k->version.lo > atomic64_read(&c->journal.seq)); + k.k->version.lo > atomic64_read(&c->journal.seq)); - if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, + if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, bkey_version_in_future, "key version number higher than recorded: %llu > %llu", - k->k->version.lo, + k.k->version.lo, atomic64_read(&c->key_version))) - atomic64_set(&c->key_version, k->k->version.lo); + atomic64_set(&c->key_version, k.k->version.lo); } - ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); - if (ret) - goto err; - - if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k), + if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), c, btree_bitmap_not_marked, "btree ptr not marked in member info btree allocated bitmap\n %s", - (bch2_bkey_val_to_text(&buf, c, *k), + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { mutex_lock(&c->sb_lock); - bch2_dev_btree_bitmap_mark(c, *k); + bch2_dev_btree_bitmap_mark(c, k); bch2_write_super(c); mutex_unlock(&c->sb_lock); } - ret = commit_do(trans, NULL, NULL, 0, - bch2_key_trigger(trans, btree_id, level, old, - unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); -fsck_err: -err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) -{ - struct btree_node_iter iter; - struct bkey unpacked; - struct bkey_s_c k; - int ret = 0; + /* + * We require a commit before key_trigger() because + * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the + * wrong result if we run it multiple times. + */ + unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0; - ret = bch2_btree_node_check_topology(trans, b); + ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), + BTREE_TRIGGER_check_repair|flags); if (ret) - return ret; - - if (!btree_node_type_needs_gc(btree_node_type(b))) - return 0; - - bch2_btree_node_iter_init_from_start(&iter, b); - - while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, - &k, initial); - if (ret) - return ret; + goto out; - bch2_btree_node_iter_advance(&iter, b); + if (trans->nr_updates) { + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; } - return 0; + ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), + BTREE_TRIGGER_gc|flags); +out: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; } -static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, - bool initial, bool metadata_only) +static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct btree *b; - unsigned depth = metadata_only ? 1 : 0; + int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1; int ret = 0; - gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); - - __for_each_btree_node(trans, iter, btree_id, POS_MIN, - 0, depth, BTREE_ITER_PREFETCH, b, ret) { - bch2_verify_btree_nr_keys(b); - - gc_pos_set(c, gc_pos_btree_node(b)); - - ret = btree_gc_mark_node(trans, b, initial); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - return ret; + /* We need to make sure every leaf node is readable before going RW */ + if (initial) + target_depth = 0; + /* root */ mutex_lock(&c->btree_root_lock); - b = bch2_btree_id_root(c, btree_id)->b; + struct btree *b = bch2_btree_id_root(c, btree)->b; if (!btree_node_fake(b)) { - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, - true, &k, initial); + gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); + ret = lockrestart_do(trans, + bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, + NULL, NULL, bkey_i_to_s_c(&b->key), initial)); + level = b->c.level; } - gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); mutex_unlock(&c->btree_root_lock); - return ret; -} - -static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, - unsigned target_depth) -{ - struct bch_fs *c = trans->c; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct bkey_buf cur; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_btree_node_check_topology(trans, b); if (ret) return ret; - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - bch2_bkey_buf_init(&cur); - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - BUG_ON(bpos_lt(k.k->p, b->data->min_key)); - BUG_ON(bpos_gt(k.k->p, b->data->max_key)); + for (; level >= target_depth; --level) { + struct btree *prev = NULL; + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level, + BTREE_ITER_prefetch); - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, - false, &k, true); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); + bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); + })); if (ret) - goto fsck_err; - - bch2_btree_and_journal_iter_advance(&iter); - } - - if (b->c.level > target_depth) { - bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - iter.prefetch = true; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - struct btree *child; - - bch2_bkey_buf_reassemble(&cur, c, k); - bch2_btree_and_journal_iter_advance(&iter); - - child = bch2_btree_node_get_noiter(trans, cur.k, - b->c.btree_id, b->c.level - 1, - false); - ret = PTR_ERR_OR_ZERO(child); - - if (bch2_err_matches(ret, EIO)) { - bch2_topology_error(c); - - if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, - btree_node_read_error, - "Unreadable btree node at btree %s level %u:\n" - " %s", - bch2_btree_id_str(b->c.btree_id), - b->c.level - 1, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && - should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto fsck_err; - } else { - /* Continue marking when opted to not - * fix the error: */ - ret = 0; - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - continue; - } - } else if (ret) { - bch_err_msg(c, ret, "getting btree node"); - break; - } - - ret = bch2_gc_btree_init_recurse(trans, child, - target_depth); - six_unlock_read(&child->c.lock); - - if (ret) - break; - } - } -fsck_err: - bch2_bkey_buf_exit(&cur, c); - bch2_btree_and_journal_iter_exit(&iter); - printbuf_exit(&buf); - return ret; -} - -static int bch2_gc_btree_init(struct btree_trans *trans, - enum btree_id btree_id, - bool metadata_only) -{ - struct bch_fs *c = trans->c; - struct btree *b; - unsigned target_depth = metadata_only ? 1 : 0; - struct printbuf buf = PRINTBUF; - int ret = 0; - - b = bch2_btree_id_root(c, btree_id)->b; - - six_lock_read(&b->c.lock, NULL, NULL); - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->data->min_key); - if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, - btree_root_bad_min_key, - "btree root with incorrect min_key: %s", buf.buf)) { - bch_err(c, "repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto fsck_err; - } - - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->data->max_key); - if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, - btree_root_bad_max_key, - "btree root with incorrect max_key: %s", buf.buf)) { - bch_err(c, "repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto fsck_err; - } - - if (b->c.level >= target_depth) - ret = bch2_gc_btree_init_recurse(trans, b, target_depth); - - if (!ret) { - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, - &k, true); + break; } -fsck_err: - six_unlock_read(&b->c.lock); - bch_err_fn(c, ret); - printbuf_exit(&buf); return ret; } @@ -1084,7 +677,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) (int) btree_id_to_gc_phase(r); } -static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) +static int bch2_gc_btrees(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id ids[BTREE_ID_NR]; @@ -1095,98 +688,36 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - for (i = 0; i < BTREE_ID_NR && !ret; i++) - ret = initial - ? bch2_gc_btree_init(trans, ids[i], metadata_only) - : bch2_gc_btree(trans, ids[i], initial, metadata_only); + for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + unsigned btree = i < BTREE_ID_NR ? ids[i] : i; - for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { - if (!bch2_btree_id_root(c, i)->alive) + if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) continue; - ret = initial - ? bch2_gc_btree_init(trans, i, metadata_only) - : bch2_gc_btree(trans, i, initial, metadata_only); - } + ret = bch2_gc_btree(trans, btree, true); + if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), + c, btree_node_read_error, + "btree node read error for %s", + bch2_btree_id_str(btree))) + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + } +fsck_err: bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } -static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, - u64 start, u64 end, - enum bch_data_type type, - unsigned flags) -{ - u64 b = sector_to_bucket(ca, start); - - do { - unsigned sectors = - min_t(u64, bucket_to_sector(ca, b + 1), end) - start; - - bch2_mark_metadata_bucket(c, ca, b, type, sectors, - gc_phase(GC_PHASE_SB), flags); - b++; - start += sectors; - } while (start < end); -} - -static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, - unsigned flags) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - unsigned i; - u64 b; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - - if (offset == BCH_SB_SECTOR) - mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, - BCH_DATA_sb, flags); - - mark_metadata_sectors(c, ca, offset, - offset + (1 << layout->sb_max_size_bits), - BCH_DATA_sb, flags); - } - - for (i = 0; i < ca->journal.nr; i++) { - b = ca->journal.buckets[i]; - bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, - ca->mi.bucket_size, - gc_phase(GC_PHASE_SB), flags); - } -} - -static void bch2_mark_superblocks(struct bch_fs *c) +static int bch2_mark_superblocks(struct bch_fs *c) { mutex_lock(&c->sb_lock); gc_pos_set(c, gc_phase(GC_PHASE_SB)); - for_each_online_member(c, ca) - bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); + int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); mutex_unlock(&c->sb_lock); + return ret; } -#if 0 -/* Also see bch2_pending_btree_node_free_insert_done() */ -static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) -{ - struct btree_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&c->btree_interior_update_lock); - gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); - - for_each_pending_btree_node_free(c, as, d) - if (d->index_update_done) - bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); - - mutex_unlock(&c->btree_interior_update_lock); -} -#endif - static void bch2_gc_free(struct bch_fs *c) { genradix_free(&c->reflink_gc_table); @@ -1204,28 +735,23 @@ static void bch2_gc_free(struct bch_fs *c) c->usage_gc = NULL; } -static int bch2_gc_done(struct bch_fs *c, - bool initial, bool metadata_only) +static int bch2_gc_done(struct bch_fs *c) { struct bch_dev *ca = NULL; struct printbuf buf = PRINTBUF; - bool verify = !metadata_only && - !c->opts.reconstruct_alloc && - (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); unsigned i; int ret = 0; percpu_down_write(&c->mark_lock); -#define copy_field(_err, _f, _msg, ...) \ - if (dst->_f != src->_f && \ - (!verify || \ - fsck_err(c, _err, _msg ": got %llu, should be %llu" \ - , ##__VA_ARGS__, dst->_f, src->_f))) \ +#define copy_field(_err, _f, _msg, ...) \ + if (fsck_err_on(dst->_f != src->_f, c, _err, \ + _msg ": got %llu, should be %llu" , ##__VA_ARGS__, \ + dst->_f, src->_f)) \ dst->_f = src->_f -#define copy_dev_field(_err, _f, _msg, ...) \ +#define copy_dev_field(_err, _f, _msg, ...) \ copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__) -#define copy_fs_field(_err, _f, _msg, ...) \ +#define copy_fs_field(_err, _f, _msg, ...) \ copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) for (i = 0; i < ARRAY_SIZE(c->usage); i++) @@ -1258,31 +784,24 @@ static int bch2_gc_done(struct bch_fs *c, copy_fs_field(fs_usage_btree_wrong, b.btree, "btree"); - if (!metadata_only) { - copy_fs_field(fs_usage_data_wrong, - b.data, "data"); - copy_fs_field(fs_usage_cached_wrong, - b.cached, "cached"); - copy_fs_field(fs_usage_reserved_wrong, - b.reserved, "reserved"); - copy_fs_field(fs_usage_nr_inodes_wrong, - b.nr_inodes,"nr_inodes"); - - for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(fs_usage_persistent_reserved_wrong, - persistent_reserved[i], - "persistent_reserved[%i]", i); - } + copy_fs_field(fs_usage_data_wrong, + b.data, "data"); + copy_fs_field(fs_usage_cached_wrong, + b.cached, "cached"); + copy_fs_field(fs_usage_reserved_wrong, + b.reserved, "reserved"); + copy_fs_field(fs_usage_nr_inodes_wrong, + b.nr_inodes,"nr_inodes"); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(fs_usage_persistent_reserved_wrong, + persistent_reserved[i], + "persistent_reserved[%i]", i); for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); - if (metadata_only && - (e->data_type == BCH_DATA_user || - e->data_type == BCH_DATA_cached)) - continue; - printbuf_reset(&buf); bch2_replicas_entry_to_text(&buf, e); @@ -1296,10 +815,8 @@ static int bch2_gc_done(struct bch_fs *c, #undef copy_stripe_field #undef copy_field fsck_err: - if (ca) - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); bch_err_fn(c, ret); - percpu_up_write(&c->mark_lock); printbuf_exit(&buf); return ret; @@ -1322,7 +839,7 @@ static int bch2_gc_start(struct bch_fs *c) ca->usage_gc = alloc_percpu(struct bch_dev_usage); if (!ca->usage_gc) { bch_err(c, "error allocating ca->usage_gc"); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return -BCH_ERR_ENOMEM_gc_start; } @@ -1333,19 +850,6 @@ static int bch2_gc_start(struct bch_fs *c) return 0; } -static int bch2_gc_reset(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - free_percpu(ca->usage_gc); - ca->usage_gc = NULL; - } - - free_percpu(c->usage_gc); - c->usage_gc = NULL; - - return bch2_gc_start(c); -} - /* returns true if not equal */ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, struct bch_alloc_v4 r) @@ -1361,56 +865,41 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, static int bch2_alloc_write_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - bool metadata_only) + struct bch_dev *ca, + struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); - struct bucket old_gc, gc, *b; struct bkey_i_alloc_v4 *a; - struct bch_alloc_v4 old_convert, new; + struct bch_alloc_v4 old_gc, gc, old_convert, new; const struct bch_alloc_v4 *old; int ret; old = bch2_alloc_to_v4(k, &old_convert); - new = *old; + gc = new = *old; percpu_down_read(&c->mark_lock); - b = gc_bucket(ca, iter->pos.offset); - old_gc = *b; + __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); + + old_gc = gc; if ((old->data_type == BCH_DATA_sb || old->data_type == BCH_DATA_journal) && !bch2_dev_is_online(ca)) { - b->data_type = old->data_type; - b->dirty_sectors = old->dirty_sectors; + gc.data_type = old->data_type; + gc.dirty_sectors = old->dirty_sectors; } /* - * b->data_type doesn't yet include need_discard & need_gc_gen states - + * gc.data_type doesn't yet include need_discard & need_gc_gen states - * fix that here: */ - b->data_type = __alloc_data_type(b->dirty_sectors, - b->cached_sectors, - b->stripe, - *old, - b->data_type); - gc = *b; + alloc_data_type_set(&gc, gc.data_type); if (gc.data_type != old_gc.data_type || gc.dirty_sectors != old_gc.dirty_sectors) - bch2_dev_usage_update_m(c, ca, &old_gc, &gc); + bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true); percpu_up_read(&c->mark_lock); - if (metadata_only && - gc.data_type != BCH_DATA_sb && - gc.data_type != BCH_DATA_journal && - gc.data_type != BCH_DATA_btree) - return 0; - - if (gen_after(old->gen, gc.gen)) - return 0; - if (fsck_err_on(new.data_type != gc.data_type, c, alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" @@ -1460,12 +949,12 @@ static int bch2_alloc_write_key(struct btree_trans *trans, if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); + ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun); fsck_err: return ret; } -static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) +static int bch2_gc_alloc_done(struct bch_fs *c) { int ret = 0; @@ -1474,11 +963,11 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, ca->mi.first_bucket), POS(ca->dev_idx, ca->mi.nbuckets - 1), - BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, + BTREE_ITER_slots|BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, - bch2_alloc_write_key(trans, &iter, k, metadata_only))); + bch2_alloc_write_key(trans, &iter, ca, k))); if (ret) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); break; } } @@ -1487,14 +976,14 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) return ret; } -static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) +static int bch2_gc_alloc_start(struct bch_fs *c) { for_each_member_device(c, ca) { struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO); if (!buckets) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); bch_err(c, "error allocating ca->buckets[gc]"); return -BCH_ERR_ENOMEM_gc_alloc_start; } @@ -1504,54 +993,29 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) rcu_assign_pointer(ca->buckets_gc, buckets); } + struct bch_dev *ca = NULL; int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); - struct bucket *g = gc_bucket(ca, k.k->p.offset); + BTREE_ITER_prefetch, k, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + struct bucket *g = gc_bucket(ca, k.k->p.offset); g->gen_valid = 1; g->gen = a->gen; - - if (metadata_only && - (a->data_type == BCH_DATA_user || - a->data_type == BCH_DATA_cached || - a->data_type == BCH_DATA_parity)) { - g->data_type = a->data_type; - g->dirty_sectors = a->dirty_sectors; - g->cached_sectors = a->cached_sectors; - g->stripe = a->stripe; - g->stripe_redundancy = a->stripe_redundancy; - } - 0; }))); + bch2_dev_put(ca); bch_err_fn(c, ret); return ret; } -static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) -{ - for_each_member_device(c, ca) { - struct bucket_array *buckets = gc_bucket_array(ca); - struct bucket *g; - - for_each_bucket(g, buckets) { - if (metadata_only && - (g->data_type == BCH_DATA_user || - g->data_type == BCH_DATA_cached || - g->data_type == BCH_DATA_parity)) - continue; - g->data_type = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } - } -} - static int bch2_gc_write_reflink_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1601,35 +1065,27 @@ fsck_err: return ret; } -static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) +static int bch2_gc_reflink_done(struct bch_fs *c) { size_t idx = 0; - if (metadata_only) - return 0; - int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_gc_write_reflink_key(trans, &iter, k, &idx))); c->reflink_gc_nr = 0; return ret; } -static int bch2_gc_reflink_start(struct bch_fs *c, - bool metadata_only) +static int bch2_gc_reflink_start(struct bch_fs *c) { - - if (metadata_only) - return 0; - c->reflink_gc_nr = 0; int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ const __le64 *refcount = bkey_refcount_c(k); if (!refcount) @@ -1652,15 +1108,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, return ret; } -static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) -{ - struct genradix_iter iter; - struct reflink_gc *r; - - genradix_for_each(&c->reflink_gc_table, iter, r) - r->refcount = 0; -} - static int bch2_gc_write_stripes_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) @@ -1714,30 +1161,20 @@ fsck_err: return ret; } -static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) +static int bch2_gc_stripes_done(struct bch_fs *c) { - if (metadata_only) - return 0; - return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_gc_write_stripes_key(trans, &iter, k))); } -static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) -{ - genradix_free(&c->gc_stripes); -} - /** - * bch2_gc - walk _all_ references to buckets, and recompute them: + * bch2_check_allocations - walk all references to buckets, and recompute them: * * @c: filesystem object - * @initial: are we in recovery? - * @metadata_only: are we just checking metadata references, or everything? * * Returns: 0 on success, or standard errcode on failure * @@ -1756,9 +1193,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) * move around - if references move backwards in the ordering GC * uses, GC could skip past them */ -int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) +int bch2_check_allocations(struct bch_fs *c) { - unsigned iter = 0; int ret; lockdep_assert_held(&c->state_lock); @@ -1768,62 +1204,30 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) bch2_btree_interior_updates_flush(c); ret = bch2_gc_start(c) ?: - bch2_gc_alloc_start(c, metadata_only) ?: - bch2_gc_reflink_start(c, metadata_only); + bch2_gc_alloc_start(c) ?: + bch2_gc_reflink_start(c); if (ret) goto out; -again: - gc_pos_set(c, gc_phase(GC_PHASE_START)); - bch2_mark_superblocks(c); + gc_pos_set(c, gc_phase(GC_PHASE_START)); - ret = bch2_gc_btrees(c, initial, metadata_only); + ret = bch2_mark_superblocks(c); + BUG_ON(ret); + ret = bch2_gc_btrees(c); if (ret) goto out; -#if 0 - bch2_mark_pending_btree_node_frees(c); -#endif c->gc_count++; - if (test_bit(BCH_FS_need_another_gc, &c->flags) || - (!iter && bch2_test_restart_gc)) { - if (iter++ > 2) { - bch_info(c, "Unable to fix bucket gens, looping"); - ret = -EINVAL; - goto out; - } - - /* - * XXX: make sure gens we fixed got saved - */ - bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_need_another_gc, &c->flags); - __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); - - bch2_gc_stripes_reset(c, metadata_only); - bch2_gc_alloc_reset(c, metadata_only); - bch2_gc_reflink_reset(c, metadata_only); - ret = bch2_gc_reset(c); - if (ret) - goto out; - - /* flush fsck errors, reset counters */ - bch2_flush_fsck_errs(c); - goto again; - } + bch2_journal_block(&c->journal); out: - if (!ret) { - bch2_journal_block(&c->journal); + ret = bch2_gc_alloc_done(c) ?: + bch2_gc_done(c) ?: + bch2_gc_stripes_done(c) ?: + bch2_gc_reflink_done(c); - ret = bch2_gc_alloc_done(c, metadata_only) ?: - bch2_gc_done(c, initial, metadata_only) ?: - bch2_gc_stripes_done(c, metadata_only) ?: - bch2_gc_reflink_done(c, metadata_only); - - bch2_journal_unblock(&c->journal); - } + bch2_journal_unblock(&c->journal); percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ @@ -1852,23 +1256,33 @@ static int gc_btree_gens_key(struct btree_trans *trans, struct bkey_i *u; int ret; + if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) + return -EROFS; + percpu_down_read(&c->mark_lock); + rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; - if (ptr_stale(ca, ptr) > 16) { + if (dev_ptr_stale(ca, ptr) > 16) { + rcu_read_unlock(); percpu_up_read(&c->mark_lock); goto update; } } bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; if (gen_after(*gen, ptr->gen)) *gen = ptr->gen; } + rcu_read_unlock(); percpu_up_read(&c->mark_lock); return 0; update: @@ -1881,10 +1295,9 @@ update: return 0; } -static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) +static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca, + struct btree_iter *iter, struct bkey_s_c k) { - struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); struct bkey_i_alloc_v4 *a_mut; @@ -1899,7 +1312,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i return ret; a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; - a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); + alloc_data_type_set(&a_mut->v, a_mut->v.data_type); return bch2_trans_update(trans, iter, &a_mut->k_i, 0); } @@ -1927,7 +1340,7 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); if (!ca->oldest_gen) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); ret = -BCH_ERR_ENOMEM_gc_gens; goto err; } @@ -1945,7 +1358,7 @@ int bch2_gc_gens(struct bch_fs *c) ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, i, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -1954,14 +1367,23 @@ int bch2_gc_gens(struct bch_fs *c) goto err; } + struct bch_dev *ca = NULL; ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, + BTREE_ITER_prefetch, k, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_alloc_write_oldest_gen(trans, &iter, k))); + BCH_TRANS_COMMIT_no_enospc, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + bch2_alloc_write_oldest_gen(trans, ca, &iter, k); + }))); + bch2_dev_put(ca); + if (ret) goto err; @@ -1985,87 +1407,23 @@ err: return ret; } -static int bch2_gc_thread(void *arg) +static void bch2_gc_gens_work(struct work_struct *work) { - struct bch_fs *c = arg; - struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last = atomic64_read(&clock->now); - unsigned last_kick = atomic_read(&c->kick_gc); - - set_freezable(); - - while (1) { - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - return 0; - } - - if (atomic_read(&c->kick_gc) != last_kick) - break; - - if (c->btree_gc_periodic) { - unsigned long next = last + c->capacity / 16; - - if (atomic64_read(&clock->now) >= next) - break; - - bch2_io_clock_schedule_timeout(clock, next); - } else { - schedule(); - } - - try_to_freeze(); - } - __set_current_state(TASK_RUNNING); - - last = atomic64_read(&clock->now); - last_kick = atomic_read(&c->kick_gc); - - /* - * Full gc is currently incompatible with btree key cache: - */ -#if 0 - ret = bch2_gc(c, false, false); -#else - bch2_gc_gens(c); -#endif - debug_check_no_locks_held(); - } - - return 0; + struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); + bch2_gc_gens(c); + bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); } -void bch2_gc_thread_stop(struct bch_fs *c) +void bch2_gc_gens_async(struct bch_fs *c) { - struct task_struct *p; - - p = c->gc_thread; - c->gc_thread = NULL; - - if (p) { - kthread_stop(p); - put_task_struct(p); - } + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) && + !queue_work(c->write_ref_wq, &c->gc_gens_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); } -int bch2_gc_thread_start(struct bch_fs *c) +void bch2_fs_gc_init(struct bch_fs *c) { - struct task_struct *p; + seqcount_init(&c->gc_pos_lock); - if (c->gc_thread) - return 0; - - p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); - if (IS_ERR(p)) { - bch_err_fn(c, PTR_ERR(p)); - return PTR_ERR(p); - } - - get_task_struct(p); - c->gc_thread = p; - wake_up_process(p); - return 0; + INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); } diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 607575f83a00..1b6489d8e0f4 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -6,10 +6,7 @@ #include "btree_types.h" int bch2_check_topology(struct bch_fs *); -int bch2_gc(struct bch_fs *, bool, bool); -int bch2_gc_gens(struct bch_fs *); -void bch2_gc_thread_stop(struct bch_fs *); -int bch2_gc_thread_start(struct bch_fs *); +int bch2_check_allocations(struct bch_fs *); /* * For concurrent mark and sweep (with other index updates), we define a total @@ -37,16 +34,16 @@ static inline struct gc_pos gc_phase(enum gc_phase phase) { return (struct gc_pos) { .phase = phase, - .pos = POS_MIN, .level = 0, + .pos = POS_MIN, }; } static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) { - return cmp_int(l.phase, r.phase) ?: - bpos_cmp(l.pos, r.pos) ?: - cmp_int(l.level, r.level); + return cmp_int(l.phase, r.phase) ?: + -cmp_int(l.level, r.level) ?: + bpos_cmp(l.pos, r.pos); } static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) @@ -60,13 +57,13 @@ static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) } } -static inline struct gc_pos gc_pos_btree(enum btree_id id, - struct bpos pos, unsigned level) +static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, + struct bpos pos) { return (struct gc_pos) { - .phase = btree_id_to_gc_phase(id), - .pos = pos, + .phase = btree_id_to_gc_phase(btree), .level = level, + .pos = pos, }; } @@ -76,19 +73,7 @@ static inline struct gc_pos gc_pos_btree(enum btree_id id, */ static inline struct gc_pos gc_pos_btree_node(struct btree *b) { - return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); -} - -/* - * GC position of the pointer to a btree root: we don't use - * gc_pos_pointer_to_btree_node() here to avoid a potential race with - * btree_split() increasing the tree depth - the new root will have level > the - * old root and thus have a greater gc position than the old root, but that - * would be incorrect since once gc has marked the root it's not coming back. - */ -static inline struct gc_pos gc_pos_btree_root(enum btree_id id) -{ - return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH); + return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p); } static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) @@ -104,11 +89,8 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) return ret; } -static inline void bch2_do_gc_gens(struct bch_fs *c) -{ - atomic_inc(&c->kick_gc); - if (c->gc_thread) - wake_up_process(c->gc_thread); -} +int bch2_gc_gens(struct bch_fs *); +void bch2_gc_gens_async(struct bch_fs *); +void bch2_fs_gc_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index debb0edc3455..cbf8f5d90602 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -23,6 +23,18 @@ #include <linux/sched/mm.h> +static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) +{ + prt_printf(out, "btree=%s l=%u seq %llux\n", + bch2_btree_id_str(BTREE_NODE_ID(bn)), + (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq); + prt_str(out, "min: "); + bch2_bpos_to_text(out, bn->min_key); + prt_newline(out); + prt_str(out, "max: "); + bch2_bpos_to_text(out, bn->max_key); +} + void bch2_btree_node_io_unlock(struct btree *b) { EBUG_ON(!btree_node_write_in_flight(b)); @@ -217,7 +229,6 @@ static bool should_compact_bset(struct btree *b, struct bset_tree *t, static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) { - struct bset_tree *t; bool ret = false; for_each_bset(b, t) { @@ -288,8 +299,7 @@ bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, static void btree_node_sort(struct bch_fs *c, struct btree *b, unsigned start_idx, - unsigned end_idx, - bool filter_whiteouts) + unsigned end_idx) { struct btree_node *out; struct sort_iter_stack sort_iter; @@ -320,7 +330,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, start_time = local_clock(); - u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts); + u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter); out->keys.u64s = cpu_to_le16(u64s); @@ -426,13 +436,12 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b) break; if (b->nsets - unwritten_idx > 1) { - btree_node_sort(c, b, unwritten_idx, - b->nsets, false); + btree_node_sort(c, b, unwritten_idx, b->nsets); ret = true; } if (unwritten_idx > 1) { - btree_node_sort(c, b, 0, unwritten_idx, false); + btree_node_sort(c, b, 0, unwritten_idx); ret = true; } @@ -441,8 +450,6 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b) void bch2_btree_build_aux_trees(struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) bch2_bset_build_aux_tree(b, t, !bset_written(b, bset(b, t)) && @@ -524,7 +531,9 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_printf(out, "at btree "); bch2_btree_pos_to_text(out, c, b); - prt_printf(out, "\n node offset %u/%u", + printbuf_indent_add(out, 2); + + prt_printf(out, "\nnode offset %u/%u", b->written, btree_ptr_sectors_written(&b->key)); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); @@ -543,6 +552,7 @@ static int __btree_err(int ret, const char *fmt, ...) { struct printbuf out = PRINTBUF; + bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; va_list args; btree_err_msg(&out, c, ca, b, i, b->written, write); @@ -564,12 +574,14 @@ static int __btree_err(int ret, if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) ret = -BCH_ERR_btree_node_read_err_bad_node; - if (ret != -BCH_ERR_btree_node_read_err_fixable) + if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) bch2_sb_error_count(c, err_type); switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: - ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf); + ret = !silent + ? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf) + : -BCH_ERR_fsck_fix; if (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore) goto fsck_err; @@ -577,14 +589,17 @@ static int __btree_err(int ret, break; case -BCH_ERR_btree_node_read_err_want_retry: case -BCH_ERR_btree_node_read_err_must_retry: - bch2_print_string_as_lines(KERN_ERR, out.buf); + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); break; case -BCH_ERR_btree_node_read_err_bad_node: - bch2_print_string_as_lines(KERN_ERR, out.buf); + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); ret = bch2_topology_error(c); break; case -BCH_ERR_btree_node_read_err_incompatible: - bch2_print_string_as_lines(KERN_ERR, out.buf); + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); ret = -BCH_ERR_fsck_errors_not_fixed; break; default: @@ -619,8 +634,6 @@ fsck_err: __cold void bch2_btree_node_drop_keys_outside_node(struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) { struct bset *i = bset(b, t); struct bkey_packed *k; @@ -1021,18 +1034,19 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, btree_node_bad_seq, - "got wrong btree node (want %llx got %llx)\n" - "got btree %s level %llu pos %s", - bp->seq, b->data->keys.seq, - bch2_btree_id_str(BTREE_NODE_ID(b->data)), - BTREE_NODE_LEVEL(b->data), - buf.buf); + "got wrong btree node: got\n%s", + (printbuf_reset(&buf), + bch2_btree_node_header_to_text(&buf, b->data), + buf.buf)); } else { btree_err_on(!b->data->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, btree_node_bad_seq, - "bad btree header: seq 0"); + "bad btree header: seq 0\n%s", + (printbuf_reset(&buf), + bch2_btree_node_header_to_text(&buf, b->data), + buf.buf)); } while (b->written < (ptr_written ?: btree_sectors(c))) { @@ -1095,7 +1109,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, nonce = btree_nonce(i, b->written << 9); struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); csum_bad = bch2_crc_cmp(bne->csum, csum); - if (csum_bad) + if (ca && csum_bad) bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); btree_err_on(csum_bad, @@ -1249,12 +1263,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); + rcu_read_lock(); bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - if (ca2->mi.state != BCH_MEMBER_STATE_rw) + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) set_btree_node_need_rewrite(b); } + rcu_read_unlock(); if (!ptr_written) set_btree_node_need_rewrite(b); @@ -1279,8 +1295,8 @@ static void btree_node_read_work(struct work_struct *work) struct btree_read_bio *rb = container_of(work, struct btree_read_bio, work); struct bch_fs *c = rb->c; + struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; struct btree *b = rb->b; - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; struct printbuf buf = PRINTBUF; @@ -1292,8 +1308,8 @@ static void btree_node_read_work(struct work_struct *work) while (1) { retry = true; bch_info(c, "retrying read"); - ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); + rb->have_ioref = ca != NULL; bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_buf_bytes(b); @@ -1307,7 +1323,7 @@ static void btree_node_read_work(struct work_struct *work) start: printbuf_reset(&buf); bch2_btree_pos_to_text(&buf, c, b); - bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read, "btree read error %s for %s", bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) @@ -1363,7 +1379,7 @@ static void btree_node_read_endio(struct bio *bio) struct bch_fs *c = rb->c; if (rb->have_ioref) { - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); } @@ -1560,7 +1576,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) struct btree_node_read_all *ra = rb->ra; if (rb->have_ioref) { - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); } @@ -1602,14 +1618,14 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool i = 0; bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); struct btree_read_bio *rb = container_of(ra->bio[i], struct btree_read_bio, bio); rb->c = c; rb->b = b; rb->ra = ra; rb->start_time = local_clock(); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->have_ioref = ca != NULL; rb->idx = i; rb->pick = pick; rb->bio.bi_iter.bi_sector = pick.ptr.offset; @@ -1679,7 +1695,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, return; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); bio = bio_alloc_bioset(NULL, buf_pages(b->data, btree_buf_bytes(b)), @@ -1691,7 +1707,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, rb->b = b; rb->ra = NULL; rb->start_time = local_clock(); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->have_ioref = ca != NULL; rb->pick = pick; INIT_WORK(&rb->work, btree_node_read_work); bio->bi_iter.bi_sector = pick.ptr.offset; @@ -1846,7 +1862,6 @@ static void btree_node_write_work(struct work_struct *work) container_of(work, struct btree_write_bio, work); struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; - struct bch_extent_ptr *ptr; int ret = 0; btree_bounce_free(c, @@ -1896,13 +1911,14 @@ static void btree_node_write_endio(struct bio *bio) struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); struct bch_fs *c = wbio->c; struct btree *b = wbio->bio.bi_private; - struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; unsigned long flags; if (wbio->have_ioref) bch2_latency_acct(ca, wbio->submit_time, WRITE); - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + if (!ca || + bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, "btree write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("btree")) { @@ -1969,7 +1985,6 @@ static void btree_write_submit(struct work_struct *work) void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) { struct btree_write_bio *wbio; - struct bset_tree *t; struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; @@ -2095,11 +2110,11 @@ do_write: unwritten_whiteouts_end(b)); SET_BSET_SEPARATE_WHITEOUTS(i, false); - b->whiteout_u64s = 0; - - u64s = bch2_sort_keys(i->start, &sort_iter.iter, false); + u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter); le16_add_cpu(&i->u64s, u64s); + b->whiteout_u64s = 0; + BUG_ON(!b->written && i->u64s != b->data->keys.u64s); set_needs_whiteout(i, false); @@ -2226,7 +2241,6 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) { bool invalidated_iter = false; struct btree_node_entry *bne; - struct bset_tree *t; if (!btree_node_just_written(b)) return false; @@ -2249,7 +2263,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) * single bset: */ if (b->nsets > 1) { - btree_node_sort(c, b, 0, b->nsets, true); + btree_node_sort(c, b, 0, b->nsets); invalidated_iter = true; } else { invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); @@ -2346,20 +2360,13 @@ void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) printbuf_tabstop_push(out, 20); printbuf_tabstop_push(out, 10); - prt_tab(out); - prt_str(out, "nr"); - prt_tab(out); - prt_str(out, "size"); - prt_newline(out); + prt_printf(out, "\tnr\tsize\n"); for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { u64 nr = atomic64_read(&c->btree_write_stats[i].nr); u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); - prt_printf(out, "%s:", bch2_btree_write_types[i]); - prt_tab(out); - prt_u64(out, nr); - prt_tab(out); + prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr); prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); prt_newline(out); } diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index e251cb6b965f..2b8b564fc560 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -81,8 +81,6 @@ static inline bool should_compact_bset_lazy(struct btree *b, static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) if (should_compact_bset_lazy(b, t)) return bch2_compact_whiteouts(c, b, COMPACT_LAZY); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 2a211a4bebd1..5bf98cb8b15d 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -61,7 +61,7 @@ static inline int btree_path_cmp(const struct btree_path *l, static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) { /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + if (iter->flags & BTREE_ITER_all_snapshots) { p = bpos_successor(p); } else { p = bpos_nosnap_successor(p); @@ -74,7 +74,7 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) { /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + if (iter->flags & BTREE_ITER_all_snapshots) { p = bpos_predecessor(p); } else { p = bpos_nosnap_predecessor(p); @@ -88,7 +88,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) { struct bpos pos = iter->pos; - if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + if ((iter->flags & BTREE_ITER_is_extents) && !bkey_eq(pos, POS_MAX)) pos = bkey_successor(iter, pos); return pos; @@ -253,13 +253,13 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) BUG_ON(iter->btree_id >= BTREE_ID_NR); - BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached); + BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); - BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && - (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + BUG_ON((iter->flags & BTREE_ITER_is_extents) && + (iter->flags & BTREE_ITER_all_snapshots)); - BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && - (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) && + (iter->flags & BTREE_ITER_all_snapshots) && !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) @@ -269,10 +269,10 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { - BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && !iter->pos.snapshot); - BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && iter->pos.snapshot != iter->snapshot); BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || @@ -289,7 +289,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k if (!bch2_debug_check_iterators) return 0; - if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_filter_snapshots)) return 0; if (bkey_err(k) || !k.k) @@ -300,8 +300,8 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k k.k->p.snapshot)); bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, - BTREE_ITER_NOPRESERVE| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_nopreserve| + BTREE_ITER_all_snapshots); prev = bch2_btree_iter_prev(©); if (!prev.k) goto out; @@ -897,7 +897,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, bch2_bkey_buf_reassemble(out, c, k); - if ((flags & BTREE_ITER_PREFETCH) && + if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) ret = btree_path_prefetch_j(trans, path, &jiter); @@ -944,7 +944,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, bch2_bkey_buf_unpack(&tmp, c, l->b, k); - if ((flags & BTREE_ITER_PREFETCH) && + if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) @@ -999,6 +999,7 @@ retry_all: bch2_trans_unlock(trans); cond_resched(); + trans->locked = true; if (unlikely(trans->memory_allocation_failure)) { struct closure cl; @@ -1162,6 +1163,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out_uptodate; path->level = btree_path_up_until_good_node(trans, path, 0); + unsigned max_level = path->level; EBUG_ON(btree_path_node(path, path->level) && !btree_node_locked(path, path->level)); @@ -1192,6 +1194,16 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out; } } + + if (unlikely(max_level > path->level)) { + struct btree_path *linked; + unsigned iter; + + trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter) + for (unsigned j = path->level + 1; j < max_level; j++) + linked->l[j] = path->l[j]; + } + out_uptodate: path->uptodate = BTREE_ITER_UPTODATE; out: @@ -1221,11 +1233,14 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path } static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, - bool intent) + bool intent, unsigned long ip) { btree_path_idx_t new = btree_path_alloc(trans, src); btree_path_copy(trans, trans->paths + new, trans->paths + src); __btree_path_get(trans->paths + new, intent); +#ifdef TRACK_PATH_ALLOCATED + trans->paths[new].ip_allocated = ip; +#endif return new; } @@ -1234,7 +1249,7 @@ btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, btree_path_idx_t path, bool intent, unsigned long ip) { __btree_path_put(trans->paths + path, intent); - path = btree_path_clone(trans, path, intent); + path = btree_path_clone(trans, path, intent, ip); trans->paths[path].preserve = false; return path; } @@ -1334,6 +1349,26 @@ static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t __clear_bit(path, trans->paths_allocated); } +static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path) +{ + unsigned l = path->level; + + do { + if (!btree_path_node(path, l)) + break; + + if (!is_btree_node(path, l)) + return false; + + if (path->l[l].lock_seq != path->l[l].b->c.lock.seq) + return false; + + l++; + } while (l < path->locks_want); + + return true; +} + void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { struct btree_path *path = trans->paths + path_idx, *dup; @@ -1348,10 +1383,15 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) return; - if (path->should_be_locked && - !trans->restarted && - (!dup || !bch2_btree_path_relock_norestart(trans, dup))) - return; + if (path->should_be_locked && !trans->restarted) { + if (!dup) + return; + + if (!(trans->locked + ? bch2_btree_path_relock_norestart(trans, dup) + : bch2_btree_path_can_relock(trans, dup))) + return; + } if (dup) { dup->preserve |= path->preserve; @@ -1384,22 +1424,26 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) (void *) trans->last_restarted_ip); } +void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans) +{ + panic("trans should be locked, unlocked by %pS\n", + (void *) trans->last_unlock_ip); +} + noinline __cold void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { - prt_printf(buf, "transaction updates for %s journal seq %llu", + prt_printf(buf, "transaction updates for %s journal seq %llu\n", trans->fn, trans->journal_res.seq); - prt_newline(buf); printbuf_indent_add(buf, 2); trans_for_each_update(trans, i) { struct bkey_s_c old = { &i->old_k, i->old_v }; - prt_printf(buf, "update: btree=%s cached=%u %pS", + prt_printf(buf, "update: btree=%s cached=%u %pS\n", bch2_btree_id_str(i->btree_id), i->cached, (void *) i->ip_allocated); - prt_newline(buf); prt_printf(buf, " old "); bch2_bkey_val_to_text(buf, trans->c, old); @@ -1428,23 +1472,63 @@ void bch2_dump_trans_updates(struct btree_trans *trans) printbuf_exit(&buf); } -static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) +static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) { struct btree_path *path = trans->paths + path_idx; - prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", + prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ", path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', + path->cached ? 'C' : 'B', bch2_btree_id_str(path->btree_id), path->level); bch2_bpos_to_text(out, path->pos); - prt_printf(out, " locks %u", path->nodes_locked); #ifdef TRACK_PATH_ALLOCATED prt_printf(out, " %pS", (void *) path->ip_allocated); #endif +} + +static const char *btree_node_locked_str(enum btree_node_locked_type t) +{ + switch (t) { + case BTREE_NODE_UNLOCKED: + return "unlocked"; + case BTREE_NODE_READ_LOCKED: + return "read"; + case BTREE_NODE_INTENT_LOCKED: + return "intent"; + case BTREE_NODE_WRITE_LOCKED: + return "write"; + default: + return NULL; + } +} + +void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) +{ + bch2_btree_path_to_text_short(out, trans, path_idx); + + struct btree_path *path = trans->paths + path_idx; + + prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want); prt_newline(out); + + printbuf_indent_add(out, 2); + for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { + prt_printf(out, "l=%u locks %s seq %u node ", l, + btree_node_locked_str(btree_node_locked_type(path, l)), + path->l[l].lock_seq); + + int ret = PTR_ERR_OR_ZERO(path->l[l].b); + if (ret) + prt_str(out, bch2_err_str(ret)); + else + prt_printf(out, "%px", path->l[l].b); + prt_newline(out); + } + printbuf_indent_sub(out, 2); } static noinline __cold @@ -1456,8 +1540,10 @@ void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, if (!nosort) btree_trans_sort_paths(trans); - trans_for_each_path_idx_inorder(trans, iter) - bch2_btree_path_to_text(out, trans, iter.path_idx); + trans_for_each_path_idx_inorder(trans, iter) { + bch2_btree_path_to_text_short(out, trans, iter.path_idx); + prt_newline(out); + } } noinline __cold @@ -1608,11 +1694,12 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, unsigned flags, unsigned long ip) { struct btree_path *path; - bool cached = flags & BTREE_ITER_CACHED; - bool intent = flags & BTREE_ITER_INTENT; + bool cached = flags & BTREE_ITER_cached; + bool intent = flags & BTREE_ITER_intent; struct trans_for_each_path_inorder_iter iter; btree_path_idx_t path_pos = 0, path_idx; + bch2_trans_verify_not_unlocked(trans); bch2_trans_verify_not_in_restart(trans); bch2_trans_verify_locks(trans); @@ -1657,7 +1744,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, trans->paths_sorted = false; } - if (!(flags & BTREE_ITER_NOPRESERVE)) + if (!(flags & BTREE_ITER_nopreserve)) path->preserve = true; if (path->intent_ref) @@ -1678,6 +1765,22 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, return path_idx; } +btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) +{ + btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, + BTREE_ITER_nopreserve| + BTREE_ITER_intent, _RET_IP_); + path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); + + struct btree_path *path = trans->paths + path_idx; + bch2_btree_path_downgrade(trans, path); + __bch2_btree_path_unlock(trans, path); + return path_idx; +} + struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) { @@ -1719,6 +1822,19 @@ hole: return (struct bkey_s_c) { u, NULL }; } + +void bch2_set_btree_iter_dontneed(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + + if (!iter->path || trans->restarted) + return; + + struct btree_path *path = btree_iter_path(trans, iter); + path->preserve = false; + if (path->ref == 1) + path->should_be_locked = false; +} /* Btree iterators: */ int __must_check @@ -1733,9 +1849,11 @@ bch2_btree_iter_traverse(struct btree_iter *iter) struct btree_trans *trans = iter->trans; int ret; + bch2_trans_verify_not_unlocked(trans); + iter->path = bch2_btree_path_set_pos(trans, iter->path, btree_iter_search_key(iter), - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); @@ -1774,7 +1892,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->k.p = iter->pos = b->key.k.p; iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out: @@ -1835,13 +1953,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) if (bpos_eq(iter->pos, b->key.k.p)) { __btree_path_set_level_up(trans, path, path->level++); } else { + if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED) + btree_node_unlock(trans, path, path->level + 1); + /* * Haven't gotten to the end of the parent node: go back down to * the next child node */ iter->path = bch2_btree_path_set_pos(trans, iter->path, bpos_successor(iter->pos), - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); path = btree_iter_path(trans, iter); @@ -1859,7 +1980,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->k.p = iter->pos = b->key.k.p; iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); btree_path_set_should_be_locked(btree_iter_path(trans, iter)); EBUG_ON(btree_iter_path(trans, iter)->uptodate); @@ -1878,11 +1999,11 @@ err: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + bool ret = !(iter->flags & BTREE_ITER_all_snapshots ? bpos_eq(pos, SPOS_MAX) : bkey_eq(pos, SPOS_MAX)); - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_successor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; @@ -1891,11 +2012,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) inline bool bch2_btree_iter_rewind(struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + bool ret = !(iter->flags & BTREE_ITER_all_snapshots ? bpos_eq(pos, POS_MIN) : bkey_eq(pos, POS_MIN)); - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_predecessor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; @@ -2006,7 +2127,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos struct bkey_s_c k; int ret; - if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && + bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked(trans); + + if ((iter->flags & BTREE_ITER_key_cache_fill) && bpos_eq(iter->pos, pos)) return bkey_s_c_null; @@ -2015,17 +2139,17 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos if (!iter->key_cache_path) iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, - iter->flags & BTREE_ITER_INTENT, 0, - iter->flags|BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL, + iter->flags & BTREE_ITER_intent, 0, + iter->flags|BTREE_ITER_cached| + BTREE_ITER_cached_nofill, _THIS_IP_); iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - iter->flags|BTREE_ITER_CACHED) ?: + iter->flags|BTREE_ITER_cached) ?: bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_); if (unlikely(ret)) return bkey_s_c_err(ret); @@ -2053,7 +2177,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp struct btree_path_level *l; iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2078,7 +2202,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp k = btree_path_level_peek_all(trans->c, l, &iter->k); - if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { k = k2; @@ -2089,10 +2213,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp } } - if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) + if (unlikely(iter->flags & BTREE_ITER_with_journal)) k = btree_trans_peek_journal(trans, iter, k); - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) bch2_btree_trans_peek_updates(trans, iter, &k); @@ -2144,11 +2268,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e struct bpos iter_pos; int ret; - EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); + bch2_trans_verify_not_unlocked(trans); + EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->update_path = 0; } @@ -2171,7 +2296,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * isn't monotonically increasing before FILTER_SNAPSHOTS, and * that's what we check against in extents mode: */ - if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + if (unlikely(!(iter->flags & BTREE_ITER_is_extents) ? bkey_gt(k.k->p, end) : k.k->p.inode > end.inode)) goto end; @@ -2179,13 +2304,13 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e if (iter->update_path && !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->update_path = 0; } - if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && - (iter->flags & BTREE_ITER_INTENT) && - !(iter->flags & BTREE_ITER_IS_EXTENTS) && + if ((iter->flags & BTREE_ITER_filter_snapshots) && + (iter->flags & BTREE_ITER_intent) && + !(iter->flags & BTREE_ITER_is_extents) && !iter->update_path) { struct bpos pos = k.k->p; @@ -2200,12 +2325,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * advance, same as on exit for iter->path, but only up * to snapshot */ - __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent); iter->update_path = iter->path; iter->update_path = bch2_btree_path_set_pos(trans, iter->update_path, pos, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, _THIS_IP_); ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); if (unlikely(ret)) { @@ -2218,7 +2343,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * We can never have a key in a leaf node at POS_MAX, so * we don't have to check these successor() calls: */ - if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + if ((iter->flags & BTREE_ITER_filter_snapshots) && !bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { @@ -2227,7 +2352,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e } if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + !(iter->flags & BTREE_ITER_all_snapshots)) { search_key = bkey_successor(iter, k.k->p); continue; } @@ -2237,12 +2362,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * equal to the key we just returned - except extents can * straddle iter->pos: */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (!(iter->flags & BTREE_ITER_is_extents)) iter_pos = k.k->p; else iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + if (unlikely(!(iter->flags & BTREE_ITER_is_extents) ? bkey_gt(iter_pos, end) : bkey_ge(iter_pos, end))) goto end; @@ -2253,7 +2378,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->pos = iter_pos; iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); btree_path_set_should_be_locked(btree_iter_path(trans, iter)); @@ -2266,7 +2391,7 @@ out_no_locked: btree_path_set_should_be_locked(trans->paths + iter->update_path); } - if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_all_snapshots)) iter->pos.snapshot = iter->snapshot; ret = bch2_btree_iter_verify_ret(iter, k); @@ -2316,21 +2441,22 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) btree_path_idx_t saved_path = 0; int ret; + bch2_trans_verify_not_unlocked(trans); EBUG_ON(btree_iter_path(trans, iter)->cached || btree_iter_path(trans, iter)->level); - if (iter->flags & BTREE_ITER_WITH_JOURNAL) + if (iter->flags & BTREE_ITER_with_journal) return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + if (iter->flags & BTREE_ITER_filter_snapshots) search_key.snapshot = U32_MAX; while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2345,17 +2471,17 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); if (!k.k || - ((iter->flags & BTREE_ITER_IS_EXTENTS) + ((iter->flags & BTREE_ITER_is_extents) ? bpos_ge(bkey_start_pos(k.k), search_key) : bpos_gt(k.k->p, search_key))) k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) bch2_btree_trans_peek_prev_updates(trans, iter, &k); if (likely(k.k)) { - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { + if (iter->flags & BTREE_ITER_filter_snapshots) { if (k.k->p.snapshot == iter->snapshot) goto got_key; @@ -2366,7 +2492,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) */ if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { bch2_path_put_nokeep(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->path = saved_path; saved_path = 0; iter->k = saved_k; @@ -2379,9 +2505,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) k.k->p.snapshot)) { if (saved_path) bch2_path_put_nokeep(trans, saved_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); saved_path = btree_path_clone(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent, + _THIS_IP_); path = btree_iter_path(trans, iter); saved_k = *k.k; saved_v = k.v; @@ -2392,9 +2519,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) } got_key: if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + !(iter->flags & BTREE_ITER_all_snapshots)) { search_key = bkey_predecessor(iter, k.k->p); - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + if (iter->flags & BTREE_ITER_filter_snapshots) search_key.snapshot = U32_MAX; continue; } @@ -2418,11 +2545,11 @@ got_key: if (bkey_lt(k.k->p, iter->pos)) iter->pos = k.k->p; - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + if (iter->flags & BTREE_ITER_filter_snapshots) iter->pos.snapshot = iter->snapshot; out_no_locked: if (saved_path) - bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); + bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2452,12 +2579,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c k; int ret; + bch2_trans_verify_not_unlocked(trans); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); + EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); /* extents can't span inode numbers: */ - if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { if (iter->pos.inode == KEY_INODE_MAX) return bkey_s_c_null; @@ -2467,7 +2595,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) search_key = btree_iter_search_key(iter); iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2476,22 +2604,22 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out_no_locked; } - if ((iter->flags & BTREE_ITER_CACHED) || - !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { + if ((iter->flags & BTREE_ITER_cached) || + !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { k = bkey_s_c_null; - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) { bch2_btree_trans_peek_slot_updates(trans, iter, &k); if (k.k) goto out; } - if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && + if (unlikely(iter->flags & BTREE_ITER_with_journal) && (k = btree_trans_peek_slot_journal(trans, iter)).k) goto out; - if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; @@ -2506,12 +2634,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bpos next; struct bpos end = iter->pos; - if (iter->flags & BTREE_ITER_IS_EXTENTS) + if (iter->flags & BTREE_ITER_is_extents) end.offset = U64_MAX; EBUG_ON(btree_iter_path(trans, iter)->level); - if (iter->flags & BTREE_ITER_INTENT) { + if (iter->flags & BTREE_ITER_intent) { struct btree_iter iter2; bch2_trans_copy_iter(&iter2, iter); @@ -2542,7 +2670,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bkey_init(&iter->k); iter->k.p = iter->pos; - if (iter->flags & BTREE_ITER_IS_EXTENTS) { + if (iter->flags & BTREE_ITER_is_extents) { bch2_key_resize(&iter->k, min_t(u64, KEY_SIZE_MAX, (next.inode == iter->pos.inode @@ -2726,13 +2854,13 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) { if (iter->update_path) bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); if (iter->path) bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->path = 0; iter->update_path = 0; iter->key_cache_path = 0; @@ -2757,9 +2885,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, unsigned depth, unsigned flags) { - flags |= BTREE_ITER_NOT_EXTENTS; - flags |= __BTREE_ITER_ALL_SNAPSHOTS; - flags |= BTREE_ITER_ALL_SNAPSHOTS; + flags |= BTREE_ITER_not_extents; + flags |= BTREE_ITER_snapshot_field; + flags |= BTREE_ITER_all_snapshots; bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, __bch2_btree_iter_flags(trans, btree_id, flags), @@ -2782,9 +2910,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) dst->ip_allocated = _RET_IP_; #endif if (src->path) - __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent); if (src->update_path) - __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent); dst->key_cache_path = 0; } @@ -2953,7 +3081,8 @@ u32 bch2_trans_begin(struct btree_trans *trans) if (!trans->restarted && (need_resched() || time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) { - drop_locks_do(trans, (cond_resched(), 0)); + bch2_trans_unlock(trans); + cond_resched(); now = local_clock(); } trans->last_begin_time = now; @@ -2963,11 +3092,14 @@ u32 bch2_trans_begin(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); trans->last_begin_ip = _RET_IP_; + trans->locked = true; + if (trans->restarted) { bch2_btree_path_traverse_all(trans); trans->notrace_relock_fail = false; } + bch2_trans_verify_not_unlocked(trans); return trans->restart_count; } @@ -3020,7 +3152,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) */ BUG_ON(pos_task && pid == pos_task->pid && - bch2_trans_locked(pos)); + pos->locked); if (pos_task && pid < pos_task->pid) { list_add_tail(&trans->list, &pos->list); @@ -3036,8 +3168,9 @@ got_trans: trans->last_begin_time = local_clock(); trans->fn_idx = fn_idx; trans->locking_wait.task = current; + trans->locked = true; trans->journal_replay_not_finished = - unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) && atomic_inc_not_zero(&c->journal_keys.ref); trans->nr_paths = ARRAY_SIZE(trans->_paths); trans->paths_allocated = trans->_paths_allocated; @@ -3166,13 +3299,11 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, pid = owner ? owner->pid : 0; rcu_read_unlock(); - prt_tab(out); - prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', + prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b', b->level, bch2_btree_id_str(b->btree_id)); bch2_bpos_to_text(out, btree_node_pos(b)); - prt_tab(out); - prt_printf(out, " locks %u:%u:%u held by pid %u", + prt_printf(out, "\t locks %u:%u:%u held by pid %u", c.n[0], c.n[1], c.n[2], pid); } @@ -3229,10 +3360,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) b = READ_ONCE(trans->locking); if (b) { - prt_printf(out, " blocked for %lluus on", - div_u64(local_clock() - trans->locking_wait.start_time, - 1000)); - prt_newline(out); + prt_printf(out, " blocked for %lluus on\n", + div_u64(local_clock() - trans->locking_wait.start_time, 1000)); prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); bch2_btree_bkey_cached_common_to_text(out, b); prt_newline(out); diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 1c70836dd7cc..eab2a25bdc7a 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -216,9 +216,13 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *, btree_path_idx_t, unsigned, unsigned long); +static inline void bch2_trans_verify_not_unlocked(struct btree_trans *); + static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) { + bch2_trans_verify_not_unlocked(trans); + if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) return 0; @@ -227,6 +231,9 @@ static inline int __must_check bch2_btree_path_traverse(struct btree_trans *tran btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned, unsigned long); +btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id, + unsigned, struct bpos); + struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); /* @@ -283,7 +290,6 @@ int bch2_trans_relock(struct btree_trans *); int bch2_trans_relock_notrace(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); void bch2_trans_unlock_long(struct btree_trans *); -bool bch2_trans_locked(struct btree_trans *); static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) { @@ -309,6 +315,14 @@ static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) bch2_trans_in_restart_error(trans); } +void __noreturn bch2_trans_unlocked_error(struct btree_trans *); + +static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans) +{ + if (!trans->locked) + bch2_trans_unlocked_error(trans); +} + __always_inline static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) { @@ -386,10 +400,10 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos if (unlikely(iter->update_path)) bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->update_path = 0; - if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_all_snapshots)) new_pos.snapshot = iter->snapshot; __bch2_btree_iter_set_pos(iter, new_pos); @@ -397,7 +411,7 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) { - BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); + BUG_ON(!(iter->flags & BTREE_ITER_is_extents)); iter->pos = bkey_start_pos(&iter->k); } @@ -416,20 +430,20 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, unsigned btree_id, unsigned flags) { - if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && + if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && btree_id_is_extents(btree_id)) - flags |= BTREE_ITER_IS_EXTENTS; + flags |= BTREE_ITER_is_extents; - if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && + if (!(flags & BTREE_ITER_snapshot_field) && !btree_type_has_snapshot_field(btree_id)) - flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + flags &= ~BTREE_ITER_all_snapshots; - if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && + if (!(flags & BTREE_ITER_all_snapshots) && btree_type_has_snapshots(btree_id)) - flags |= BTREE_ITER_FILTER_SNAPSHOTS; + flags |= BTREE_ITER_filter_snapshots; if (trans->journal_replay_not_finished) - flags |= BTREE_ITER_WITH_JOURNAL; + flags |= BTREE_ITER_with_journal; return flags; } @@ -439,10 +453,10 @@ static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, unsigned flags) { if (!btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_CACHED; - flags &= ~BTREE_ITER_WITH_KEY_CACHE; - } else if (!(flags & BTREE_ITER_CACHED)) - flags |= BTREE_ITER_WITH_KEY_CACHE; + flags &= ~BTREE_ITER_cached; + flags &= ~BTREE_ITER_with_key_cache; + } else if (!(flags & BTREE_ITER_cached)) + flags |= BTREE_ITER_with_key_cache; return __bch2_btree_iter_flags(trans, btree_id, flags); } @@ -494,18 +508,7 @@ void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, unsigned, unsigned, unsigned); void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); -static inline void set_btree_iter_dontneed(struct btree_iter *iter) -{ - struct btree_trans *trans = iter->trans; - - if (!iter->path || trans->restarted) - return; - - struct btree_path *path = btree_iter_path(trans, iter); - path->preserve = false; - if (path->ref == 1) - path->should_be_locked = false; -} +void bch2_set_btree_iter_dontneed(struct btree_iter *); void *__bch2_trans_kmalloc(struct btree_trans *, size_t); @@ -619,14 +622,14 @@ u32 bch2_trans_begin(struct btree_trans *); static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek_prev(iter); } static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek(iter); } @@ -634,7 +637,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter * struct bpos end, unsigned flags) { - if (!(flags & BTREE_ITER_SLOTS)) + if (!(flags & BTREE_ITER_slots)) return bch2_btree_iter_peek_upto(iter, end); if (bkey_gt(iter->pos, end)) @@ -699,16 +702,12 @@ transaction_restart: \ _ret2 ?: trans_was_restarted(_trans, _restart_count); \ }) -#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ +#define for_each_btree_key_upto_continue(_trans, _iter, \ + _end, _flags, _k, _do) \ ({ \ - struct btree_iter _iter; \ struct bkey_s_c _k; \ int _ret3 = 0; \ \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ do { \ _ret3 = lockrestart_do(_trans, ({ \ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ @@ -724,6 +723,21 @@ transaction_restart: \ _ret3; \ }) +#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ + for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) + +#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ +({ \ + bch2_trans_begin(trans); \ + \ + struct btree_iter _iter; \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\ +}) + #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \ @@ -794,14 +808,6 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, return k; } -#define for_each_btree_key_old(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - #define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ @@ -861,6 +867,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, }) void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); +void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 1e8cf49a6935..332dbf164929 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -623,3 +623,20 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, keys->data[dst++] = *i; keys->nr = keys->gap = dst; } + +void bch2_journal_keys_dump(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + struct printbuf buf = PRINTBUF; + + pr_info("%zu keys:", keys->nr); + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) { + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf); + } + printbuf_exit(&buf); +} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index af25046ebcaa..1ba4a79b0ef9 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -70,4 +70,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, unsigned, unsigned, struct bpos, struct bpos); +void bch2_journal_keys_dump(struct bch_fs *); + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 7dafa1accec2..75f5e6fe4634 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -383,9 +383,9 @@ static int btree_key_cache_fill(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, - BTREE_ITER_KEY_CACHE_FILL| - BTREE_ITER_CACHED_NOFILL); - iter.flags &= ~BTREE_ITER_WITH_JOURNAL; + BTREE_ITER_key_cache_fill| + BTREE_ITER_cached_nofill); + iter.flags &= ~BTREE_ITER_with_journal; k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -456,7 +456,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); /* We're not likely to need this iterator again: */ - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -515,23 +515,10 @@ retry: fill: path->uptodate = BTREE_ITER_UPTODATE; - if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { - /* - * Using the underscore version because we haven't set - * path->uptodate yet: - */ - if (!path->locks_want && - !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { - trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); - goto err; - } - - ret = btree_key_cache_fill(trans, path, ck); - if (ret) - goto err; - - ret = bch2_btree_path_relock(trans, path, _THIS_IP_); + if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) { + ret = bch2_btree_path_upgrade(trans, path, 1) ?: + btree_key_cache_fill(trans, path, ck) ?: + bch2_btree_path_relock(trans, path, _THIS_IP_); if (ret) goto err; @@ -622,13 +609,13 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, - BTREE_ITER_SLOTS| - BTREE_ITER_INTENT| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_slots| + BTREE_ITER_intent| + BTREE_ITER_all_snapshots); bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; + BTREE_ITER_cached| + BTREE_ITER_intent); + b_iter.flags &= ~BTREE_ITER_with_key_cache; ret = bch2_btree_iter_traverse(&c_iter); if (ret) @@ -661,14 +648,14 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, commit_flags |= BCH_WATERMARK_reclaim; if (ck->journal.seq != journal_last_seq(j) || - !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) + !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; ret = bch2_btree_iter_traverse(&b_iter) ?: bch2_trans_update(trans, &b_iter, ck->k, - BTREE_UPDATE_KEY_CACHE_RECLAIM| - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - BTREE_TRIGGER_NORUN) ?: + BTREE_UPDATE_key_cache_reclaim| + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| @@ -790,7 +777,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, * flushing. The flush callback will not proceed unless ->seq matches * the latest pin, so make sure it starts with a consistent value. */ - if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || + if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || !journal_pin_active(&ck->journal)) { ck->seq = trans->journal_res.seq; } @@ -835,6 +822,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, int srcu_idx; mutex_lock(&bc->lock); + bc->requested_to_free += sc->nr_to_scan; + srcu_idx = srcu_read_lock(&c->btree_trans_barrier); flags = memalloc_nofs_save(); @@ -853,6 +842,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, atomic_long_dec(&bc->nr_freed); freed++; bc->nr_freed_nonpcpu--; + bc->freed++; } list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { @@ -866,6 +856,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, atomic_long_dec(&bc->nr_freed); freed++; bc->nr_freed_pcpu--; + bc->freed++; } rcu_read_lock(); @@ -884,13 +875,18 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, ck = container_of(pos, struct bkey_cached, hash); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + bc->skipped_dirty++; goto next; } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); + bc->skipped_accessed++; goto next; } else if (bkey_cached_lock_for_evict(ck)) { bkey_cached_evict(bc, ck); bkey_cached_free(bc, ck); + bc->moved_to_freelist++; + } else { + bc->skipped_lock_fail++; } scanned++; @@ -1037,14 +1033,47 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) return 0; } -void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) { - prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); - prt_newline(out); - prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); - prt_newline(out); - prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); - prt_newline(out); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + + printbuf_tabstop_push(out, 24); + printbuf_tabstop_push(out, 12); + + unsigned flags = memalloc_nofs_save(); + mutex_lock(&bc->lock); + prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); + prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); + prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed)); + prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu); + prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu); + + prt_printf(out, "\nshrinker:\n"); + prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); + prt_printf(out, "freed:\t%lu\r\n", bc->freed); + prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist); + prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); + prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); + prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); + + prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier)); + + struct bkey_cached *ck; + unsigned iter = 0; + list_for_each_entry(ck, &bc->freed_nonpcpu, list) { + prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); + if (++iter > 10) + break; + } + + iter = 0; + list_for_each_entry(ck, &bc->freed_pcpu, list) { + prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); + if (++iter > 10) + break; + } + mutex_unlock(&bc->lock); + memalloc_flags_restore(flags); } void bch2_btree_key_cache_exit(void) diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h index 290e4e57df5b..237e8bb3ac40 100644 --- a/fs/bcachefs/btree_key_cache_types.h +++ b/fs/bcachefs/btree_key_cache_types.h @@ -24,6 +24,14 @@ struct btree_key_cache { atomic_long_t nr_freed; atomic_long_t nr_keys; atomic_long_t nr_dirty; + + /* shrinker stats */ + unsigned long requested_to_free; + unsigned long freed; + unsigned long moved_to_freelist; + unsigned long skipped_dirty; + unsigned long skipped_accessed; + unsigned long skipped_lock_fail; }; struct bkey_cached_key { diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index f2caf491957e..c3e9b0cc7bbd 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -83,8 +83,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) { struct trans_waiting_for_lock *i; - prt_printf(out, "Found lock cycle (%u entries):", g->nr); - prt_newline(out); + prt_printf(out, "Found lock cycle (%u entries):\n", g->nr); for (i = g->g; i < g->g + g->nr; i++) { struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); @@ -224,8 +223,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) bch2_btree_trans_to_text(&buf, trans); - prt_printf(&buf, "backtrace:"); - prt_newline(&buf); + prt_printf(&buf, "backtrace:\n"); printbuf_indent_add(&buf, 2); bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); printbuf_indent_sub(&buf, 2); @@ -492,8 +490,6 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, if (path->uptodate == BTREE_ITER_NEED_RELOCK) path->uptodate = BTREE_ITER_UPTODATE; - bch2_trans_verify_locks(trans); - return path->uptodate < BTREE_ITER_NEED_RELOCK; } @@ -609,7 +605,9 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_pa { struct get_locks_fail f; - return btree_path_get_locks(trans, path, false, &f); + bool ret = btree_path_get_locks(trans, path, false, &f); + bch2_trans_verify_locks(trans); + return ret; } int __bch2_btree_path_relock(struct btree_trans *trans, @@ -632,7 +630,9 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, path->locks_want = new_locks_want; - return btree_path_get_locks(trans, path, true, f); + bool ret = btree_path_get_locks(trans, path, true, f); + bch2_trans_verify_locks(trans); + return ret; } bool __bch2_btree_path_upgrade(struct btree_trans *trans, @@ -640,8 +640,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, unsigned new_locks_want, struct get_locks_fail *f) { - if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) - return true; + bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f); + if (ret) + goto out; /* * XXX: this is ugly - we'd prefer to not be mucking with other @@ -675,8 +676,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, btree_path_get_locks(trans, linked, true, NULL); } } - - return false; +out: + bch2_trans_verify_locks(trans); + return ret; } void __bch2_btree_path_downgrade(struct btree_trans *trans, @@ -725,82 +727,100 @@ void bch2_trans_downgrade(struct btree_trans *trans) bch2_btree_path_downgrade(trans, path); } -int bch2_trans_relock(struct btree_trans *trans) +static inline void __bch2_trans_unlock(struct btree_trans *trans) { struct btree_path *path; unsigned i; - if (unlikely(trans->restarted)) - return -((int) trans->restarted); + trans_for_each_path(trans, path, i) + __bch2_btree_path_unlock(trans, path); +} - trans_for_each_path(trans, path, i) { - struct get_locks_fail f; +static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, + struct get_locks_fail *f, bool trace) +{ + if (!trace) + goto out; - if (path->should_be_locked && - !btree_path_get_locks(trans, path, false, &f)) { - if (trace_trans_restart_relock_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " l=%u seq=%u node seq=", - f.l, path->l[f.l].lock_seq); - if (IS_ERR_OR_NULL(f.b)) { - prt_str(&buf, bch2_err_str(PTR_ERR(f.b))); - } else { - prt_printf(&buf, "%u", f.b->c.lock.seq); - - struct six_lock_count c = - bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l); - prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - - c = six_lock_counts(&f.b->c.lock); - prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - } + if (trace_trans_restart_relock_enabled()) { + struct printbuf buf = PRINTBUF; - trace_trans_restart_relock(trans, _RET_IP_, buf.buf); - printbuf_exit(&buf); - } + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); + if (IS_ERR_OR_NULL(f->b)) { + prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); + } else { + prt_printf(&buf, "%u", f->b->c.lock.seq); - count_event(trans->c, trans_restart_relock); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + struct six_lock_count c = + bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l); + prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + + c = six_lock_counts(&f->b->c.lock); + prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); } + + trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); } - return 0; + count_event(trans->c, trans_restart_relock); +out: + __bch2_trans_unlock(trans); + bch2_trans_verify_locks(trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } -int bch2_trans_relock_notrace(struct btree_trans *trans) +static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) { - struct btree_path *path; - unsigned i; + bch2_trans_verify_locks(trans); if (unlikely(trans->restarted)) return -((int) trans->restarted); + if (unlikely(trans->locked)) + goto out; + + struct btree_path *path; + unsigned i; + + trans_for_each_path(trans, path, i) { + struct get_locks_fail f; - trans_for_each_path(trans, path, i) if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path)) { - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); - } + !btree_path_get_locks(trans, path, false, &f)) + return bch2_trans_relock_fail(trans, path, &f, trace); + } + + trans->locked = true; +out: + bch2_trans_verify_locks(trans); return 0; } +int bch2_trans_relock(struct btree_trans *trans) +{ + return __bch2_trans_relock(trans, true); +} + +int bch2_trans_relock_notrace(struct btree_trans *trans) +{ + return __bch2_trans_relock(trans, false); +} + void bch2_trans_unlock_noassert(struct btree_trans *trans) { - struct btree_path *path; - unsigned i; + __bch2_trans_unlock(trans); - trans_for_each_path(trans, path, i) - __bch2_btree_path_unlock(trans, path); + trans->locked = false; + trans->last_unlock_ip = _RET_IP_; } void bch2_trans_unlock(struct btree_trans *trans) { - struct btree_path *path; - unsigned i; + __bch2_trans_unlock(trans); - trans_for_each_path(trans, path, i) - __bch2_btree_path_unlock(trans, path); + trans->locked = false; + trans->last_unlock_ip = _RET_IP_; } void bch2_trans_unlock_long(struct btree_trans *trans) @@ -809,17 +829,6 @@ void bch2_trans_unlock_long(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); } -bool bch2_trans_locked(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->nodes_locked) - return true; - return false; -} - int __bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) { @@ -836,15 +845,19 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, void bch2_btree_path_verify_locks(struct btree_path *path) { - unsigned l; + /* + * A path may be uptodate and yet have nothing locked if and only if + * there is no node at path->level, which generally means we were + * iterating over all nodes and got to the end of the btree + */ + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && + btree_path_node(path, path->level) && + !path->nodes_locked); - if (!path->nodes_locked) { - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && - btree_path_node(path, path->level)); + if (!path->nodes_locked) return; - } - for (l = 0; l < BTREE_MAX_DEPTH; l++) { + for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { int want = btree_lock_want(path, l); int have = btree_node_locked_type(path, l); @@ -857,8 +870,24 @@ void bch2_btree_path_verify_locks(struct btree_path *path) } } +static bool bch2_trans_locked(struct btree_trans *trans) +{ + struct btree_path *path; + unsigned i; + + trans_for_each_path(trans, path, i) + if (path->nodes_locked) + return true; + return false; +} + void bch2_trans_verify_locks(struct btree_trans *trans) { + if (!trans->locked) { + BUG_ON(bch2_trans_locked(trans)); + return; + } + struct btree_path *path; unsigned i; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 4bd72c855da1..7f41545b9147 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -364,14 +364,14 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - struct get_locks_fail f; + struct get_locks_fail f = {}; unsigned old_locks_want = path->locks_want; new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); if (path->locks_want < new_locks_want ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) - : path->uptodate == BTREE_ITER_UPTODATE) + : path->nodes_locked) return 0; trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index bbec91e8e650..74e1ff225674 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_foreground.h" #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" @@ -19,6 +20,26 @@ #include <linux/prefetch.h> +static const char * const trans_commit_flags_strs[] = { +#define x(n, ...) #n, + BCH_TRANS_COMMIT_FLAGS() +#undef x + NULL +}; + +void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags) +{ + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + + prt_printf(out, "watermark=%s", bch2_watermarks[watermark]); + + flags >>= BCH_WATERMARK_BITS; + if (flags) { + prt_char(out, ' '); + bch2_prt_bitflags(out, trans_commit_flags_strs, flags); + } +} + static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -315,8 +336,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->btree_id != path->btree_id); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && - !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && - test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + !(i->flags & BTREE_UPDATE_internal_snapshot_node) && + test_bit(JOURNAL_replay_done, &trans->c->journal.flags) && i->k->k.p.snapshot && bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); } @@ -443,13 +464,13 @@ static int run_one_mem_trigger(struct btree_trans *trans, verify_update_old_key(trans, i); - if (unlikely(flags & BTREE_TRIGGER_NORUN)) + if (unlikely(flags & BTREE_TRIGGER_norun)) return 0; if (old_ops->trigger == new_ops->trigger) { ret = bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(new), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags); } else { ret = bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(new), flags) ?: @@ -472,11 +493,11 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ struct bkey_s_c old = { &old_k, i->old_v }; const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL; + unsigned flags = i->flags|BTREE_TRIGGER_transactional; verify_update_old_key(trans, i); - if ((i->flags & BTREE_TRIGGER_NORUN) || + if ((i->flags & BTREE_TRIGGER_norun) || !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) return 0; @@ -486,8 +507,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ i->overwrite_trigger_run = true; i->insert_trigger_run = true; return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), - BTREE_TRIGGER_INSERT| - BTREE_TRIGGER_OVERWRITE|flags) ?: 1; + BTREE_TRIGGER_insert| + BTREE_TRIGGER_overwrite|flags) ?: 1; } else if (overwrite && !i->overwrite_trigger_run) { i->overwrite_trigger_run = true; return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; @@ -572,7 +593,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && (!i->insert_trigger_run || !i->overwrite_trigger_run)); #endif @@ -590,7 +611,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) && gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { - int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc); if (ret) return ret; } @@ -609,6 +630,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, unsigned u64s = 0; int ret; + bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_in_restart(trans); + if (race_fault()) { trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); @@ -686,7 +710,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, trans_for_each_update(trans, i) if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags); + ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags); if (ret) goto fatal_err; } @@ -705,7 +729,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (i->key_cache_already_flushed) continue; - if (i->flags & BTREE_UPDATE_NOJOURNAL) + if (i->flags & BTREE_UPDATE_nojournal) continue; verify_update_old_key(trans, i); @@ -766,16 +790,15 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans } static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct btree_insert_entry *i, struct printbuf *err) { struct bch_fs *c = trans->c; printbuf_reset(err); - prt_printf(err, "invalid bkey on insert from %s -> %ps", + prt_printf(err, "invalid bkey on insert from %s -> %ps\n", trans->fn, (void *) i->ip_allocated); - prt_newline(err); printbuf_indent_add(err, 2); bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); @@ -796,8 +819,7 @@ static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans * struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - prt_printf(&buf, "invalid bkey on insert from %s", trans->fn); - prt_newline(&buf); + prt_printf(&buf, "invalid bkey on insert from %s\n", trans->fn); printbuf_indent_add(&buf, 2); bch2_journal_entry_to_text(&buf, c, i); @@ -988,6 +1010,9 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; int ret = 0; + bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_in_restart(trans); + if (!trans->nr_updates && !trans->journal_entries_u64s) goto out_reset; @@ -1000,10 +1025,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) trans_for_each_update(trans, i) { struct printbuf buf = PRINTBUF; - enum bkey_invalid_flags invalid_flags = 0; + enum bch_validate_flags invalid_flags = 0; if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, invalid_flags, &buf))) @@ -1018,10 +1043,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) for (struct jset_entry *i = trans->journal_entries; i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); i = vstruct_next(i)) { - enum bkey_invalid_flags invalid_flags = 0; + enum bch_validate_flags invalid_flags = 0; if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; if (unlikely(bch2_journal_entry_validate(c, NULL, i, bcachefs_metadata_version_current, @@ -1065,7 +1090,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (i->key_cache_already_flushed) continue; - if (i->flags & BTREE_UPDATE_NOJOURNAL) + if (i->flags & BTREE_UPDATE_nojournal) continue; /* we're going to journal the key being updated: */ @@ -1086,6 +1111,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) } retry: errored_at = NULL; + bch2_trans_verify_not_unlocked(trans); bch2_trans_verify_not_in_restart(trans); if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) memset(&trans->journal_res, 0, sizeof(trans->journal_res)); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index c69b233c41bb..d63db4fefe73 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -163,9 +163,21 @@ struct btree_cache { /* Number of elements in live + freeable lists */ unsigned used; unsigned reserve; + unsigned freed; + unsigned not_freed_lock_intent; + unsigned not_freed_lock_write; + unsigned not_freed_dirty; + unsigned not_freed_read_in_flight; + unsigned not_freed_write_in_flight; + unsigned not_freed_noevict; + unsigned not_freed_write_blocked; + unsigned not_freed_will_make_reachable; + unsigned not_freed_access_bit; atomic_t dirty; struct shrinker *shrink; + unsigned used_by_btree[BTREE_ID_NR]; + /* * If we need to allocate memory for a new btree node and that * allocation fails, we can cannibalize another node in the btree cache @@ -187,36 +199,89 @@ struct btree_node_iter { } data[MAX_BSETS]; }; +#define BTREE_ITER_FLAGS() \ + x(slots) \ + x(intent) \ + x(prefetch) \ + x(is_extents) \ + x(not_extents) \ + x(cached) \ + x(with_key_cache) \ + x(with_updates) \ + x(with_journal) \ + x(snapshot_field) \ + x(all_snapshots) \ + x(filter_snapshots) \ + x(nopreserve) \ + x(cached_nofill) \ + x(key_cache_fill) \ + +#define STR_HASH_FLAGS() \ + x(must_create) \ + x(must_replace) + +#define BTREE_UPDATE_FLAGS() \ + x(internal_snapshot_node) \ + x(nojournal) \ + x(key_cache_reclaim) + + /* - * Iterate over all possible positions, synthesizing deleted keys for holes: - */ -static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0; -/* - * Indicates that intent locks should be taken on leaf nodes, because we expect - * to be doing updates: - */ -static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1; -/* - * Causes the btree iterator code to prefetch additional btree nodes from disk: - */ -static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2; -/* - * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for - * @pos or the first key strictly greater than @pos + * BTREE_TRIGGER_norun - don't run triggers at all + * + * BTREE_TRIGGER_transactional - we're running transactional triggers as part of + * a transaction commit: triggers may generate new updates + * + * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction + * commit: we have our journal reservation, we're holding btree node write + * locks, and we know the transaction is going to commit (returning an error + * here is a fatal error, causing us to go emergency read-only) + * + * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage + * + * BTREE_TRIGGER_insert - @new is entering the btree + * BTREE_TRIGGER_overwrite - @old is leaving the btree + * + * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc + * trigger */ -static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3; -static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4; -static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5; -static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6; -static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7; -static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8; -static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9; -static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; -static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11; -static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12; -static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13; -static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14; -#define __BTREE_ITER_FLAGS_END 15 +#define BTREE_TRIGGER_FLAGS() \ + x(norun) \ + x(transactional) \ + x(atomic) \ + x(check_repair) \ + x(gc) \ + x(insert) \ + x(overwrite) \ + x(is_root) \ + x(bucket_invalidate) + +enum { +#define x(n) BTREE_ITER_FLAG_BIT_##n, + BTREE_ITER_FLAGS() + STR_HASH_FLAGS() + BTREE_UPDATE_FLAGS() + BTREE_TRIGGER_FLAGS() +#undef x +}; + +/* iter flags must fit in a u16: */ +//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15); + +enum btree_iter_update_trigger_flags { +#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_ITER_FLAGS() +#undef x +#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + STR_HASH_FLAGS() +#undef x +#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_UPDATE_FLAGS() +#undef x +#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_TRIGGER_FLAGS() +#undef x +}; enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -307,7 +372,7 @@ struct btree_iter { */ struct bkey k; - /* BTREE_ITER_WITH_JOURNAL: */ + /* BTREE_ITER_with_journal: */ size_t journal_idx; #ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; @@ -418,6 +483,8 @@ struct btree_trans { u8 lock_must_abort; bool lock_may_not_fail:1; bool srcu_held:1; + bool locked:1; + bool write_locked:1; bool used_mempool:1; bool in_traverse_all:1; bool paths_sorted:1; @@ -425,13 +492,13 @@ struct btree_trans { bool journal_transaction_names:1; bool journal_replay_not_finished:1; bool notrace_relock_fail:1; - bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; u64 last_begin_time; unsigned long last_begin_ip; unsigned long last_restarted_ip; + unsigned long last_unlock_ip; unsigned long srcu_lock_time; const char *fn; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 8e47e260eba5..f3c645a43dcb 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -25,14 +25,14 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, static int __must_check bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t, - struct bkey_i *, enum btree_update_flags, + struct bkey_i *, enum btree_iter_update_trigger_flags, unsigned long ip); static noinline int extent_front_merge(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct bkey_i **insert, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct bkey_i *update; @@ -104,8 +104,8 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, pos.snapshot++; for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOPRESERVE, k, ret) { + BTREE_ITER_all_snapshots| + BTREE_ITER_nopreserve, k, ret) { if (!bkey_eq(k.k->p, pos)) break; @@ -138,8 +138,8 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, darray_init(&s); bch2_trans_iter_init(trans, &old_iter, id, old_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); while ((old_k = bch2_btree_iter_prev(&old_iter)).k && !(ret = bkey_err(old_k)) && bkey_eq(old_pos, old_k.k->p)) { @@ -151,8 +151,8 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, continue; new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); + BTREE_ITER_not_extents| + BTREE_ITER_intent); ret = bkey_err(new_k); if (ret) break; @@ -168,7 +168,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, update->k.type = KEY_TYPE_whiteout; ret = bch2_trans_update(trans, &new_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } bch2_trans_iter_exit(trans, &new_iter); @@ -185,7 +185,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, int bch2_trans_update_extent_overwrite(struct btree_trans *trans, struct btree_iter *iter, - enum btree_update_flags flags, + enum btree_iter_update_trigger_flags flags, struct bkey_s_c old, struct bkey_s_c new) { @@ -218,7 +218,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, ret = bch2_insert_snapshot_whiteouts(trans, btree_id, old.k->p, update->k.p) ?: bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -235,7 +235,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, ret = bch2_insert_snapshot_whiteouts(trans, btree_id, old.k->p, update->k.p) ?: bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -260,7 +260,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, } ret = bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -273,7 +273,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, bch2_cut_front(new.k->p, update); ret = bch2_trans_update_by_path(trans, iter->path, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_UPDATE_internal_snapshot_node| flags, _RET_IP_); if (ret) return ret; @@ -285,7 +285,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, static int bch2_trans_update_extent(struct btree_trans *trans, struct btree_iter *orig_iter, struct bkey_i *insert, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; struct bkey_s_c k; @@ -293,9 +293,9 @@ static int bch2_trans_update_extent(struct btree_trans *trans, int ret = 0; bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_NOT_EXTENTS); + BTREE_ITER_intent| + BTREE_ITER_with_updates| + BTREE_ITER_not_extents); k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; @@ -346,7 +346,7 @@ err: static noinline int flush_new_cached_update(struct btree_trans *trans, struct btree_insert_entry *i, - enum btree_update_flags flags, + enum btree_iter_update_trigger_flags flags, unsigned long ip) { struct bkey k; @@ -354,7 +354,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, btree_path_idx_t path_idx = bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, - BTREE_ITER_INTENT, _THIS_IP_); + BTREE_ITER_intent, _THIS_IP_); ret = bch2_btree_path_traverse(trans, path_idx, 0); if (ret) goto out; @@ -372,7 +372,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, goto out; i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_NORUN; + i->flags |= BTREE_TRIGGER_norun; btree_path_set_should_be_locked(btree_path); ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); @@ -383,7 +383,7 @@ out: static int __must_check bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, - struct bkey_i *k, enum btree_update_flags flags, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, unsigned long ip) { struct bch_fs *c = trans->c; @@ -479,15 +479,15 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, if (!iter->key_cache_path) iter->key_cache_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT| - BTREE_ITER_CACHED, _THIS_IP_); + BTREE_ITER_intent| + BTREE_ITER_cached, _THIS_IP_); iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED); + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached); if (unlikely(ret)) return ret; @@ -505,17 +505,17 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, } int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_update_flags flags) + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; - if (iter->flags & BTREE_ITER_IS_EXTENTS) + if (iter->flags & BTREE_ITER_is_extents) return bch2_trans_update_extent(trans, iter, k, flags); if (bkey_deleted(&k->k) && - !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && - (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + !(flags & BTREE_UPDATE_key_cache_reclaim) && + (iter->flags & BTREE_ITER_filter_snapshots)) { ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); if (unlikely(ret < 0)) return ret; @@ -528,7 +528,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter * Ensure that updates to cached btrees go to the key cache: */ struct btree_path *path = trans->paths + path_idx; - if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + if (!(flags & BTREE_UPDATE_key_cache_reclaim) && !path->cached && !path->level && btree_id_cached(trans->c, path->btree_id)) { @@ -587,7 +587,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k; int ret = 0; - bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent); k = bch2_btree_iter_prev(iter); ret = bkey_err(k); if (ret) @@ -621,15 +621,15 @@ void bch2_trans_commit_hook(struct btree_trans *trans, int bch2_btree_insert_nonextent(struct btree_trans *trans, enum btree_id btree, struct bkey_i *k, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; int ret; bch2_trans_iter_init(trans, &iter, btree, k->k.p, - BTREE_ITER_CACHED| - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); + BTREE_ITER_cached| + BTREE_ITER_not_extents| + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); @@ -637,16 +637,13 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, } int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, - struct bkey_i *k, enum btree_update_flags flags) + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; - int ret; - bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, flags); + BTREE_ITER_intent|flags); + int ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -698,8 +695,8 @@ int bch2_btree_delete(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, btree, pos, - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); + BTREE_ITER_cached| + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(trans, &iter, update_flags); bch2_trans_iter_exit(trans, &iter); @@ -717,7 +714,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bkey_s_c k; int ret = 0; - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); @@ -745,7 +742,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, */ delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) + if (iter.flags & BTREE_ITER_is_extents) bch2_key_resize(&delete.k, bpos_min(end, k.k->p).offset - iter.pos.offset); @@ -804,7 +801,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, k->k.p = pos; struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, k, 0); @@ -852,7 +849,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, if (ret) goto err; - if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { + if (!test_bit(JOURNAL_running, &c->journal.flags)) { ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); if (ret) goto err; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index cc7c53e83f89..b4894e4d5447 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -44,16 +44,18 @@ enum bch_trans_commit_flags { #undef x }; +void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); + int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, - struct bkey_i *, enum btree_update_flags); + struct bkey_i *, enum btree_iter_update_trigger_flags); int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, - enum btree_update_flags); + enum btree_iter_update_trigger_flags); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, int flags); @@ -94,14 +96,14 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, } int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, - enum btree_update_flags, + enum btree_iter_update_trigger_flags, struct bkey_s_c, struct bkey_s_c); int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos); int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_update_flags); + struct bkey_i *, enum btree_iter_update_trigger_flags); struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); @@ -276,7 +278,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr unsigned flags, unsigned type, unsigned min_bytes) { struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, - btree_id, pos, flags|BTREE_ITER_INTENT, type); + btree_id, pos, flags|BTREE_ITER_intent, type); struct bkey_i *ret = IS_ERR(k.k) ? ERR_CAST(k.k) : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); @@ -299,7 +301,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, unsigned flags, unsigned type, unsigned min_bytes) { struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, - btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes); + btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); int ret; if (IS_ERR(mut)) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index b4efd8cc4d1a..60b8544cea48 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -38,22 +38,6 @@ static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, - enum btree_id btree_id, - unsigned level, - struct bpos pos) -{ - btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, - BTREE_ITER_NOPRESERVE| - BTREE_ITER_INTENT, _RET_IP_); - path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); - - struct btree_path *path = trans->paths + path_idx; - bch2_btree_path_downgrade(trans, path); - __bch2_btree_path_unlock(trans, path); - return path_idx; -} - /* * Verify that child nodes correctly span parent node's range: */ @@ -73,6 +57,24 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); + if (b == btree_node_root(c, b)) { + if (!bpos_eq(b->data->min_key, POS_MIN)) { + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->min_key); + need_fsck_err(c, btree_root_bad_min_key, + "btree root with incorrect min_key: %s", buf.buf); + goto topology_repair; + } + + if (!bpos_eq(b->data->max_key, SPOS_MAX)) { + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->max_key); + need_fsck_err(c, btree_root_bad_max_key, + "btree root with incorrect max_key: %s", buf.buf); + goto topology_repair; + } + } + if (!b->c.level) return 0; @@ -158,7 +160,6 @@ topology_repair: static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) { struct bkey_packed *k; - struct bset_tree *t; struct bkey uk; for_each_bset(b, t) @@ -646,7 +647,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -655,7 +656,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -735,9 +736,6 @@ err: */ b = READ_ONCE(as->b); if (b) { - btree_path_idx_t path_idx = get_unlocked_mut_path(trans, - as->btree_id, b->c.level, b->key.k.p); - struct btree_path *path = trans->paths + path_idx; /* * @b is the node we did the final insert into: * @@ -755,12 +753,16 @@ err: * btree_node_lock_nopath() (the use of which is always suspect, * we need to work on removing this in the future) * - * It should be, but get_unlocked_mut_path() -> bch2_path_get() + * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() * calls bch2_path_upgrade(), before we call path_make_mut(), so * we may rarely end up with a locked path besides the one we * have here: */ bch2_trans_unlock(trans); + bch2_trans_begin(trans); + btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans, + as->btree_id, b->c.level, b->key.k.p); + struct btree_path *path = trans->paths + path_idx; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); @@ -1154,13 +1156,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags |= watermark; if (watermark < BCH_WATERMARK_reclaim && - test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) { + test_bit(JOURNAL_space_low, &c->journal.flags)) { if (flags & BCH_TRANS_COMMIT_journal_reclaim) return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); - bch2_trans_unlock(trans); - wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)); - ret = bch2_trans_relock(trans); + ret = drop_locks_do(trans, + ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; })); if (ret) return ERR_PTR(ret); } @@ -1206,7 +1207,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->start_time = start_time; as->ip_started = _RET_IP_; as->mode = BTREE_UPDATE_none; - as->watermark = watermark; + as->flags = flags; as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level_start = level_start; @@ -1360,7 +1361,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && !btree_ptr_sectors_written(insert)); - if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), @@ -1619,12 +1620,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); + path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path1, n1); - path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p); + path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p); six_lock_increment(&n2->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path2, n2); @@ -1669,7 +1670,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n1); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); + path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path1, n1); @@ -1947,6 +1948,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; + bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked(trans); BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); @@ -1979,7 +1982,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, : bpos_successor(b->data->max_key); sib_path = bch2_path_get(trans, btree, sib_pos, - U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); + U8_MAX, level, BTREE_ITER_intent, _THIS_IP_); ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; @@ -2072,7 +2075,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p); + new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + new_path, n); @@ -2150,7 +2153,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); + new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + new_path, n); @@ -2333,10 +2336,10 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, if (!skip_triggers) { ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key), - BTREE_TRIGGER_TRANSACTIONAL) ?: + BTREE_TRIGGER_transactional) ?: bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, bkey_i_to_s(new_key), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -2353,7 +2356,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bch2_trans_copy_iter(&iter2, iter); iter2.path = bch2_btree_path_make_mut(trans, iter2.path, - iter2.flags & BTREE_ITER_INTENT, + iter2.flags & BTREE_ITER_intent, _THIS_IP_); struct btree_path *path2 = btree_iter_path(trans, &iter2); @@ -2365,7 +2368,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, trans->paths_sorted = false; ret = bch2_btree_iter_traverse(&iter2) ?: - bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); + bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); if (ret) goto err; } else { @@ -2473,7 +2476,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter); if (ret) goto out; @@ -2487,7 +2490,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, BUG_ON(!btree_node_hashed(b)); - struct bch_extent_ptr *ptr; bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); @@ -2511,7 +2513,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_inmem(c, b); } -static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level) +int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level) { struct bch_fs *c = trans->c; struct closure cl; @@ -2559,17 +2561,18 @@ static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); + bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level)); } static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) { - prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - (void *) as->ip_started, + prt_printf(out, "%ps: ", (void *) as->ip_started); + bch2_trans_commit_flags_to_text(out, as->flags); + + prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", bch2_btree_id_str(as->btree_id), as->update_level_start, as->update_level_end, - bch2_watermarks[as->watermark], bch2_btree_update_modes[as->mode], as->nodes_written, closure_nr_remaining(&as->cl), diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index c1a479ebaad1..b5b76ce01cfc 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -52,7 +52,7 @@ struct btree_update { struct list_head unwritten_list; enum btree_update_mode mode; - enum bch_watermark watermark; + enum bch_trans_commit_flags flags; unsigned nodes_written:1; unsigned took_gc_lock:1; @@ -144,6 +144,9 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, EBUG_ON(!btree_node_locked(path, level)); + if (bch2_btree_node_merging_disabled) + return 0; + b = path->l[level].b; if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) return 0; @@ -172,6 +175,8 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, struct bkey_i *, unsigned, bool); void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); + +int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned); void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); static inline unsigned btree_update_reserve_required(struct bch_fs *c, diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 36a6f42aba5e..75c8a196b3f6 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -122,7 +122,7 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans, trans->journal_res.seq = wb->journal_seq; return bch2_trans_update(trans, iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| @@ -191,13 +191,13 @@ btree_write_buffered_insert(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), - BTREE_ITER_CACHED|BTREE_ITER_INTENT); + BTREE_ITER_cached|BTREE_ITER_intent); trans->journal_res.seq = wb->journal_seq; ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -332,7 +332,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) if (!iter.path || iter.btree_id != k->btree) { bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, - BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_intent|BTREE_ITER_all_snapshots); } bch2_btree_iter_set_pos(&iter, k->k.k.p); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 82f179258867..e28d28ac2a13 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -274,25 +274,14 @@ void bch2_dev_usage_init(struct bch_dev *ca) void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) { - prt_tab(out); - prt_str(out, "buckets"); - prt_tab_rjust(out); - prt_str(out, "sectors"); - prt_tab_rjust(out); - prt_str(out, "fragmented"); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); for (unsigned i = 0; i < BCH_DATA_NR; i++) { bch2_prt_data_type(out, i); - prt_tab(out); - prt_u64(out, usage->d[i].buckets); - prt_tab_rjust(out); - prt_u64(out, usage->d[i].sectors); - prt_tab_rjust(out); - prt_u64(out, usage->d[i].fragmented); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\t%llu\r%llu\r%llu\r\n", + usage->d[i].buckets, + usage->d[i].sectors, + usage->d[i].fragmented); } } @@ -329,26 +318,6 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, preempt_enable(); } -static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) -{ - return (struct bch_alloc_v4) { - .gen = b.gen, - .data_type = b.data_type, - .dirty_sectors = b.dirty_sectors, - .cached_sectors = b.cached_sectors, - .stripe = b.stripe, - }; -} - -void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, - struct bucket *old, struct bucket *new) -{ - struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old); - struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new); - - bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true); -} - static inline int __update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, struct bch_replicas_entry_v1 *r, @@ -496,78 +465,276 @@ int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 return bch2_update_replicas_list(trans, &r.e, sectors); } -int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, enum bch_data_type data_type, - unsigned sectors, struct gc_pos pos, - unsigned flags) +int bch2_check_fix_ptrs(struct btree_trans *trans, + enum btree_id btree, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { - struct bucket old, new, *g; + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry_c; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; + struct printbuf buf = PRINTBUF; int ret = 0; - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - BUG_ON(data_type != BCH_DATA_sb && - data_type != BCH_DATA_journal); + percpu_down_read(&c->mark_lock); - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return 0; + rcu_read_lock(); + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (!ca) { + if (fsck_err(c, ptr_to_invalid_device, + "pointer to missing device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + do_update = true; + continue; + } - percpu_down_read(&c->mark_lock); - g = gc_bucket(ca, b); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry_c); + + if (fsck_err_on(!g->gen_valid, + c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached) { + g->gen_valid = true; + g->gen = p.ptr.gen; + } else { + do_update = true; + } + } - bucket_lock(g); - old = *g; + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached && + (g->data_type != BCH_DATA_btree || + data_type == BCH_DATA_btree)) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + do_update = true; + } + } - if (bch2_fs_inconsistent_on(g->data_type && - g->data_type != data_type, c, - "different types of data in same bucket: %s, %s", - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type))) { - ret = -EIO; - goto err; - } + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + do_update = true; + + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, + c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + do_update = true; + + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) + continue; + + if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), + c, ptr_bucket_data_type_mismatch, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (data_type == BCH_DATA_btree) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = data_type; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + do_update = true; + } + } - if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", - ca->dev_idx, b, g->gen, - bch2_data_type_str(g->data_type ?: data_type), - g->dirty_sectors, sectors)) { - ret = -EIO; - goto err; + if (p.has_ec) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); + + if (fsck_err_on(!m || !m->alive, c, + ptr_to_missing_stripe, + "pointer to nonexistent stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + do_update = true; + + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, + ptr_to_incorrect_stripe, + "pointer does not match stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + do_update = true; + } } + rcu_read_unlock(); - g->data_type = data_type; - g->dirty_sectors += sectors; - new = *g; + if (do_update) { + if (flags & BTREE_TRIGGER_is_root) { + bch_err(c, "cannot update btree roots yet"); + ret = -EINVAL; + goto err; + } + + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + rcu_read_lock(); + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_rcu(c, ptr->dev)); + rcu_read_unlock(); + + if (level) { + /* + * We don't want to drop btree node pointers - if the + * btree node isn't there anymore, the read path will + * sort it out: + */ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + rcu_read_lock(); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); + + ptr->gen = g->gen; + } + rcu_read_unlock(); + } else { + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + rcu_read_lock(); + bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); + + if ((p.ptr.cached && + (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || + (!p.ptr.cached && + gen_cmp(p.ptr.gen, g->gen) < 0) || + gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type)) { + bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); + goto restart_drop_ptrs; + } + } + rcu_read_unlock(); +again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, + entry->stripe_ptr.idx); + union bch_extent_entry *next_ptr; + + bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) + if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) + goto found; + next_ptr = NULL; +found: + if (!next_ptr) { + bch_err(c, "aieee, found stripe ptr with no data ptr"); + continue; + } + + if (!m || !m->alive || + !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], + &next_ptr->ptr, + m->sectors)) { + bch2_bkey_extent_entry_drop(new, entry); + goto again; + } + } + } + } + + if (0) { + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, k); + bch_info(c, "updated %s", buf.buf); + + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + bch_info(c, "new key %s", buf.buf); + } + + percpu_up_read(&c->mark_lock); + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, + BTREE_ITER_intent|BTREE_ITER_all_snapshots); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun); + bch2_trans_iter_exit(trans, &iter); + percpu_down_read(&c->mark_lock); + + if (ret) + goto err; + + if (level) + bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + } err: - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, &old, &new); +fsck_err: percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); return ret; } -int bch2_check_bucket_ref(struct btree_trans *trans, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 b_gen, u8 bucket_data_type, - u32 bucket_sectors) +int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 b_gen, u8 bucket_data_type, + u32 *bucket_sectors) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); struct printbuf buf = PRINTBUF; + bool inserting = sectors > 0; int ret = 0; - if (bucket_data_type == BCH_DATA_cached) - bucket_data_type = BCH_DATA_user; - - if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || - (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) - bucket_data_type = ptr_data_type = BCH_DATA_stripe; + BUG_ON(!sectors); if (gen_after(ptr->gen, b_gen)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, @@ -578,8 +745,9 @@ int bch2_check_bucket_ref(struct btree_trans *trans, bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; } if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { @@ -592,11 +760,17 @@ int bch2_check_bucket_ref(struct btree_trans *trans, ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; + } + + if (b_gen != ptr->gen && ptr->cached) { + ret = 1; + goto out; } - if (b_gen != ptr->gen && !ptr->cached) { + if (b_gen != ptr->gen) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, BCH_FSCK_ERR_stale_dirty_ptr, "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" @@ -607,18 +781,12 @@ int bch2_check_bucket_ref(struct btree_trans *trans, ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; - } - - if (b_gen != ptr->gen) { - ret = 1; + if (inserting) + goto err; goto out; } - if (!data_type_is_empty(bucket_data_type) && - ptr_data_type && - bucket_data_type != ptr_data_type) { + if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" @@ -628,28 +796,33 @@ int bch2_check_bucket_ref(struct btree_trans *trans, bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; } - if ((u64) bucket_sectors + sectors > U32_MAX) { + if ((u64) *bucket_sectors + sectors > U32_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, BCH_FSCK_ERR_bucket_sector_count_overflow, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), - bucket_sectors, sectors, + *bucket_sectors, sectors, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + sectors = -*bucket_sectors; } + + *bucket_sectors += sectors; out: printbuf_exit(&buf); return ret; err: bch2_dump_trans_updates(trans); + ret = -EIO; goto out; } @@ -786,29 +959,22 @@ need_mark: /* KEY_TYPE_extent: */ -static int __mark_pointer(struct btree_trans *trans, +static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, - u8 bucket_gen, u8 *bucket_data_type, - u32 *dirty_sectors, u32 *cached_sectors) + struct bch_alloc_v4 *a) { u32 *dst_sectors = !ptr->cached - ? dirty_sectors - : cached_sectors; - int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, - bucket_gen, *bucket_data_type, *dst_sectors); + ? &a->dirty_sectors + : &a->cached_sectors; + int ret = bch2_bucket_ref_update(trans, ca, k, ptr, sectors, ptr_data_type, + a->gen, a->data_type, dst_sectors); if (ret) return ret; - *dst_sectors += sectors; - - if (!*dirty_sectors && !*cached_sectors) - *bucket_data_type = 0; - else if (*bucket_data_type != BCH_DATA_stripe) - *bucket_data_type = ptr_data_type; - + alloc_data_type_set(a, ptr_data_type); return 0; } @@ -816,81 +982,69 @@ static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, - s64 *sectors, unsigned flags) + s64 *sectors, + enum btree_iter_update_trigger_flags flags) { - bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); + bool insert = !(flags & BTREE_TRIGGER_overwrite); + int ret = 0; + + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + if (unlikely(!ca)) { + if (insert) + ret = -EIO; + goto err; + } + struct bpos bucket; struct bch_backpointer bp; - - bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp); + bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp); *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { - struct btree_iter iter; - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket); - int ret = PTR_ERR_OR_ZERO(a); + if (flags & BTREE_TRIGGER_transactional) { + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &a->v); if (ret) - return ret; - - ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type, - a->v.gen, &a->v.data_type, - &a->v.dirty_sectors, &a->v.cached_sectors) ?: - bch2_trans_update(trans, &iter, &a->k_i, 0); - bch2_trans_iter_exit(trans, &iter); - - if (ret) - return ret; + goto err; if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); + ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert); if (ret) - return ret; + goto err; } } - if (flags & BTREE_TRIGGER_GC) { - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); - + if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + struct bucket *g = gc_bucket(ca, bucket.offset); bucket_lock(g); - struct bucket old = *g; - - u8 bucket_data_type = g->data_type; - int ret = __mark_pointer(trans, k, &p.ptr, *sectors, - data_type, g->gen, - &bucket_data_type, - &g->dirty_sectors, - &g->cached_sectors); - if (ret) { - bucket_unlock(g); - percpu_up_read(&c->mark_lock); - return ret; + struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; + ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new); + if (!ret) { + alloc_to_bucket(g, new); + bch2_dev_usage_update(c, ca, &old, &new, 0, true); } - - g->data_type = bucket_data_type; - struct bucket new = *g; bucket_unlock(g); - bch2_dev_usage_update_m(c, ca, &old, &new); percpu_up_read(&c->mark_lock); } - - return 0; +err: + bch2_dev_put(ca); + return ret; } static int bch2_trigger_stripe_ptr(struct btree_trans *trans, struct bkey_s_c k, struct extent_ptr_decoded p, enum bch_data_type data_type, - s64 sectors, unsigned flags) + s64 sectors, + enum btree_iter_update_trigger_flags flags) { - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct btree_iter iter; struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_WITH_UPDATES, stripe); + BTREE_ITER_with_updates, stripe); int ret = PTR_ERR_OR_ZERO(s); if (unlikely(ret)) { bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, @@ -920,10 +1074,10 @@ err: return ret; } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { struct bch_fs *c = trans->c; - BUG_ON(!(flags & BTREE_TRIGGER_GC)); + BUG_ON(!(flags & BTREE_TRIGGER_gc)); struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); if (!m) { @@ -959,9 +1113,10 @@ err: static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) + struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { - bool gc = flags & BTREE_TRIGGER_GC; + bool gc = flags & BTREE_TRIGGER_gc; struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -970,7 +1125,7 @@ static int __trigger_extent(struct btree_trans *trans, enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; - s64 dirty_sectors = 0; + s64 replicas_sectors = 0; int ret = 0; r.e.data_type = data_type; @@ -996,7 +1151,7 @@ static int __trigger_extent(struct btree_trans *trans, return ret; } } else if (!p.has_ec) { - dirty_sectors += disk_sectors; + replicas_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); @@ -1014,8 +1169,8 @@ static int __trigger_extent(struct btree_trans *trans, if (r.e.nr_devs) { ret = !gc - ? bch2_update_replicas_list(trans, &r.e, dirty_sectors) - : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true); + ? bch2_update_replicas_list(trans, &r.e, replicas_sectors) + : bch2_update_replicas(c, k, &r.e, replicas_sectors, 0, true); if (unlikely(ret && gc)) { struct printbuf buf = PRINTBUF; @@ -1031,15 +1186,18 @@ static int __trigger_extent(struct btree_trans *trans, } int bch2_trigger_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, + enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; + if (unlikely(flags & BTREE_TRIGGER_check_repair)) + return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags); + /* if pointers aren't changing - nothing to do: */ if (new_ptrs_bytes == old_ptrs_bytes && !memcmp(new_ptrs.start, @@ -1047,7 +1205,7 @@ int bch2_trigger_extent(struct btree_trans *trans, new_ptrs_bytes)) return 0; - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct bch_fs *c = trans->c; int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) - (int) bch2_bkey_needs_rebalance(c, old); @@ -1060,8 +1218,8 @@ int bch2_trigger_extent(struct btree_trans *trans, } } - if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC)) - return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags); + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) + return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags); return 0; } @@ -1069,17 +1227,17 @@ int bch2_trigger_extent(struct btree_trans *trans, /* KEY_TYPE_reservation */ static int __trigger_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) + enum btree_id btree_id, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size * replicas; - if (flags & BTREE_TRIGGER_OVERWRITE) + if (flags & BTREE_TRIGGER_overwrite) sectors = -sectors; - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { int ret = bch2_replicas_deltas_realloc(trans, 0); if (ret) return ret; @@ -1090,7 +1248,7 @@ static int __trigger_reservation(struct btree_trans *trans, d->persistent_reserved[replicas - 1] += sectors; } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); preempt_disable(); @@ -1110,7 +1268,7 @@ static int __trigger_reservation(struct btree_trans *trans, int bch2_trigger_reservation(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); } @@ -1118,22 +1276,16 @@ int bch2_trigger_reservation(struct btree_trans *trans, /* Mark superblocks: */ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, size_t b, + struct bch_dev *ca, u64 b, enum bch_data_type type, unsigned sectors) { struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_i_alloc_v4 *a; int ret = 0; - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return 0; - - a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b)); if (IS_ERR(a)) return PTR_ERR(a); @@ -1161,20 +1313,75 @@ err: return ret; } +static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 b, enum bch_data_type data_type, unsigned sectors, + enum btree_iter_update_trigger_flags flags) +{ + int ret = 0; + + percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, b); + + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g); + + if (bch2_fs_inconsistent_on(g->data_type && + g->data_type != data_type, c, + "different types of data in same bucket: %s, %s", + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type))) { + ret = -EIO; + goto err; + } + + if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, + "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", + ca->dev_idx, b, g->gen, + bch2_data_type_str(g->data_type ?: data_type), + g->dirty_sectors, sectors)) { + ret = -EIO; + goto err; + } + + g->data_type = data_type; + g->dirty_sectors += sectors; + struct bch_alloc_v4 new = bucket_m_to_alloc(*g); +err: + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update(c, ca, &old, &new, 0, true); + percpu_up_read(&c->mark_lock); + return ret; +} + int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, size_t b, - enum bch_data_type type, - unsigned sectors) + struct bch_dev *ca, u64 b, + enum bch_data_type type, unsigned sectors, + enum btree_iter_update_trigger_flags flags) { - return commit_do(trans, NULL, NULL, 0, - __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); + BUG_ON(type != BCH_DATA_free && + type != BCH_DATA_sb && + type != BCH_DATA_journal); + + /* + * Backup superblock might be past the end of our normal usable space: + */ + if (b >= ca->mi.nbuckets) + return 0; + + if (flags & BTREE_TRIGGER_gc) + return bch2_mark_metadata_bucket(trans->c, ca, b, type, sectors, flags); + else if (flags & BTREE_TRIGGER_transactional) + return commit_do(trans, NULL, NULL, 0, + __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); + else + BUG(); } static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, - struct bch_dev *ca, - u64 start, u64 end, - enum bch_data_type type, - u64 *bucket, unsigned *bucket_sectors) + struct bch_dev *ca, u64 start, u64 end, + enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors, + enum btree_iter_update_trigger_flags flags) { do { u64 b = sector_to_bucket(ca, start); @@ -1183,7 +1390,7 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, if (b != *bucket && *bucket_sectors) { int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, - type, *bucket_sectors); + type, *bucket_sectors, flags); if (ret) return ret; @@ -1198,8 +1405,8 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, return 0; } -static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, - struct bch_dev *ca) +static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca, + enum btree_iter_update_trigger_flags flags) { struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; u64 bucket = 0; @@ -1212,21 +1419,21 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, if (offset == BCH_SB_SECTOR) { ret = bch2_trans_mark_metadata_sectors(trans, ca, 0, BCH_SB_SECTOR, - BCH_DATA_sb, &bucket, &bucket_sectors); + BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; } ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, offset + (1 << layout->sb_max_size_bits), - BCH_DATA_sb, &bucket, &bucket_sectors); + BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; } if (bucket_sectors) { ret = bch2_trans_mark_metadata_bucket(trans, ca, - bucket, BCH_DATA_sb, bucket_sectors); + bucket, BCH_DATA_sb, bucket_sectors, flags); if (ret) return ret; } @@ -1234,7 +1441,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, for (i = 0; i < ca->journal.nr; i++) { ret = bch2_trans_mark_metadata_bucket(trans, ca, ca->journal.buckets[i], - BCH_DATA_journal, ca->mi.bucket_size); + BCH_DATA_journal, ca->mi.bucket_size, flags); if (ret) return ret; } @@ -1242,20 +1449,22 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, return 0; } -int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) +int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, + enum btree_iter_update_trigger_flags flags) { - int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); - + int ret = bch2_trans_run(c, + __bch2_trans_mark_dev_sb(trans, ca, flags)); bch_err_fn(c, ret); return ret; } -int bch2_trans_mark_dev_sbs(struct bch_fs *c) +int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, + enum btree_iter_update_trigger_flags flags) { for_each_online_member(c, ca) { - int ret = bch2_trans_mark_dev_sb(c, ca); + int ret = bch2_trans_mark_dev_sb(c, ca, flags); if (ret) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } } @@ -1263,6 +1472,11 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) return 0; } +int bch2_trans_mark_dev_sbs(struct bch_fs *c) +{ + return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); +} + /* Disk reservations: */ #define SECTORS_CACHE 1024 @@ -1331,6 +1545,31 @@ recalculate: /* Startup/shutdown: */ +void bch2_buckets_nouse_free(struct bch_fs *c) +{ + for_each_member_device(c, ca) { + kvfree_rcu_mightsleep(ca->buckets_nouse); + ca->buckets_nouse = NULL; + } +} + +int bch2_buckets_nouse_alloc(struct bch_fs *c) +{ + for_each_member_device(c, ca) { + BUG_ON(ca->buckets_nouse); + + ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO); + if (!ca->buckets_nouse) { + bch2_dev_put(ca); + return -BCH_ERR_ENOMEM_buckets_nouse; + } + } + + return 0; +} + static void bucket_gens_free_rcu(struct rcu_head *rcu) { struct bucket_gens *buckets = @@ -1342,24 +1581,17 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; - unsigned long *buckets_nouse = NULL; bool resize = ca->bucket_gens != NULL; int ret; + BUG_ON(resize && ca->buckets_nouse); + if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, GFP_KERNEL|__GFP_ZERO))) { ret = -BCH_ERR_ENOMEM_bucket_gens; goto err; } - if ((c->opts.buckets_nouse && - !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)))) { - ret = -BCH_ERR_ENOMEM_buckets_nouse; - goto err; - } - bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; @@ -1377,17 +1609,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) memcpy(bucket_gens->b, old_bucket_gens->b, n); - if (buckets_nouse) - memcpy(buckets_nouse, - ca->buckets_nouse, - BITS_TO_LONGS(n) * sizeof(unsigned long)); } rcu_assign_pointer(ca->bucket_gens, bucket_gens); bucket_gens = old_bucket_gens; - swap(ca->buckets_nouse, buckets_nouse); - nbuckets = ca->mi.nbuckets; if (resize) { @@ -1398,7 +1624,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ret = 0; err: - kvfree(buckets_nouse); if (bucket_gens) call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index f9af5adabe83..617ffde2fb7a 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -12,7 +12,7 @@ #include "extents.h" #include "sb-members.h" -static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) +static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s) { return div_u64(s, ca->mi.bucket_size); } @@ -30,8 +30,7 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) return remainder; } -static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, - u32 *offset) +static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset) { return div_u64_rem(s, ca->mi.bucket_size, offset); } @@ -94,7 +93,7 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { struct bucket_array *buckets = gc_bucket_array(ca); - BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); + BUG_ON(!bucket_valid(ca, b)); return buckets->b + b; } @@ -111,7 +110,7 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) { struct bucket_gens *gens = bucket_gens(ca); - BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); + BUG_ON(!bucket_valid(ca, b)); return gens->b + b; } @@ -121,20 +120,16 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, return sector_to_bucket(ca, ptr->offset); } -static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, - const struct bch_extent_ptr *ptr) +static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca, + const struct bch_extent_ptr *ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); } -static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c, +static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca, const struct bch_extent_ptr *ptr, u32 *bucket_offset) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); } @@ -175,17 +170,19 @@ static inline int gen_after(u8 a, u8 b) return r > 0 ? r : 0; } +static inline u8 dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) +{ + return gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); +} + /** - * ptr_stale() - check if a pointer points into a bucket that has been + * dev_ptr_stale() - check if a pointer points into a bucket that has been * invalidated. */ -static inline u8 ptr_stale(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) +static inline u8 dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - u8 ret; - rcu_read_lock(); - ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + u8 ret = dev_ptr_stale_rcu(ca, ptr); rcu_read_unlock(); return ret; @@ -306,8 +303,6 @@ bch2_fs_usage_read_short(struct bch_fs *); void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *, const struct bch_alloc_v4 *, const struct bch_alloc_v4 *, u64, bool); -void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *, - struct bucket *, struct bucket *); /* key/bucket marking: */ @@ -333,27 +328,29 @@ int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); void bch2_fs_usage_initialize(struct bch_fs *); -int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c, - const struct bch_extent_ptr *, - s64, enum bch_data_type, u8, u8, u32); +int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *, + struct bkey_s_c, const struct bch_extent_ptr *, + s64, enum bch_data_type, u8, u8, u32 *); -int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, - size_t, enum bch_data_type, unsigned, - struct gc_pos, unsigned); +int bch2_check_fix_ptrs(struct btree_trans *, + enum btree_id, unsigned, struct bkey_s_c, + enum btree_iter_update_trigger_flags); int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ ({ \ int ret = 0; \ \ if (_old.k->type) \ - ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ + ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \ if (!ret && _new.k->type) \ - ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\ + ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\ ret; \ }) @@ -362,9 +359,13 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *); void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); -int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, - size_t, enum bch_data_type, unsigned); -int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64, + enum bch_data_type, unsigned, + enum btree_iter_update_trigger_flags); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *, + enum btree_iter_update_trigger_flags); +int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, + enum btree_iter_update_trigger_flags); int bch2_trans_mark_dev_sbs(struct bch_fs *); static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) @@ -464,6 +465,9 @@ static inline u64 avail_factor(u64 r) return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); } +void bch2_buckets_nouse_free(struct bch_fs *); +int bch2_buckets_nouse_alloc(struct bch_fs *); + int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); void bch2_dev_buckets_free(struct bch_dev *); int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 4d14f19f5185..9e54323f0f5f 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -32,12 +32,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, if (dev >= c->sb.nr_devices) return ERR_PTR(-EINVAL); - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - + ca = bch2_dev_tryget_noerror(c, dev); if (!ca) return ERR_PTR(-EINVAL); } else { @@ -391,7 +386,7 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) return PTR_ERR(ca); ret = bch2_dev_offline(c, ca, arg.flags); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -420,7 +415,7 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, if (ret) bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -615,7 +610,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.d[i].fragmented = src.d[i].fragmented; } - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); } @@ -667,7 +662,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, goto err; } err: - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -689,11 +684,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c, if (arg.flags & BCH_READ_DEV) { ca = bch2_device_lookup(c, arg.dev, arg.flags); - - if (IS_ERR(ca)) { - ret = PTR_ERR(ca); - goto err; - } + ret = PTR_ERR_OR_ZERO(ca); + if (ret) + goto err_unlock; sb = ca->disk_sb.sb; } else { @@ -708,8 +701,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c, ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb, vstruct_bytes(sb)); err: - if (!IS_ERR_OR_NULL(ca)) - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); +err_unlock: mutex_unlock(&c->sb_lock); return ret; } @@ -753,7 +746,7 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c, ret = bch2_dev_resize(c, ca, arg.nbuckets); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -779,7 +772,7 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -961,7 +954,9 @@ static const struct file_operations bch_chardev_fops = { }; static int bch_chardev_major; -static struct class *bch_chardev_class; +static const struct class bch_chardev_class = { + .name = "bcachefs", +}; static struct device *bch_chardev; void bch2_fs_chardev_exit(struct bch_fs *c) @@ -978,7 +973,7 @@ int bch2_fs_chardev_init(struct bch_fs *c) if (c->minor < 0) return c->minor; - c->chardev = device_create(bch_chardev_class, NULL, + c->chardev = device_create(&bch_chardev_class, NULL, MKDEV(bch_chardev_major, c->minor), c, "bcachefs%u-ctl", c->minor); if (IS_ERR(c->chardev)) @@ -989,32 +984,39 @@ int bch2_fs_chardev_init(struct bch_fs *c) void bch2_chardev_exit(void) { - if (!IS_ERR_OR_NULL(bch_chardev_class)) - device_destroy(bch_chardev_class, - MKDEV(bch_chardev_major, U8_MAX)); - if (!IS_ERR_OR_NULL(bch_chardev_class)) - class_destroy(bch_chardev_class); + device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX)); + class_unregister(&bch_chardev_class); if (bch_chardev_major > 0) unregister_chrdev(bch_chardev_major, "bcachefs"); } int __init bch2_chardev_init(void) { + int ret; + bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); if (bch_chardev_major < 0) return bch_chardev_major; - bch_chardev_class = class_create("bcachefs"); - if (IS_ERR(bch_chardev_class)) - return PTR_ERR(bch_chardev_class); + ret = class_register(&bch_chardev_class); + if (ret) + goto major_out; - bch_chardev = device_create(bch_chardev_class, NULL, + bch_chardev = device_create(&bch_chardev_class, NULL, MKDEV(bch_chardev_major, U8_MAX), NULL, "bcachefs-ctl"); - if (IS_ERR(bch_chardev)) - return PTR_ERR(bch_chardev); + if (IS_ERR(bch_chardev)) { + ret = PTR_ERR(bch_chardev); + goto class_out; + } return 0; + +class_out: + class_unregister(&bch_chardev_class); +major_out: + unregister_chrdev(bch_chardev_major, "bcachefs-ctl"); + return ret; } #endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 088fd2e7bdf1..85198f391e9c 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -469,9 +469,8 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, /* BCH_SB_FIELD_crypt: */ -static int bch2_sb_crypt_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); @@ -494,14 +493,10 @@ static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); - prt_newline(out); - prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); - prt_newline(out); - prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); - prt_newline(out); - prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); - prt_newline(out); + prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt)); + prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt)); + prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt)); + prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt)); } const struct bch_sb_field_ops bch_sb_field_ops_crypt = { diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0022b51ce3c0..0d807c2ce9c6 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -106,7 +106,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, m->btree_id, bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); while (1) { struct bkey_s_c k; @@ -203,6 +203,8 @@ restart_drop_conflicting_replicas: /* Now, drop excess replicas: */ restart_drop_extra_replicas: + + rcu_read_lock(); bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); @@ -214,6 +216,7 @@ restart_drop_extra_replicas: goto restart_drop_extra_replicas; } } + rcu_read_unlock(); /* Finally, add the pointers we just wrote: */ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) @@ -288,7 +291,7 @@ restart_drop_extra_replicas: k.k->p, insert->k.p) ?: bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, &op->res, NULL, BCH_TRANS_COMMIT_no_check_rw| @@ -357,10 +360,11 @@ void bch2_data_update_exit(struct data_update *update) bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); if (c->opts.nocow_enabled) bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), 0); - percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref); + PTR_BUCKET_POS(ca, ptr), 0); + bch2_dev_put(ca); } bch2_bkey_buf_exit(&update->k, c); @@ -386,8 +390,10 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, while (bio_sectors(bio)) { unsigned sectors = bio_sectors(bio); + bch2_trans_begin(trans); + bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, - BTREE_ITER_SLOTS); + BTREE_ITER_slots); ret = lockrestart_do(trans, ({ k = bch2_btree_iter_peek_slot(&iter); bkey_err(k); @@ -465,7 +471,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, while (data_opts.kill_ptrs) { unsigned i = 0, drop = __fls(data_opts.kill_ptrs); - struct bch_extent_ptr *ptr; bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); data_opts.kill_ptrs ^= 1U << drop; @@ -480,15 +485,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, /* * Since we're not inserting through an extent iterator - * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * (BTREE_ITER_all_snapshots iterators aren't extent iterators), * we aren't using the extent overwrite path to delete, we're * just using the normal key deletion path: */ - if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents)) n->k.size = 0; return bch2_trans_relock(trans) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } @@ -539,15 +544,26 @@ int bch2_data_update_init(struct btree_trans *trans, m->op.compression_opt = background_compression(io_opts); m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - bkey_for_each_ptr(ptrs, ptr) - percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref); + bkey_for_each_ptr(ptrs, ptr) { + if (!bch2_dev_tryget(c, ptr->dev)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); + } + return -BCH_ERR_data_update_done; + } + } unsigned durability_have = 0, durability_removing = 0; i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); + struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); bool locked; + rcu_read_lock(); if (((1U << i) & m->data_opts.rewrite_ptrs)) { BUG_ON(p.ptr.cached); @@ -561,6 +577,7 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); durability_have += bch2_extent_ptr_durability(c, &p); } + rcu_read_unlock(); /* * op->csum_type is normally initialized from the fs/file's @@ -579,15 +596,13 @@ int bch2_data_update_init(struct btree_trans *trans, if (ctxt) { move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0)) || + bucket, 0)) || list_empty(&ctxt->ios)); if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); + bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0)) { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { ret = -BCH_ERR_nocow_lock_blocked; goto err; } @@ -649,10 +664,11 @@ int bch2_data_update_init(struct btree_trans *trans, err: i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); + struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); if ((1U << i) & ptrs_locked) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); - percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + bch2_dev_put(ca); i++; } diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index cd99b7399414..51cbf3928361 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -37,11 +37,11 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct btree_node *n_ondisk = c->verify_ondisk; struct btree_node *n_sorted = c->verify_data->data; struct bset *sorted, *inmemory = &b->data->keys; - struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); struct bio *bio; bool failed = false, saw_error = false; - if (!bch2_dev_get_ioref(ca, READ)) + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) return false; bio = bio_alloc_bioset(ca->disk_sb.bdev, @@ -194,8 +194,8 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); - if (!bch2_dev_get_ioref(ca, READ)) { + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) { prt_printf(out, "error getting device to read from: not online\n"); return; } @@ -375,8 +375,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, return flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ bch2_bkey_val_to_text(&i->buf, i->c, k); prt_newline(&i->buf); bch2_trans_unlock(trans); @@ -459,8 +459,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, return flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ struct btree_path_level *l = &btree_iter_path(trans, &iter)->l[0]; struct bkey_packed *_k = @@ -492,51 +492,26 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); - prt_printf(out, "%px btree=%s l=%u ", - b, - bch2_btree_id_str(b->c.btree_id), - b->c.level); - prt_newline(out); + prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level); printbuf_indent_add(out, 2); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); prt_newline(out); - prt_printf(out, "flags: "); - prt_tab(out); + prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_btree_node_flags, b->flags); prt_newline(out); - prt_printf(out, "pcpu read locks: "); - prt_tab(out); - prt_printf(out, "%u", b->c.lock.readers != NULL); - prt_newline(out); - - prt_printf(out, "written:"); - prt_tab(out); - prt_printf(out, "%u", b->written); - prt_newline(out); - - prt_printf(out, "writes blocked:"); - prt_tab(out); - prt_printf(out, "%u", !list_empty_careful(&b->write_blocked)); - prt_newline(out); - - prt_printf(out, "will make reachable:"); - prt_tab(out); - prt_printf(out, "%lx", b->will_make_reachable); - prt_newline(out); - - prt_printf(out, "journal pin %px:", &b->writes[0].journal); - prt_tab(out); - prt_printf(out, "%llu", b->writes[0].journal.seq); - prt_newline(out); + prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL); + prt_printf(out, "written:\t%u\n", b->written); + prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked)); + prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable); - prt_printf(out, "journal pin %px:", &b->writes[1].journal); - prt_tab(out); - prt_printf(out, "%llu", b->writes[1].journal.seq); - prt_newline(out); + prt_printf(out, "journal pin %px:\t%llu\n", + &b->writes[0].journal, b->writes[0].journal.seq); + prt_printf(out, "journal pin %px:\t%llu\n", + &b->writes[1].journal, b->writes[1].journal.seq); printbuf_indent_sub(out, 2); } @@ -625,8 +600,7 @@ restart: bch2_btree_trans_to_text(&i->buf, trans); - prt_printf(&i->buf, "backtrace:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "backtrace:\n"); printbuf_indent_add(&i->buf, 2); bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); @@ -782,25 +756,20 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, !bch2_btree_transaction_fns[i->iter]) break; - prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]); - prt_newline(&i->buf); + prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]); printbuf_indent_add(&i->buf, 2); mutex_lock(&s->lock); - prt_printf(&i->buf, "Max mem used: %u", s->max_mem); - prt_newline(&i->buf); - - prt_printf(&i->buf, "Transaction duration:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); + prt_printf(&i->buf, "Transaction duration:\n"); printbuf_indent_add(&i->buf, 2); bch2_time_stats_to_text(&i->buf, &s->duration); printbuf_indent_sub(&i->buf, 2); if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { - prt_printf(&i->buf, "Lock hold times:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "Lock hold times:\n"); printbuf_indent_add(&i->buf, 2); bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); @@ -808,8 +777,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, } if (s->max_paths_text) { - prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths); - prt_newline(&i->buf); + prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths); printbuf_indent_add(&i->buf, 2); prt_str_indented(&i->buf, s->max_paths_text); diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index d37bd07afbfe..6bbf9a7d9e4d 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -98,7 +98,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { }; int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); @@ -118,7 +118,7 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, * Check new keys don't exceed the max length * (older keys may be larger.) */ - bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err, + bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, err, dirent_name_too_long, "dirent name too big (%u > %u)", d_name.len, BCH_NAME_MAX); @@ -205,7 +205,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; struct bkey_i_dirent *dirent; @@ -220,9 +220,8 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.snapshot = snapshot; ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, snapshot, - &dirent->k_i, str_hash_flags, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + dir_inum, snapshot, &dirent->k_i, + flags|BTREE_UPDATE_internal_snapshot_node); *dir_offset = dirent->k.p.offset; return ret; @@ -232,7 +231,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_i_dirent *dirent; int ret; @@ -243,7 +242,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir, &dirent->k_i, str_hash_flags); + dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; return ret; @@ -272,7 +271,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } else { target->subvol = le32_to_cpu(d.v->d_child_subvol); - ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s); + ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s); target->inum = le64_to_cpu(s.inode); } @@ -301,13 +300,9 @@ int bch2_dirent_rename(struct btree_trans *trans, memset(dst_inum, 0, sizeof(*dst_inum)); /* Lookup src: */ - ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, - src_hash, src_dir, src_name, - BTREE_ITER_INTENT); - if (ret) - goto out; - - old_src = bch2_btree_iter_peek_slot(&src_iter); + old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, + src_hash, src_dir, src_name, + BTREE_ITER_intent); ret = bkey_err(old_src); if (ret) goto out; @@ -329,13 +324,9 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; } else { - ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name, - BTREE_ITER_INTENT); - if (ret) - goto out; - - old_dst = bch2_btree_iter_peek_slot(&dst_iter); + old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name, + BTREE_ITER_intent); ret = bkey_err(old_dst); if (ret) goto out; @@ -450,7 +441,7 @@ out_set_src: if (delete_src) { bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); ret = bch2_btree_iter_traverse(&src_iter) ?: - bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; } @@ -458,7 +449,7 @@ out_set_src: if (delete_dst) { bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); ret = bch2_btree_iter_traverse(&dst_iter) ?: - bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; } @@ -479,13 +470,9 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans, const struct qstr *name, subvol_inum *inum, unsigned flags) { - int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); - if (ret) - return ret; - - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); + struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); + int ret = bkey_err(k); if (ret) goto err; @@ -541,16 +528,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot); } +static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target) +{ + struct qstr name = bch2_dirent_get_name(d); + bool ret = dir_emit(ctx, name.name, + name.len, + target.inum, + vfs_d_type(d.v->d_type)); + if (ret) + ctx->pos = d.k->p.offset + 1; + return ret; +} + int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; - struct bkey_s_c_dirent dirent; subvol_inum target; u32 snapshot; struct bkey_buf sk; - struct qstr name; int ret; bch2_bkey_buf_init(&sk); @@ -567,7 +564,9 @@ retry: if (k.k->type != KEY_TYPE_dirent) continue; - dirent = bkey_s_c_to_dirent(k); + /* dir_emit() can fault and block: */ + bch2_bkey_buf_reassemble(&sk, c, k); + struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); ret = bch2_dirent_read_target(trans, inum, dirent, &target); if (ret < 0) @@ -575,28 +574,22 @@ retry: if (ret) continue; - /* dir_emit() can fault and block: */ - bch2_bkey_buf_reassemble(&sk, c, k); - dirent = bkey_i_to_s_c_dirent(sk.k); - bch2_trans_unlock(trans); - - name = bch2_dirent_get_name(dirent); - - ctx->pos = dirent.k->p.offset; - if (!dir_emit(ctx, name.name, - name.len, - target.inum, - vfs_d_type(dirent.v->d_type))) - break; - ctx->pos = dirent.k->p.offset + 1; - /* * read_target looks up subvolumes, we can overflow paths if the * directory has many subvolumes in it + * + * XXX: btree_trans_too_many_iters() is something we'd like to + * get rid of, and there's no good reason to be using it here + * except that we don't yet have a for_each_btree_key() helper + * that does subvolume_get_snapshot(). */ - ret = btree_trans_too_many_iters(trans); - if (ret) + ret = drop_locks_do(trans, + bch2_dir_emit(ctx, dirent, target)) ?: + btree_trans_too_many_iters(trans); + if (ret) { + ret = ret < 0 ? ret : 0; break; + } } bch2_trans_iter_exit(trans, &iter); err: diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index bee55cca2aa0..24037e6e0a09 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -4,11 +4,11 @@ #include "str_hash.h" -enum bkey_invalid_flags; +enum bch_validate_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ @@ -38,11 +38,11 @@ int bch2_dirent_read_target(struct btree_trans *, subvol_inum, int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, - bch_str_hash_flags_t); + enum btree_iter_update_trigger_flags); int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, - bch_str_hash_flags_t); + enum btree_iter_update_trigger_flags); static inline unsigned vfs_d_type(unsigned type) { diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 06a7df529b40..521a86df5e52 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -18,9 +18,8 @@ static int group_cmp(const void *_l, const void *_r) strncmp(l->label, r->label, sizeof(l->label)); } -static int bch2_sb_disk_groups_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); @@ -177,7 +176,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i); struct bch_disk_group_cpu *dst; - if (!bch2_member_exists(&m)) + if (!bch2_member_alive(&m)) continue; g = BCH_MEMBER_GROUP(&m); @@ -523,7 +522,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, ca = bch2_dev_lookup(c, val); if (!IS_ERR(ca)) { *res = dev_to_target(ca->dev_idx); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return 0; } @@ -588,7 +587,7 @@ static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsi case TARGET_DEV: { struct bch_member m = bch2_sb_member_get(sb, t.dev); - if (bch2_dev_exists(sb, t.dev)) { + if (bch2_member_exists(sb, t.dev)) { prt_printf(out, "Device "); pr_uuid(out, m.uuid.b); prt_printf(out, " (%u)", t.dev); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 556a217108d3..b26dc7424662 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -107,7 +107,7 @@ struct ec_bio { /* Stripes btree keys: */ int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; @@ -163,146 +163,189 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, /* Triggers: */ -static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned idx, bool deleting) +static int __mark_stripe_bucket(struct btree_trans *trans, + struct bch_dev *ca, + struct bkey_s_c_stripe s, + unsigned ptr_idx, bool deleting, + struct bpos bucket, + struct bch_alloc_v4 *a, + enum btree_iter_update_trigger_flags flags) { - struct bch_fs *c = trans->c; - const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; - enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity : 0; - s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; + const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; + unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant; + bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; + s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; + struct printbuf buf = PRINTBUF; int ret = 0; + struct bch_fs *c = trans->c; if (deleting) sectors = -sectors; - a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); - if (IS_ERR(a)) - return PTR_ERR(a); - - ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, - a->v.gen, a->v.data_type, - a->v.dirty_sectors); - if (ret) - goto err; - if (!deleting) { - if (bch2_trans_inconsistent_on(a->v.stripe || - a->v.stripe_redundancy, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - a->v.dirty_sectors, - a->v.stripe, s.k->p.offset)) { + if (bch2_trans_inconsistent_on(a->stripe || + a->stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + a->dirty_sectors, + a->stripe, s.k->p.offset, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -EIO; goto err; } - if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - a->v.dirty_sectors, - s.k->p.offset)) { + if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + a->dirty_sectors, + a->cached_sectors, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -EIO; goto err; } - - a->v.stripe = s.k->p.offset; - a->v.stripe_redundancy = s.v->nr_redundant; - a->v.data_type = BCH_DATA_stripe; } else { - if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || - a->v.stripe_redundancy != s.v->nr_redundant, trans, - "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", - iter.pos.inode, iter.pos.offset, a->v.gen, - s.k->p.offset, a->v.stripe)) { + if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || + a->stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", + bucket.inode, bucket.offset, a->gen, + a->stripe, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -EIO; goto err; } - a->v.stripe = 0; - a->v.stripe_redundancy = 0; - a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); + if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, + "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + bch2_data_type_str(data_type), + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -EIO; + goto err; + } + + if (bch2_trans_inconsistent_on(parity && + (a->dirty_sectors != -sectors || + a->cached_sectors), trans, + "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s", + bucket.inode, bucket.offset, a->gen, + a->dirty_sectors, + a->cached_sectors, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -EIO; + goto err; + } } - a->v.dirty_sectors += sectors; - if (data_type) - a->v.data_type = !deleting ? data_type : 0; + if (sectors) { + ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, + a->gen, a->data_type, &a->dirty_sectors); + if (ret) + goto err; + } - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto err; + if (!deleting) { + a->stripe = s.k->p.offset; + a->stripe_redundancy = s.v->nr_redundant; + } else { + a->stripe = 0; + a->stripe_redundancy = 0; + } + + alloc_data_type_set(a, data_type); err: - bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); return ret; } static int mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c k, - unsigned ptr_idx, - unsigned flags) + struct bkey_s_c_stripe s, + unsigned ptr_idx, bool deleting, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned nr_data = s->nr_blocks - s->nr_redundant; - bool parity = ptr_idx >= nr_data; - enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; - s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; - const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket old, new, *g; - struct printbuf buf = PRINTBUF; + const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; int ret = 0; - BUG_ON(!(flags & BTREE_TRIGGER_GC)); + struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); + if (unlikely(!ca)) { + if (!(flags & BTREE_TRIGGER_overwrite)) + ret = -EIO; + goto err; + } - /* * XXX doesn't handle deletion */ + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - percpu_down_read(&c->mark_lock); - g = PTR_GC_BUCKET(ca, ptr); + if (flags & BTREE_TRIGGER_transactional) { + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update(trans, bucket); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); + } - if (g->dirty_sectors || - (g->stripe && g->stripe != k.k->p.offset)) { - bch2_fs_inconsistent(c, - "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EINVAL; - goto err; + if (flags & BTREE_TRIGGER_gc) { + percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, bucket.offset); + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; + ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); + if (!ret) { + alloc_to_bucket(g, new); + bch2_dev_usage_update(c, ca, &old, &new, 0, true); + } + bucket_unlock(g); + percpu_up_read(&c->mark_lock); } +err: + bch2_dev_put(ca); + return ret; +} - bucket_lock(g); - old = *g; +static int mark_stripe_buckets(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + enum btree_iter_update_trigger_flags flags) +{ + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; - ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type, - g->gen, g->data_type, - g->dirty_sectors); - if (ret) - goto err; + BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks); - g->data_type = data_type; - g->dirty_sectors += sectors; + unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - g->stripe = k.k->p.offset; - g->stripe_redundancy = s->nr_redundant; - new = *g; -err: - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, &old, &new); - percpu_up_read(&c->mark_lock); - printbuf_exit(&buf); - return ret; + for (unsigned i = 0; i < nr_blocks; i++) { + if (new_s && old_s && + !memcmp(&new_s->ptrs[i], + &old_s->ptrs[i], + sizeof(new_s->ptrs[i]))) + continue; + + if (new_s) { + int ret = mark_stripe_bucket(trans, + bkey_s_c_to_stripe(new), i, false, flags); + if (ret) + return ret; + } + + if (old_s) { + int ret = mark_stripe_bucket(trans, + bkey_s_c_to_stripe(old), i, true, flags); + if (ret) + return ret; + } + } + + return 0; } int bch2_trigger_stripe(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, + enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_s_c new = _new.s_c; struct bch_fs *c = trans->c; @@ -312,7 +355,10 @@ int bch2_trigger_stripe(struct btree_trans *trans, const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(new).v : NULL; - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (unlikely(flags & BTREE_TRIGGER_check_repair)) + return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags); + + if (flags & BTREE_TRIGGER_transactional) { /* * If the pointers aren't changing, we don't need to do anything: */ @@ -347,31 +393,12 @@ int bch2_trigger_stripe(struct btree_trans *trans, return ret; } - unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - for (unsigned i = 0; i < nr_blocks; i++) { - if (new_s && old_s && - !memcmp(&new_s->ptrs[i], - &old_s->ptrs[i], - sizeof(new_s->ptrs[i]))) - continue; - - if (new_s) { - int ret = bch2_trans_mark_stripe_bucket(trans, - bkey_s_c_to_stripe(new), i, false); - if (ret) - return ret; - } - - if (old_s) { - int ret = bch2_trans_mark_stripe_bucket(trans, - bkey_s_c_to_stripe(old), i, true); - if (ret) - return ret; - } - } + int ret = mark_stripe_buckets(trans, old, new, flags); + if (ret) + return ret; } - if (flags & BTREE_TRIGGER_ATOMIC) { + if (flags & BTREE_TRIGGER_atomic) { struct stripe *m = genradix_ptr(&c->stripes, idx); if (!m) { @@ -410,7 +437,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, } } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); @@ -439,13 +466,11 @@ int bch2_trigger_stripe(struct btree_trans *trans, */ memset(m->block_sectors, 0, sizeof(m->block_sectors)); - for (unsigned i = 0; i < new_s->nr_blocks; i++) { - int ret = mark_stripe_bucket(trans, new, i, flags); - if (ret) - return ret; - } + int ret = mark_stripe_buckets(trans, old, new, flags); + if (ret) + return ret; - int ret = bch2_update_replicas(c, new, &m->r.e, + ret = bch2_update_replicas(c, new, &m->r.e, ((s64) m->sectors * m->nr_redundant), 0, true); if (ret) { @@ -608,19 +633,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct bch_csum got = ec_block_checksum(buf, i, offset); if (bch2_crc_cmp(want, got)) { - struct printbuf err = PRINTBUF; - struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); + struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); + if (ca) { + struct printbuf err = PRINTBUF; - prt_str(&err, "stripe "); - bch2_csum_err_msg(&err, v->csum_type, want, got); - prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); - bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); - bch_err_ratelimited(ca, "%s", err.buf); - printbuf_exit(&err); + prt_str(&err, "stripe "); + bch2_csum_err_msg(&err, v->csum_type, want, got); + prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); + bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); + bch_err_ratelimited(ca, "%s", err.buf); + printbuf_exit(&err); - clear_bit(i, buf->valid); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + } - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + clear_bit(i, buf->valid); break; } @@ -687,7 +714,7 @@ static void ec_block_endio(struct bio *bio) bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); - if (ptr_stale(ca, ptr)) { + if (dev_ptr_stale(ca, ptr)) { bch_err_ratelimited(ca->fs, "error %s stripe: stale pointer after io", bio_data_dir(bio) == READ ? "reading from" : "writing to"); @@ -705,25 +732,26 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned offset = 0, bytes = buf->size << 9; struct bch_extent_ptr *ptr = &v->ptrs[idx]; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant ? BCH_DATA_user : BCH_DATA_parity; int rw = op_is_write(opf); - if (ptr_stale(ca, ptr)) { - bch_err_ratelimited(c, - "error %s stripe: stale pointer", - rw == READ ? "reading from" : "writing to"); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); + if (!ca) { clear_bit(idx, buf->valid); return; } - if (!bch2_dev_get_ioref(ca, rw)) { + if (dev_ptr_stale(ca, ptr)) { + bch_err_ratelimited(c, + "error %s stripe: stale pointer", + rw == READ ? "reading from" : "writing to"); clear_bit(idx, buf->valid); return; } + this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); while (offset < bytes) { @@ -769,7 +797,7 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - POS(0, idx), BTREE_ITER_SLOTS); + POS(0, idx), BTREE_ITER_slots); ret = bkey_err(k); if (ret) goto err; @@ -1060,7 +1088,7 @@ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1131,7 +1159,7 @@ static int ec_stripe_key_update(struct btree_trans *trans, int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_INTENT); + new->k.p, BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1173,6 +1201,7 @@ err: } static int ec_stripe_update_extent(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, struct bpos *bp_pos) @@ -1183,13 +1212,13 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k; const struct bch_extent_ptr *ptr_c; - struct bch_extent_ptr *ptr, *ec_ptr = NULL; + struct bch_extent_ptr *ec_ptr = NULL; struct bch_extent_stripe_ptr stripe_ptr; struct bkey_i *n; int ret, dev, block; - ret = bch2_get_next_backpointer(trans, bucket, gen, - bp_pos, &bp, BTREE_ITER_CACHED); + ret = bch2_get_next_backpointer(trans, ca, bucket, gen, + bp_pos, &bp, BTREE_ITER_cached); if (ret) return ret; if (bpos_eq(*bp_pos, SPOS_MAX)) @@ -1214,7 +1243,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, return -EIO; } - k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); + k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent); ret = bkey_err(k); if (ret) return ret; @@ -1272,17 +1301,21 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b { struct bch_fs *c = trans->c; struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_extent_ptr bucket = v->ptrs[block]; - struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); + struct bch_extent_ptr ptr = v->ptrs[block]; struct bpos bp_pos = POS_MIN; int ret = 0; + struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); + if (!ca) + return -EIO; + + struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); + while (1) { ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc, - ec_stripe_update_extent(trans, bucket_pos, bucket.gen, - s, &bp_pos)); + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos)); if (ret) break; if (bkey_eq(bp_pos, POS_MAX)) @@ -1291,6 +1324,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b bp_pos = bpos_nosnap_successor(bp_pos); } + bch2_dev_put(ca); return ret; } @@ -1321,20 +1355,18 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, unsigned block, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - unsigned offset = ca->mi.bucket_size - ob->sectors_free; - int ret; - - if (!bch2_dev_get_ioref(ca, WRITE)) { + struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); + if (!ca) { s->err = -BCH_ERR_erofs_no_writes; return; } + unsigned offset = ca->mi.bucket_size - ob->sectors_free; memset(s->new_stripe.data[block] + (offset << 9), 0, ob->sectors_free << 9); - ret = blkdev_issue_zeroout(ca->disk_sb.bdev, + int ret = blkdev_issue_zeroout(ca->disk_sb.bdev, ob->bucket * ca->mi.bucket_size + offset, ob->sectors_free, GFP_KERNEL, 0); @@ -1519,16 +1551,13 @@ void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) { struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); - struct bch_dev *ca; - unsigned offset; - if (!ob) return NULL; BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); - ca = bch_dev_bkey_exists(c, ob->dev); - offset = ca->mi.bucket_size - ob->sectors_free; + struct bch_dev *ca = ob_dev(c, ob); + unsigned offset = ca->mi.bucket_size - ob->sectors_free; return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); } @@ -1937,7 +1966,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st } for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { if (bkey_gt(k.k->p, POS(0, U32_MAX))) { if (start_pos.offset) { start_pos = min_pos; @@ -2127,7 +2156,7 @@ int bch2_stripes_read(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ if (k.k->type != KEY_TYPE_stripe) continue; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index f042616888b0..84a23eeb6249 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -6,14 +6,15 @@ #include "buckets_types.h" #include "extents_types.h" -enum bkey_invalid_flags; +enum bch_validate_flags; int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_stripe ((struct bkey_ops) { \ .key_invalid = bch2_stripe_invalid, \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 82a6656c941c..c66eeffcd7f2 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -176,6 +176,21 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) return s; } +/* s/fix?/fixing/ s/recreate?/recreating/ */ +static void prt_actioning(struct printbuf *out, const char *action) +{ + unsigned len = strlen(action); + + BUG_ON(action[len - 1] != '?'); + --len; + + if (action[len - 1] == 'e') + --len; + + prt_bytes(out, action, len); + prt_str(out, "ing"); +} + int bch2_fsck_err(struct bch_fs *c, enum bch_fsck_flags flags, enum bch_sb_error_id err, @@ -186,6 +201,7 @@ int bch2_fsck_err(struct bch_fs *c, bool print = true, suppressing = false, inconsistent = false; struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; + const char *action_orig = "fix?", *action = action_orig; if ((flags & FSCK_CAN_FIX) && test_bit(err, c->sb.errors_silent)) @@ -197,6 +213,19 @@ int bch2_fsck_err(struct bch_fs *c, prt_vprintf(out, fmt, args); va_end(args); + /* Custom fix/continue/recreate/etc.? */ + if (out->buf[out->pos - 1] == '?') { + const char *p = strrchr(out->buf, ','); + if (p) { + out->pos = p - out->buf; + action = kstrdup(p + 2, GFP_KERNEL); + if (!action) { + ret = -ENOMEM; + goto err; + } + } + } + mutex_lock(&c->fsck_error_msgs_lock); s = fsck_err_get(c, fmt); if (s) { @@ -208,12 +237,16 @@ int bch2_fsck_err(struct bch_fs *c, if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { ret = s->ret; mutex_unlock(&c->fsck_error_msgs_lock); - printbuf_exit(&buf); - return ret; + goto err; } kfree(s->last_msg); s->last_msg = kstrdup(buf.buf, GFP_KERNEL); + if (!s->last_msg) { + mutex_unlock(&c->fsck_error_msgs_lock); + ret = -ENOMEM; + goto err; + } if (c->opts.ratelimit_errors && !(flags & FSCK_NO_RATELIMIT) && @@ -239,7 +272,8 @@ int bch2_fsck_err(struct bch_fs *c, inconsistent = true; ret = -BCH_ERR_fsck_errors_not_fixed; } else if (flags & FSCK_CAN_FIX) { - prt_str(out, ", fixing"); + prt_str(out, ", "); + prt_actioning(out, action); ret = -BCH_ERR_fsck_fix; } else { prt_str(out, ", continuing"); @@ -254,16 +288,16 @@ int bch2_fsck_err(struct bch_fs *c, : c->opts.fix_errors; if (fix == FSCK_FIX_ask) { - int ask; + prt_str(out, ", "); + prt_str(out, action); - prt_str(out, ": fix?"); if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s", out->buf); else bch2_print_string_as_lines(KERN_ERR, out->buf); print = false; - ask = bch2_fsck_ask_yn(c); + int ask = bch2_fsck_ask_yn(c); if (ask >= YN_ALLNO && s) s->fix = ask == YN_ALLNO @@ -276,10 +310,12 @@ int bch2_fsck_err(struct bch_fs *c, } else if (fix == FSCK_FIX_yes || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { - prt_str(out, ", fixing"); + prt_str(out, ", "); + prt_actioning(out, action); ret = -BCH_ERR_fsck_fix; } else { - prt_str(out, ", not fixing"); + prt_str(out, ", not "); + prt_actioning(out, action); } } else if (flags & FSCK_NEED_FSCK) { prt_str(out, " (run fsck to correct)"); @@ -311,8 +347,6 @@ int bch2_fsck_err(struct bch_fs *c, mutex_unlock(&c->fsck_error_msgs_lock); - printbuf_exit(&buf); - if (inconsistent) bch2_inconsistent_error(c); @@ -322,7 +356,10 @@ int bch2_fsck_err(struct bch_fs *c, set_bit(BCH_FS_errors_not_fixed, &c->flags); set_bit(BCH_FS_error, &c->flags); } - +err: + if (action != action_orig) + kfree(action); + printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index b9033bb4f11c..5f4fecb358da 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -72,7 +72,7 @@ static int count_iters_for_insert(struct btree_trans *trans, for_each_btree_key_norestart(trans, iter, BTREE_ID_reflink, POS(0, idx + offset), - BTREE_ITER_SLOTS, r_k, ret2) { + BTREE_ITER_slots, r_k, ret2) { if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) break; diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 1a331e539204..469037929685 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -71,6 +71,12 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, } } +static inline u64 dev_latency(struct bch_fs *c, unsigned dev) +{ + struct bch_dev *ca = bch2_dev_rcu(c, dev); + return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; +} + /* * returns true if p1 is better than p2: */ @@ -79,11 +85,8 @@ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p2) { if (likely(!p1.idx && !p2.idx)) { - struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); - struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); - - u64 l1 = atomic64_read(&dev1->cur_latency[READ]); - u64 l2 = atomic64_read(&dev2->cur_latency[READ]); + u64 l1 = dev_latency(c, p1.ptr.dev); + u64 l2 = dev_latency(c, p2.ptr.dev); /* Pick at random, biased in favor of the faster device: */ @@ -109,21 +112,21 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, const union bch_extent_entry *entry; struct extent_ptr_decoded p; struct bch_dev_io_failures *f; - struct bch_dev *ca; int ret = 0; if (k.k->type == KEY_TYPE_error) return -EIO; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { /* * Unwritten extent: no need to actually read, treat it as a * hole and return 0s: */ - if (p.ptr.unwritten) - return 0; - - ca = bch_dev_bkey_exists(c, p.ptr.dev); + if (p.ptr.unwritten) { + ret = 0; + break; + } /* * If there are any dirty pointers it's an error if we can't @@ -132,7 +135,9 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (!ret && !p.ptr.cached) ret = -EIO; - if (p.ptr.cached && ptr_stale(ca, &p.ptr)) + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + + if (p.ptr.cached && (!ca || dev_ptr_stale(ca, &p.ptr))) continue; f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; @@ -141,12 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ? f->idx : f->idx + 1; - if (!p.idx && - !bch2_dev_is_readable(ca)) + if (!p.idx && !ca) p.idx++; - if (bch2_force_reconstruct_read && - !p.idx && p.has_ec) + if (!p.idx && p.has_ec && bch2_force_reconstruct_read) + p.idx++; + + if (!p.idx && !bch2_dev_is_readable(ca)) p.idx++; if (p.idx >= (unsigned) p.has_ec + 1) @@ -158,6 +164,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, *pick = p; ret = 1; } + rcu_read_unlock(); return ret; } @@ -165,7 +172,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -186,7 +193,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); @@ -201,6 +208,11 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, c, err, btree_ptr_v2_min_key_bad, "min_key > key"); + if (flags & BCH_VALIDATE_write) + bkey_fsck_err_on(!bp.v->sectors_written, + c, err, btree_ptr_v2_written_0, + "sectors_written == 0"); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); fsck_err: return ret; @@ -247,7 +259,6 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) const union bch_extent_entry *en_r; struct extent_ptr_decoded lp, rp; bool use_right_ptr; - struct bch_dev *ca; en_l = l_ptrs.start; en_r = r_ptrs.start; @@ -278,8 +289,12 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return false; /* Extents may not straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp.ptr.dev); - if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); + bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); + rcu_read_unlock(); + + if (!same_bucket) return false; if (lp.has_ec != rp.has_ec || @@ -385,7 +400,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); @@ -667,16 +682,16 @@ static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); + struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - return __extent_ptr_durability(ca, p); + return ca ? __extent_ptr_durability(ca, p) : 0; } unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); + struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - if (ca->mi.state == BCH_MEMBER_STATE_failed) + if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) return 0; return __extent_ptr_durability(ca, p); @@ -689,8 +704,10 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) durability += bch2_extent_ptr_durability(c, &p); + rcu_read_unlock(); return durability; } @@ -702,9 +719,11 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) durability += bch2_extent_ptr_durability(c, &p); + rcu_read_unlock(); return durability; } @@ -833,8 +852,6 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) { - struct bch_extent_ptr *ptr; - bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } @@ -860,14 +877,21 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_dev *ca; + bool ret = false; + rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && + (ca = bch2_dev_rcu(c, ptr->dev)) && (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return true; + !dev_ptr_stale_rcu(ca, ptr))) { + ret = true; + break; + } + rcu_read_unlock(); - return false; + return ret; } bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, @@ -969,21 +993,23 @@ void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) */ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { - struct bch_extent_ptr *ptr; + struct bch_dev *ca; + rcu_read_lock(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && - ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); + (ca = bch2_dev_rcu(c, ptr->dev)) && + dev_ptr_stale_rcu(ca, ptr)); + rcu_read_unlock(); return bkey_deleted(k.k); } void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) { - struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; - + out->atomic++; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); if (!ca) { prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, (u64) ptr->offset, ptr->gen, @@ -998,11 +1024,11 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_str(out, " cached"); if (ptr->unwritten) prt_str(out, " unwritten"); - if (b >= ca->mi.first_bucket && - b < ca->mi.nbuckets && - ptr_stale(ca, ptr)) + if (bucket_valid(ca, b) && dev_ptr_stale_rcu(ca, ptr)) prt_printf(out, " stale"); } + rcu_read_unlock(); + --out->atomic; } void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, @@ -1069,55 +1095,50 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, static int extent_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, const struct bch_extent_ptr *ptr, unsigned size_ondisk, bool metadata, struct printbuf *err) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - u64 bucket; - u32 bucket_offset; - struct bch_dev *ca; int ret = 0; - if (!bch2_dev_exists2(c, ptr->dev)) { - /* - * If we're in the write path this key might have already been - * overwritten, and we could be seeing a device that doesn't - * exist anymore due to racing with device removal: - */ - if (flags & BKEY_INVALID_WRITE) - return 0; - - bkey_fsck_err(c, err, ptr_to_invalid_device, - "pointer to invalid device (%u)", ptr->dev); + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) { + rcu_read_unlock(); + return 0; } + u32 bucket_offset; + u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); + unsigned first_bucket = ca->mi.first_bucket; + u64 nbuckets = ca->mi.nbuckets; + unsigned bucket_size = ca->mi.bucket_size; + rcu_read_unlock(); - ca = bch_dev_bkey_exists(c, ptr->dev); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr2) bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, ptr_to_duplicate_device, "multiple pointers to same device (%u)", ptr->dev); - bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err, + bkey_fsck_err_on(bucket >= nbuckets, c, err, ptr_after_last_bucket, - "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets); - bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err, + "pointer past last bucket (%llu > %llu)", bucket, nbuckets); + bkey_fsck_err_on(bucket < first_bucket, c, err, ptr_before_first_bucket, - "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); - bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err, + "pointer before first bucket (%llu < %u)", bucket, first_bucket); + bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, err, ptr_spans_multiple_buckets, "pointer spans multiple buckets (%u + %u > %u)", - bucket_offset, size_ondisk, ca->mi.bucket_size); + bucket_offset, size_ondisk, bucket_size); fsck_err: return ret; } int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -1193,7 +1214,7 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(crc_is_encoded(crc) && (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err, + (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, err, ptr_crc_uncompressed_size_too_big, "too large encoded extent"); diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 528e817eacbd..1ade959652b2 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -8,7 +8,7 @@ struct bch_fs; struct btree_trans; -enum bkey_invalid_flags; +enum bch_validate_flags; /* extent entries: */ @@ -406,12 +406,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); @@ -448,7 +448,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -654,7 +654,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, do { \ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ \ - _ptr = &_ptrs.start->ptr; \ + struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \ \ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ if (_cond) { \ @@ -680,7 +680,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_ptr_swab(struct bkey_s); diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 0f955c3c76a7..2eaffe37b5e7 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -171,7 +171,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, swap_r_func_t swap_func, const void *priv) { - int i, c, r; + int i, j, k; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) @@ -188,17 +188,22 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, /* heapify */ for (i = n / 2 - 1; i >= 0; --i) { - for (r = i; r * 2 + 1 < n; r = c) { - c = r * 2 + 1; + /* Find the sift-down path all the way to the leaves. */ + for (j = i; k = j * 2 + 1, k + 1 < n;) + j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - if (c + 1 < n && - eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) - c++; + /* Special case for the last leaf with no sibling. */ + if (j * 2 + 2 == n) + j = j * 2 + 1; - if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) - break; + /* Backtrack to the correct location. */ + while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) + j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + /* Shift the element into its correct place. */ + for (k = j; j != i;) { + j = (j - 1) / 2; + eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); } } @@ -206,17 +211,22 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, for (i = n - 1; i > 0; --i) { eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); - for (r = 0; r * 2 + 1 < i; r = c) { - c = r * 2 + 1; + /* Find the sift-down path all the way to the leaves. */ + for (j = 0; k = j * 2 + 1, k + 1 < i;) + j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - if (c + 1 < i && - eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) - c++; + /* Special case for the last leaf with no sibling. */ + if (j * 2 + 2 == i) + j = j * 2 + 1; - if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) - break; + /* Backtrack to the correct location. */ + while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) + j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + /* Shift the element into its correct place. */ + for (k = j; j;) { + j = (j - 1) / 2; + eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); } } } @@ -232,3 +242,64 @@ void eytzinger0_sort(void *base, size_t n, size_t size, return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); } + +#if 0 +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/ktime.h> + +static u64 cmp_count; + +static int mycmp(const void *a, const void *b) +{ + u32 _a = *(u32 *)a; + u32 _b = *(u32 *)b; + + cmp_count++; + if (_a < _b) + return -1; + else if (_a > _b) + return 1; + else + return 0; +} + +static int test(void) +{ + size_t N, i; + ktime_t start, end; + s64 delta; + u32 *arr; + + for (N = 10000; N <= 100000; N += 10000) { + arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL); + cmp_count = 0; + + for (i = 0; i < N; i++) + arr[i] = get_random_u32(); + + start = ktime_get(); + eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL); + end = ktime_get(); + + delta = ktime_us_delta(end, start); + printk(KERN_INFO "time: %lld\n", delta); + printk(KERN_INFO "comparisons: %lld\n", cmp_count); + + u32 prev = 0; + + eytzinger0_for_each(i, N) { + if (prev > arr[i]) + goto err; + prev = arr[i]; + } + + kfree(arr); + } + return 0; + +err: + kfree(arr); + return -1; +} +#endif diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 624e6f963240..508d029ac53d 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -42,7 +42,7 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); if (ret) goto err; @@ -70,7 +70,7 @@ int bch2_create_trans(struct btree_trans *trans, struct bch_subvolume s; ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, - BTREE_ITER_CACHED, &s); + BTREE_ITER_cached, &s); if (ret) goto err; @@ -78,7 +78,7 @@ int bch2_create_trans(struct btree_trans *trans, } ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -163,7 +163,7 @@ int bch2_create_trans(struct btree_trans *trans, name, dir_target, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + STR_HASH_must_create); if (ret) goto err; @@ -171,7 +171,7 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_dir_offset = dir_offset; } - inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + inode_iter.flags &= ~BTREE_ITER_all_snapshots; bch2_btree_iter_set_snapshot(&inode_iter, snapshot); ret = bch2_btree_iter_traverse(&inode_iter) ?: @@ -198,16 +198,16 @@ int bch2_link_trans(struct btree_trans *trans, if (dir.subvol != inum.subvol) return -EXDEV; - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); if (ret) - goto err; + return ret; inode_u->bi_ctime = now; ret = bch2_inode_nlink_inc(inode_u); if (ret) - return ret; + goto err; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); if (ret) goto err; @@ -223,7 +223,7 @@ int bch2_link_trans(struct btree_trans *trans, ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), name, inum.inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + STR_HASH_must_create); if (ret) goto err; @@ -255,19 +255,19 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bkey_s_c k; int ret; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); if (ret) goto err; dir_hash = bch2_hash_info_init(c, dir_u); ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, - name, &inum, BTREE_ITER_INTENT); + name, &inum, BTREE_ITER_intent); if (ret) goto err; ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -322,7 +322,7 @@ int bch2_unlink_trans(struct btree_trans *trans, ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash, &dirent_iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); err: @@ -363,7 +363,7 @@ static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_p struct bkey_i_subvolume *s = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), - BTREE_ITER_CACHED, subvolume); + BTREE_ITER_cached, subvolume); int ret = PTR_ERR_OR_ZERO(s); if (ret) return ret; @@ -394,7 +394,7 @@ int bch2_rename_trans(struct btree_trans *trans, int ret; ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -403,7 +403,7 @@ int bch2_rename_trans(struct btree_trans *trans, if (dst_dir.inum != src_dir.inum || dst_dir.subvol != src_dir.subvol) { ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -423,13 +423,13 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; if (dst_inum.inum) { ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; } diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 39292e7ef342..b0a33fabadf8 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -30,15 +30,8 @@ static void bch2_readpages_end_io(struct bio *bio) { struct folio_iter fi; - bio_for_each_folio_all(fi, bio) { - if (!bio->bi_status) { - folio_mark_uptodate(fi.folio); - } else { - folio_clear_uptodate(fi.folio); - folio_set_error(fi.folio); - } - folio_unlock(fi.folio); - } + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK); bio_put(bio); } @@ -176,7 +169,7 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); + BTREE_ITER_slots); while (1) { struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; @@ -408,7 +401,6 @@ static void bch2_writepage_io_done(struct bch_write_op *op) bio_for_each_folio_all(fi, bio) { struct bch_folio *s; - folio_set_error(fi.folio); mapping_set_error(fi.folio->mapping, -EIO); s = __bch2_folio(fi.folio); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index b889370a5088..09d21aef879a 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -254,7 +254,7 @@ retry: for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, err) { + BTREE_ITER_slots, k, err) { if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) break; diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index d359aa9b33b8..872283e5bd1e 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -214,7 +214,7 @@ retry: for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, ret) { + BTREE_ITER_slots, k, ret) { unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); unsigned state = bkey_to_sector_state(k); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 20b40477425f..ef20b64033e0 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -202,7 +202,10 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) goto out; ret = bch2_flush_inode(c, inode); out: - return bch2_err_class(ret); + ret = bch2_err_class(ret); + if (ret == -EROFS) + ret = -EIO; + return ret; } /* truncate: */ @@ -594,7 +597,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inode->v.i_ino, start_sector), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); while (!ret && bkey_lt(iter.pos, end_pos)) { s64 i_sectors_delta = 0; @@ -1009,7 +1012,7 @@ retry: for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inode->v.i_ino, offset >> 9, snapshot), - BTREE_ITER_SLOTS, k, ret) { + BTREE_ITER_slots, k, ret) { if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_seek_pagecache_hole(&inode->v, offset, MAX_LFS_FILESIZE, 0, false); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 3dc8630ff9fe..205a323ffc6d 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -548,7 +548,7 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) { /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { - case FS_IOC_GETFLAGS: + case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 65b04b3c2679..fd851f10d11c 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -90,7 +90,7 @@ retry: bch2_trans_begin(trans); ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT) ?: + BTREE_ITER_intent) ?: (set ? set(trans, inode, &inode_u, p) : 0) ?: bch2_inode_write(trans, &iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); @@ -213,19 +213,43 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino _ret; \ }) +static struct inode *bch2_alloc_inode(struct super_block *sb) +{ + BUG(); +} + +static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) +{ + struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); + if (!inode) + return NULL; + + inode_init_once(&inode->v); + mutex_init(&inode->ei_update_lock); + two_state_lock_init(&inode->ei_pagecache_lock); + INIT_LIST_HEAD(&inode->ei_vfs_inode_list); + mutex_init(&inode->ei_quota_lock); + inode->v.i_state = 0; + + if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { + kmem_cache_free(bch2_inode_cache, inode); + return NULL; + } + + return inode; +} + /* * Allocate a new inode, dropping/retaking btree locks if necessary: */ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) { - struct bch_fs *c = trans->c; - struct bch_inode_info *inode = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, - to_bch_ei(new_inode(c->vfs_sb))); + __bch2_new_inode(trans->c)); if (unlikely(!inode)) { - int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM); + int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM); if (ret && inode) { __destroy_inode(&inode->v); kmem_cache_free(bch2_inode_cache, inode); @@ -290,7 +314,7 @@ __bch2_create(struct mnt_idmap *idmap, if (ret) return ERR_PTR(ret); #endif - inode = to_bch_ei(new_inode(c->vfs_sb)); + inode = __bch2_new_inode(c); if (unlikely(!inode)) { inode = ERR_PTR(-ENOMEM); goto err; @@ -323,7 +347,7 @@ retry: inum.inum = inode_u.bi_inum; ret = bch2_subvolume_get(trans, inum.subvol, true, - BTREE_ITER_WITH_UPDATES, &subvol) ?: + BTREE_ITER_with_updates, &subvol) ?: bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, @@ -376,17 +400,14 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter dirent_iter = {}; subvol_inum inum = {}; + struct printbuf buf = PRINTBUF; - int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, - dir_hash_info, dir, name, 0); + struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, + dir_hash_info, dir, name, 0); + int ret = bkey_err(k); if (ret) return ERR_PTR(ret); - struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter); - ret = bkey_err(k); - if (ret) - goto err; - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); if (ret > 0) ret = -ENOENT; @@ -406,20 +427,31 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); - if (bch2_err_matches(ret, ENOENT)) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - bch_err(c, "%s points to missing inode", buf.buf); - printbuf_exit(&buf); - } + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), + c, "dirent to missing inode:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); if (ret) goto err; + /* regular files may have hardlinks: */ + if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) && + !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), + c, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, &inode_u), + buf.buf))) { + ret = -ENOENT; + goto err; + } + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); inode = bch2_inode_insert(c, inode); out: bch2_trans_iter_exit(trans, &dirent_iter); + printbuf_exit(&buf); return inode; err: inode = ERR_PTR(ret); @@ -787,7 +819,7 @@ retry: acl = NULL; ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto btree_err; @@ -1043,6 +1075,10 @@ retry: bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); + + ret = bch2_trans_relock(trans); + if (ret) + break; } start = iter.pos.offset; bch2_trans_iter_exit(trans, &iter); @@ -1490,34 +1526,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, mapping_set_large_folios(inode->v.i_mapping); } -static struct inode *bch2_alloc_inode(struct super_block *sb) -{ - struct bch_inode_info *inode; - - inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); - if (!inode) - return NULL; - - inode_init_once(&inode->v); - mutex_init(&inode->ei_update_lock); - two_state_lock_init(&inode->ei_pagecache_lock); - INIT_LIST_HEAD(&inode->ei_vfs_inode_list); - mutex_init(&inode->ei_quota_lock); - - return &inode->v; -} - -static void bch2_i_callback(struct rcu_head *head) -{ - struct inode *vinode = container_of(head, struct inode, i_rcu); - struct bch_inode_info *inode = to_bch_ei(vinode); - - kmem_cache_free(bch2_inode_cache, inode); -} - -static void bch2_destroy_inode(struct inode *vinode) +static void bch2_free_inode(struct inode *vinode) { - call_rcu(&vinode->i_rcu, bch2_i_callback); + kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); } static int inode_update_times_fn(struct btree_trans *trans, @@ -1825,7 +1836,7 @@ static int bch2_unfreeze(struct super_block *sb) static const struct super_operations bch_super_operations = { .alloc_inode = bch2_alloc_inode, - .destroy_inode = bch2_destroy_inode, + .free_inode = bch2_free_inode, .write_inode = bch2_vfs_write_inode, .evict_inode = bch2_evict_inode, .sync_fs = bch2_sync_fs, diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 8e2010212cc3..c8f57465131c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -79,7 +79,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_all_snapshots); k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) @@ -127,13 +127,13 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, u64 *target, unsigned *type, u32 snapshot) { struct btree_iter iter; - struct bkey_s_c_dirent d; - int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0, snapshot); + struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0, snapshot); + int ret = bkey_err(k); if (ret) return ret; - d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); *target = le64_to_cpu(d.v->d_inum); *type = d.v->d_type; bch2_trans_iter_exit(trans, &iter); @@ -154,12 +154,12 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) dir_hash_info = bch2_hash_info_init(c, &dir_inode); - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash_info, &iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); err: bch_err_fn(c, ret); @@ -274,9 +274,9 @@ create_lostfound: &lostfound_str, lostfound->bi_inum, &lostfound->bi_dir_offset, - BCH_HASH_SET_MUST_CREATE) ?: + STR_HASH_must_create) ?: bch2_inode_write_flags(trans, &lostfound_iter, lostfound, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); err: bch_err_msg(c, ret, "creating lost+found"); bch2_trans_iter_exit(trans, &lostfound_iter); @@ -333,7 +333,7 @@ static int reattach_inode(struct btree_trans *trans, &name, inode->bi_subvol ?: inode->bi_inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + STR_HASH_must_create); if (ret) return ret; @@ -486,14 +486,9 @@ static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 in return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG); } -struct snapshots_seen_entry { - u32 id; - u32 equiv; -}; - struct snapshots_seen { struct bpos pos; - DARRAY(struct snapshots_seen_entry) ids; + snapshot_id_list ids; }; static inline void snapshots_seen_exit(struct snapshots_seen *s) @@ -508,20 +503,15 @@ static inline void snapshots_seen_init(struct snapshots_seen *s) static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) { - struct snapshots_seen_entry *i, n = { - .id = id, - .equiv = bch2_snapshot_equiv(c, id), - }; - int ret = 0; - + u32 *i; __darray_for_each(s->ids, i) { - if (i->id == id) + if (*i == id) return 0; - if (i->id > id) + if (*i > id) break; } - ret = darray_insert_item(&s->ids, i - s->ids.data, n); + int ret = darray_insert_item(&s->ids, i - s->ids.data, id); if (ret) bch_err(c, "error reallocating snapshots_seen table (size %zu)", s->ids.size); @@ -531,42 +521,11 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, enum btree_id btree_id, struct bpos pos) { - struct snapshots_seen_entry n = { - .id = pos.snapshot, - .equiv = bch2_snapshot_equiv(c, pos.snapshot), - }; - int ret = 0; - if (!bkey_eq(s->pos, pos)) s->ids.nr = 0; - s->pos = pos; - s->pos.snapshot = n.equiv; - darray_for_each(s->ids, i) { - if (i->id == n.id) - return 0; - - /* - * We currently don't rigorously track for snapshot cleanup - * needing to be run, so it shouldn't be a fsck error yet: - */ - if (i->equiv == n.equiv) { - bch_err(c, "snapshot deletion did not finish:\n" - " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", - bch2_btree_id_str(btree_id), - pos.inode, pos.offset, - i->id, n.id, n.equiv); - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); - } - } - - ret = darray_push(&s->ids, n); - if (ret) - bch_err(c, "error reallocating snapshots_seen table (size %zu)", - s->ids.size); - return ret; + return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); } /** @@ -586,12 +545,10 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see ssize_t i; EBUG_ON(id > ancestor); - EBUG_ON(!bch2_snapshot_is_equiv(c, id)); - EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); /* @ancestor should be the snapshot most recently added to @seen */ EBUG_ON(ancestor != seen->pos.snapshot); - EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv); + EBUG_ON(ancestor != darray_last(seen->ids)); if (id == ancestor) return true; @@ -610,9 +567,9 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see */ for (i = seen->ids.nr - 2; - i >= 0 && seen->ids.data[i].equiv >= id; + i >= 0 && seen->ids.data[i] >= id; --i) - if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv)) + if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) return false; return true; @@ -643,9 +600,6 @@ static int ref_visible2(struct bch_fs *c, u32 src, struct snapshots_seen *src_seen, u32 dst, struct snapshots_seen *dst_seen) { - src = bch2_snapshot_equiv(c, src); - dst = bch2_snapshot_equiv(c, dst); - if (dst > src) { swap(dst, src); swap(dst_seen, src_seen); @@ -692,7 +646,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, return darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, - .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), + .snapshot = inode.k->p.snapshot, })); } @@ -708,7 +662,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, w->inodes.nr = 0; for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + BTREE_ITER_all_snapshots, k, ret) { if (k.k->p.offset != inum) break; @@ -728,21 +682,20 @@ static struct inode_walker_entry * lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) { bool is_whiteout = k.k->type == KEY_TYPE_whiteout; - u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); struct inode_walker_entry *i; __darray_for_each(w->inodes, i) - if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) goto found; return NULL; found: - BUG_ON(snapshot > i->snapshot); + BUG_ON(k.k->p.snapshot > i->snapshot); - if (snapshot != i->snapshot && !is_whiteout) { + if (k.k->p.snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - new.snapshot = snapshot; + new.snapshot = k.k->p.snapshot; new.count = 0; struct printbuf buf = PRINTBUF; @@ -751,10 +704,10 @@ found: bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" "unexpected because we should always update the inode when we update a key in that inode\n" "%s", - w->last_pos.inode, snapshot, i->snapshot, buf.buf); + w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); printbuf_exit(&buf); - while (i > w->inodes.data && i[-1].snapshot > snapshot) + while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) --i; size_t pos = i - w->inodes.data; @@ -786,10 +739,10 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, return lookup_inode_for_snapshot(trans->c, w, k); } -static int __get_visible_inodes(struct btree_trans *trans, - struct inode_walker *w, - struct snapshots_seen *s, - u64 inum) +static int get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -799,19 +752,17 @@ static int __get_visible_inodes(struct btree_trans *trans, w->inodes.nr = 0; for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); - + BTREE_ITER_all_snapshots, k, ret) { if (k.k->p.offset != inum) break; - if (!ref_visible(c, s, s->pos.snapshot, equiv)) + if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) continue; if (bkey_is_inode(k.k)) add_inode(c, w, k); - if (equiv >= s->pos.snapshot) + if (k.k->p.snapshot >= s->pos.snapshot) break; } bch2_trans_iter_exit(trans, &iter); @@ -832,7 +783,7 @@ static int check_key_has_snapshot(struct btree_trans *trans, "key in missing snapshot: %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; + BTREE_UPDATE_internal_snapshot_node) ?: 1; fsck_err: printbuf_exit(&buf); return ret; @@ -861,8 +812,8 @@ static int hash_redo_key(struct btree_trans *trans, bch2_hash_set_in_snapshot(trans, desc, hash_info, (subvol_inum) { 0, k.k->p.inode }, k.k->p.snapshot, tmp, - BCH_HASH_SET_MUST_CREATE, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + STR_HASH_must_create| + BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } @@ -891,7 +842,7 @@ static int hash_check_key(struct btree_trans *trans, for_each_btree_key_norestart(trans, iter, desc.btree_id, SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_SLOTS, k, ret) { + BTREE_ITER_slots, k, ret) { if (bkey_eq(k.k->p, hash_k.k->p)) break; @@ -1233,7 +1184,7 @@ int bch2_check_inodes(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_inode(trans, &iter, k, &prev, &s, full))); @@ -1362,8 +1313,8 @@ static int overlapping_extents_found(struct btree_trans *trans, BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); bch2_trans_iter_init(trans, &iter1, btree, pos1, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOT_EXTENTS); + BTREE_ITER_all_snapshots| + BTREE_ITER_not_extents); k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); ret = bkey_err(k1); if (ret) @@ -1425,7 +1376,7 @@ static int overlapping_extents_found(struct btree_trans *trans, trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); ret = bch2_trans_update_extent_overwrite(trans, old_iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + BTREE_UPDATE_internal_snapshot_node, k1, k2) ?: bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(c, &res); @@ -1466,7 +1417,6 @@ static int check_overlapping_extents(struct btree_trans *trans, struct snapshots_seen *seen, struct extent_ends *extent_ends, struct bkey_s_c k, - u32 equiv, struct btree_iter *iter, bool *fixed) { @@ -1535,11 +1485,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct bch_fs *c = trans->c; struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; - struct bpos equiv = k.k->p; int ret = 0; - equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); - ret = check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; @@ -1589,8 +1536,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_val_to_text(&buf, c, k), buf.buf))) goto delete; - ret = check_overlapping_extents(trans, s, extent_ends, k, - equiv.snapshot, iter, + ret = check_overlapping_extents(trans, s, extent_ends, k, iter, &inode->recalculate_sums); if (ret) goto err; @@ -1607,8 +1553,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (; inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > equiv.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) + if (i->snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) continue; if (k.k->type != KEY_TYPE_whiteout) { @@ -1625,7 +1571,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, bch2_btree_iter_set_snapshot(&iter2, i->snapshot); ret = bch2_btree_iter_traverse(&iter2) ?: bch2_btree_delete_at(trans, &iter2, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter2); if (ret) goto err; @@ -1652,7 +1598,7 @@ fsck_err: bch_err_fn(c, ret); return ret; delete: - ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); goto out; } @@ -1673,7 +1619,7 @@ int bch2_check_extents(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_disk_reservation_put(c, &res); @@ -1698,7 +1644,7 @@ int bch2_check_indirect_extents(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_disk_reservation_put(c, &res); @@ -1767,6 +1713,15 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, if (inode_points_to_dirent(target, d)) return 0; + if (bch2_inode_should_have_bp(target) && + !fsck_err(c, inode_wrong_backpointer, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf))) + goto out_noiter; + if (!target->bi_dir && !target->bi_dir_offset) { target->bi_dir = d.k->p.inode; @@ -1835,6 +1790,7 @@ out: err: fsck_err: bch2_trans_iter_exit(trans, &bp_iter); +out_noiter: printbuf_exit(&buf); bch_err_fn(c, ret); return ret; @@ -2052,7 +2008,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_fs *c = trans->c; struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; - struct bpos equiv; int ret = 0; ret = check_key_has_snapshot(trans, iter, k); @@ -2061,9 +2016,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto out; } - equiv = k.k->p; - equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); if (ret) goto err; @@ -2104,7 +2056,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); goto out; } @@ -2140,14 +2092,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; } else { - ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); if (ret) goto err; if (fsck_err_on(!target->inodes.nr, c, dirent_to_missing_inode, - "dirent points to missing inode: (equiv %u)\n%s", - equiv.snapshot, + "dirent points to missing inode:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -2164,7 +2115,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, } if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, equiv.snapshot, i) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) i->count++; } out: @@ -2191,7 +2142,7 @@ int bch2_check_dirents(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -2255,7 +2206,7 @@ int bch2_check_xattrs(struct bch_fs *c) ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -2422,7 +2373,7 @@ int bch2_check_subvolume_structure(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_subvol_path(trans, &iter, k))); bch_err_fn(c, ret); @@ -2457,7 +2408,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode; struct printbuf buf = PRINTBUF; - u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot); + u32 snapshot = inode_k.k->p.snapshot; int ret = 0; p->nr = 0; @@ -2559,9 +2510,9 @@ int bch2_check_directory_structure(struct bch_fs *c) ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ if (!bkey_is_inode(k.k)) continue; @@ -2661,9 +2612,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ if (!bkey_is_inode(k.k)) continue; @@ -2704,9 +2655,9 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); if (ret) break; @@ -2717,8 +2668,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links if (d.v->d_type != DT_DIR && d.v->d_type != DT_SUBVOL) inc_link(c, &s, links, range_start, range_end, - le64_to_cpu(d.v->d_inum), - bch2_snapshot_equiv(c, d.k->p.snapshot)); + le64_to_cpu(d.v->d_inum), d.k->p.snapshot); } 0; }))); @@ -2781,7 +2731,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS(0, range_start), - BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); if (ret < 0) { @@ -2849,7 +2799,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, u->v.front_pad = 0; u->v.back_pad = 0; - return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); + return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); } int bch2_fix_reflink_p(struct bch_fs *c) @@ -2860,8 +2810,8 @@ int bch2_fix_reflink_p(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS_MIN, - BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent|BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, fix_reflink_p_key(trans, &iter, k))); bch_err_fn(c, ret); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 0f95d7fb5ec0..aafa79fa6351 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -339,7 +339,7 @@ int bch2_inode_peek_nowarn(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, SPOS(0, inum.inum, snapshot), - flags|BTREE_ITER_CACHED); + flags|BTREE_ITER_cached); ret = bkey_err(k); if (ret) return ret; @@ -371,7 +371,7 @@ int bch2_inode_peek(struct btree_trans *trans, int bch2_inode_write_flags(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_inode_buf *inode_p; @@ -399,7 +399,7 @@ int __bch2_fsck_write_inode(struct btree_trans *trans, return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, &inode_p->inode.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } int bch2_fsck_write_inode(struct btree_trans *trans, @@ -473,7 +473,7 @@ fsck_err: } int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); @@ -490,7 +490,7 @@ fsck_err: } int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); @@ -507,7 +507,7 @@ fsck_err: } int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); @@ -535,29 +535,19 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { printbuf_indent_add(out, 2); - prt_printf(out, "mode=%o", inode->bi_mode); - prt_newline(out); + prt_printf(out, "mode=%o\n", inode->bi_mode); prt_str(out, "flags="); prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); - prt_printf(out, " (%x)", inode->bi_flags); - prt_newline(out); + prt_printf(out, " (%x)\n", inode->bi_flags); - prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq); - prt_newline(out); - - prt_printf(out, "bi_size=%llu", inode->bi_size); - prt_newline(out); - - prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); - prt_newline(out); - - prt_printf(out, "bi_version=%llu", inode->bi_version); - prt_newline(out); + prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); + prt_printf(out, "bi_size=%llu\n", inode->bi_size); + prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors); + prt_printf(out, "bi_version=%llu\n", inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, #_name "=%llu", (u64) inode->_name); \ - prt_newline(out); + prt_printf(out, #_name "=%llu\n", (u64) inode->_name); BCH_INODE_FIELDS_v3() #undef x printbuf_indent_sub(out, 2); @@ -604,11 +594,11 @@ int bch2_trigger_inode(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { if (nr) { int ret = bch2_replicas_deltas_realloc(trans, 0); if (ret) @@ -627,13 +617,13 @@ int bch2_trigger_inode(struct btree_trans *trans, } } - if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { BUG_ON(!trans->journal_res.seq); bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { struct bch_fs *c = trans->c; percpu_down_read(&c->mark_lock); @@ -645,7 +635,7 @@ int bch2_trigger_inode(struct btree_trans *trans, } int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -762,8 +752,8 @@ int bch2_inode_create(struct btree_trans *trans, pos = start; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_INTENT); + BTREE_ITER_all_snapshots| + BTREE_ITER_intent); again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -824,7 +814,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, * extent iterator: */ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), - BTREE_ITER_INTENT); + BTREE_ITER_intent); while (1) { bch2_trans_begin(trans); @@ -846,7 +836,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bkey_init(&delete.k); delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) + if (iter.flags & BTREE_ITER_is_extents) bch2_key_resize(&delete.k, bpos_min(end, k.k->p).offset - iter.pos.offset); @@ -895,7 +885,7 @@ retry: k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, inum.inum, snapshot), - BTREE_ITER_INTENT|BTREE_ITER_CACHED); + BTREE_ITER_intent|BTREE_ITER_cached); ret = bkey_err(k); if (ret) goto err; @@ -1055,7 +1045,7 @@ retry: bch2_trans_begin(trans); k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), BTREE_ITER_INTENT); + SPOS(0, inum, snapshot), BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1100,7 +1090,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bch_inode_unpacked inode; int ret; - k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); + k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); ret = bkey_err(k); if (ret) return ret; @@ -1152,7 +1142,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, inode.bi_flags &= ~BCH_INODE_unlinked; ret = bch2_inode_write_flags(trans, &inode_iter, &inode, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch_err_msg(c, ret, "clearing inode unlinked flag"); if (ret) goto out; @@ -1199,7 +1189,7 @@ again: * flushed and we'd spin: */ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); if (ret > 0) { diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 056298050550..679f5f5e5d15 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -6,19 +6,20 @@ #include "bkey_methods.h" #include "opts.h" -enum bkey_invalid_flags; +enum bch_validate_flags; extern const char * const bch2_inode_opts[]; int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_inode ((struct bkey_ops) { \ .key_invalid = bch2_inode_invalid, \ @@ -49,7 +50,7 @@ static inline bool bkey_is_inode(const struct bkey *k) } int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ @@ -101,7 +102,7 @@ int bch2_inode_peek(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, subvol_inum, unsigned); int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, enum btree_update_flags); + struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); static inline int bch2_inode_write(struct btree_trans *trans, struct btree_iter *iter, @@ -220,6 +221,14 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); +static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode) +{ + bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; + + return S_ISDIR(inode->bi_mode) || + (!inode->bi_nlink && inode_has_bp); +} + struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 82f9170dab3f..4ec979b4b23e 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -198,7 +198,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inum.inum, start), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); @@ -230,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans, struct bch_inode_unpacked inode_u; int ret; - ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?: + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent) ?: (inode_u.bi_size = new_i_size, 0) ?: bch2_inode_write(trans, &iter, &inode_u); @@ -256,7 +256,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); bch2_trans_iter_exit(trans, &fpunch_iter); @@ -317,7 +317,7 @@ static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset offset <<= 9; len <<= 9; - ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent); if (ret) return ret; @@ -365,7 +365,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inum.inum, 0), - BTREE_ITER_INTENT); + BTREE_ITER_intent); switch (op->v.state) { case LOGGED_OP_FINSERT_start: diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 8a556e6d1ab6..f57486794484 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -378,7 +378,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_SLOTS); + rbio->read_pos, BTREE_ITER_slots); retry: rbio->bio.bi_status = 0; @@ -487,7 +487,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, return 0; k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); if ((ret = bkey_err(k))) goto out; @@ -523,7 +523,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, goto out; ret = bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); out: bch2_trans_iter_exit(trans, &iter); return ret; @@ -541,7 +541,6 @@ static void __bch2_read_endio(struct work_struct *work) struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); struct bio *src = &rbio->bio; struct bio *dst = &bch2_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->bvec_iter; @@ -647,13 +646,15 @@ csum_err: prt_str(&buf, "data "); bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); - bch_err_inum_offset_ratelimited(ca, - rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "data %s", buf.buf); + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) { + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "data %s", buf.buf); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + } printbuf_exit(&buf); - - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; decompression_err: @@ -675,7 +676,7 @@ static void bch2_read_endio(struct bio *bio) struct bch_read_bio *rbio = container_of(bio, struct bch_read_bio, bio); struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; @@ -687,17 +688,21 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, - rbio->read_pos.inode, - rbio->read_pos.offset, - "data read error: %s", - bch2_blk_status_to_str(bio->bi_status))) { + if (bio->bi_status) { + if (ca) { + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status)); + bch2_io_error(ca, BCH_MEMBER_ERROR_read); + } bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(ca, &rbio->pick.ptr)) { + (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { trace_and_count(c, read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_RETRY_IF_STALE) @@ -758,22 +763,21 @@ err: } static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bch_dev *ca, struct bkey_s_c k, struct bch_extent_ptr ptr) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); struct btree_iter iter; struct printbuf buf = PRINTBUF; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - PTR_BUCKET_POS(c, &ptr), - BTREE_ITER_CACHED); + PTR_BUCKET_POS(ca, &ptr), + BTREE_ITER_cached); - prt_printf(&buf, "Attempting to read from stale dirty pointer:"); + prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); printbuf_indent_add(&buf, 2); - prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); @@ -801,7 +805,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct bch_dev *ca = NULL; struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); @@ -832,7 +835,7 @@ retry_pick: goto err; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); /* * Stale dirty pointers are treated as IO errors, but @failed isn't @@ -842,9 +845,11 @@ retry_pick: */ if ((flags & BCH_READ_IN_RETRY) && !pick.ptr.cached && - unlikely(ptr_stale(ca, &pick.ptr))) { - read_from_stale_dirty_pointer(trans, k, pick.ptr); + ca && + unlikely(dev_ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); bch2_mark_io_failure(failed, &pick); + percpu_ref_put(&ca->io_ref); goto retry_pick; } @@ -859,8 +864,11 @@ retry_pick: * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { + if (ca) + percpu_ref_put(&ca->io_ref); goto hole; + } iter.bi_size = pick.crc.compressed_size << 9; goto get_bio; @@ -965,7 +973,7 @@ get_bio: rbio->bvec_iter = iter; rbio->offset_into_extent= offset_into_extent; rbio->flags = flags; - rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); + rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; rbio->hole = 0; rbio->retry = 0; @@ -995,7 +1003,7 @@ get_bio: * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); @@ -1113,7 +1121,7 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum.inum, bvec_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); + BTREE_ITER_slots); while (1) { unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 40d7df7607df..9401d13e31bb 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -166,7 +166,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, bch2_trans_copy_iter(&iter, extent_iter); for_each_btree_key_upto_continue_norestart(iter, - new->k.p, BTREE_ITER_SLOTS, old, ret) { + new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), bkey_start_offset(old.k)); @@ -210,14 +210,14 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, * to be journalled - if we crash, the bi_journal_seq update will be * lost, but that's fine. */ - unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; + unsigned inode_update_flags = BTREE_UPDATE_nojournal; struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, extent_iter->pos.inode, extent_iter->snapshot), - BTREE_ITER_CACHED); + BTREE_ITER_cached); int ret = bkey_err(k); if (unlikely(ret)) return ret; @@ -259,7 +259,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, } ret = bch2_trans_update(trans, &iter, &inode->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_UPDATE_internal_snapshot_node| inode_update_flags); err: bch2_trans_iter_exit(trans, &iter); @@ -368,7 +368,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, bkey_start_pos(&sk.k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: bch2_extent_update(trans, inum, &iter, sk.k, @@ -407,13 +407,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, BUG_ON(c->opts.nochanges); bkey_for_each_ptr(ptrs, ptr) { - BUG_ON(!bch2_dev_exists2(c, ptr->dev)); - - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = nocow + ? bch2_dev_have_ref(c, ptr->dev) + : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); if (to_entry(ptr + 1) < ptrs.end) { - n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, - GFP_NOFS, &ca->replica_set)); + n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; n->bio.bi_private = wbio->bio.bi_private; @@ -430,11 +429,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->dev = ptr->dev; - n->have_ioref = nocow || bch2_dev_get_ioref(ca, - type == BCH_DATA_btree ? READ : WRITE); + n->have_ioref = ca != NULL; n->nocow = nocow; n->submit_time = local_clock(); n->inode_offset = bkey_start_offset(&k->k); + if (nocow) + n->nocow_bucket = PTR_BUCKET_NR(ca, ptr); n->bio.bi_iter.bi_sector = ptr->offset; if (likely(n->have_ioref)) { @@ -481,7 +481,6 @@ static void bch2_write_done(struct closure *cl) static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { struct keylist *keys = &op->insert_keys; - struct bch_extent_ptr *ptr; struct bkey_i *src, *dst = keys->keys, *n; for (src = keys->keys; src != keys->top; src = n) { @@ -650,7 +649,9 @@ static void bch2_write_endio(struct bio *bio) struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_fs *c = wbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + struct bch_dev *ca = wbio->have_ioref + ? bch2_dev_have_ref(c, wbio->dev) + : NULL; if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, op->pos.inode, @@ -661,8 +662,12 @@ static void bch2_write_endio(struct bio *bio) op->flags |= BCH_WRITE_IO_ERROR; } - if (wbio->nocow) + if (wbio->nocow) { + bch2_bucket_nocow_unlock(&c->nocow_locks, + POS(ca->dev_idx, wbio->nocow_bucket), + BUCKET_NOCOW_LOCK_UPDATE); set_bit(wbio->dev, op->devs_need_flush->d); + } if (wbio->have_ioref) { bch2_latency_acct(ca, wbio->submit_time, WRITE); @@ -1101,30 +1106,21 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, return false; e = bkey_s_c_to_extent(k); + + rcu_read_lock(); extent_for_each_ptr_decode(e, p, entry) { - if (crc_is_encoded(p.crc) || p.has_ec) + if (crc_is_encoded(p.crc) || p.has_ec) { + rcu_read_unlock(); return false; + } replicas += bch2_extent_ptr_durability(c, &p); } + rcu_read_unlock(); return replicas >= op->opts.data_replicas; } -static inline void bch2_nocow_write_unlock(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - - for_each_keylist_key(&op->insert_keys, k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - - bkey_for_each_ptr(ptrs, ptr) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), - BUCKET_NOCOW_LOCK_UPDATE); - } -} - static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *orig, @@ -1158,7 +1154,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return bch2_extent_update_i_size_sectors(trans, iter, min(new->k.p.offset << 9, new_i_size), 0) ?: bch2_trans_update(trans, iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) @@ -1169,7 +1165,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) for_each_keylist_key(&op->insert_keys, orig) { int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); @@ -1195,8 +1191,6 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { - bch2_nocow_write_unlock(op); - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { op->error = -EIO; } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) @@ -1242,12 +1236,16 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(op->pos.inode, op->pos.offset, snapshot), - BTREE_ITER_SLOTS); + BTREE_ITER_slots); while (1) { struct bio *bio = &op->wbio.bio; buckets.nr = 0; + ret = bch2_trans_relock(trans); + if (ret) + break; + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -1267,14 +1265,15 @@ retry: /* Get iorefs before dropping btree locks: */ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - struct bpos b = PTR_BUCKET_POS(c, ptr); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + if (unlikely(!ca)) + goto err_get_ioref; + + struct bpos b = PTR_BUCKET_POS(ca, ptr); struct nocow_lock_bucket *l = bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); prefetch(l); - if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) - goto err_get_ioref; - /* XXX allocating memory with btree locks held - rare */ darray_push_gfp(&buckets, ((struct bucket_to_lock) { .b = b, .gen = ptr->gen, .l = l, @@ -1293,7 +1292,7 @@ retry: bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); darray_for_each(buckets, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode); + struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, bucket_to_u64(i->b), @@ -1370,7 +1369,7 @@ err: return; err_get_ioref: darray_for_each(buckets, i) - percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref); + percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref); /* Fall back to COW path: */ goto out; @@ -1491,7 +1490,11 @@ err: if ((op->flags & BCH_WRITE_SYNC) || (!(op->flags & BCH_WRITE_DONE) && !(op->flags & BCH_WRITE_IN_WORKER))) { - closure_sync(&op->cl); + if (closure_sync_timeout(&op->cl, HZ * 10)) { + bch2_print_allocator_stuck(c); + closure_sync(&op->cl); + } + __bch2_write_index(op); if (!(op->flags & BCH_WRITE_DONE)) @@ -1649,8 +1652,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); - prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); - prt_newline(out); + prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); printbuf_indent_sub(out, 2); } @@ -1658,13 +1660,14 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) void bch2_fs_io_write_exit(struct bch_fs *c) { mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->replica_set); bioset_exit(&c->bio_write); } int bch2_fs_io_write_init(struct bch_fs *c) { - if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), - BIOSET_NEED_BVECS)) + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || + bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) return -BCH_ERR_ENOMEM_bio_write_init; if (mempool_init_page_pool(&c->bio_bounce_pages, diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index c7f97c2c4805..6e878a6f2f0b 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -20,6 +20,7 @@ struct bch_write_bio { u64 submit_time; u64 inode_offset; + u64 nocow_bucket; struct bch_devs_list failed; u8 dev; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index a8b08e76d0d0..adec8e1ea73e 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -53,29 +53,19 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 unsigned i = seq & JOURNAL_BUF_MASK; struct journal_buf *buf = j->buf + i; - prt_str(out, "seq:"); - prt_tab(out); - prt_printf(out, "%llu", seq); - prt_newline(out); + prt_printf(out, "seq:\t%llu\n", seq); printbuf_indent_add(out, 2); - prt_str(out, "refcount:"); - prt_tab(out); - prt_printf(out, "%u", journal_state_count(s, i)); - prt_newline(out); + prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); - prt_str(out, "size:"); - prt_tab(out); + prt_printf(out, "size:\t"); prt_human_readable_u64(out, vstruct_bytes(buf->data)); prt_newline(out); - prt_str(out, "expires:"); - prt_tab(out); - prt_printf(out, "%li jiffies", buf->expires - jiffies); - prt_newline(out); + prt_printf(out, "expires:\t"); + prt_printf(out, "%li jiffies\n", buf->expires - jiffies); - prt_str(out, "flags:"); - prt_tab(out); + prt_printf(out, "flags:\t"); if (buf->noflush) prt_str(out, "noflush "); if (buf->must_flush) @@ -87,9 +77,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 if (buf->write_started) prt_str(out, "write_started "); if (buf->write_allocated) - prt_str(out, "write allocated "); + prt_str(out, "write_allocated "); if (buf->write_done) - prt_str(out, "write done"); + prt_str(out, "write_done"); prt_newline(out); printbuf_indent_sub(out, 2); @@ -948,7 +938,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, break; } } else { - ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl); + ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, + BCH_DATA_journal, cl); ret = PTR_ERR_OR_ZERO(ob[nr_got]); if (ret) break; @@ -956,7 +947,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ret = bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, - ca->mi.bucket_size)); + ca->mi.bucket_size, BTREE_TRIGGER_transactional)); if (ret) { bch2_open_bucket_put(c, ob[nr_got]); bch_err_msg(c, ret, "marking new journal buckets"); @@ -1036,7 +1027,8 @@ err_unblock: for (i = 0; i < nr_got; i++) bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, - bu[i], BCH_DATA_free, 0)); + bu[i], BCH_DATA_free, 0, + BTREE_TRIGGER_transactional)); err_free: if (!new_fs) for (i = 0; i < nr_got; i++) @@ -1187,12 +1179,14 @@ void bch2_fs_journal_stop(struct journal *j) bch2_journal_meta(j); journal_quiesce(j); + cancel_delayed_work_sync(&j->write_work); BUG_ON(!bch2_journal_error(j) && - test_bit(JOURNAL_REPLAY_DONE, &j->flags) && + test_bit(JOURNAL_replay_done, &j->flags) && j->last_empty_seq != journal_cur_seq(j)); - cancel_delayed_work_sync(&j->write_work); + if (!bch2_journal_error(j)) + clear_bit(JOURNAL_running, &j->flags); } int bch2_fs_journal_start(struct journal *j, u64 cur_seq) @@ -1266,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) spin_lock(&j->lock); - set_bit(JOURNAL_STARTED, &j->flags); + set_bit(JOURNAL_running, &j->flags); j->last_flush_write = jiffies; j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); @@ -1407,6 +1401,13 @@ int bch2_fs_journal_init(struct journal *j) /* debug: */ +static const char * const bch2_journal_flags_strs[] = { +#define x(n) #n, + JOURNAL_FLAGS() +#undef x + NULL +}; + void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); @@ -1415,19 +1416,22 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; if (!out->nr_tabstops) - printbuf_tabstop_push(out, 24); + printbuf_tabstop_push(out, 28); out->atomic++; rcu_read_lock(); s = READ_ONCE(j->reservations); + prt_printf(out, "flags:\t"); + prt_bitflags(out, bch2_journal_flags_strs, j->flags); + prt_newline(out); prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); - prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); - prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j)); + prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk); + prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); + prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); @@ -1436,48 +1440,44 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_newline(out); prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); - prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); + prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - prt_printf(out, "blocked:\t\t%u\n", j->blocked); + prt_printf(out, "blocked:\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); - prt_printf(out, "current entry:\t\t"); + prt_printf(out, "current entry:\t"); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: - prt_printf(out, "error"); + prt_printf(out, "error\n"); break; case JOURNAL_ENTRY_CLOSED_VAL: - prt_printf(out, "closed"); + prt_printf(out, "closed\n"); break; default: - prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); + prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; } - prt_newline(out); - prt_printf(out, "unwritten entries:"); - prt_newline(out); + prt_printf(out, "unwritten entries:\n"); bch2_journal_bufs_to_text(out, j); - prt_printf(out, - "replay done:\t\t%i\n", - test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - prt_printf(out, "space:\n"); - prt_printf(out, "\tdiscarded\t%u:%u\n", + printbuf_indent_add(out, 2); + prt_printf(out, "discarded\t%u:%u\n", j->space[journal_space_discarded].next_entry, j->space[journal_space_discarded].total); - prt_printf(out, "\tclean ondisk\t%u:%u\n", + prt_printf(out, "clean ondisk\t%u:%u\n", j->space[journal_space_clean_ondisk].next_entry, j->space[journal_space_clean_ondisk].total); - prt_printf(out, "\tclean\t\t%u:%u\n", + prt_printf(out, "clean\t%u:%u\n", j->space[journal_space_clean].next_entry, j->space[journal_space_clean].total); - prt_printf(out, "\ttotal\t\t%u:%u\n", + prt_printf(out, "total\t%u:%u\n", j->space[journal_space_total].next_entry, j->space[journal_space_total].total); + printbuf_indent_sub(out, 2); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; @@ -1488,14 +1488,16 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) if (!ja->nr) continue; - prt_printf(out, "dev %u:\n", ca->dev_idx); - prt_printf(out, "\tnr\t\t%u\n", ja->nr); - prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); - prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); - prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); - prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); - prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); - prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + prt_printf(out, "dev %u:\n", ca->dev_idx); + printbuf_indent_add(out, 2); + prt_printf(out, "nr\t%u\n", ja->nr); + prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); + prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); + prt_printf(out, "discard_idx\t%u\n", ja->discard_idx); + prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); + prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); + prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + printbuf_indent_sub(out, 2); } rcu_read_unlock(); @@ -1527,25 +1529,18 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 pin_list = journal_seq_pin(j, *seq); - prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); - prt_newline(out); + prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); printbuf_indent_add(out, 2); for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) - list_for_each_entry(pin, &pin_list->list[i], list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } + list_for_each_entry(pin, &pin_list->list[i], list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); - if (!list_empty(&pin_list->flushed)) { - prt_printf(out, "flushed:"); - prt_newline(out); - } + if (!list_empty(&pin_list->flushed)) + prt_printf(out, "flushed:\n"); - list_for_each_entry(pin, &pin_list->flushed, list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } + list_for_each_entry(pin, &pin_list->flushed, list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); printbuf_indent_sub(out, 2); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 7c7528f839c5..fd1f7cdaa8bc 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -372,7 +372,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re int ret; EBUG_ON(res->ref); - EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); + EBUG_ON(!test_bit(JOURNAL_running, &j->flags)); res->u64s = u64s; @@ -418,8 +418,8 @@ struct bch_dev; static inline void bch2_journal_set_replay_done(struct journal *j) { - BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); - set_bit(JOURNAL_REPLAY_DONE, &j->flags); + BUG_ON(!test_bit(JOURNAL_running, &j->flags)); + set_bit(JOURNAL_replay_done, &j->flags); } void bch2_journal_unblock(struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index eb1f9d6f5a19..cdcb1ad49af4 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,15 +17,38 @@ #include "sb-clean.h" #include "trace.h" +void bch2_journal_pos_from_member_info_set(struct bch_fs *c) +{ + lockdep_assert_held(&c->sb_lock); + + for_each_member_device(c, ca) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + + m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); + m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); + } +} + +void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + for_each_member_device(c, ca) { + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); + + unsigned idx = le32_to_cpu(m.last_journal_bucket); + if (idx < ca->journal.nr) + ca->journal.cur_idx = idx; + unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); + if (offset <= ca->mi.bucket_size) + ca->journal.sectors_free = ca->mi.bucket_size - offset; + } + mutex_unlock(&c->sb_lock); +} + void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) { darray_for_each(j->ptrs, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); - u64 offset; - - div64_u64_rem(i->sector, ca->mi.bucket_size, &offset); - if (i != j->ptrs.data) prt_printf(out, " "); prt_printf(out, "%u:%u:%u (sector %llu)", @@ -122,6 +145,10 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct printbuf buf = PRINTBUF; int ret = JOURNAL_ENTRY_ADD_OK; + if (!c->journal.oldest_seq_found_ondisk || + le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) + c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); + /* Is this entry older than the range we need? */ if (!c->opts.read_entire_journal && le64_to_cpu(j->seq) < jlist->last_seq) @@ -272,7 +299,7 @@ static void journal_entry_err_msg(struct printbuf *out, journal_entry_err_msg(&_buf, version, jset, entry); \ prt_printf(&_buf, msg, ##__VA_ARGS__); \ \ - switch (flags & BKEY_INVALID_WRITE) { \ + switch (flags & BCH_VALIDATE_write) { \ case READ: \ mustfix_fsck_err(c, _err, "%s", _buf.buf); \ break; \ @@ -301,9 +328,9 @@ static int journal_validate_key(struct bch_fs *c, unsigned level, enum btree_id btree_id, struct bkey_i *k, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { - int write = flags & BKEY_INVALID_WRITE; + int write = flags & BCH_VALIDATE_write; void *next = vstruct_next(entry); struct printbuf buf = PRINTBUF; int ret = 0; @@ -376,7 +403,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct bkey_i *k = entry->start; @@ -385,7 +412,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, entry->level, entry->btree_id, k, version, big_endian, - flags|BKEY_INVALID_JOURNAL); + flags|BCH_VALIDATE_journal); if (ret == FSCK_DELETED_KEY) continue; @@ -416,7 +443,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct bkey_i *k = entry->start; int ret = 0; @@ -455,7 +482,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { /* obsolete, don't care: */ return 0; @@ -470,7 +497,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { int ret = 0; @@ -497,7 +524,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; @@ -539,7 +566,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -573,7 +600,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); @@ -617,7 +644,7 @@ static int journal_entry_clock_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); @@ -657,13 +684,12 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); unsigned expected = sizeof(*u); - unsigned dev; int ret = 0; if (journal_entry_err_on(bytes < expected, @@ -675,16 +701,6 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, return ret; } - dev = le32_to_cpu(u->dev); - - if (journal_entry_err_on(!bch2_dev_exists2(c, dev), - c, version, jset, entry, - journal_entry_dev_usage_bad_dev, - "bad dev")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - if (journal_entry_err_on(u->pad, c, version, jset, entry, journal_entry_dev_usage_bad_pad, @@ -719,7 +735,7 @@ static int journal_entry_log_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return 0; } @@ -737,7 +753,7 @@ static int journal_entry_overwrite_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, READ); @@ -753,7 +769,7 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, READ); @@ -769,7 +785,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { unsigned bytes = vstruct_bytes(entry); unsigned expected = 16; @@ -799,7 +815,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs * struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bkey_invalid_flags); + enum bch_validate_flags); void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; @@ -817,7 +833,7 @@ int bch2_journal_entry_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return entry->type < BCH_JSET_ENTRY_NR ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, @@ -837,7 +853,7 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, } static int jset_validate_entries(struct bch_fs *c, struct jset *jset, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { unsigned version = le32_to_cpu(jset->version); int ret = 0; @@ -863,7 +879,7 @@ fsck_err: static int jset_validate(struct bch_fs *c, struct bch_dev *ca, struct jset *jset, u64 sector, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { unsigned version; int ret = 0; @@ -918,7 +934,7 @@ static int jset_validate_early(struct bch_fs *c, { size_t bytes = vstruct_bytes(jset); unsigned version; - enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; + enum bch_validate_flags flags = BCH_VALIDATE_journal; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) @@ -1057,6 +1073,13 @@ reread: goto err; } + if (le64_to_cpu(j->seq) > ja->highest_seq_found) { + ja->highest_seq_found = le64_to_cpu(j->seq); + ja->cur_idx = bucket; + ja->sectors_free = ca->mi.bucket_size - + bucket_remainder(ca, offset) - sectors; + } + /* * This happens sometimes if we don't have discards on - * when we've partially overwritten a bucket with new @@ -1125,8 +1148,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) struct bch_fs *c = ca->fs; struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); - struct journal_replay *r, **_r; - struct genradix_iter iter; struct journal_read_buf buf = { NULL, 0 }; unsigned i; int ret = 0; @@ -1146,47 +1167,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) goto err; } - ja->sectors_free = ca->mi.bucket_size; - - mutex_lock(&jlist->lock); - genradix_for_each_reverse(&c->journal_entries, iter, _r) { - r = *_r; - - if (!r) - continue; - - darray_for_each(r->ptrs, i) - if (i->dev == ca->dev_idx) { - unsigned wrote = bucket_remainder(ca, i->sector) + - vstruct_sectors(&r->j, c->block_bits); - - ja->cur_idx = i->bucket; - ja->sectors_free = ca->mi.bucket_size - wrote; - goto found; - } - } -found: - mutex_unlock(&jlist->lock); - - if (ja->bucket_seq[ja->cur_idx] && - ja->sectors_free == ca->mi.bucket_size) { -#if 0 - /* - * Debug code for ZNS support, where we (probably) want to be - * correlated where we stopped in the journal to the zone write - * points: - */ - bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); - bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); - for (i = 0; i < 3; i++) { - unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; - - bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); - } -#endif - ja->sectors_free = 0; - } - /* * Set dirty_idx to indicate the entire journal is full and needs to be * reclaimed - journal reclaim will immediately reclaim whatever isn't @@ -1255,7 +1235,7 @@ int bch2_journal_read(struct bch_fs *c, * those entries will be blacklisted: */ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; + enum bch_validate_flags flags = BCH_VALIDATE_journal; i = *_i; @@ -1366,7 +1346,7 @@ int bch2_journal_read(struct bch_fs *c, fsck_err(c, journal_entries_missing, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" " prev at %s\n" - " next at %s", + " next at %s, continue?", missing_start, missing_end, *last_seq, *blacklist_seq - 1, buf1.buf, buf2.buf); @@ -1390,7 +1370,7 @@ int bch2_journal_read(struct bch_fs *c, continue; darray_for_each(i->ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); if (!ptr->csum_good) bch_err_dev_offset(ca, ptr->sector, @@ -1400,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c, } ret = jset_validate(c, - bch_dev_bkey_exists(c, i->ptrs.data[0].dev), + bch2_dev_have_ref(c, i->ptrs.data[0].dev), &i->j, i->ptrs.data[0].sector, READ); @@ -1731,10 +1711,8 @@ static CLOSURE_CALLBACK(journal_write_submit) unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct journal_device *ja = &ca->journal; - - if (!percpu_ref_tryget(&ca->io_ref)) { + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + if (!ca) { /* XXX: fix this */ bch_err(c, "missing device for journal write\n"); continue; @@ -1743,6 +1721,7 @@ static CLOSURE_CALLBACK(journal_write_submit) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); + struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; @@ -1958,14 +1937,14 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * * So if we're in an error state, and we're still starting up, we don't * write anything at all. */ - if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) + if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) return -EIO; if (error || w->noflush || (!w->must_flush && (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { + test_bit(JOURNAL_may_skip_flush, &j->flags))) { w->noflush = true; SET_JSET_NO_FLUSH(w->data, true); w->data->last_seq = 0; @@ -1976,7 +1955,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * w->must_flush = true; j->last_flush_write = jiffies; j->nr_flush_writes++; - clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); + clear_bit(JOURNAL_need_flush_write, &j->flags); } return 0; diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 4f1e763ab506..2ca9cde30ea8 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -4,6 +4,9 @@ #include "darray.h" +void bch2_journal_pos_from_member_info_set(struct bch_fs *); +void bch2_journal_pos_from_member_info_resume(struct bch_fs *); + struct journal_ptr { bool csum_good; u8 dev; @@ -60,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, int bch2_journal_entry_validate(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bkey_invalid_flags); + enum bch_validate_flags); void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, struct jset_entry *); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 04a577848b01..79be0eaddfa0 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -67,7 +67,7 @@ void bch2_journal_set_watermark(struct journal *j) track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); - mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin); + mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin); swap(watermark, j->watermark); if (watermark > j->watermark) @@ -225,9 +225,9 @@ void bch2_journal_space_available(struct journal *j) j->space[journal_space_clean_ondisk].total) && (clean - clean_ondisk <= total / 8) && (clean_ondisk * 2 > clean)) - set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + set_bit(JOURNAL_may_skip_flush, &j->flags); else - clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + clear_bit(JOURNAL_may_skip_flush, &j->flags); bch2_journal_set_watermark(j); out: @@ -818,7 +818,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, * If journal replay hasn't completed, the unreplayed journal entries * hold refs on their corresponding sequence numbers */ - ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || + ret = !test_bit(JOURNAL_replay_done, &j->flags) || journal_last_seq(j) > seq_to_flush || !fifo_used(&j->pin); @@ -833,7 +833,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) /* time_stats this */ bool did_work = false; - if (!test_bit(JOURNAL_STARTED, &j->flags)) + if (!test_bit(JOURNAL_running, &j->flags)) return false; closure_wait_event(&j->async_wait, diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c index ae4fb8c3a2bc..db80e506e3ab 100644 --- a/fs/bcachefs/journal_sb.c +++ b/fs/bcachefs/journal_sb.c @@ -16,9 +16,8 @@ static int u64_cmp(const void *_l, const void *_r) return cmp_int(*l, *r); } -static int bch2_sb_journal_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal *journal = field_to_type(f, journal); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); @@ -99,9 +98,8 @@ static int u64_range_cmp(const void *_l, const void *_r) return cmp_int(l->start, r->start); } -static int bch2_sb_journal_v2_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 37a024e034d4..ed4846709611 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "btree_iter.h" #include "eytzinger.h" +#include "journal.h" #include "journal_seq_blacklist.h" #include "super-io.h" @@ -162,9 +162,8 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) return 0; } -static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist); @@ -217,78 +216,40 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { .to_text = bch2_sb_journal_seq_blacklist_to_text }; -void bch2_blacklist_entries_gc(struct work_struct *work) +bool bch2_blacklist_entries_gc(struct bch_fs *c) { - struct bch_fs *c = container_of(work, struct bch_fs, - journal_seq_blacklist_gc_work); - struct journal_seq_blacklist_table *t; - struct bch_sb_field_journal_seq_blacklist *bl; struct journal_seq_blacklist_entry *src, *dst; - struct btree_trans *trans = bch2_trans_get(c); - unsigned i, nr, new_nr; - int ret; - - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_iter iter; - struct btree *b; - - bch2_trans_node_iter_init(trans, &iter, i, POS_MIN, - 0, 0, BTREE_ITER_PREFETCH); -retry: - bch2_trans_begin(trans); - - b = bch2_btree_iter_peek_node(&iter); - - while (!(ret = PTR_ERR_OR_ZERO(b)) && - b && - !test_bit(BCH_FS_stopping, &c->flags)) - b = bch2_btree_iter_next_node(&iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_iter_exit(trans, &iter); - } - - bch2_trans_put(trans); - if (ret) - return; - - mutex_lock(&c->sb_lock); - bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); + struct bch_sb_field_journal_seq_blacklist *bl = + bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); if (!bl) - goto out; + return false; - nr = blacklist_nr_entries(bl); + unsigned nr = blacklist_nr_entries(bl); dst = bl->start; - t = c->journal_seq_blacklist_table; + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; BUG_ON(nr != t->nr); + unsigned i; for (src = bl->start, i = eytzinger0_first(t->nr); src < bl->start + nr; src++, i = eytzinger0_next(i, nr)) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); - if (t->entries[i].dirty) + if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) *dst++ = *src; } - new_nr = dst - bl->start; - - bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); - - if (new_nr != nr) { - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - new_nr ? sb_blacklist_u64s(new_nr) : 0); - BUG_ON(new_nr && !bl); + unsigned new_nr = dst - bl->start; + if (new_nr == nr) + return false; - if (!new_nr) - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); + bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr); - bch2_write_super(c); - } -out: - mutex_unlock(&c->sb_lock); + bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, + new_nr ? sb_blacklist_u64s(new_nr) : 0); + BUG_ON(new_nr && !bl); + return true; } diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index afb886ec8e25..d47636f96fdc 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -17,6 +17,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; -void bch2_blacklist_entries_gc(struct work_struct *); +bool bch2_blacklist_entries_gc(struct bch_fs *); #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index b5161b5d76a0..19183fcf7ad7 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -129,12 +129,17 @@ enum journal_space_from { journal_space_nr, }; +#define JOURNAL_FLAGS() \ + x(replay_done) \ + x(running) \ + x(may_skip_flush) \ + x(need_flush_write) \ + x(space_low) + enum journal_flags { - JOURNAL_REPLAY_DONE, - JOURNAL_STARTED, - JOURNAL_MAY_SKIP_FLUSH, - JOURNAL_NEED_FLUSH_WRITE, - JOURNAL_SPACE_LOW, +#define x(n) JOURNAL_##n, + JOURNAL_FLAGS() +#undef x }; /* Reasons we may fail to get a journal reservation: */ @@ -229,6 +234,7 @@ struct journal { u64 last_seq_ondisk; u64 err_seq; u64 last_empty_seq; + u64 oldest_seq_found_ondisk; /* * FIFO of journal entries whose btree updates have not yet been @@ -326,6 +332,7 @@ struct journal_device { /* for bch_journal_read_device */ struct closure read; + u64 highest_seq_found; }; /* diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index b82f8209041f..f49fdca1d07d 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -56,7 +56,7 @@ int bch2_resume_logged_ops(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_logged_ops, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, resume_logged_op(trans, &iter, k))); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 26569043e368..a40d116224ed 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -11,7 +11,7 @@ /* KEY_TYPE_lru is obsolete: */ int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -149,7 +149,7 @@ int bch2_check_lrus(struct bch_fs *c) struct bpos last_flushed_pos = POS_MIN; int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, bch2_check_lru_key(trans, &iter, k, &last_flushed_pos))); bch_err_fn(c, ret); diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 429dca816df5..fb11ab0dd00e 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -49,7 +49,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) } int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_lru_pos_to_text(struct printbuf *, struct bpos); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 69098eeb5d48..ddc187fb693d 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -49,7 +49,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, if (!bch2_bkey_has_device_c(k, dev_idx)) return 0; - n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node); ret = PTR_ERR_OR_ZERO(n); if (ret) return ret; @@ -67,7 +67,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, /* * Since we're not inserting through an extent iterator - * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * (BTREE_ITER_all_snapshots iterators aren't extent iterators), * we aren't using the extent overwrite path to delete, we're * just using the normal key deletion path: */ @@ -87,7 +87,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) continue; ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); if (ret) @@ -119,7 +119,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) for (id = 0; id < BTREE_ID_NR; id++) { bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); retry: ret = 0; while (bch2_trans_begin(trans), diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 4d94b7742dbb..8171f947fac8 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -41,28 +41,23 @@ static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c struct data_update_opts *data_opts) { printbuf_tabstop_push(out, 20); - prt_str(out, "rewrite ptrs:"); - prt_tab(out); + prt_str(out, "rewrite ptrs:\t"); bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); prt_newline(out); - prt_str(out, "kill ptrs: "); - prt_tab(out); + prt_str(out, "kill ptrs:\t"); bch2_prt_u64_base2(out, data_opts->kill_ptrs); prt_newline(out); - prt_str(out, "target: "); - prt_tab(out); + prt_str(out, "target:\t"); bch2_target_to_text(out, c, data_opts->target); prt_newline(out); - prt_str(out, "compression: "); - prt_tab(out); + prt_str(out, "compression:\t"); bch2_compression_opt_to_text(out, background_compression(*io_opts)); prt_newline(out); - prt_str(out, "extra replicas: "); - prt_tab(out); + prt_str(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); } @@ -421,7 +416,7 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, io_opts->d.nr = 0; ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_all_snapshots, k, ({ if (k.k->p.offset != extent_k.k->p.inode) break; @@ -467,7 +462,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_CACHED); + BTREE_ITER_cached); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; @@ -553,8 +548,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt, } bch2_trans_iter_init(trans, &iter, btree_id, start, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots); if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); @@ -695,6 +690,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bpos bp_pos = POS_MIN; int ret = 0; + struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); + if (!ca) + return 0; + trace_bucket_evacuate(c, &bucket); bch2_bkey_buf_init(&sk); @@ -705,7 +704,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED); + bucket, BTREE_ITER_cached); ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); bch2_trans_iter_exit(trans, &iter); @@ -716,7 +715,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, a = bch2_alloc_to_v4(k, &a_convert); dirty_sectors = bch2_bucket_sectors_dirty(*a); - bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; + bucket_size = ca->mi.bucket_size; fragmentation = a->fragmentation_lru; ret = bch2_btree_write_buffer_tryflush(trans); @@ -730,9 +729,9 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); - ret = bch2_get_next_backpointer(trans, bucket, gen, + ret = bch2_get_next_backpointer(trans, ca, bucket, gen, &bp_pos, &bp, - BTREE_ITER_CACHED); + BTREE_ITER_cached); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) @@ -828,6 +827,7 @@ next: trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); err: + bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); return ret; } @@ -868,7 +868,7 @@ static int bch2_move_btree(struct bch_fs *c, continue; bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); retry: ret = 0; while (bch2_trans_begin(trans), @@ -975,26 +975,10 @@ static bool migrate_btree_pred(struct bch_fs *c, void *arg, */ static bool bformat_needs_redo(struct bkey_format *f) { - for (unsigned i = 0; i < f->nr_fields; i++) { - unsigned f_bits = f->bits_per_field[i]; - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (f_bits > unpacked_bits) - return true; - - if ((f_bits == unpacked_bits) && field_offset) + for (unsigned i = 0; i < f->nr_fields; i++) + if (bch2_bkey_format_field_overflows(f, i)) return true; - u64 f_mask = f_bits - ? ~((~0ULL << (f_bits - 1)) << 1) - : 0; - - if (((field_offset + f_mask) & unpacked_mask) < field_offset) - return true; - } - return false; } @@ -1049,6 +1033,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, struct extent_ptr_decoded p; unsigned i = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { unsigned d = bch2_extent_ptr_durability(c, &p); @@ -1059,6 +1044,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, i++; } + rcu_read_unlock(); return data_opts->kill_ptrs != 0; } @@ -1143,23 +1129,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_newline(out); printbuf_indent_add(out, 2); - prt_str(out, "keys moved: "); - prt_u64(out, atomic64_read(&stats->keys_moved)); - prt_newline(out); - - prt_str(out, "keys raced: "); - prt_u64(out, atomic64_read(&stats->keys_raced)); - prt_newline(out); - - prt_str(out, "bytes seen: "); + prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); + prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); + prt_printf(out, "bytes seen: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_newline(out); - prt_str(out, "bytes moved: "); + prt_printf(out, "bytes moved: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_str(out, "bytes raced: "); + prt_printf(out, "bytes raced: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); @@ -1173,19 +1153,17 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); - prt_printf(out, "reads: ios %u/%u sectors %u/%u", + prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", atomic_read(&ctxt->read_ios), c->opts.move_ios_in_flight, atomic_read(&ctxt->read_sectors), c->opts.move_bytes_in_flight >> 9); - prt_newline(out); - prt_printf(out, "writes: ios %u/%u sectors %u/%u", + prt_printf(out, "writes: ios %u/%u sectors %u/%u\n", atomic_read(&ctxt->write_ios), c->opts.move_ios_in_flight, atomic_read(&ctxt->write_sectors), c->opts.move_bytes_in_flight >> 9); - prt_newline(out); printbuf_indent_add(out, 2); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 0d2b82d8d11f..10bfb31c151b 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -84,7 +84,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, return 0; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - b->k.bucket, BTREE_ITER_CACHED); + b->k.bucket, BTREE_ITER_cached); ret = bkey_err(k); if (ret) return ret; @@ -158,6 +158,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) return ret; + bch2_trans_begin(trans); + ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 84e452835a17..25530e0bb2f3 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -426,11 +426,6 @@ enum fsck_err_opts { BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ NULL, "Set superblock to latest version,\n" \ "allowing any new features to be used") \ - x(buckets_nouse, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Allocate the buckets_nouse bitmap") \ x(stdio, u64, \ 0, \ OPT_UINT(0, S64_MAX), \ @@ -480,7 +475,7 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\ + NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ " prefetched sequentially") struct bch_opts { diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index b27d22925929..8b0369185f5c 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -10,35 +10,50 @@ #include "printbuf.h" -static inline unsigned printbuf_linelen(struct printbuf *buf) +static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos) { - return buf->pos - buf->last_newline; + return pos - buf->last_newline; } -int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) +static inline unsigned printbuf_linelen(struct printbuf *buf) { - unsigned new_size; - char *buf; + return __printbuf_linelen(buf, buf->pos); +} - if (!out->heap_allocated) - return 0; +/* + * Returns spaces from start of line, if set, or 0 if unset: + */ +static inline unsigned cur_tabstop(struct printbuf *buf) +{ + return buf->cur_tabstop < buf->nr_tabstops + ? buf->_tabstops[buf->cur_tabstop] + : 0; +} +int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) +{ /* Reserved space for terminating nul: */ extra += 1; - if (out->pos + extra < out->size) + if (out->pos + extra <= out->size) return 0; - new_size = roundup_pow_of_two(out->size + extra); + if (!out->heap_allocated) { + out->overflow = true; + return 0; + } + + unsigned new_size = roundup_pow_of_two(out->size + extra); /* * Note: output buffer must be freeable with kfree(), it's not required * that the user use printbuf_exit(). */ - buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); + char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); if (!buf) { out->allocation_failure = true; + out->overflow = true; return -ENOMEM; } @@ -47,6 +62,92 @@ int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) return 0; } +static void printbuf_advance_pos(struct printbuf *out, unsigned len) +{ + out->pos += min(len, printbuf_remaining(out)); +} + +static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr) +{ + unsigned move = out->pos - pos; + + bch2_printbuf_make_room(out, nr); + + if (pos + nr < out->size) + memmove(out->buf + pos + nr, + out->buf + pos, + min(move, out->size - 1 - pos - nr)); + + if (pos < out->size) + memset(out->buf + pos, ' ', min(nr, out->size - pos)); + + printbuf_advance_pos(out, nr); + printbuf_nul_terminate_reserved(out); +} + +static void __printbuf_do_indent(struct printbuf *out, unsigned pos) +{ + while (true) { + int pad; + unsigned len = out->pos - pos; + char *p = out->buf + pos; + char *n = memscan(p, '\n', len); + if (cur_tabstop(out)) { + n = min(n, (char *) memscan(p, '\r', len)); + n = min(n, (char *) memscan(p, '\t', len)); + } + + pos = n - out->buf; + if (pos == out->pos) + break; + + switch (*n) { + case '\n': + pos++; + out->last_newline = pos; + + printbuf_insert_spaces(out, pos, out->indent); + + pos = min(pos + out->indent, out->pos); + out->last_field = pos; + out->cur_tabstop = 0; + break; + case '\r': + memmove(n, n + 1, out->pos - pos); + --out->pos; + pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos); + if (pad > 0) { + printbuf_insert_spaces(out, out->last_field, pad); + pos += pad; + } + + out->last_field = pos; + out->cur_tabstop++; + break; + case '\t': + pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1; + if (pad > 0) { + *n = ' '; + printbuf_insert_spaces(out, pos, pad - 1); + pos += pad; + } else { + memmove(n, n + 1, out->pos - pos); + --out->pos; + } + + out->last_field = pos; + out->cur_tabstop++; + break; + } + } +} + +static inline void printbuf_do_indent(struct printbuf *out, unsigned pos) +{ + if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling) + __printbuf_do_indent(out, pos); +} + void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) { int len; @@ -55,14 +156,14 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) va_list args2; va_copy(args2, args); - len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2); va_end(args2); - } while (len + 1 >= printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len + 1)); + } while (len > printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len)); - len = min_t(size_t, len, - printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); - out->pos += len; + unsigned indent_pos = out->pos; + printbuf_advance_pos(out, len); + printbuf_do_indent(out, indent_pos); } void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) @@ -72,14 +173,14 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) do { va_start(args, fmt); - len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); + len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args); va_end(args); - } while (len + 1 >= printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len + 1)); + } while (len > printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len)); - len = min_t(size_t, len, - printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); - out->pos += len; + unsigned indent_pos = out->pos; + printbuf_advance_pos(out, len); + printbuf_do_indent(out, indent_pos); } /** @@ -194,33 +295,20 @@ void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) void bch2_prt_newline(struct printbuf *buf) { - unsigned i; - bch2_printbuf_make_room(buf, 1 + buf->indent); - __prt_char(buf, '\n'); + __prt_char_reserved(buf, '\n'); buf->last_newline = buf->pos; - for (i = 0; i < buf->indent; i++) - __prt_char(buf, ' '); + __prt_chars_reserved(buf, ' ', buf->indent); - printbuf_nul_terminate(buf); + printbuf_nul_terminate_reserved(buf); buf->last_field = buf->pos; buf->cur_tabstop = 0; } -/* - * Returns spaces from start of line, if set, or 0 if unset: - */ -static inline unsigned cur_tabstop(struct printbuf *buf) -{ - return buf->cur_tabstop < buf->nr_tabstops - ? buf->_tabstops[buf->cur_tabstop] - : 0; -} - static void __prt_tab(struct printbuf *out) { int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); @@ -247,24 +335,9 @@ void bch2_prt_tab(struct printbuf *out) static void __prt_tab_rjust(struct printbuf *buf) { - unsigned move = buf->pos - buf->last_field; int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); - - if (pad > 0) { - bch2_printbuf_make_room(buf, pad); - - if (buf->last_field + pad < buf->size) - memmove(buf->buf + buf->last_field + pad, - buf->buf + buf->last_field, - min(move, buf->size - 1 - buf->last_field - pad)); - - if (buf->last_field < buf->size) - memset(buf->buf + buf->last_field, ' ', - min((unsigned) pad, buf->size - buf->last_field)); - - buf->pos += pad; - printbuf_nul_terminate(buf); - } + if (pad > 0) + printbuf_insert_spaces(buf, buf->last_field, pad); buf->last_field = buf->pos; buf->cur_tabstop++; @@ -301,41 +374,9 @@ void bch2_prt_tab_rjust(struct printbuf *buf) */ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) { - const char *unprinted_start = str; - const char *end = str + count; - - if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) { - prt_bytes(out, str, count); - return; - } - - while (str != end) { - switch (*str) { - case '\n': - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - bch2_prt_newline(out); - break; - case '\t': - if (likely(cur_tabstop(out))) { - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - __prt_tab(out); - } - break; - case '\r': - if (likely(cur_tabstop(out))) { - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - __prt_tab_rjust(out); - } - break; - } - - str++; - } - - prt_bytes(out, unprinted_start, str - unprinted_start); + unsigned indent_pos = out->pos; + prt_bytes(out, str, count); + printbuf_do_indent(out, indent_pos); } /** @@ -348,9 +389,10 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou void bch2_prt_human_readable_u64(struct printbuf *out, u64 v) { bch2_printbuf_make_room(out, 10); - out->pos += string_get_size(v, 1, !out->si_units, - out->buf + out->pos, - printbuf_remaining_size(out)); + unsigned len = string_get_size(v, 1, !out->si_units, + out->buf + out->pos, + printbuf_remaining_size(out)); + printbuf_advance_pos(out, len); } /** @@ -402,9 +444,7 @@ void bch2_prt_string_option(struct printbuf *out, const char * const list[], size_t selected) { - size_t i; - - for (i = 0; list[i]; i++) + for (size_t i = 0; list[i]; i++) bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); } diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index 9a4a56c40937..9ecc56bc9635 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -86,6 +86,7 @@ struct printbuf { u8 atomic; bool allocation_failure:1; bool heap_allocated:1; + bool overflow:1; enum printbuf_si si_units:1; bool human_readable_units:1; bool has_indent_or_tabstops:1; @@ -142,7 +143,9 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], */ static inline unsigned printbuf_remaining_size(struct printbuf *out) { - return out->pos < out->size ? out->size - out->pos : 0; + if (WARN_ON(out->size && out->pos >= out->size)) + out->pos = out->size - 1; + return out->size - out->pos; } /* @@ -151,7 +154,7 @@ static inline unsigned printbuf_remaining_size(struct printbuf *out) */ static inline unsigned printbuf_remaining(struct printbuf *out) { - return out->pos < out->size ? out->size - out->pos - 1 : 0; + return out->size ? printbuf_remaining_size(out) - 1 : 0; } static inline unsigned printbuf_written(struct printbuf *out) @@ -159,30 +162,25 @@ static inline unsigned printbuf_written(struct printbuf *out) return out->size ? min(out->pos, out->size - 1) : 0; } -/* - * Returns true if output was truncated: - */ -static inline bool printbuf_overflowed(struct printbuf *out) +static inline void printbuf_nul_terminate_reserved(struct printbuf *out) { - return out->pos >= out->size; + if (WARN_ON(out->size && out->pos >= out->size)) + out->pos = out->size - 1; + if (out->size) + out->buf[out->pos] = 0; } static inline void printbuf_nul_terminate(struct printbuf *out) { bch2_printbuf_make_room(out, 1); - - if (out->pos < out->size) - out->buf[out->pos] = 0; - else if (out->size) - out->buf[out->size - 1] = 0; + printbuf_nul_terminate_reserved(out); } /* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ static inline void __prt_char_reserved(struct printbuf *out, char c) { if (printbuf_remaining(out)) - out->buf[out->pos] = c; - out->pos++; + out->buf[out->pos++] = c; } /* Doesn't nul terminate: */ @@ -194,37 +192,34 @@ static inline void __prt_char(struct printbuf *out, char c) static inline void prt_char(struct printbuf *out, char c) { - __prt_char(out, c); - printbuf_nul_terminate(out); + bch2_printbuf_make_room(out, 2); + __prt_char_reserved(out, c); + printbuf_nul_terminate_reserved(out); } static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) { - unsigned i, can_print = min(n, printbuf_remaining(out)); + unsigned can_print = min(n, printbuf_remaining(out)); - for (i = 0; i < can_print; i++) + for (unsigned i = 0; i < can_print; i++) out->buf[out->pos++] = c; - out->pos += n - can_print; } static inline void prt_chars(struct printbuf *out, char c, unsigned n) { bch2_printbuf_make_room(out, n); __prt_chars_reserved(out, c, n); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); } static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) { - unsigned i, can_print; - bch2_printbuf_make_room(out, n); - can_print = min(n, printbuf_remaining(out)); + unsigned can_print = min(n, printbuf_remaining(out)); - for (i = 0; i < can_print; i++) + for (unsigned i = 0; i < can_print; i++) out->buf[out->pos++] = ((char *) b)[i]; - out->pos += n - can_print; printbuf_nul_terminate(out); } @@ -241,18 +236,18 @@ static inline void prt_str_indented(struct printbuf *out, const char *str) static inline void prt_hex_byte(struct printbuf *out, u8 byte) { - bch2_printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 3); __prt_char_reserved(out, hex_asc_hi(byte)); __prt_char_reserved(out, hex_asc_lo(byte)); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); } static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) { - bch2_printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 3); __prt_char_reserved(out, hex_asc_upper_hi(byte)); __prt_char_reserved(out, hex_asc_upper_lo(byte)); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); } /** diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 556da0738106..a0cca8b70e0a 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -20,7 +20,7 @@ static const char * const bch2_quota_counters[] = { }; static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_quota *q = field_to_type(f, quota); @@ -60,8 +60,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { }; int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -97,45 +96,14 @@ static void qc_info_to_text(struct printbuf *out, struct qc_info *i) printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 20); - prt_str(out, "i_fieldmask"); - prt_tab(out); - prt_printf(out, "%x", i->i_fieldmask); - prt_newline(out); - - prt_str(out, "i_flags"); - prt_tab(out); - prt_printf(out, "%u", i->i_flags); - prt_newline(out); - - prt_str(out, "i_spc_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_spc_timelimit); - prt_newline(out); - - prt_str(out, "i_ino_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_ino_timelimit); - prt_newline(out); - - prt_str(out, "i_rt_spc_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_rt_spc_timelimit); - prt_newline(out); - - prt_str(out, "i_spc_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_spc_warnlimit); - prt_newline(out); - - prt_str(out, "i_ino_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_ino_warnlimit); - prt_newline(out); - - prt_str(out, "i_rt_spc_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_rt_spc_warnlimit); - prt_newline(out); + prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask); + prt_printf(out, "i_flags\t%u\n", i->i_flags); + prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit); + prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit); + prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit); + prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit); + prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit); + prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit); } static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) @@ -143,60 +111,17 @@ static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 20); - prt_str(out, "d_fieldmask"); - prt_tab(out); - prt_printf(out, "%x", q->d_fieldmask); - prt_newline(out); - - prt_str(out, "d_spc_hardlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_hardlimit); - prt_newline(out); - - prt_str(out, "d_spc_softlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_softlimit); - prt_newline(out); - - prt_str(out, "d_ino_hardlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_hardlimit); - prt_newline(out); - - prt_str(out, "d_ino_softlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_softlimit); - prt_newline(out); - - prt_str(out, "d_space"); - prt_tab(out); - prt_printf(out, "%llu", q->d_space); - prt_newline(out); - - prt_str(out, "d_ino_count"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_count); - prt_newline(out); - - prt_str(out, "d_ino_timer"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_timer); - prt_newline(out); - - prt_str(out, "d_spc_timer"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_timer); - prt_newline(out); - - prt_str(out, "d_ino_warns"); - prt_tab(out); - prt_printf(out, "%i", q->d_ino_warns); - prt_newline(out); - - prt_str(out, "d_spc_warns"); - prt_tab(out); - prt_printf(out, "%i", q->d_spc_warns); - prt_newline(out); + prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask); + prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit); + prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit); + prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit); + prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit); + prt_printf(out, "d_space\t%llu\n", q->d_space); + prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count); + prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer); + prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer); + prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns); + prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns); } static inline unsigned __next_qtype(unsigned i, unsigned qtypes) @@ -610,10 +535,10 @@ int bch2_fs_quota_read(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, __bch2_quota_set(c, k, NULL)) ?: for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, bch2_fs_quota_read_inode(trans, &iter, k))); bch_err_fn(c, ret); return ret; @@ -900,7 +825,7 @@ static int bch2_set_quota_trans(struct btree_trans *trans, int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); ret = bkey_err(k); if (unlikely(ret)) return ret; diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index 884f601f41c4..02d37a332218 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -5,11 +5,11 @@ #include "inode.h" #include "quota_types.h" -enum bkey_invalid_flags; +enum bch_validate_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota ((struct bkey_ops) { \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 56336f3dd1d0..cf81e5128c3a 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -42,7 +42,7 @@ static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -89,7 +89,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -140,7 +140,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, bch2_trans_iter_init(trans, extent_iter, work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, work_pos, - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_all_snapshots); k = bch2_btree_iter_peek_slot(extent_iter); if (bkey_err(k)) return k; @@ -323,12 +323,14 @@ static int do_rebalance(struct moving_context *ctxt) struct bkey_s_c k; int ret = 0; + bch2_trans_begin(trans); + bch2_move_stats_init(&r->work_stats, "rebalance_work"); bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); bch2_trans_iter_init(trans, &rebalance_work_iter, BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_all_snapshots); while (!bch2_move_ratelimit(ctxt)) { if (!r->enabled) { diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 8091d0686029..1266916ac03f 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -65,9 +65,20 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); + + __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent); + + __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); @@ -125,9 +136,9 @@ static int bch2_journal_replay_key(struct btree_trans *trans, { struct btree_iter iter; unsigned iter_flags = - BTREE_ITER_INTENT| - BTREE_ITER_NOT_EXTENTS; - unsigned update_flags = BTREE_TRIGGER_NORUN; + BTREE_ITER_intent| + BTREE_ITER_not_extents; + unsigned update_flags = BTREE_TRIGGER_norun; int ret; if (k->overwritten) @@ -136,17 +147,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans, trans->journal_res.seq = k->journal_seq; /* - * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to + * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to * keep the key cache coherent with the underlying btree. Nothing * besides the allocator is doing updates yet so we don't need key cache * coherency for non-alloc btrees, and key cache fills for snapshots - * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until + * btrees use BTREE_ITER_filter_snapshots, which isn't available until * the snapshots recovery pass runs. */ if (!k->level && k->btree_id == BTREE_ID_alloc) - iter_flags |= BTREE_ITER_CACHED; + iter_flags |= BTREE_ITER_cached; else - update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; + update_flags |= BTREE_UPDATE_key_cache_reclaim; bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, @@ -191,7 +202,7 @@ int bch2_journal_replay(struct bch_fs *c) struct journal *j = &c->journal; u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; - struct btree_trans *trans = bch2_trans_get(c); + struct btree_trans *trans = NULL; bool immediate_flush = false; int ret = 0; @@ -205,6 +216,7 @@ int bch2_journal_replay(struct bch_fs *c) BUG_ON(!atomic_read(&keys->ref)); move_gap(keys, keys->nr); + trans = bch2_trans_get(c); /* * First, attempt to replay keys in sorted order. This is more @@ -361,14 +373,17 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_dev_usage: { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); - struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); - unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - - for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { - ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); - ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); - ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); - } + unsigned nr_types = jset_entry_dev_usage_nr_types(u); + + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, le32_to_cpu(u->dev)); + if (ca) + for (unsigned i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { + ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); + ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); + ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); + } + rcu_read_unlock(); break; } @@ -597,56 +612,54 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.norecovery) c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; - if (!c->opts.nochanges) { - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - bool write_sb = false; - - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { - ext->recovery_passes_required[0] |= - cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); - write_sb = true; - } + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + bool write_sb = false; - u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - if (sb_passes) { - struct printbuf buf = PRINTBUF; - prt_str(&buf, "superblock requires following recovery passes to be run:\n "); - prt_bitflags(&buf, bch2_recovery_passes, sb_passes); - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } + if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { + ext->recovery_passes_required[0] |= + cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); + write_sb = true; + } - if (bch2_check_version_downgrade(c)) { - struct printbuf buf = PRINTBUF; + u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (sb_passes) { + struct printbuf buf = PRINTBUF; + prt_str(&buf, "superblock requires following recovery passes to be run:\n "); + prt_bitflags(&buf, bch2_recovery_passes, sb_passes); + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } - prt_str(&buf, "Version downgrade required:"); + if (bch2_check_version_downgrade(c)) { + struct printbuf buf = PRINTBUF; - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_downgrade(c, - BCH_VERSION_MINOR(bcachefs_metadata_version_current), - BCH_VERSION_MINOR(c->sb.version)); - passes = ext->recovery_passes_required[0] & ~passes; - if (passes) { - prt_str(&buf, "\n running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } + prt_str(&buf, "Version downgrade required:"); - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - write_sb = true; + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_downgrade(c, + BCH_VERSION_MINOR(bcachefs_metadata_version_current), + BCH_VERSION_MINOR(c->sb.version)); + passes = ext->recovery_passes_required[0] & ~passes; + if (passes) { + prt_str(&buf, "\n running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } - if (check_version_upgrade(c)) - write_sb = true; + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + write_sb = true; + } - if (write_sb) - bch2_write_super(c); + if (check_version_upgrade(c)) + write_sb = true; - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - mutex_unlock(&c->sb_lock); - } + if (write_sb) + bch2_write_super(c); + + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + mutex_unlock(&c->sb_lock); if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); @@ -660,7 +673,9 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) { + bch2_journal_pos_from_member_info_resume(c); + + if (!c->sb.clean || c->opts.retain_recovery_info) { struct genradix_iter iter; struct journal_replay **i; @@ -832,8 +847,8 @@ use_clean: } mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - bool write_sb = false; + ext = bch2_sb_field_get(c->disk_sb.sb, ext); + write_sb = false; if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); @@ -868,6 +883,9 @@ use_clean: write_sb = true; } + if (bch2_blacklist_entries_gc(c)) + write_sb = true; + if (write_sb) bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -890,10 +908,6 @@ use_clean: bch_info(c, "scanning for old btree nodes done"); } - if (c->journal_seq_blacklist_table && - c->journal_seq_blacklist_table->nr > 128) - queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); - ret = 0; out: bch2_flush_fsck_errs(c); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 0cec0f7d9703..4a9eb9582b6e 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -26,11 +26,6 @@ const char * const bch2_recovery_passes[] = { NULL }; -static int bch2_check_allocations(struct bch_fs *c) -{ - return bch2_gc(c, true, false); -} - static int bch2_set_may_go_rw(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; @@ -227,7 +222,8 @@ int bch2_run_recovery_passes(struct bch_fs *c) if (should_run_recovery_pass(c, c->curr_recovery_pass)) { unsigned pass = c->curr_recovery_pass; - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass) ?: + bch2_journal_flush(&c->journal); if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || (ret && c->curr_recovery_pass < pass)) continue; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index ff7864731a07..9ac6cf21cfbf 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -30,7 +30,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); @@ -74,20 +74,20 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r } static int trans_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags) + struct bkey_s_c_reflink_p p, u64 *idx, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_i *k; __le64 *refcount; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; struct printbuf buf = PRINTBUF; int ret; k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_reflink, POS(0, *idx), - BTREE_ITER_WITH_UPDATES); + BTREE_ITER_with_updates); ret = PTR_ERR_OR_ZERO(k); if (ret) goto err; @@ -102,7 +102,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, goto err; } - if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { bch2_bkey_val_to_text(&buf, c, p.s_c); bch2_trans_inconsistent(trans, "indirect extent refcount underflow at %llu while marking\n %s", @@ -111,7 +111,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, goto err; } - if (flags & BTREE_TRIGGER_INSERT) { + if (flags & BTREE_TRIGGER_insert) { struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; u64 pad; @@ -141,12 +141,13 @@ err: } static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags, size_t r_idx) + struct bkey_s_c_reflink_p p, u64 *idx, + enum btree_iter_update_trigger_flags flags, + size_t r_idx) { struct bch_fs *c = trans->c; struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; u64 start = le64_to_cpu(p.v->idx); u64 end = le64_to_cpu(p.v->idx) + p.k->size; u64 next_idx = end + le32_to_cpu(p.v->back_pad); @@ -163,10 +164,13 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, BUG_ON((s64) r->refcount + add < 0); - r->refcount += add; + if (flags & BTREE_TRIGGER_gc) + r->refcount += add; *idx = r->offset; return 0; not_found: + BUG_ON(!(flags & BTREE_TRIGGER_check_repair)); + if (fsck_err(c, reflink_p_to_missing_reflink_v, "pointer to missing indirect extent\n" " %s\n" @@ -189,7 +193,7 @@ not_found: set_bkey_val_u64s(&update->k, 0); } - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN); + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun); } *idx = next_idx; @@ -200,8 +204,8 @@ fsck_err: } static int __trigger_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) + enum btree_id btree_id, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); @@ -210,12 +214,12 @@ static int __trigger_reflink_p(struct btree_trans *trans, u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { while (idx < end && !ret) ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags); } - if (flags & BTREE_TRIGGER_GC) { + if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) { size_t l = 0, r = c->reflink_gc_nr; while (l < r) { @@ -238,10 +242,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && - (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_transactional) && + (flags & BTREE_TRIGGER_insert)) { struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v; v->front_pad = v->back_pad = 0; @@ -253,7 +257,7 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, /* indirect extents */ int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { return bch2_bkey_ptrs_invalid(c, k, flags, err); @@ -281,23 +285,25 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r } #endif -static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags) +static inline void +check_indirect_extent_deleting(struct bkey_s new, + enum btree_iter_update_trigger_flags *flags) { - if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { + if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) { new.k->type = KEY_TYPE_deleted; new.k->size = 0; set_bkey_val_u64s(new.k, 0); - *flags &= ~BTREE_TRIGGER_INSERT; + *flags &= ~BTREE_TRIGGER_insert; } } int bch2_trigger_reflink_v(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && - (flags & BTREE_TRIGGER_INSERT)) + if ((flags & BTREE_TRIGGER_transactional) && + (flags & BTREE_TRIGGER_insert)) check_indirect_extent_deleting(new, &flags); return bch2_trigger_extent(trans, btree_id, level, old, new, flags); @@ -306,7 +312,7 @@ int bch2_trigger_reflink_v(struct btree_trans *trans, /* indirect inline data */ int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { return 0; @@ -326,7 +332,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, int bch2_trigger_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { check_indirect_extent_deleting(new, &flags); @@ -349,7 +355,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_prev(&reflink_iter); ret = bkey_err(k); if (ret) @@ -394,7 +400,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); err: bch2_trans_iter_exit(trans, &reflink_iter); @@ -455,9 +461,9 @@ s64 bch2_remap_range(struct bch_fs *c, goto err; bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, - BTREE_ITER_INTENT); + BTREE_ITER_intent); bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, - BTREE_ITER_INTENT); + BTREE_ITER_intent); while ((ret == 0 || bch2_err_matches(ret, BCH_ERR_transaction_restart)) && @@ -567,7 +573,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_begin(trans); ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u, - dst_inum, BTREE_ITER_INTENT); + dst_inum, BTREE_ITER_intent); if (!ret2 && inode_u.bi_size < new_i_size) { diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 4d8867289717..e894f3a2c67a 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -2,15 +2,16 @@ #ifndef _BCACHEFS_REFLINK_H #define _BCACHEFS_REFLINK_H -enum bkey_invalid_flags; +enum bch_validate_flags; int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ .key_invalid = bch2_reflink_p_invalid, \ @@ -21,11 +22,12 @@ int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, }) int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ @@ -36,13 +38,13 @@ int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, }) int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, - unsigned); + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ .key_invalid = bch2_indirect_inline_data_invalid, \ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 678b9c20e251..bd1d5d085e23 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -84,7 +84,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, } for (unsigned i = 0; i < r->nr_devs; i++) - if (!bch2_dev_exists(sb, r->devs[i])) { + if (!bch2_member_exists(sb, r->devs[i])) { prt_printf(err, "invalid device %u in entry ", r->devs[i]); goto bad; } @@ -200,7 +200,7 @@ cpu_replicas_add_entry(struct bch_fs *c, }; for (i = 0; i < new_entry->nr_devs; i++) - BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i])); + BUG_ON(!bch2_dev_exists(c, new_entry->devs[i])); BUG_ON(!new_entry->data_type); verify_replicas_entry(new_entry); @@ -860,7 +860,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, } static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); struct bch_replicas_cpu cpu_r; @@ -899,7 +899,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = { }; static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); struct bch_replicas_cpu cpu_r; @@ -947,18 +947,20 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, percpu_down_read(&c->mark_lock); for_each_cpu_replicas_entry(&c->replicas, e) { - unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; + unsigned nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; if (e->data_type == BCH_DATA_cached) continue; - for (i = 0; i < e->nr_devs; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); - + rcu_read_lock(); + for (unsigned i = 0; i < e->nr_devs; i++) { nr_online += test_bit(e->devs[i], devs.d); - nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; + + struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]); + nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed; } + rcu_read_unlock(); if (nr_failed == e->nr_devs) continue; diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 194e55b11137..47f10ab57f40 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -266,9 +266,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, } } -static int bch2_sb_clean_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_clean *clean = field_to_type(f, clean); @@ -283,7 +282,7 @@ static int bch2_sb_clean_validate(struct bch_sb *sb, entry = vstruct_next(entry)) { if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) { prt_str(err, "entry type "); - bch2_prt_jset_entry_type(err, le16_to_cpu(entry->type)); + bch2_prt_jset_entry_type(err, entry->type); prt_str(err, " overruns end of section"); return -BCH_ERR_invalid_sb_clean; } @@ -298,10 +297,8 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field_clean *clean = field_to_type(f, clean); struct jset_entry *entry; - prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); - prt_newline(out); - prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); - prt_newline(out); + prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags)); + prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq)); for (entry = clean->start; entry != vstruct_end(&clean->field); @@ -392,6 +389,8 @@ void bch2_fs_mark_clean(struct bch_fs *c) goto out; } + bch2_journal_pos_from_member_info_set(c); + bch2_write_super(c); out: mutex_unlock(&c->sb_lock); diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c index 7dc898761bb3..6992e7469112 100644 --- a/fs/bcachefs/sb-counters.c +++ b/fs/bcachefs/sb-counters.c @@ -20,9 +20,8 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; }; -static int bch2_sb_counters_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { return 0; }; @@ -31,19 +30,12 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_counters *ctrs = field_to_type(f, counters); - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - for (i = 0; i < nr; i++) { - if (i < BCH_COUNTER_NR) - prt_printf(out, "%s ", bch2_counter_names[i]); - else - prt_printf(out, "(unknown)"); - - prt_tab(out); - prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i])); - prt_newline(out); - } + for (unsigned i = 0; i < nr; i++) + prt_printf(out, "%s \t%llu\n", + i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", + le64_to_cpu(ctrs->d[i])); }; int bch2_sb_counters_to_cpu(struct bch_fs *c) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index a98ef940b7a3..390a1bbd2567 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -134,15 +134,25 @@ downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e) #define for_each_downgrade_entry(_d, _i) \ for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \ (void *) _i < vstruct_end(&(_d)->field) && \ - (void *) &_i->errors[0] < vstruct_end(&(_d)->field); \ + (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \ + (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \ _i = downgrade_entry_next_c(_i)) static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); - for_each_downgrade_entry(e, i) { + for (const struct bch_sb_field_downgrade_entry *i = e->entries; + (void *) i < vstruct_end(&e->field); + i = downgrade_entry_next_c(i)) { + if (flags & BCH_VALIDATE_write && + ((void *) &i->errors[0] > vstruct_end(&e->field) || + (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field))) { + prt_printf(err, "downgrade entry overruns end of superblock section)"); + return -BCH_ERR_invalid_sb_downgrade; + } + if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) != BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) { prt_printf(err, "downgrade entry with mismatched major version (%u != %u)", @@ -164,19 +174,16 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb, printbuf_tabstop_push(out, 16); for_each_downgrade_entry(e, i) { - prt_str(out, "version:"); - prt_tab(out); + prt_str(out, "version:\t"); bch2_version_to_text(out, le16_to_cpu(i->version)); prt_newline(out); - prt_str(out, "recovery passes:"); - prt_tab(out); + prt_str(out, "recovery passes:\t"); prt_bitflags(out, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0]))); prt_newline(out); - prt_str(out, "errors:"); - prt_tab(out); + prt_str(out, "errors:\t"); bool first = true; for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { if (!first) diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index 5f5bcae391fb..bda33e59e226 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -30,7 +30,7 @@ static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) } static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_errors *e = field_to_type(f, errors); unsigned i, nr = bch2_sb_field_errors_nr_entries(e); diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 06c7a644f4a4..87324747351a 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -272,7 +272,8 @@ x(snapshot_node_missing, 264) \ x(dup_backpointer_to_bad_csum_extent, 265) \ x(btree_bitmap_not_marked, 266) \ - x(sb_clean_entry_overrun, 267) + x(sb_clean_entry_overrun, 267) \ + x(btree_ptr_v2_written_0, 268) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 44b3f0cb7b49..39196f2a4197 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -3,11 +3,22 @@ #include "bcachefs.h" #include "btree_cache.h" #include "disk_groups.h" +#include "error.h" #include "opts.h" #include "replicas.h" #include "sb-members.h" #include "super-io.h" +void bch2_dev_missing(struct bch_fs *c, unsigned dev) +{ + bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); +} + +void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket) +{ + bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset); +} + #define x(t, n, ...) [n] = #t, static const char * const bch2_iops_measurements[] = { BCH_IOPS_MEASUREMENTS() @@ -164,18 +175,14 @@ static void member_to_text(struct printbuf *out, u64 bucket_size = le16_to_cpu(m.bucket_size); u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; - if (!bch2_member_exists(&m)) + if (!bch2_member_alive(&m)) return; - prt_printf(out, "Device:"); - prt_tab(out); - prt_printf(out, "%u", i); - prt_newline(out); + prt_printf(out, "Device:\t%u\n", i); printbuf_indent_add(out, 2); - prt_printf(out, "Label:"); - prt_tab(out); + prt_printf(out, "Label:\t"); if (BCH_MEMBER_GROUP(&m)) { unsigned idx = BCH_MEMBER_GROUP(&m) - 1; @@ -189,103 +196,73 @@ static void member_to_text(struct printbuf *out, } prt_newline(out); - prt_printf(out, "UUID:"); - prt_tab(out); + prt_printf(out, "UUID:\t"); pr_uuid(out, m.uuid.b); prt_newline(out); - prt_printf(out, "Size:"); - prt_tab(out); + prt_printf(out, "Size:\t"); prt_units_u64(out, device_size << 9); prt_newline(out); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s errors:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, le64_to_cpu(m.errors[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i])); - for (unsigned i = 0; i < BCH_IOPS_NR; i++) { - prt_printf(out, "%s iops:", bch2_iops_measurements[i]); - prt_tab(out); - prt_printf(out, "%u", le32_to_cpu(m.iops[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_IOPS_NR; i++) + prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i])); - prt_printf(out, "Bucket size:"); - prt_tab(out); + prt_printf(out, "Bucket size:\t"); prt_units_u64(out, bucket_size << 9); prt_newline(out); - prt_printf(out, "First bucket:"); - prt_tab(out); - prt_printf(out, "%u", le16_to_cpu(m.first_bucket)); - prt_newline(out); - - prt_printf(out, "Buckets:"); - prt_tab(out); - prt_printf(out, "%llu", le64_to_cpu(m.nbuckets)); - prt_newline(out); + prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket)); + prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets)); - prt_printf(out, "Last mount:"); - prt_tab(out); + prt_printf(out, "Last mount:\t"); if (m.last_mount) bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); else prt_printf(out, "(never)"); prt_newline(out); - prt_printf(out, "Last superblock write:"); - prt_tab(out); - prt_u64(out, le64_to_cpu(m.seq)); - prt_newline(out); + prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq)); - prt_printf(out, "State:"); - prt_tab(out); - prt_printf(out, "%s", + prt_printf(out, "State:\t%s\n", BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR ? bch2_member_states[BCH_MEMBER_STATE(&m)] : "unknown"); - prt_newline(out); - prt_printf(out, "Data allowed:"); - prt_tab(out); + prt_printf(out, "Data allowed:\t"); if (BCH_MEMBER_DATA_ALLOWED(&m)) prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); else prt_printf(out, "(none)"); prt_newline(out); - prt_printf(out, "Has data:"); - prt_tab(out); + prt_printf(out, "Has data:\t"); if (data_have) prt_bitflags(out, __bch2_data_types, data_have); else prt_printf(out, "(none)"); prt_newline(out); - prt_str(out, "Durability:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + prt_printf(out, "Btree allocated bitmap blocksize:\t"); + prt_units_u64(out, 1ULL << m.btree_bitmap_shift); prt_newline(out); - prt_printf(out, "Discard:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m)); + prt_printf(out, "Btree allocated bitmap:\t"); + bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64); prt_newline(out); - prt_printf(out, "Freespace initialized:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); - prt_newline(out); + prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + + prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); + prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); printbuf_indent_sub(out, 2); } -static int bch2_sb_members_v1_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); unsigned i; @@ -333,9 +310,8 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, member_to_text(out, members_v2_get(mi, i), gi, sb, i); } -static int bch2_sb_members_v2_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - @@ -390,12 +366,8 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) prt_newline(out); printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, atomic64_read(&ca->errors[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i])); printbuf_indent_sub(out, 2); prt_str(out, "IO errors since "); @@ -404,12 +376,9 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) prt_newline(out); printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], + atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); printbuf_indent_sub(out, 2); } @@ -437,11 +406,20 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) { - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) - if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev), - ptr->offset, btree_sectors(c))) - return false; - return true; + bool ret = true; + rcu_read_lock(); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + + if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) { + ret = false; + break; + } + } + rcu_read_unlock(); + return ret; } static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, @@ -463,6 +441,9 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns m->btree_bitmap_shift += resize; } + BUG_ON(m->btree_bitmap_shift > 57); + BUG_ON(end > 64ULL << m->btree_bitmap_shift); + for (unsigned bit = start >> m->btree_bitmap_shift; (u64) bit << m->btree_bitmap_shift < end; bit++) @@ -476,6 +457,10 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) lockdep_assert_held(&c->sb_lock); struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (!bch2_member_exists(c->disk_sb.sb, ptr->dev)) + continue; + __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); + } } diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 5bf27d30ca29..dd93192ec065 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -29,19 +29,6 @@ static inline bool bch2_dev_is_readable(struct bch_dev *ca) ca->mi.state != BCH_MEMBER_STATE_failed; } -static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) -{ - if (!percpu_ref_tryget(&ca->io_ref)) - return false; - - if (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) - return true; - - percpu_ref_put(&ca->io_ref); - return false; -} - static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) { return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); @@ -105,14 +92,41 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * for (struct bch_dev *_ca = NULL; \ (_ca = __bch2_next_dev((_c), _ca, (_mask)));) -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) +static inline void bch2_dev_get(struct bch_dev *ca) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L); +#else + percpu_ref_get(&ca->ref); +#endif +} + +static inline void __bch2_dev_put(struct bch_dev *ca) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + long r = atomic_long_dec_return(&ca->ref); + if (r < (long) !ca->dying) + panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put); + ca->last_put = _THIS_IP_; + if (!r) + complete(&ca->ref_completion); +#else + percpu_ref_put(&ca->ref); +#endif +} + +static inline void bch2_dev_put(struct bch_dev *ca) { - rcu_read_lock(); if (ca) - percpu_ref_put(&ca->ref); + __bch2_dev_put(ca); +} +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) +{ + rcu_read_lock(); + bch2_dev_put(ca); if ((ca = __bch2_next_dev(c, ca, NULL))) - percpu_ref_get(&ca->ref); + bch2_dev_get(ca); rcu_read_unlock(); return ca; @@ -158,26 +172,113 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, #define for_each_readable_member(c, ca) \ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro)) -/* - * If a key exists that references a device, the device won't be going away and - * we can omit rcu_read_lock(): - */ -static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) +static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) { - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + return dev < c->sb.nr_devices && c->devs[dev]; +} - return rcu_dereference_check(c->devs[idx], 1); +static inline bool bucket_valid(const struct bch_dev *ca, u64 b) +{ + return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first; } -static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) +static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev) { - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + EBUG_ON(!bch2_dev_exists(c, dev)); + + return rcu_dereference_check(c->devs[dev], 1); +} - return rcu_dereference_protected(c->devs[idx], +static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev) +{ + EBUG_ON(!bch2_dev_exists(c, dev)); + + return rcu_dereference_protected(c->devs[dev], lockdep_is_held(&c->sb_lock) || lockdep_is_held(&c->state_lock)); } +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) +{ + return c && dev < c->sb.nr_devices + ? rcu_dereference(c->devs[dev]) + : NULL; +} + +static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + if (ca) + bch2_dev_get(ca); + rcu_read_unlock(); + return ca; +} + +void bch2_dev_missing(struct bch_fs *, unsigned); + +static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); + if (!ca) + bch2_dev_missing(c, dev); + return ca; +} + +static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); + if (ca && !bucket_valid(ca, bucket.offset)) { + bch2_dev_put(ca); + ca = NULL; + } + return ca; +} + +void bch2_dev_bucket_missing(struct bch_fs *, struct bpos); + +static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket) +{ + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket); + if (!ca) + bch2_dev_bucket_missing(c, bucket); + return ca; +} + +static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) +{ + if (ca && ca->dev_idx == dev_idx) + return ca; + bch2_dev_put(ca); + return bch2_dev_tryget_noerror(c, dev_idx); +} + +static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) +{ + if (ca && ca->dev_idx == dev_idx) + return ca; + bch2_dev_put(ca); + return bch2_dev_tryget(c, dev_idx); +} + +static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + if (ca && !percpu_ref_tryget(&ca->io_ref)) + ca = NULL; + rcu_read_unlock(); + + if (ca && + (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))) + return ca; + + if (ca) + percpu_ref_put(&ca->io_ref); + return NULL; +} + /* XXX kill, move to struct bch_fs */ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) { @@ -192,16 +293,16 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; -static inline bool bch2_member_exists(struct bch_member *m) +static inline bool bch2_member_alive(struct bch_member *m) { return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); } -static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev) +static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) { if (dev < sb->nr_devices) { struct bch_member m = bch2_sb_member_get(sb, dev); - return bch2_member_exists(&m); + return bch2_member_alive(&m); } return false; } @@ -210,6 +311,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) { return (struct bch_member_cpu) { .nbuckets = le64_to_cpu(mi->nbuckets), + .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) - + le16_to_cpu(mi->first_bucket), .first_bucket = le16_to_cpu(mi->first_bucket), .bucket_size = le16_to_cpu(mi->bucket_size), .group = BCH_MEMBER_GROUP(mi), @@ -220,7 +323,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), - .valid = bch2_member_exists(mi), + .valid = bch2_member_alive(mi), .btree_bitmap_shift = mi->btree_bitmap_shift, .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), }; diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h new file mode 100644 index 000000000000..c0eda888fe39 --- /dev/null +++ b/fs/bcachefs/sb-members_types.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H +#define _BCACHEFS_SB_MEMBERS_TYPES_H + +struct bch_member_cpu { + u64 nbuckets; /* device size */ + u64 nbuckets_minus_first; + u16 first_bucket; /* index of first bucket used */ + u16 bucket_size; /* sectors */ + u16 group; + u8 state; + u8 discard; + u8 data_allowed; + u8 durability; + u8 freespace_initialized; + u8 valid; + u8 btree_bitmap_shift; + u64 btree_allocated_bitmap; +}; + +#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 544322d5c251..629900a5e641 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -32,7 +32,7 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -49,7 +49,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot_tree *s) { int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), - BTREE_ITER_WITH_UPDATES, snapshot_tree, s); + BTREE_ITER_with_updates, snapshot_tree, s); if (bch2_err_matches(ret, ENOENT)) ret = -BCH_ERR_ENOENT_snapshot_tree; @@ -223,7 +223,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_snapshot s; @@ -298,7 +298,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct snapshot_t *t; @@ -352,7 +352,7 @@ err: int bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags); } @@ -361,7 +361,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s) { return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_WITH_UPDATES, snapshot, s); + BTREE_ITER_with_updates, snapshot, s); } static int bch2_snapshot_live(struct btree_trans *trans, u32 id) @@ -618,7 +618,7 @@ int bch2_check_snapshot_trees(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_snapshot_tree(trans, &iter, k))); bch_err_fn(c, ret); @@ -695,7 +695,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, root = bch2_bkey_get_iter_typed(trans, &root_iter, BTREE_ID_snapshots, POS(0, root_id), - BTREE_ITER_WITH_UPDATES, snapshot); + BTREE_ITER_with_updates, snapshot); ret = bkey_err(root); if (ret) goto err; @@ -886,7 +886,7 @@ int bch2_check_snapshots(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_snapshots, POS_MAX, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_snapshot(trans, &iter, k))); bch_err_fn(c, ret); @@ -900,7 +900,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) if (bch2_snapshot_equiv(c, id)) return 0; - u32 tree_id; + /* 0 is an invalid tree ID */ + u32 tree_id = 0; int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); if (ret) return ret; @@ -1001,7 +1002,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) r.btree = btree; ret = for_each_btree_key(trans, iter, btree, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ get_snapshot_trees(c, &r, k.k->p); })); if (ret) @@ -1018,7 +1019,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) darray_for_each(*t, id) { if (fsck_err_on(!bch2_snapshot_equiv(c, *id), c, snapshot_node_missing, - "snapshot node %u from tree %s missing", *id, buf.buf)) { + "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); ret = -BCH_ERR_fsck_repair_unimplemented; @@ -1090,7 +1091,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) int ret = 0; s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_INTENT, snapshot); + BTREE_ITER_intent, snapshot); ret = bkey_err(s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", id); @@ -1199,7 +1200,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS_MIN, BTREE_ITER_INTENT); + POS_MIN, BTREE_ITER_intent); k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) @@ -1367,7 +1368,7 @@ static int snapshot_delete_key(struct btree_trans *trans, if (snapshot_list_has_id(deleted, k.k->p.snapshot) || snapshot_list_has_id(equiv_seen, equiv)) { return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } else { return snapshot_list_add(c, equiv_seen, equiv); } @@ -1404,15 +1405,15 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans, new->k.p.snapshot = equiv; bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); + BTREE_ITER_all_snapshots| + BTREE_ITER_cached| + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&new_iter) ?: bch2_trans_update(trans, &new_iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &new_iter); if (ret) return ret; @@ -1603,12 +1604,12 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, move_key_to_correct_snapshot(trans, &iter, k)); @@ -1643,7 +1644,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) * nodes some depth fields will be off: */ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); if (ret) @@ -1699,8 +1700,8 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, id, pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); while (1) { k = bch2_btree_iter_prev(&iter); ret = bkey_err(k); @@ -1752,7 +1753,7 @@ static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, pos.snapshot = leaf_id; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index b7d2fed37c4f..bd5d74269d15 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -2,11 +2,11 @@ #ifndef _BCACHEFS_SNAPSHOT_H #define _BCACHEFS_SNAPSHOT_H -enum bkey_invalid_flags; +enum bch_validate_flags; void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ .key_invalid = bch2_snapshot_tree_invalid, \ @@ -20,9 +20,10 @@ int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tre void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ .key_invalid = bch2_snapshot_invalid, \ @@ -77,7 +78,7 @@ static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) return 0; u32 parent = s->parent; - if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) && + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && parent && s->depth != snapshot_t(c, parent)->depth + 1) panic("id %u depth=%u parent %u depth=%u\n", @@ -135,11 +136,6 @@ static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) return id; } -static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) -{ - return id == bch2_snapshot_equiv(c, id); -} - static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { rcu_read_lock(); diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 3976f80721bf..cbad9b27874f 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -15,16 +15,6 @@ #include <crypto/hash.h> #include <crypto/sha2.h> -typedef unsigned __bitwise bch_str_hash_flags_t; - -enum bch_str_hash_flags { - __BCH_HASH_SET_MUST_CREATE, - __BCH_HASH_SET_MUST_REPLACE, -}; - -#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE) -#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE) - static inline enum bch_str_hash_type bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { @@ -159,13 +149,14 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s desc.is_visible(inum, k)); } -static __always_inline int +static __always_inline struct bkey_s_c bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags, u32 snapshot) + enum btree_iter_update_trigger_flags flags, + u32 snapshot) { struct bkey_s_c k; int ret; @@ -173,10 +164,10 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), - BTREE_ITER_SLOTS|flags, k, ret) { + BTREE_ITER_slots|flags, k, ret) { if (is_visible_key(desc, inum, k)) { if (!desc.cmp_key(k, key)) - return 0; + return k; } else if (k.k->type == KEY_TYPE_hash_whiteout) { ; } else { @@ -186,20 +177,23 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, } bch2_trans_iter_exit(trans, iter); - return ret ?: -BCH_ERR_ENOENT_str_hash_lookup; + return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup); } -static __always_inline int +static __always_inline struct bkey_s_c bch2_hash_lookup(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { u32 snapshot; - return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: - bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); + int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return bkey_s_c_err(ret); + + return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); } static __always_inline int @@ -220,7 +214,7 @@ bch2_hash_hole(struct btree_trans *trans, for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) if (!is_visible_key(desc, inum, k)) return 0; bch2_trans_iter_exit(trans, iter); @@ -242,7 +236,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, bch2_btree_iter_advance(&iter); - for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) { + for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) { if (k.k->type != desc.key_type && k.k->type != KEY_TYPE_hash_whiteout) break; @@ -264,8 +258,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, struct bkey_i *insert, - bch_str_hash_flags_t str_hash_flags, - int update_flags) + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter, slot = { NULL }; struct bkey_s_c k; @@ -277,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), POS(insert->k.p.inode, U64_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { if (is_visible_key(desc, inum, k)) { if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) goto found; @@ -286,8 +279,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, continue; } - if (!slot.path && - !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) + if (!slot.path && !(flags & STR_HASH_must_replace)) bch2_trans_copy_iter(&slot, &iter); if (k.k->type != KEY_TYPE_hash_whiteout) @@ -305,16 +297,16 @@ found: found = true; not_found: - if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) { + if (!found && (flags & STR_HASH_must_replace)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; - } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) { + } else if (found && (flags & STR_HASH_must_create)) { ret = -EEXIST; } else { if (!found && slot.path) swap(iter, slot); insert->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, insert, update_flags); + ret = bch2_trans_update(trans, &iter, insert, flags); } goto out; @@ -326,14 +318,14 @@ int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_info *info, subvol_inum inum, struct bkey_i *insert, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { insert->k.p.inode = inum.inum; u32 snapshot; return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: bch2_hash_set_in_snapshot(trans, desc, info, inum, - snapshot, insert, str_hash_flags, 0); + snapshot, insert, flags); } static __always_inline @@ -341,7 +333,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, struct btree_iter *iter, - unsigned update_flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_i *delete; int ret; @@ -359,7 +351,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; - return bch2_trans_update(trans, iter, delete, update_flags); + return bch2_trans_update(trans, iter, delete, flags); } static __always_inline @@ -369,14 +361,10 @@ int bch2_hash_delete(struct btree_trans *trans, subvol_inum inum, const void *key) { struct btree_iter iter; - int ret; - - ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, - BTREE_ITER_INTENT); - if (ret) - return ret; - - ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key, + BTREE_ITER_intent); + int ret = bkey_err(k) ?: + bch2_hash_delete_at(trans, desc, info, &iter, 0); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 88a79c823276..132213761ef6 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -162,7 +162,7 @@ int bch2_check_subvols(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_subvol(trans, &iter, k))); bch_err_fn(c, ret); @@ -198,7 +198,7 @@ int bch2_check_subvol_children(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_subvol_child(trans, &iter, k))); bch_err_fn(c, ret); @@ -208,7 +208,7 @@ int bch2_check_subvol_children(struct bch_fs *c) /* Subvolumes: */ int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -245,9 +245,9 @@ static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bo int bch2_subvolume_trigger(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct bpos children_pos_old = subvolume_children_pos(old); struct bpos children_pos_new = subvolume_children_pos(new.s_c); @@ -333,7 +333,7 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, subvol = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES, + BTREE_ITER_cached|BTREE_ITER_with_updates, subvolume); ret = bkey_err(subvol); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, @@ -383,9 +383,9 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d return lockrestart_do(trans, bch2_subvolume_get(trans, subvolid_to_delete, true, - BTREE_ITER_CACHED, &s)) ?: + BTREE_ITER_cached, &s)) ?: for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, subvolid_to_delete, le32_to_cpu(s.creation_parent))); @@ -404,7 +404,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) subvol = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED|BTREE_ITER_INTENT, + BTREE_ITER_cached|BTREE_ITER_intent, subvolume); ret = bkey_err(subvol); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, @@ -505,7 +505,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) n = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED, subvolume); + BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(n); if (unlikely(ret)) { bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, @@ -547,7 +547,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, BTREE_ID_subvolumes, POS(0, src_subvolid), - BTREE_ITER_CACHED, subvolume); + BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(src_subvol); if (unlikely(ret)) { bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index d2015d549bd2..afa5e871efb2 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -5,16 +5,17 @@ #include "darray.h" #include "subvolume_types.h" -enum bkey_invalid_flags; +enum bch_validate_flags; int bch2_check_subvols(struct bch_fs *); int bch2_check_subvol_children(struct bch_fs *); int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ .key_invalid = bch2_subvolume_invalid, \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index bfdb15e7d778..f1bee6c5222d 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -76,7 +76,7 @@ const char * const bch2_sb_fields[] = { }; static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, - struct printbuf *); + enum bch_validate_flags, struct printbuf *); struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, enum bch_sb_field_type type) @@ -344,8 +344,8 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) return 0; } -static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, - int rw) +static int bch2_sb_validate(struct bch_sb_handle *disk_sb, + enum bch_validate_flags flags, struct printbuf *out) { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field_members_v1 *mi; @@ -401,7 +401,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return -BCH_ERR_invalid_sb_time_precision; } - if (rw == READ) { + if (!flags) { /* * Been seeing a bug where these are getting inexplicably * zeroed, so we're now validating them, but we have to be @@ -457,7 +457,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return -BCH_ERR_invalid_sb_members_missing; } - ret = bch2_sb_field_validate(sb, &mi->field, out); + ret = bch2_sb_field_validate(sb, &mi->field, flags, out); if (ret) return ret; @@ -465,12 +465,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) continue; - ret = bch2_sb_field_validate(sb, f, out); + ret = bch2_sb_field_validate(sb, f, flags, out); if (ret) return ret; } - if (rw == WRITE && + if ((flags & BCH_VALIDATE_write) && bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), @@ -819,7 +819,7 @@ got_super: sb->have_layout = true; - ret = bch2_sb_validate(sb, &err, READ); + ret = bch2_sb_validate(sb, 0, &err); if (ret) { bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); @@ -975,7 +975,7 @@ int bch2_write_super(struct bch_fs *c) darray_for_each(online_devices, ca) { printbuf_reset(&err); - ret = bch2_sb_validate(&(*ca)->disk_sb, &err, WRITE); + ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); goto out; @@ -1020,26 +1020,35 @@ int bch2_write_super(struct bch_fs *c) continue; if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { - bch2_fs_fatal_error(c, + struct printbuf buf = PRINTBUF; + prt_char(&buf, ' '); + prt_bdevname(&buf, ca->disk_sb.bdev); + prt_printf(&buf, ": Superblock write was silently dropped! (seq %llu expected %llu)", le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); - percpu_ref_put(&ca->io_ref); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); ret = -BCH_ERR_erofs_sb_err; - goto out; } if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { - bch2_fs_fatal_error(c, + struct printbuf buf = PRINTBUF; + prt_char(&buf, ' '); + prt_bdevname(&buf, ca->disk_sb.bdev); + prt_printf(&buf, ": Superblock modified by another process (seq %llu expected %llu)", le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); - percpu_ref_put(&ca->io_ref); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); ret = -BCH_ERR_erofs_sb_err; - goto out; } } + if (ret) + goto out; + do { wrote = false; darray_for_each(online_devices, cap) { @@ -1152,7 +1161,7 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) } static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { if (vstruct_bytes(f) < 88) { prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88); @@ -1167,8 +1176,7 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_ext *e = field_to_type(f, ext); - prt_printf(out, "Recovery passes required:"); - prt_tab(out); + prt_printf(out, "Recovery passes required:\t"); prt_bitflags(out, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0]))); prt_newline(out); @@ -1177,16 +1185,14 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, if (errors_silent) { le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8); - prt_printf(out, "Errors to silently fix:"); - prt_tab(out); + prt_printf(out, "Errors to silently fix:\t"); prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8); prt_newline(out); kfree(errors_silent); } - prt_printf(out, "Btrees with missing data:"); - prt_tab(out); + prt_printf(out, "Btrees with missing data:\t"); prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); prt_newline(out); } @@ -1213,14 +1219,14 @@ static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) } static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { unsigned type = le32_to_cpu(f->type); struct printbuf field_err = PRINTBUF; const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); int ret; - ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; + ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0; if (ret) { prt_printf(err, "Invalid superblock section %s: %s", bch2_sb_fields[type], field_err.buf); @@ -1294,97 +1300,73 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, printbuf_tabstop_push(out, 44); for (int i = 0; i < sb->nr_devices; i++) - nr_devices += bch2_dev_exists(sb, i); + nr_devices += bch2_member_exists(sb, i); - prt_printf(out, "External UUID:"); - prt_tab(out); + prt_printf(out, "External UUID:\t"); pr_uuid(out, sb->user_uuid.b); prt_newline(out); - prt_printf(out, "Internal UUID:"); - prt_tab(out); + prt_printf(out, "Internal UUID:\t"); pr_uuid(out, sb->uuid.b); prt_newline(out); - prt_printf(out, "Magic number:"); - prt_tab(out); + prt_printf(out, "Magic number:\t"); pr_uuid(out, sb->magic.b); prt_newline(out); - prt_str(out, "Device index:"); - prt_tab(out); - prt_printf(out, "%u", sb->dev_idx); - prt_newline(out); + prt_printf(out, "Device index:\t%u\n", sb->dev_idx); - prt_str(out, "Label:"); - prt_tab(out); + prt_str(out, "Label:\t"); prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); prt_newline(out); - prt_str(out, "Version:"); - prt_tab(out); + prt_str(out, "Version:\t"); bch2_version_to_text(out, le16_to_cpu(sb->version)); prt_newline(out); - prt_str(out, "Version upgrade complete:"); - prt_tab(out); + prt_str(out, "Version upgrade complete:\t"); bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); prt_newline(out); - prt_printf(out, "Oldest version on disk:"); - prt_tab(out); + prt_printf(out, "Oldest version on disk:\t"); bch2_version_to_text(out, le16_to_cpu(sb->version_min)); prt_newline(out); - prt_printf(out, "Created:"); - prt_tab(out); + prt_printf(out, "Created:\t"); if (sb->time_base_lo) bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); else prt_printf(out, "(not set)"); prt_newline(out); - prt_printf(out, "Sequence number:"); - prt_tab(out); + prt_printf(out, "Sequence number:\t"); prt_printf(out, "%llu", le64_to_cpu(sb->seq)); prt_newline(out); - prt_printf(out, "Time of last write:"); - prt_tab(out); + prt_printf(out, "Time of last write:\t"); bch2_prt_datetime(out, le64_to_cpu(sb->write_time)); prt_newline(out); - prt_printf(out, "Superblock size:"); - prt_tab(out); + prt_printf(out, "Superblock size:\t"); prt_units_u64(out, vstruct_bytes(sb)); prt_str(out, "/"); prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); prt_newline(out); - prt_printf(out, "Clean:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); - prt_newline(out); - - prt_printf(out, "Devices:"); - prt_tab(out); - prt_printf(out, "%u", nr_devices); - prt_newline(out); + prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb)); + prt_printf(out, "Devices:\t%u\n", nr_devices); - prt_printf(out, "Sections:"); + prt_printf(out, "Sections:\t"); vstruct_for_each(sb, f) fields_have |= 1 << le32_to_cpu(f->type); - prt_tab(out); prt_bitflags(out, bch2_sb_fields, fields_have); prt_newline(out); - prt_printf(out, "Features:"); - prt_tab(out); + prt_printf(out, "Features:\t"); prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); prt_newline(out); - prt_printf(out, "Compat features:"); - prt_tab(out); + prt_printf(out, "Compat features:\t"); prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); prt_newline(out); @@ -1401,8 +1383,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, if (opt->get_sb != BCH2_NO_SB_OPT) { u64 v = bch2_opt_from_sb(sb, id); - prt_printf(out, "%s:", opt->attr.name); - prt_tab(out); + prt_printf(out, "%s:\t", opt->attr.name); bch2_opt_to_text(out, NULL, sb, opt, v, OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); prt_newline(out); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 95e80e06316b..fadd364e2802 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -51,7 +51,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); extern const char * const bch2_sb_fields[]; struct bch_sb_field_ops { - int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); + int (*validate)(struct bch_sb *, struct bch_sb_field *, + enum bch_validate_flags, struct printbuf *); void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); }; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index dddf57ec4511..294a9d35a9f2 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -264,7 +264,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_open_buckets_stop(c, NULL, true); bch2_rebalance_stop(c); bch2_copygc_stop(c); - bch2_gc_thread_stop(c); bch2_fs_ec_flush(c); bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", @@ -285,7 +284,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", journal_cur_seq(&c->journal)); - if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && + if (test_bit(JOURNAL_replay_done, &c->journal.flags) && !test_bit(BCH_FS_emergency_ro, &c->flags)) set_bit(BCH_FS_clean_shutdown, &c->flags); @@ -467,7 +466,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) * overwriting whatever was there previously, and there must always be * at least one non-flush write in the journal or recovery will fail: */ - set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); + set_bit(JOURNAL_need_flush_write, &c->journal.flags); + set_bit(JOURNAL_running, &c->journal.flags); for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); @@ -485,12 +485,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) } #endif - ret = bch2_gc_thread_start(c); - if (ret) { - bch_err(c, "error starting gc thread"); - return ret; - } - ret = bch2_journal_reclaim_start(&c->journal); if (ret) goto err; @@ -537,9 +531,7 @@ int bch2_fs_read_write_early(struct bch_fs *c) static void __bch2_fs_free(struct bch_fs *c) { - unsigned i; - - for (i = 0; i < BCH_TIME_STAT_NR; i++) + for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); bch2_find_btree_nodes_exit(&c->found_btree_nodes); @@ -572,6 +564,7 @@ static void __bch2_fs_free(struct bch_fs *c) BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); + EBUG_ON(percpu_u64_get(c->online_reserved)); free_percpu(c->online_reserved); darray_exit(&c->btree_roots_extra); @@ -616,8 +609,6 @@ void __bch2_fs_stop(struct bch_fs *c) set_bit(BCH_FS_stopping, &c->flags); - cancel_work_sync(&c->journal_seq_blacklist_gc_work); - down_write(&c->state_lock); bch2_fs_read_only(c); up_write(&c->state_lock); @@ -665,6 +656,7 @@ void bch2_fs_free(struct bch_fs *c) struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); if (ca) { + EBUG_ON(atomic_long_read(&ca->ref) != 1); bch2_free_super(&ca->disk_sb); bch2_dev_free(ca); } @@ -719,7 +711,7 @@ static int bch2_fs_online(struct bch_fs *c) ret = bch2_dev_sysfs_online(c, ca); if (ret) { bch_err(c, "error creating sysfs objects"); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); goto err; } } @@ -778,6 +770,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); + bch2_fs_gc_init(c); bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_btree_iter_init_early(c); @@ -800,16 +793,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) spin_lock_init(&c->btree_write_error_lock); - INIT_WORK(&c->journal_seq_blacklist_gc_work, - bch2_blacklist_entries_gc); - INIT_LIST_HEAD(&c->journal_iters); INIT_LIST_HEAD(&c->fsck_error_msgs); mutex_init(&c->fsck_error_msgs_lock); - seqcount_init(&c->gc_pos_lock); - seqcount_init(&c->usage_lock); sema_init(&c->io_in_flight, 128); @@ -940,7 +928,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; for (i = 0; i < c->sb.nr_devices; i++) - if (bch2_dev_exists(c->disk_sb.sb, i) && + if (bch2_member_exists(c->disk_sb.sb, i) && bch2_dev_alloc(c, i)) { ret = -EEXIST; goto err; @@ -1101,7 +1089,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) return -BCH_ERR_device_not_a_member_of_filesystem; - if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx)) + if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) return -BCH_ERR_device_has_been_removed; if (fs->sb->block_size != sb->sb->block_size) @@ -1200,11 +1188,11 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); + kfree(ca->buckets_nouse); bch2_free_super(&ca->disk_sb); bch2_dev_journal_exit(ca); free_percpu(ca->io_done); - bioset_exit(&ca->replica_set); bch2_dev_buckets_free(ca); free_page((unsigned long) ca->sb_read_scratch); @@ -1212,7 +1200,9 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); percpu_ref_exit(&ca->io_ref); +#ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); +#endif kobject_put(&ca->kobj); } @@ -1239,12 +1229,14 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) bch2_dev_journal_exit(ca); } +#ifndef CONFIG_BCACHEFS_DEBUG static void bch2_dev_ref_complete(struct percpu_ref *ref) { struct bch_dev *ca = container_of(ref, struct bch_dev, ref); complete(&ca->ref_completion); } +#endif static void bch2_dev_io_ref_complete(struct percpu_ref *ref) { @@ -1313,14 +1305,17 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / btree_sectors(c)); - if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, - 0, GFP_KERNEL) || - percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, +#ifndef CONFIG_BCACHEFS_DEBUG + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) + goto err; +#else + atomic_long_set(&ca->ref, 1); +#endif + + if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || - bioset_init(&ca->replica_set, 4, - offsetof(struct bch_write_bio, bio), 0) || !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; @@ -1411,10 +1406,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) le64_to_cpu(c->disk_sb.sb->seq)) bch2_sb_to_fs(c, sb->sb); - BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || - !c->devs[sb->sb->dev_idx]); + BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); - ca = bch_dev_locked(c, sb->sb->dev_idx); + ca = bch2_dev_locked(c, sb->sb->dev_idx); ret = __bch2_dev_attach_bdev(ca, sb); if (ret) @@ -1506,10 +1500,10 @@ static bool bch2_fs_may_start(struct bch_fs *c) mutex_lock(&c->sb_lock); for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { - if (!bch2_dev_exists(c->disk_sb.sb, i)) + if (!bch2_member_exists(c->disk_sb.sb, i)) continue; - ca = bch_dev_locked(c, i); + ca = bch2_dev_locked(c, i); if (!bch2_dev_is_online(ca) && (ca->mi.state == BCH_MEMBER_STATE_rw || @@ -1599,17 +1593,17 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) * with bch2_do_invalidates() and bch2_do_discards() */ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, - BTREE_TRIGGER_NORUN, NULL); + BTREE_TRIGGER_norun, NULL); bch_err_msg(c, ret, "removing dev alloc info"); return ret; } @@ -1626,7 +1620,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) * We consume a reference to ca->ref, regardless of whether we succeed * or fail: */ - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); @@ -1678,7 +1672,12 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) rcu_assign_pointer(c->devs[ca->dev_idx], NULL); mutex_unlock(&c->sb_lock); +#ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_kill(&ca->ref); +#else + ca->dying = true; + bch2_dev_put(ca); +#endif wait_for_completion(&ca->ref_completion); bch2_dev_free(ca); @@ -1777,9 +1776,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (dynamic_fault("bcachefs:add:no_slot")) goto no_slot; - for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) - if (!bch2_dev_exists(c->disk_sb.sb, dev_idx)) - goto have_slot; + if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) { + dev_idx = c->sb.nr_devices; + goto have_slot; + } + + int best = -1; + u64 best_last_mount = 0; + for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); + if (bch2_member_alive(&m)) + continue; + + u64 last_mount = le64_to_cpu(m.last_mount); + if (best < 0 || last_mount < best_last_mount) { + best = dev_idx; + best_last_mount = last_mount; + } + } + if (best >= 0) { + dev_idx = best; + goto have_slot; + } no_slot: ret = -BCH_ERR_ENOSPC_sb_members; bch_err_msg(c, ret, "setting up new superblock"); @@ -1821,7 +1839,7 @@ have_slot: bch2_dev_usage_journal_reserve(c); - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(ca, ret, "marking new superblock"); if (ret) goto err_late; @@ -1884,9 +1902,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path) if (ret) goto err; - ca = bch_dev_locked(c, dev_idx); + ca = bch2_dev_locked(c, dev_idx); - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); if (ret) goto err; @@ -1979,7 +1997,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (ret) goto err; - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); if (ret) goto err; diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 11bcef170c2c..368a63d938cf 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -26,19 +26,4 @@ struct bch_devs_list { u8 data[BCH_BKEY_PTRS_MAX]; }; -struct bch_member_cpu { - u64 nbuckets; /* device size */ - u16 first_bucket; /* index of first bucket used */ - u16 bucket_size; /* sectors */ - u16 group; - u8 state; - u8 discard; - u8 data_allowed; - u8 durability; - u8 freespace_initialized; - u8 valid; - u8 btree_bitmap_shift; - u64 btree_allocated_bitmap; -}; - #endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 5be92fe3f4ea..93ca74d108b1 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -140,9 +140,8 @@ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); write_attribute(trigger_journal_flush); -write_attribute(prune_cache); -write_attribute(btree_wakeup); -rw_attribute(btree_gc_periodic); +write_attribute(trigger_btree_cache_shrink); +write_attribute(trigger_btree_key_cache_shrink); rw_attribute(gc_gens_pos); read_attribute(uuid); @@ -189,12 +188,8 @@ static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) { bch2_printbuf_tabstop_push(out, 24); - for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) { - prt_str(out, bch2_write_refs[i]); - prt_tab(out); - prt_printf(out, "%li", atomic_long_read(&c->writes[i])); - prt_newline(out); - } + for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) + prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i])); } #endif @@ -278,7 +273,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c continue; ret = for_each_btree_key(trans, iter, id, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_all_snapshots, k, ({ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *entry; @@ -313,22 +308,11 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (ret) return ret; - prt_str(out, "type"); printbuf_tabstop_push(out, 12); - prt_tab(out); - - prt_str(out, "compressed"); printbuf_tabstop_push(out, 16); - prt_tab_rjust(out); - - prt_str(out, "uncompressed"); printbuf_tabstop_push(out, 16); - prt_tab_rjust(out); - - prt_str(out, "average extent size"); printbuf_tabstop_push(out, 24); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n"); for (unsigned i = 0; i < ARRAY_SIZE(s); i++) { bch2_prt_compression_type(out, i); @@ -362,21 +346,6 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "\n"); } -static void bch2_btree_wakeup_all(struct bch_fs *c) -{ - struct btree_trans *trans; - - seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct btree_bkey_cached_common *b = READ_ONCE(trans->locking); - - if (b) - six_lock_wakeup_all(&b->lock); - - } - seqmutex_unlock(&c->btree_trans_lock); -} - SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -392,8 +361,6 @@ SHOW(bch2_fs) if (attr == &sysfs_btree_write_stats) bch2_btree_write_stats_to_text(out, c); - sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); - if (attr == &sysfs_gc_gens_pos) bch2_gc_gens_pos_to_text(out, c); @@ -416,7 +383,7 @@ SHOW(bch2_fs) bch2_journal_debug_to_text(out, &c->journal); if (attr == &sysfs_btree_cache) - bch2_btree_cache_to_text(out, c); + bch2_btree_cache_to_text(out, &c->btree_cache); if (attr == &sysfs_btree_key_cache) bch2_btree_key_cache_to_text(out, &c->btree_key_cache); @@ -459,6 +426,9 @@ SHOW(bch2_fs) if (attr == &sysfs_disk_groups) bch2_disk_groups_to_text(out, c); + if (attr == &sysfs_alloc_debug) + bch2_fs_alloc_debug_to_text(out, c); + return 0; } @@ -466,14 +436,6 @@ STORE(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - if (attr == &sysfs_btree_gc_periodic) { - ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) - ?: (ssize_t) size; - - wake_up_process(c->gc_thread); - return ret; - } - if (attr == &sysfs_copy_gc_enabled) { ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) ?: (ssize_t) size; @@ -505,7 +467,7 @@ STORE(bch2_fs) if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) return -EROFS; - if (attr == &sysfs_prune_cache) { + if (attr == &sysfs_trigger_btree_cache_shrink) { struct shrink_control sc; sc.gfp_mask = GFP_KERNEL; @@ -513,22 +475,17 @@ STORE(bch2_fs) c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); } - if (attr == &sysfs_btree_wakeup) - bch2_btree_wakeup_all(c); - - if (attr == &sysfs_trigger_gc) { - /* - * Full gc is currently incompatible with btree key cache: - */ -#if 0 - down_read(&c->state_lock); - bch2_gc(c, false, false); - up_read(&c->state_lock); -#else - bch2_gc_gens(c); -#endif + if (attr == &sysfs_trigger_btree_key_cache_shrink) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); } + if (attr == &sysfs_trigger_gc) + bch2_gc_gens(c); + if (attr == &sysfs_trigger_discards) bch2_do_discards(c); @@ -594,13 +551,11 @@ SHOW(bch2_fs_counters) if (attr == &sysfs_##t) { \ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ - prt_printf(out, "since mount:"); \ - prt_tab(out); \ + prt_printf(out, "since mount:\t"); \ prt_human_readable_u64(out, counter_since_mount); \ prt_newline(out); \ \ - prt_printf(out, "since filesystem creation:"); \ - prt_tab(out); \ + prt_printf(out, "since filesystem creation:\t"); \ prt_human_readable_u64(out, counter); \ prt_newline(out); \ } @@ -660,8 +615,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_discards, &sysfs_trigger_invalidates, &sysfs_trigger_journal_flush, - &sysfs_prune_cache, - &sysfs_btree_wakeup, + &sysfs_trigger_btree_cache_shrink, + &sysfs_trigger_btree_key_cache_shrink, &sysfs_gc_gens_pos, @@ -677,6 +632,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_internal_uuid, &sysfs_disk_groups, + &sysfs_alloc_debug, NULL }; @@ -792,88 +748,6 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch2_dev_usage_read(ca); - unsigned i, nr[BCH_DATA_NR]; - - memset(nr, 0, sizeof(nr)); - - for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].data_type]++; - - printbuf_tabstop_push(out, 8); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - - bch2_dev_usage_to_text(out, &stats); - - prt_newline(out); - - prt_printf(out, "reserves:"); - prt_newline(out); - for (i = 0; i < BCH_WATERMARK_NR; i++) { - prt_str(out, bch2_watermarks[i]); - prt_tab(out); - prt_u64(out, bch2_dev_buckets_reserved(ca, i)); - prt_tab_rjust(out); - prt_newline(out); - } - - prt_newline(out); - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 24); - - prt_str(out, "freelist_wait"); - prt_tab(out); - prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty"); - prt_newline(out); - - prt_str(out, "open buckets allocated"); - prt_tab(out); - prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); - prt_newline(out); - - prt_str(out, "open buckets this dev"); - prt_tab(out); - prt_u64(out, ca->nr_open_buckets); - prt_newline(out); - - prt_str(out, "open buckets total"); - prt_tab(out); - prt_u64(out, OPEN_BUCKETS_COUNT); - prt_newline(out); - - prt_str(out, "open_buckets_wait"); - prt_tab(out); - prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty"); - prt_newline(out); - - prt_str(out, "open_buckets_btree"); - prt_tab(out); - prt_u64(out, nr[BCH_DATA_btree]); - prt_newline(out); - - prt_str(out, "open_buckets_user"); - prt_tab(out); - prt_u64(out, nr[BCH_DATA_user]); - prt_newline(out); - - prt_str(out, "buckets_to_invalidate"); - prt_tab(out); - prt_u64(out, should_invalidate_buckets(ca, stats)); - prt_newline(out); - - prt_str(out, "btree reserve cache"); - prt_tab(out); - prt_u64(out, c->btree_reserve_cache_nr); - prt_newline(out); -} - static const char * const bch2_rw[] = { "read", "write", @@ -943,7 +817,7 @@ SHOW(bch2_dev) * 100 / CONGESTED_MAX); if (attr == &sysfs_alloc_debug) - dev_alloc_debug_to_text(out, ca); + bch2_dev_alloc_debug_to_text(out, ca); return 0; } diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index bfec656f94c0..68104b2056d9 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -40,7 +40,7 @@ static int test_delete(struct bch_fs *c, u64 nr) k.k.p.snapshot = U32_MAX; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: @@ -81,7 +81,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) k.k.p.snapshot = U32_MAX; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: @@ -261,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) ret = bch2_trans_run(c, for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ + BTREE_ITER_slots, k, ({ if (i >= nr * 2) break; @@ -322,7 +322,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) ret = bch2_trans_run(c, for_each_btree_key_upto(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ + BTREE_ITER_slots, k, ({ if (i == nr) break; BUG_ON(bkey_deleted(k.k) != !(i % 16)); @@ -452,7 +452,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, ret = bch2_trans_do(c, NULL, NULL, 0, bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + BTREE_UPDATE_internal_snapshot_node)); bch_err_fn(c, ret); return ret; } @@ -671,7 +671,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) @@ -714,7 +714,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, + BTREE_ITER_slots|BTREE_ITER_intent, k, NULL, NULL, 0, ({ if (iter.pos.offset >= nr) break; @@ -737,7 +737,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, 0, ({ struct bkey_i_cookie u; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 6aa81d1e6d36..362e1fc7ef6a 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -638,99 +638,14 @@ DEFINE_EVENT(bch_fs, gc_gens_end, /* Allocator */ -DECLARE_EVENT_CLASS(bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err), - - TP_STRUCT__entry( - __field(u8, dev ) - __array(char, reserve, 16 ) - __field(u64, bucket ) - __field(u64, free ) - __field(u64, avail ) - __field(u64, copygc_wait_amount ) - __field(s64, copygc_waiting_for ) - __field(u64, seen ) - __field(u64, open ) - __field(u64, need_journal_commit ) - __field(u64, nouse ) - __field(bool, nonblocking ) - __field(u64, nocow ) - __array(char, err, 32 ) - ), - - TP_fast_assign( - __entry->dev = ca->dev_idx; - strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); - __entry->bucket = bucket; - __entry->free = free; - __entry->avail = avail; - __entry->copygc_wait_amount = copygc_wait_amount; - __entry->copygc_waiting_for = copygc_waiting_for; - __entry->seen = s->buckets_seen; - __entry->open = s->skipped_open; - __entry->need_journal_commit = s->skipped_need_journal_commit; - __entry->nouse = s->skipped_nouse; - __entry->nonblocking = nonblocking; - __entry->nocow = s->skipped_nocow; - strscpy(__entry->err, err, sizeof(__entry->err)); - ), - - TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", - __entry->reserve, - __entry->dev, - __entry->bucket, - __entry->free, - __entry->avail, - __entry->copygc_wait_amount, - __entry->copygc_waiting_for, - __entry->seen, - __entry->open, - __entry->need_journal_commit, - __entry->nouse, - __entry->nocow, - __entry->nonblocking, - __entry->err) +DEFINE_EVENT(fs_str, bucket_alloc, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bucket_alloc, bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err) -); - -DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err) +DEFINE_EVENT(fs_str, bucket_alloc_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); TRACE_EVENT(discard_buckets, diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 92c6ad75e702..de331dec2a99 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -348,15 +348,12 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { const struct time_unit *u = bch2_pick_time_units(ns); - prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); - prt_tab_rjust(out); - prt_printf(out, "%s", u->name); + prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name); } static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) { - prt_str(out, name); - prt_tab(out); + prt_printf(out, "%s\t", name); bch2_pr_time_units_aligned(out, ns); prt_newline(out); } @@ -389,12 +386,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats } printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); - prt_printf(out, "count:"); - prt_tab(out); - prt_printf(out, "%llu ", - stats->duration_stats.n); + prt_printf(out, "count:\t%llu\n", stats->duration_stats.n); printbuf_tabstop_pop(out); - prt_newline(out); printbuf_tabstops_reset(out); @@ -403,13 +396,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, 0); printbuf_tabstop_push(out, TABSTOP_SIZE + 2); - prt_tab(out); - prt_printf(out, "since mount"); - prt_tab_rjust(out); - prt_tab(out); + prt_printf(out, "\tsince mount\r\trecent\r\n"); prt_printf(out, "recent"); - prt_tab_rjust(out); - prt_newline(out); printbuf_tabstops_reset(out); printbuf_tabstop_push(out, out->indent + 20); @@ -417,23 +405,20 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, 2); printbuf_tabstop_push(out, TABSTOP_SIZE); - prt_printf(out, "duration of events"); - prt_newline(out); + prt_printf(out, "duration of events\n"); printbuf_indent_add(out, 2); pr_name_and_units(out, "min:", stats->min_duration); pr_name_and_units(out, "max:", stats->max_duration); pr_name_and_units(out, "total:", stats->total_duration); - prt_printf(out, "mean:"); - prt_tab(out); + prt_printf(out, "mean:\t"); bch2_pr_time_units_aligned(out, d_mean); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); - prt_printf(out, "stddev:"); - prt_tab(out); + prt_printf(out, "stddev:\t"); bch2_pr_time_units_aligned(out, d_stddev); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); @@ -441,22 +426,19 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_indent_sub(out, 2); prt_newline(out); - prt_printf(out, "time between events"); - prt_newline(out); + prt_printf(out, "time between events\n"); printbuf_indent_add(out, 2); pr_name_and_units(out, "min:", stats->min_freq); pr_name_and_units(out, "max:", stats->max_freq); - prt_printf(out, "mean:"); - prt_tab(out); + prt_printf(out, "mean:\t"); bch2_pr_time_units_aligned(out, f_mean); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); - prt_printf(out, "stddev:"); - prt_tab(out); + prt_printf(out, "stddev:\t"); bch2_pr_time_units_aligned(out, f_stddev); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); @@ -589,40 +571,31 @@ void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_contro if (!out->nr_tabstops) printbuf_tabstop_push(out, 20); - prt_printf(out, "rate:"); - prt_tab(out); + prt_printf(out, "rate:\t"); prt_human_readable_s64(out, pd->rate.rate); prt_newline(out); - prt_printf(out, "target:"); - prt_tab(out); + prt_printf(out, "target:\t"); prt_human_readable_u64(out, pd->last_target); prt_newline(out); - prt_printf(out, "actual:"); - prt_tab(out); + prt_printf(out, "actual:\t"); prt_human_readable_u64(out, pd->last_actual); prt_newline(out); - prt_printf(out, "proportional:"); - prt_tab(out); + prt_printf(out, "proportional:\t"); prt_human_readable_s64(out, pd->last_proportional); prt_newline(out); - prt_printf(out, "derivative:"); - prt_tab(out); + prt_printf(out, "derivative:\t"); prt_human_readable_s64(out, pd->last_derivative); prt_newline(out); - prt_printf(out, "change:"); - prt_tab(out); + prt_printf(out, "change:\t"); prt_human_readable_s64(out, pd->last_change); prt_newline(out); - prt_printf(out, "next io:"); - prt_tab(out); - prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); - prt_newline(out); + prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); } /* misc: */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 754f17bba68e..c11bf6dacc2c 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -71,7 +71,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { }; int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); @@ -118,11 +118,17 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, else prt_printf(out, "(unknown type %u)", xattr.v->x_type); + unsigned name_len = xattr.v->x_name_len; + unsigned val_len = le16_to_cpu(xattr.v->x_val_len); + unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - + offsetof(struct bch_xattr, x_name); + + val_len = min_t(int, val_len, max_name_val_bytes - name_len); + name_len = min(name_len, max_name_val_bytes); + prt_printf(out, "%.*s:%.*s", - xattr.v->x_name_len, - xattr.v->x_name, - le16_to_cpu(xattr.v->x_val_len), - (char *) xattr_val(xattr.v)); + name_len, xattr.v->x_name, + val_len, (char *) xattr_val(xattr.v)); if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) { @@ -138,21 +144,13 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); struct btree_iter iter; - struct bkey_s_c_xattr xattr; - struct bkey_s_c k; - int ret; - - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, - inode_inum(inode), &search, 0); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, + inode_inum(inode), &search, 0); + int ret = bkey_err(k); if (ret) - goto err1; - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err2; + return ret; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ret = le16_to_cpu(xattr.v->x_val_len); if (buffer) { if (ret > size) @@ -160,10 +158,8 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info else memcpy(buffer, xattr_val(xattr.v), ret); } -err2: bch2_trans_iter_exit(trans, &iter); -err1: - return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret; + return ret; } int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, @@ -177,7 +173,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, int ret; ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: - bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); if (ret) return ret; @@ -212,8 +208,8 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, inum, &xattr->k_i, - (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| - (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); + (flags & XATTR_CREATE ? STR_HASH_must_create : 0)| + (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0)); } else { struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); @@ -359,6 +355,9 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, int ret = bch2_trans_do(c, NULL, NULL, 0, bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); + if (ret < 0 && bch2_err_matches(ret, ENOENT)) + ret = -ENODATA; + return bch2_err_class(ret); } diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 1337f31a5c49..1574b9eb4c85 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -7,7 +7,7 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ diff --git a/include/linux/closure.h b/include/linux/closure.h index c554c6a08768..99155df162d0 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -194,6 +194,18 @@ static inline void closure_sync(struct closure *cl) __closure_sync(cl); } +int __closure_sync_timeout(struct closure *cl, unsigned long timeout); + +static inline int closure_sync_timeout(struct closure *cl, unsigned long timeout) +{ +#ifdef CONFIG_DEBUG_CLOSURES + BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened); +#endif + return cl->closure_get_happened + ? __closure_sync_timeout(cl, timeout) + : 0; +} + #ifdef CONFIG_DEBUG_CLOSURES void closure_debug_create(struct closure *cl); diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 1b40a968ba91..bb575f3ab45e 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -37,6 +37,7 @@ #define HOSTFS_SUPER_MAGIC 0x00c0ffee #define OVERLAYFS_SUPER_MAGIC 0x794c7630 #define FUSE_SUPER_MAGIC 0x65735546 +#define BCACHEFS_SUPER_MAGIC 0xca451a4e #define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ #define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ diff --git a/lib/closure.c b/lib/closure.c index c16540552d61..07409e9e35a5 100644 --- a/lib/closure.c +++ b/lib/closure.c @@ -139,6 +139,43 @@ void __sched __closure_sync(struct closure *cl) } EXPORT_SYMBOL(__closure_sync); +int __sched __closure_sync_timeout(struct closure *cl, unsigned long timeout) +{ + struct closure_syncer s = { .task = current }; + int ret = 0; + + cl->s = &s; + continue_at(cl, closure_sync_fn, NULL); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (s.done) + break; + if (!timeout) { + /* + * Carefully undo the continue_at() - but only if it + * hasn't completed, i.e. the final closure_put() hasn't + * happened yet: + */ + unsigned old, new, v = atomic_read(&cl->remaining); + do { + old = v; + if (!old || (old & CLOSURE_RUNNING)) + goto success; + + new = old + CLOSURE_REMAINING_INITIALIZER; + } while ((v = atomic_cmpxchg(&cl->remaining, old, new)) != old); + ret = -ETIME; + } + + timeout = schedule_timeout(timeout); + } +success: + __set_current_state(TASK_RUNNING); + return ret; +} +EXPORT_SYMBOL(__closure_sync_timeout); + #ifdef CONFIG_DEBUG_CLOSURES static LIST_HEAD(closure_list); |