summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/bcachefs/CodingStyle.rst2
-rw-r--r--fs/bcachefs/Kconfig7
-rw-r--r--fs/bcachefs/Makefile1
-rw-r--r--fs/bcachefs/acl.c2
-rw-r--r--fs/bcachefs/alloc_background.c45
-rw-r--r--fs/bcachefs/alloc_background.h3
-rw-r--r--fs/bcachefs/alloc_foreground.c59
-rw-r--r--fs/bcachefs/alloc_foreground.h5
-rw-r--r--fs/bcachefs/backpointers.c106
-rw-r--r--fs/bcachefs/backpointers.h23
-rw-r--r--fs/bcachefs/bcachefs.h14
-rw-r--r--fs/bcachefs/bcachefs_format.h2
-rw-r--r--fs/bcachefs/bset.c182
-rw-r--r--fs/bcachefs/bset.h4
-rw-r--r--fs/bcachefs/btree_cache.c273
-rw-r--r--fs/bcachefs/btree_cache.h3
-rw-r--r--fs/bcachefs/btree_gc.c21
-rw-r--r--fs/bcachefs/btree_io.c8
-rw-r--r--fs/bcachefs/btree_io.h4
-rw-r--r--fs/bcachefs/btree_iter.c63
-rw-r--r--fs/bcachefs/btree_iter.h52
-rw-r--r--fs/bcachefs/btree_key_cache.c405
-rw-r--r--fs/bcachefs/btree_key_cache_types.h18
-rw-r--r--fs/bcachefs/btree_locking.h13
-rw-r--r--fs/bcachefs/btree_trans_commit.c2
-rw-r--r--fs/bcachefs/btree_types.h60
-rw-r--r--fs/bcachefs/btree_update.c12
-rw-r--r--fs/bcachefs/btree_update_interior.c37
-rw-r--r--fs/bcachefs/btree_update_interior.h2
-rw-r--r--fs/bcachefs/buckets.c35
-rw-r--r--fs/bcachefs/buckets.h15
-rw-r--r--fs/bcachefs/buckets_types.h8
-rw-r--r--fs/bcachefs/checksum.c101
-rw-r--r--fs/bcachefs/clock.h9
-rw-r--r--fs/bcachefs/darray.c4
-rw-r--r--fs/bcachefs/darray.h26
-rw-r--r--fs/bcachefs/data_update.c2
-rw-r--r--fs/bcachefs/dirent.c66
-rw-r--r--fs/bcachefs/ec.c303
-rw-r--r--fs/bcachefs/ec.h11
-rw-r--r--fs/bcachefs/ec_format.h9
-rw-r--r--fs/bcachefs/ec_types.h1
-rw-r--r--fs/bcachefs/errcode.h14
-rw-r--r--fs/bcachefs/extents.c33
-rw-r--r--fs/bcachefs/extents.h24
-rw-r--r--fs/bcachefs/fs-common.c5
-rw-r--r--fs/bcachefs/fs-io-buffered.c41
-rw-r--r--fs/bcachefs/fs-io-direct.c2
-rw-r--r--fs/bcachefs/fs-io-pagecache.c90
-rw-r--r--fs/bcachefs/fs-io-pagecache.h4
-rw-r--r--fs/bcachefs/fs-io.c178
-rw-r--r--fs/bcachefs/fs-ioctl.c4
-rw-r--r--fs/bcachefs/fs.c427
-rw-r--r--fs/bcachefs/fs.h18
-rw-r--r--fs/bcachefs/inode.c2
-rw-r--r--fs/bcachefs/io_read.c18
-rw-r--r--fs/bcachefs/io_write.c7
-rw-r--r--fs/bcachefs/journal_io.c6
-rw-r--r--fs/bcachefs/journal_reclaim.c7
-rw-r--r--fs/bcachefs/opts.c85
-rw-r--r--fs/bcachefs/opts.h61
-rw-r--r--fs/bcachefs/rcu_pending.c650
-rw-r--r--fs/bcachefs/rcu_pending.h27
-rw-r--r--fs/bcachefs/rebalance.c3
-rw-r--r--fs/bcachefs/recovery.c22
-rw-r--r--fs/bcachefs/recovery_passes.c10
-rw-r--r--fs/bcachefs/replicas.c10
-rw-r--r--fs/bcachefs/replicas_format.h9
-rw-r--r--fs/bcachefs/sb-clean.c2
-rw-r--r--fs/bcachefs/sb-members.c57
-rw-r--r--fs/bcachefs/sb-members.h22
-rw-r--r--fs/bcachefs/str_hash.h2
-rw-r--r--fs/bcachefs/subvolume.h45
-rw-r--r--fs/bcachefs/subvolume_types.h3
-rw-r--r--fs/bcachefs/super-io.c12
-rw-r--r--fs/bcachefs/super.c85
-rw-r--r--fs/bcachefs/sysfs.c55
-rw-r--r--fs/bcachefs/thread_with_file.c2
-rw-r--r--fs/bcachefs/time_stats.c14
-rw-r--r--fs/bcachefs/time_stats.h3
-rw-r--r--fs/bcachefs/trace.h465
-rw-r--r--fs/bcachefs/util.c16
-rw-r--r--fs/bcachefs/util.h2
-rw-r--r--fs/bcachefs/xattr.c81
-rw-r--r--fs/bcachefs/xattr_format.h2
-rw-r--r--fs/inode.c8
-rw-r--r--include/linux/fs.h9
-rw-r--r--include/linux/generic-radix-tree.h105
-rw-r--r--lib/generic-radix-tree.c80
89 files changed, 3155 insertions, 1690 deletions
diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst
index 0c45829a4899..01de555e21d8 100644
--- a/Documentation/filesystems/bcachefs/CodingStyle.rst
+++ b/Documentation/filesystems/bcachefs/CodingStyle.rst
@@ -175,7 +175,7 @@ errors in our thinking by running our code and seeing what happens. If your
time is being wasted because your tools are bad or too slow - don't accept it,
fix it.
-Put effort into your documentation, commmit messages, and code comments - but
+Put effort into your documentation, commit messages, and code comments - but
don't go overboard. A good commit message is wonderful - but if the information
was important enough to go in a commit message, ask yourself if it would be
even better as a code comment.
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5cdfef3b551a..5bac803ea367 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -87,6 +87,13 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
is held by another thread, spin for a short while, as long as the
thread owning the lock is running.
+config BCACHEFS_PATH_TRACEPOINTS
+ bool "Extra btree_path tracepoints"
+ depends on BCACHEFS_FS
+ help
+ Enable extra tracepoints for debugging btree_path operations; we don't
+ normally want these enabled because they happen at very high rates.
+
config MEAN_AND_VARIANCE_UNIT_TEST
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 0ab533a2b03b..56d20e219f59 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -69,6 +69,7 @@ bcachefs-y := \
printbuf.o \
quota.o \
rebalance.o \
+ rcu_pending.o \
recovery.o \
recovery_passes.o \
reflink.o \
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 331a17f3f113..87f1be9d4db4 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -361,7 +361,7 @@ retry:
bch2_trans_begin(trans);
acl = _acl;
- ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
BTREE_ITER_intent);
if (ret)
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index dc3a4024aab6..645b5ed4babb 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -30,6 +30,7 @@
#include <linux/rcupdate.h>
#include <linux/sched/task.h>
#include <linux/sort.h>
+#include <linux/jiffies.h>
static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
@@ -2183,7 +2184,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
* freespace/need_discard/need_gc_gens btrees as needed:
*/
while (1) {
- if (last_updated + HZ * 10 < jiffies) {
+ if (time_after(jiffies, last_updated + HZ * 10)) {
bch_info(ca, "%s: currently at %llu/%llu",
__func__, iter.pos.offset, ca->mi.nbuckets);
last_updated = jiffies;
@@ -2297,6 +2298,36 @@ int bch2_fs_freespace_init(struct bch_fs *c)
return 0;
}
+/* device removal */
+
+int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct bpos start = POS(ca->dev_idx, 0);
+ struct bpos end = POS(ca->dev_idx, U64_MAX);
+ int ret;
+
+ /*
+ * We clear the LRU and need_discard btrees first so that we don't race
+ * with bch2_do_invalidates() and bch2_do_discards()
+ */
+ ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?:
+ bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_dev_usage_remove(c, ca->dev_idx);
+ bch_err_msg(ca, ret, "removing dev alloc info");
+ return ret;
+}
+
/* Bucket IO clocks: */
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
@@ -2432,13 +2463,15 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
/* device goes ro: */
void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
+ lockdep_assert_held(&c->state_lock);
/* First, remove device from allocation groups: */
- for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
clear_bit(ca->dev_idx, c->rw_devs[i].d);
+ c->rw_devs_change_count++;
+
/*
* Capacity is calculated based off of devices in allocation groups:
*/
@@ -2467,11 +2500,13 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
/* device goes rw: */
void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
+ lockdep_assert_held(&c->state_lock);
- for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
if (ca->mi.data_allowed & (1 << i))
set_bit(ca->dev_idx, c->rw_devs[i].d);
+
+ c->rw_devs_change_count++;
}
void bch2_dev_allocator_background_exit(struct bch_dev *ca)
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index fd790b03fbe1..f8e87c6721b1 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -16,7 +16,7 @@ enum bch_validate_flags;
static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
{
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, pos.inode);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
bool ret = ca && bucket_valid(ca, pos.offset);
rcu_read_unlock();
return ret;
@@ -338,6 +338,7 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct
int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
int bch2_fs_freespace_init(struct bch_fs *);
+int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
void bch2_recalc_capacity(struct bch_fs *);
u64 bch2_min_rw_member_capacity(struct bch_fs *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 8563c2d26847..d0e0b56892e3 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -600,6 +600,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
enum bch_watermark watermark,
enum bch_data_type data_type,
struct closure *cl,
+ bool nowait,
struct bch_dev_usage *usage)
{
struct bch_fs *c = trans->c;
@@ -609,7 +610,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct bucket_alloc_state s = {
.btree_bitmap = data_type == BCH_DATA_btree,
};
- bool waiting = false;
+ bool waiting = nowait;
again:
bch2_dev_usage_read_fast(ca, usage);
avail = dev_buckets_free(ca, *usage, watermark);
@@ -685,7 +686,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
bch2_trans_do(c, NULL, NULL, 0,
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
- data_type, cl, &usage)));
+ data_type, cl, false, &usage)));
return ob;
}
@@ -748,7 +749,6 @@ static int add_new_bucket(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- unsigned flags,
struct open_bucket *ob)
{
unsigned durability = ob_dev(c, ob)->mi.durability;
@@ -775,7 +775,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- unsigned flags,
+ enum bch_write_flags flags,
enum bch_data_type data_type,
enum bch_watermark watermark,
struct closure *cl)
@@ -801,7 +801,8 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
- ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage);
+ ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
+ cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
if (!IS_ERR(ob))
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
bch2_dev_put(ca);
@@ -815,7 +816,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
if (add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob)) {
+ have_cache, ob)) {
ret = 0;
break;
}
@@ -841,7 +842,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl)
{
struct bch_fs *c = trans->c;
@@ -883,7 +884,7 @@ got_bucket:
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob);
+ have_cache, ob);
out_put_head:
bch2_ec_stripe_head_put(c, h);
return ret;
@@ -922,7 +923,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- bool ec, unsigned flags)
+ bool ec)
{
struct open_buckets ptrs_skip = { .nr = 0 };
struct open_bucket *ob;
@@ -934,7 +935,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
have_cache, ec, ob))
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob);
+ have_cache, ob);
else
ob_push(c, &ptrs_skip, ob);
}
@@ -950,8 +951,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache, bool ec,
- enum bch_watermark watermark,
- unsigned flags)
+ enum bch_watermark watermark)
{
int i, ret = 0;
@@ -983,7 +983,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob);
+ have_cache, ob);
if (ret)
break;
}
@@ -1003,7 +1003,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *_cl)
{
struct bch_fs *c = trans->c;
@@ -1022,18 +1022,15 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
open_bucket_for_each(c, ptrs, ob, i)
__clear_bit(ob->dev, devs.d);
- if (erasure_code && ec_open_bucket(c, ptrs))
- return 0;
-
ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
- have_cache, erasure_code, flags);
+ have_cache, erasure_code);
if (ret)
return ret;
ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
- have_cache, erasure_code, watermark, flags);
+ have_cache, erasure_code, watermark);
if (ret)
return ret;
@@ -1074,12 +1071,12 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl)
{
int ret;
- if (erasure_code) {
+ if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
ret = __open_bucket_add_buckets(trans, ptrs, wp,
devs_have, target, erasure_code,
nr_replicas, nr_effective, have_cache,
@@ -1376,7 +1373,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
unsigned nr_replicas,
unsigned nr_replicas_required,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl,
struct write_point **wp_ret)
{
@@ -1392,8 +1389,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
- BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
-
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
ptrs.nr = 0;
@@ -1498,11 +1493,12 @@ err:
try_decrease_writepoints(trans, write_points_nr))
goto retry;
- if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
+ if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ ret = -BCH_ERR_bucket_alloc_blocked;
+
+ if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
bch2_err_matches(ret, BCH_ERR_freelist_empty))
- return cl
- ? -BCH_ERR_bucket_alloc_blocked
- : -BCH_ERR_ENOSPC_bucket_alloc;
+ ret = -BCH_ERR_bucket_alloc_blocked;
return ret;
}
@@ -1733,13 +1729,6 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].data_type]++;
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 12);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
-
bch2_dev_usage_to_text(out, ca, &stats);
prt_newline(out);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 386d231ceca3..1a16fd5bd4f8 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -155,9 +155,10 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
return ret;
}
+enum bch_write_flags;
int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
- unsigned, unsigned *, bool *, unsigned,
+ unsigned, unsigned *, bool *, enum bch_write_flags,
enum bch_data_type, enum bch_watermark,
struct closure *);
@@ -167,7 +168,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
struct bch_devs_list *,
unsigned, unsigned,
enum bch_watermark,
- unsigned,
+ enum bch_write_flags,
struct closure *,
struct write_point **);
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index d4da6343efa9..e11989a57ca0 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -9,6 +9,7 @@
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "checksum.h"
+#include "disk_accounting.h"
#include "error.h"
#include <linux/mm.h>
@@ -53,7 +54,7 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, bp.k->p.inode);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
if (!ca) {
/* these will be caught by fsck */
rcu_read_unlock();
@@ -87,7 +88,7 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer
void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, k.k->p.inode);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
if (ca) {
struct bpos bucket = bp_pos_to_bucket(ca, k.k->p);
rcu_read_unlock();
@@ -671,7 +672,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
continue;
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
if (ca)
bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp);
rcu_read_unlock();
@@ -750,10 +751,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
+ bch2_btree_cache_unpin(c);
+
btree_interior_mask |= btree_leaf_mask;
- c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
- c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
+ c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
+ c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
c->btree_cache.pinned_nodes_start = start;
c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
@@ -775,6 +778,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
BBPOS(btree, b->key.k.p);
break;
}
+ bch2_node_pin(c, b);
0;
}));
}
@@ -782,12 +786,80 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
return ret;
}
+struct progress_indicator_state {
+ unsigned long next_print;
+ u64 nodes_seen;
+ u64 nodes_total;
+ struct btree *last_node;
+};
+
+static inline void progress_init(struct progress_indicator_state *s,
+ struct bch_fs *c,
+ u64 btree_id_mask)
+{
+ memset(s, 0, sizeof(*s));
+
+ s->next_print = jiffies + HZ * 10;
+
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
+ if (!(btree_id_mask & BIT_ULL(i)))
+ continue;
+
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_btree,
+ .btree.id = i,
+ };
+
+ u64 v;
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
+ s->nodes_total += div64_ul(v, btree_sectors(c));
+ }
+}
+
+static inline bool progress_update_p(struct progress_indicator_state *s)
+{
+ bool ret = time_after_eq(jiffies, s->next_print);
+
+ if (ret)
+ s->next_print = jiffies + HZ * 10;
+ return ret;
+}
+
+static void progress_update_iter(struct btree_trans *trans,
+ struct progress_indicator_state *s,
+ struct btree_iter *iter,
+ const char *msg)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = path_l(btree_iter_path(trans, iter))->b;
+
+ s->nodes_seen += b != s->last_node;
+ s->last_node = b;
+
+ if (progress_update_p(s)) {
+ struct printbuf buf = PRINTBUF;
+ unsigned percent = s->nodes_total
+ ? div64_u64(s->nodes_seen * 100, s->nodes_total)
+ : 0;
+
+ prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
+ msg, percent, s->nodes_seen, s->nodes_total);
+ bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
+
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct extents_to_bp_state *s)
{
struct bch_fs *c = trans->c;
+ struct progress_indicator_state progress;
int ret = 0;
+ progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
+
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
btree_id++) {
@@ -805,6 +877,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
BTREE_ITER_prefetch);
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}));
@@ -865,8 +938,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
- c->btree_cache.pinned_nodes_leaf_mask = 0;
- c->btree_cache.pinned_nodes_interior_mask = 0;
+ bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
@@ -920,19 +992,24 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
{
+ struct bch_fs *c = trans->c;
struct bkey_buf last_flushed;
+ struct progress_indicator_state progress;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
+ progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_one_backpointer(trans, start, end,
- bkey_s_c_to_backpointer(k),
- &last_flushed));
-
- bch2_bkey_buf_exit(&last_flushed, trans->c);
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
+ check_one_backpointer(trans, start, end,
+ bkey_s_c_to_backpointer(k),
+ &last_flushed);
+ }));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
return ret;
}
@@ -977,8 +1054,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
- c->btree_cache.pinned_nodes_leaf_mask = 0;
- c->btree_cache.pinned_nodes_interior_mask = 0;
+ bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 7daecadb764e..3b29fdf519dd 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -134,28 +134,37 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
}
}
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
+static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
const union bch_extent_entry *entry,
- struct bpos *bucket_pos, struct bch_backpointer *bp)
+ struct bpos *bucket_pos, struct bch_backpointer *bp,
+ u64 sectors)
{
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
- s64 sectors = level ? btree_sectors(c) : k.k->size;
u32 bucket_offset;
-
*bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset);
*bp = (struct bch_backpointer) {
.btree_id = btree_id,
.level = level,
- .data_type = data_type,
+ .data_type = bch2_bkey_ptr_data_type(k, p, entry),
.bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
p.crc.offset,
- .bucket_len = ptr_disk_sectors(sectors, p),
+ .bucket_len = sectors,
.pos = k.k->p,
};
}
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
+ struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+ u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
+
+ __bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors);
+}
+
int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int,
struct bpos *, struct bch_backpointer *, unsigned);
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 0c7086e00d18..c711d4c27a03 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -542,7 +542,7 @@ struct bch_dev {
* gc_gens_lock, for device resize - holding any is sufficient for
* access: Or rcu_read_lock(), but only for dev_ptr_stale():
*/
- struct bucket_array __rcu *buckets_gc;
+ GENRADIX(struct bucket) buckets_gc;
struct bucket_gens __rcu *bucket_gens;
u8 *oldest_gen;
unsigned long *buckets_nouse;
@@ -871,6 +871,7 @@ struct bch_fs {
/* ALLOCATION */
struct bch_devs_mask rw_devs[BCH_DATA_NR];
+ unsigned long rw_devs_change_count;
u64 capacity; /* sectors */
u64 reserved; /* sectors */
@@ -1023,6 +1024,7 @@ struct bch_fs {
/* fs.c */
struct list_head vfs_inodes_list;
struct mutex vfs_inodes_lock;
+ struct rhashtable vfs_inodes_table;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
@@ -1044,8 +1046,6 @@ struct bch_fs {
* for signaling to the toplevel code which pass we want to run now.
*/
enum bch_recovery_pass curr_recovery_pass;
- /* bitmap of explicitly enabled recovery passes: */
- u64 recovery_passes_explicit;
/* bitmask of recovery passes that we actually ran */
u64 recovery_passes_complete;
/* never rewinds version of curr_recovery_pass */
@@ -1085,7 +1085,6 @@ struct bch_fs {
u64 __percpu *counters;
unsigned copy_gc_enabled:1;
- bool promote_whole_extents;
struct bch2_time_stats times[BCH_TIME_STAT_NR];
@@ -1195,12 +1194,15 @@ static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
{
struct timespec64 t;
+ s64 sec;
s32 rem;
time += c->sb.time_base_lo;
- t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
- t.tv_nsec = rem * c->sb.nsec_per_time_unit;
+ sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
+
+ set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
+
return t;
}
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 14ce726bf5a3..8c4addddd07e 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -795,6 +795,8 @@ LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
+LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
+ struct bch_sb, flags[0], 63, 64);
LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8);
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 575e1d0b6eeb..d1f6092624d8 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -304,11 +304,6 @@ struct bkey_float {
};
#define BKEY_MANTISSA_BITS 16
-static unsigned bkey_float_byte_offset(unsigned idx)
-{
- return idx * sizeof(struct bkey_float);
-}
-
struct ro_aux_tree {
u8 nothing[0];
struct bkey_float f[];
@@ -328,8 +323,7 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
return t->aux_data_offset;
case BSET_RO_AUX_TREE:
return t->aux_data_offset +
- DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
- t->size * sizeof(u8), 8);
+ DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8);
case BSET_RW_AUX_TREE:
return t->aux_data_offset +
DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
@@ -360,14 +354,6 @@ static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
return __aux_tree_base(b, t);
}
-static u8 *ro_aux_tree_prev(const struct btree *b,
- const struct bset_tree *t)
-{
- EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
- return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
-}
-
static struct bkey_float *bkey_float(const struct btree *b,
const struct bset_tree *t,
unsigned idx)
@@ -479,15 +465,6 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
bkey_float(b, t, j)->key_offset);
}
-static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
- const struct bset_tree *t,
- unsigned j)
-{
- unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
-
- return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
-}
-
static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
const struct bset_tree *t)
{
@@ -585,8 +562,7 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
}
static inline unsigned bkey_mantissa(const struct bkey_packed *k,
- const struct bkey_float *f,
- unsigned idx)
+ const struct bkey_float *f)
{
u64 v;
@@ -617,7 +593,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
struct bkey_packed *m = tree_to_bkey(b, t, j);
struct bkey_packed *l = is_power_of_2(j)
? min_key
- : tree_to_prev_bkey(b, t, j >> ffs(j));
+ : tree_to_bkey(b, t, j >> ffs(j));
struct bkey_packed *r = is_power_of_2(j + 1)
? max_key
: tree_to_bkey(b, t, j >> (ffz(j) + 1));
@@ -668,7 +644,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
f->exponent = shift;
- mantissa = bkey_mantissa(m, f, j);
+ mantissa = bkey_mantissa(m, f);
/*
* If we've got garbage bits, set them to all 1s - it's legal for the
@@ -690,8 +666,7 @@ static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
{
- return __bset_tree_capacity(b, t) /
- (sizeof(struct bkey_float) + sizeof(u8));
+ return __bset_tree_capacity(b, t) / sizeof(struct bkey_float);
}
static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
@@ -720,7 +695,7 @@ static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
- struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+ struct bkey_packed *k = btree_bkey_first(b, t);
struct bkey_i min_key, max_key;
unsigned cacheline = 1;
@@ -733,12 +708,12 @@ retry:
return;
}
- t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+ t->extra = eytzinger1_extra(t->size - 1);
/* First we figure out where the first key in each cacheline is */
eytzinger1_for_each(j, t->size - 1) {
while (bkey_to_cacheline(b, t, k) < cacheline)
- prev = k, k = bkey_p_next(k);
+ k = bkey_p_next(k);
if (k >= btree_bkey_last(b, t)) {
/* XXX: this path sucks */
@@ -746,17 +721,12 @@ retry:
goto retry;
}
- ro_aux_tree_prev(b, t)[j] = prev->u64s;
bkey_float(b, t, j)->key_offset =
bkey_to_cacheline_offset(b, t, cacheline++, k);
- EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
EBUG_ON(tree_to_bkey(b, t, j) != k);
}
- while (k != btree_bkey_last(b, t))
- prev = k, k = bkey_p_next(k);
-
if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
bkey_init(&min_key.k);
min_key.k.p = b->data->min_key;
@@ -915,6 +885,38 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
/* Insert */
+static void rw_aux_tree_insert_entry(struct btree *b,
+ struct bset_tree *t,
+ unsigned idx)
+{
+ EBUG_ON(!idx || idx > t->size);
+ struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
+ struct bkey_packed *end = idx < t->size
+ ? rw_aux_to_bkey(b, t, idx)
+ : btree_bkey_last(b, t);
+
+ if (t->size < bset_rw_tree_capacity(b, t) &&
+ (void *) end - (void *) start > L1_CACHE_BYTES) {
+ struct bkey_packed *k = start;
+
+ while (1) {
+ k = bkey_p_next(k);
+ if (k == end)
+ break;
+
+ if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+ memmove(&rw_aux_tree(b, t)[idx + 1],
+ &rw_aux_tree(b, t)[idx],
+ (void *) &rw_aux_tree(b, t)[t->size] -
+ (void *) &rw_aux_tree(b, t)[idx]);
+ t->size++;
+ rw_aux_tree_set(b, t, idx, k);
+ break;
+ }
+ }
+ }
+}
+
static void bch2_bset_fix_lookup_table(struct btree *b,
struct bset_tree *t,
struct bkey_packed *_where,
@@ -922,84 +924,59 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
unsigned new_u64s)
{
int shift = new_u64s - clobber_u64s;
- unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+ unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
EBUG_ON(bset_has_ro_aux_tree(t));
if (!bset_has_rw_aux_tree(t))
return;
+ if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
+ rw_aux_tree_insert_entry(b, t, t->size);
+ goto verify;
+ }
+
/* returns first entry >= where */
- l = rw_aux_tree_bsearch(b, t, where);
-
- if (!l) /* never delete first entry */
- l++;
- else if (l < t->size &&
- where < t->end_offset &&
- rw_aux_tree(b, t)[l].offset == where)
- rw_aux_tree_set(b, t, l++, _where);
-
- /* l now > where */
-
- for (j = l;
- j < t->size &&
- rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
- j++)
- ;
-
- if (j < t->size &&
- rw_aux_tree(b, t)[j].offset + shift ==
- rw_aux_tree(b, t)[l - 1].offset)
- j++;
-
- memmove(&rw_aux_tree(b, t)[l],
- &rw_aux_tree(b, t)[j],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[j]);
- t->size -= j - l;
-
- for (j = l; j < t->size; j++)
- rw_aux_tree(b, t)[j].offset += shift;
+ idx = rw_aux_tree_bsearch(b, t, where);
+
+ if (rw_aux_tree(b, t)[idx].offset == where) {
+ if (!idx) { /* never delete first entry */
+ idx++;
+ } else if (where < t->end_offset) {
+ rw_aux_tree_set(b, t, idx++, _where);
+ } else {
+ EBUG_ON(where != t->end_offset);
+ rw_aux_tree_insert_entry(b, t, --t->size);
+ goto verify;
+ }
+ }
- EBUG_ON(l < t->size &&
- rw_aux_tree(b, t)[l].offset ==
- rw_aux_tree(b, t)[l - 1].offset);
+ EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
+ if (idx < t->size &&
+ rw_aux_tree(b, t)[idx].offset + shift ==
+ rw_aux_tree(b, t)[idx - 1].offset) {
+ memmove(&rw_aux_tree(b, t)[idx],
+ &rw_aux_tree(b, t)[idx + 1],
+ (void *) &rw_aux_tree(b, t)[t->size] -
+ (void *) &rw_aux_tree(b, t)[idx + 1]);
+ t->size -= 1;
+ }
- if (t->size < bset_rw_tree_capacity(b, t) &&
- (l < t->size
- ? rw_aux_tree(b, t)[l].offset
- : t->end_offset) -
- rw_aux_tree(b, t)[l - 1].offset >
- L1_CACHE_BYTES / sizeof(u64)) {
- struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
- struct bkey_packed *end = l < t->size
- ? rw_aux_to_bkey(b, t, l)
- : btree_bkey_last(b, t);
- struct bkey_packed *k = start;
+ for (j = idx; j < t->size; j++)
+ rw_aux_tree(b, t)[j].offset += shift;
- while (1) {
- k = bkey_p_next(k);
- if (k == end)
- break;
+ EBUG_ON(idx < t->size &&
+ rw_aux_tree(b, t)[idx].offset ==
+ rw_aux_tree(b, t)[idx - 1].offset);
- if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
- memmove(&rw_aux_tree(b, t)[l + 1],
- &rw_aux_tree(b, t)[l],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[l]);
- t->size++;
- rw_aux_tree_set(b, t, l, k);
- break;
- }
- }
- }
+ rw_aux_tree_insert_entry(b, t, idx);
+verify:
bch2_bset_verify_rw_aux_tree(b, t);
bset_aux_tree_verify(b);
}
void bch2_bset_insert(struct btree *b,
- struct btree_node_iter *iter,
struct bkey_packed *where,
struct bkey_i *insert,
unsigned clobber_u64s)
@@ -1098,8 +1075,7 @@ static inline void prefetch_four_cachelines(void *p)
}
static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
- const struct bkey_float *f,
- unsigned idx)
+ const struct bkey_float *f)
{
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
@@ -1133,9 +1109,9 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
goto slowpath;
l = f->mantissa;
- r = bkey_mantissa(packed_search, f, n);
+ r = bkey_mantissa(packed_search, f);
- if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
+ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f))
goto slowpath;
n = n * 2 + (l < r);
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 5c6c7a14fa0f..6953d55b72cc 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -270,8 +270,8 @@ void bch2_bset_init_first(struct btree *, struct bset *);
void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-void bch2_bset_insert(struct btree *, struct btree_node_iter *,
- struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *,
+ unsigned);
void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
/* Bkey utility code */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index e52a06d3418c..6e4afb2b5441 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -15,11 +15,12 @@
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
+#include <linux/swap.h>
#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
do { \
if (shrinker_counter) \
- bc->not_freed_##counter++; \
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \
} while (0)
const char * const bch2_btree_node_flags[] = {
@@ -31,24 +32,29 @@ const char * const bch2_btree_node_flags[] = {
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
- unsigned i, reserve = 16;
+ unsigned reserve = 16;
if (!c->btree_roots_known[0].b)
reserve += 8;
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
reserve += min_t(unsigned, 1, r->b->c.level) * 8;
}
- c->btree_cache.reserve = reserve;
+ c->btree_cache.nr_reserve = reserve;
}
-static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+static inline size_t btree_cache_can_free(struct btree_cache_list *list)
{
- return max_t(int, 0, bc->used - bc->reserve);
+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+
+ size_t can_free = list->nr;
+ if (!list->idx)
+ can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
+ return can_free;
}
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
@@ -63,6 +69,18 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
+ BUG_ON(btree_node_hashed(b));
+
+ /*
+ * This should really be done in slub/vmalloc, but we're using the
+ * kmalloc_large() path, so we're working around a slub bug by doing
+ * this here:
+ */
+ if (b->data)
+ mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
+ if (b->aux_data)
+ mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
+
EBUG_ON(btree_node_write_in_flight(b));
clear_btree_node_just_written(b);
@@ -76,7 +94,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
#endif
b->aux_data = NULL;
- bc->used--;
+ bc->nr_freeable--;
btree_node_to_freedlist(bc, b);
}
@@ -102,6 +120,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
+ gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
+
b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
@@ -154,7 +174,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
bch2_btree_lock_init(&b->c, 0);
- bc->used++;
+ bc->nr_freeable++;
list_add(&b->list, &bc->freeable);
return b;
}
@@ -169,10 +189,56 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
six_unlock_intent(&b->c.lock);
}
+static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
+{
+ struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+ u64 mask = bc->pinned_nodes_mask[!!b->c.level];
+
+ return ((mask & BIT_ULL(b->c.btree_id)) &&
+ bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+ bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
+}
+
+void bch2_node_pin(struct bch_fs *c, struct btree *b)
+{
+ struct btree_cache *bc = &c->btree_cache;
+
+ mutex_lock(&bc->lock);
+ BUG_ON(!__btree_node_pinned(bc, b));
+ if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
+ set_btree_node_pinned(b);
+ list_move(&b->list, &bc->live[1].list);
+ bc->live[0].nr--;
+ bc->live[1].nr++;
+ }
+ mutex_unlock(&bc->lock);
+}
+
+void bch2_btree_cache_unpin(struct bch_fs *c)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b, *n;
+
+ mutex_lock(&bc->lock);
+ c->btree_cache.pinned_nodes_mask[0] = 0;
+ c->btree_cache.pinned_nodes_mask[1] = 0;
+
+ list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
+ clear_btree_node_pinned(b);
+ list_move(&b->list, &bc->live[0].list);
+ bc->live[0].nr++;
+ bc->live[1].nr--;
+ }
+
+ mutex_unlock(&bc->lock);
+}
+
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
+ lockdep_assert_held(&bc->lock);
int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
BUG_ON(ret);
@@ -181,7 +247,11 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
b->hash_val = 0;
if (b->c.btree_id < BTREE_ID_NR)
- --bc->used_by_btree[b->c.btree_id];
+ --bc->nr_by_btree[b->c.btree_id];
+
+ bc->live[btree_node_pinned(b)].nr--;
+ bc->nr_freeable++;
+ list_move(&b->list, &bc->freeable);
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@@ -191,23 +261,30 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
bch_btree_cache_params);
- if (!ret && b->c.btree_id < BTREE_ID_NR)
- bc->used_by_btree[b->c.btree_id]++;
- return ret;
+ if (ret)
+ return ret;
+
+ if (b->c.btree_id < BTREE_ID_NR)
+ bc->nr_by_btree[b->c.btree_id]++;
+
+ bool p = __btree_node_pinned(bc, b);
+ mod_bit(BTREE_NODE_pinned, &b->flags, p);
+
+ list_move_tail(&b->list, &bc->live[p].list);
+ bc->live[p].nr++;
+
+ bc->nr_freeable--;
+ return 0;
}
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
unsigned level, enum btree_id id)
{
- int ret;
-
b->c.level = level;
b->c.btree_id = id;
mutex_lock(&bc->lock);
- ret = __bch2_btree_node_hash_insert(bc, b);
- if (!ret)
- list_add_tail(&b->list, &bc->live);
+ int ret = __bch2_btree_node_hash_insert(bc, b);
mutex_unlock(&bc->lock);
return ret;
@@ -261,18 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b
int ret = 0;
lockdep_assert_held(&bc->lock);
-
- struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
-
- u64 mask = b->c.level
- ? bc->pinned_nodes_interior_mask
- : bc->pinned_nodes_leaf_mask;
-
- if ((mask & BIT_ULL(b->c.btree_id)) &&
- bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
- bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
-
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
@@ -377,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = shrink->private_data;
- struct btree_cache *bc = &c->btree_cache;
+ struct btree_cache_list *list = shrink->private_data;
+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
unsigned long can_free = 0;
@@ -386,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long touched = 0;
unsigned i, flags;
unsigned long ret = SHRINK_STOP;
- bool trigger_writes = atomic_read(&bc->dirty) + nr >=
- bc->used * 3 / 4;
+ bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
@@ -402,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
* succeed, so that inserting keys into the btree can always succeed and
* IO can always make forward progress:
*/
- can_free = btree_cache_can_free(bc);
+ can_free = btree_cache_can_free(list);
nr = min_t(unsigned long, nr, can_free);
i = 0;
@@ -424,22 +489,24 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
- bc->freed++;
+ bc->nr_freed++;
}
}
restart:
- list_for_each_entry_safe(b, t, &bc->live, list) {
+ list_for_each_entry_safe(b, t, &list->list, list) {
touched++;
if (btree_node_accessed(b)) {
clear_btree_node_accessed(b);
- bc->not_freed_access_bit++;
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
+ --touched;;
} else if (!btree_node_reclaim(c, b, true)) {
+ bch2_btree_node_hash_remove(bc, b);
+
freed++;
btree_node_data_free(c, b);
- bc->freed++;
+ bc->nr_freed++;
- bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -450,7 +517,7 @@ restart:
!btree_node_will_make_reachable(b) &&
!btree_node_write_blocked(b) &&
six_trylock_read(&b->c.lock)) {
- list_move(&bc->live, &b->list);
+ list_move(&list->list, &b->list);
mutex_unlock(&bc->lock);
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_read(&b->c.lock);
@@ -464,8 +531,8 @@ restart:
break;
}
out_rotate:
- if (&t->list != &bc->live)
- list_move_tail(&bc->live, &t->list);
+ if (&t->list != &list->list)
+ list_move_tail(&list->list, &t->list);
out:
mutex_unlock(&bc->lock);
out_nounlock:
@@ -478,44 +545,45 @@ out_nounlock:
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = shrink->private_data;
- struct btree_cache *bc = &c->btree_cache;
+ struct btree_cache_list *list = shrink->private_data;
if (bch2_btree_shrinker_disabled)
return 0;
- return btree_cache_can_free(bc);
+ return btree_cache_can_free(list);
}
void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
- unsigned i, flags;
+ struct btree *b, *t;
+ unsigned long flags;
- shrinker_free(bc->shrink);
+ shrinker_free(bc->live[1].shrink);
+ shrinker_free(bc->live[0].shrink);
/* vfree() can allocate memory: */
flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
if (c->verify_data)
- list_move(&c->verify_data->list, &bc->live);
+ list_move(&c->verify_data->list, &bc->live[0].list);
kvfree(c->verify_ondisk);
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
- list_add(&r->b->list, &bc->live);
+ list_add(&r->b->list, &bc->live[0].list);
}
- list_splice(&bc->freeable, &bc->live);
-
- while (!list_empty(&bc->live)) {
- b = list_first_entry(&bc->live, struct btree, list);
+ list_for_each_entry_safe(b, t, &bc->live[1].list, list)
+ bch2_btree_node_hash_remove(bc, b);
+ list_for_each_entry_safe(b, t, &bc->live[0].list, list)
+ bch2_btree_node_hash_remove(bc, b);
+ list_for_each_entry_safe(b, t, &bc->freeable, list) {
BUG_ON(btree_node_read_in_flight(b) ||
btree_node_write_in_flight(b));
@@ -523,12 +591,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
}
BUG_ON(!bch2_journal_error(&c->journal) &&
- atomic_read(&c->btree_cache.dirty));
+ atomic_long_read(&c->btree_cache.nr_dirty));
list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
- while (!list_empty(&bc->freed_nonpcpu)) {
- b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
+ list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
list_del(&b->list);
six_lock_exit(&b->c.lock);
kfree(b);
@@ -537,6 +604,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
mutex_unlock(&bc->lock);
memalloc_nofs_restore(flags);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
+ BUG_ON(bc->nr_by_btree[i]);
+ BUG_ON(bc->live[0].nr);
+ BUG_ON(bc->live[1].nr);
+ BUG_ON(bc->nr_freeable);
+
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
}
@@ -556,22 +629,32 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bch2_recalc_btree_reserve(c);
- for (i = 0; i < bc->reserve; i++)
+ for (i = 0; i < bc->nr_reserve; i++)
if (!__bch2_btree_node_mem_alloc(c))
goto err;
- list_splice_init(&bc->live, &bc->freeable);
+ list_splice_init(&bc->live[0].list, &bc->freeable);
mutex_init(&c->verify_lock);
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
if (!shrink)
goto err;
- bc->shrink = shrink;
+ bc->live[0].shrink = shrink;
+ shrink->count_objects = bch2_btree_cache_count;
+ shrink->scan_objects = bch2_btree_cache_scan;
+ shrink->seeks = 2;
+ shrink->private_data = &bc->live[0];
+ shrinker_register(shrink);
+
+ shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
+ if (!shrink)
+ goto err;
+ bc->live[1].shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
- shrink->seeks = 4;
- shrink->private_data = c;
+ shrink->seeks = 8;
+ shrink->private_data = &bc->live[1];
shrinker_register(shrink);
return 0;
@@ -582,7 +665,10 @@ err:
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
{
mutex_init(&bc->lock);
- INIT_LIST_HEAD(&bc->live);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
+ bc->live[i].idx = i;
+ INIT_LIST_HEAD(&bc->live[i].list);
+ }
INIT_LIST_HEAD(&bc->freeable);
INIT_LIST_HEAD(&bc->freed_pcpu);
INIT_LIST_HEAD(&bc->freed_nonpcpu);
@@ -644,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_reclaim(c, b, false))
- return b;
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+ list_for_each_entry_reverse(b, &bc->live[i].list, list)
+ if (!btree_node_reclaim(c, b, false))
+ return b;
while (1) {
- list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_write_and_reclaim(c, b))
- return b;
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+ list_for_each_entry_reverse(b, &bc->live[i].list, list)
+ if (!btree_node_write_and_reclaim(c, b))
+ return b;
/*
* Rare case: all nodes were intent-locked.
@@ -671,9 +759,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
: &bc->freed_nonpcpu;
struct btree *b, *b2;
u64 start_time = local_clock();
- unsigned flags;
- flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
/*
@@ -725,7 +811,7 @@ got_node:
}
mutex_lock(&bc->lock);
- bc->used++;
+ bc->nr_freeable++;
got_mem:
mutex_unlock(&bc->lock);
@@ -745,8 +831,6 @@ out:
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
- memalloc_nofs_restore(flags);
-
int ret = bch2_trans_relock(trans);
if (unlikely(ret)) {
bch2_btree_node_to_freelist(c, b);
@@ -781,7 +865,6 @@ err:
}
mutex_unlock(&bc->lock);
- memalloc_nofs_restore(flags);
return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
}
@@ -1269,8 +1352,8 @@ wait_on_io:
BUG_ON(btree_node_dirty(b));
mutex_lock(&bc->lock);
- btree_node_data_free(c, b);
bch2_btree_node_hash_remove(bc, b);
+ btree_node_data_free(c, b);
mutex_unlock(&bc->lock);
out:
six_unlock_write(&b->c.lock);
@@ -1342,13 +1425,20 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
}
static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
- const char *label, unsigned nr)
+ const char *label, size_t nr)
{
prt_printf(out, "%s\t", label);
prt_human_readable_u64(out, nr * c->opts.btree_node_size);
- prt_printf(out, " (%u)\n", nr);
+ prt_printf(out, " (%zu)\n", nr);
}
+static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
+#define x(n) #n,
+ BCH_BTREE_CACHE_NOT_FREED_REASONS()
+#undef x
+ NULL
+};
+
void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
@@ -1356,24 +1446,21 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
- prt_btree_cache_line(out, c, "total:", bc->used);
- prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty));
+ prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
+ prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
+ prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
+ prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
prt_newline(out);
- for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++)
- prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
+ prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]);
prt_newline(out);
- prt_printf(out, "freed:\t%u\n", bc->freed);
+ prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
prt_printf(out, "not freed:\n");
- prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty);
- prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight);
- prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight);
- prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent);
- prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write);
- prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit);
- prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict);
- prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked);
- prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
+ prt_printf(out, " %s\t%llu\n",
+ bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index f82064007127..367acd217c6a 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
+void bch2_node_pin(struct bch_fs *, struct btree *);
+void bch2_btree_cache_unpin(struct bch_fs *);
+
void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_i *);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index eb3002c4eae7..b5e0692f03c6 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -549,9 +549,8 @@ reconstruct_root:
six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) {
- bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_lock(&c->btree_cache.lock);
- list_move(&b->list, &c->btree_cache.freeable);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
r->b = NULL;
@@ -753,10 +752,8 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->reflink_gc_table);
genradix_free(&c->gc_stripes);
- for_each_member_device(c, ca) {
- kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
- ca->buckets_gc = NULL;
- }
+ for_each_member_device(c, ca)
+ genradix_free(&ca->buckets_gc);
}
static int bch2_gc_start(struct bch_fs *c)
@@ -910,20 +907,12 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
int ret = 0;
for_each_member_device(c, ca) {
- struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO);
- if (!buckets) {
+ ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
+ if (ret) {
bch2_dev_put(ca);
ret = -BCH_ERR_ENOMEM_gc_alloc_start;
break;
}
-
- buckets->first_bucket = ca->mi.first_bucket;
- buckets->nbuckets = ca->mi.nbuckets;
- buckets->nbuckets_minus_first =
- buckets->nbuckets - buckets->first_bucket;
- rcu_assign_pointer(ca->buckets_gc, buckets);
}
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 56ea9a77cd4a..cb48a9477514 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1666,7 +1666,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bch2_btree_pos_to_text(&buf, c, b);
bch_err_ratelimited(c, "%s", buf.buf);
- if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+ if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
bch2_fatal_error(c);
@@ -1749,10 +1749,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
- bch2_btree_node_hash_remove(&c->btree_cache, b);
-
mutex_lock(&c->btree_cache.lock);
- list_move(&b->list, &c->btree_cache.freeable);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
ret = -BCH_ERR_btree_node_read_error;
@@ -2031,7 +2029,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
do_write:
BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
- atomic_dec(&c->btree_cache.dirty);
+ atomic_long_dec(&c->btree_cache.nr_dirty);
BUG_ON(btree_node_fake(b));
BUG_ON((b->will_make_reachable != 0) != !b->written);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 63d76f5c6403..9b01ca3de907 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -18,13 +18,13 @@ struct btree_node_read_all;
static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
- atomic_inc(&c->btree_cache.dirty);
+ atomic_long_inc(&c->btree_cache.nr_dirty);
}
static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
- atomic_dec(&c->btree_cache.dirty);
+ atomic_long_dec(&c->btree_cache.nr_dirty);
}
static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2e84d22e17bd..bfe9f0c1e1be 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1010,9 +1010,9 @@ retry_all:
* the same position:
*/
if (trans->paths[idx].uptodate) {
- __btree_path_get(&trans->paths[idx], false);
+ __btree_path_get(trans, &trans->paths[idx], false);
ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
- __btree_path_put(&trans->paths[idx], false);
+ __btree_path_put(trans, &trans->paths[idx], false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, ENOMEM))
@@ -1131,6 +1131,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
if (unlikely(!trans->srcu_held))
bch2_trans_srcu_lock(trans);
+ trace_btree_path_traverse_start(trans, path);
+
/*
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
* and re-traverse the path without a transaction restart:
@@ -1194,6 +1196,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
out_uptodate:
path->uptodate = BTREE_ITER_UPTODATE;
+ trace_btree_path_traverse_end(trans, path);
out:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
panic("ret %s (%i) trans->restarted %s (%i)\n",
@@ -1225,7 +1228,7 @@ static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_i
{
btree_path_idx_t new = btree_path_alloc(trans, src);
btree_path_copy(trans, trans->paths + new, trans->paths + src);
- __btree_path_get(trans->paths + new, intent);
+ __btree_path_get(trans, trans->paths + new, intent);
#ifdef TRACK_PATH_ALLOCATED
trans->paths[new].ip_allocated = ip;
#endif
@@ -1236,8 +1239,10 @@ __flatten
btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
btree_path_idx_t path, bool intent, unsigned long ip)
{
- __btree_path_put(trans->paths + path, intent);
+ struct btree_path *old = trans->paths + path;
+ __btree_path_put(trans, trans->paths + path, intent);
path = btree_path_clone(trans, path, intent, ip);
+ trace_btree_path_clone(trans, old, trans->paths + path);
trans->paths[path].preserve = false;
return path;
}
@@ -1252,6 +1257,8 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
bch2_trans_verify_not_in_restart(trans);
EBUG_ON(!trans->paths[path_idx].ref);
+ trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
+
path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
struct btree_path *path = trans->paths + path_idx;
@@ -1361,13 +1368,15 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
{
struct btree_path *path = trans->paths + path_idx, *dup;
- if (!__btree_path_put(path, intent))
+ if (!__btree_path_put(trans, path, intent))
return;
dup = path->preserve
? have_path_at_pos(trans, path)
: have_node_at_pos(trans, path);
+ trace_btree_path_free(trans, path_idx, dup);
+
if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
return;
@@ -1392,7 +1401,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
bool intent)
{
- if (!__btree_path_put(trans->paths + path, intent))
+ if (!__btree_path_put(trans, trans->paths + path, intent))
return;
__bch2_path_free(trans, path);
@@ -1421,8 +1430,8 @@ void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
- prt_printf(buf, "transaction updates for %s journal seq %llu\n",
- trans->fn, trans->journal_res.seq);
+ prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
+ trans->nr_updates, trans->fn, trans->journal_res.seq);
printbuf_indent_add(buf, 2);
trans_for_each_update(trans, i) {
@@ -1464,7 +1473,7 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra
{
struct btree_path *path = trans->paths + path_idx;
- prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ",
+ prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ",
path_idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
@@ -1716,14 +1725,16 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
trans->paths[path_pos].cached == cached &&
trans->paths[path_pos].btree_id == btree_id &&
trans->paths[path_pos].level == level) {
- __btree_path_get(trans->paths + path_pos, intent);
+ trace_btree_path_get(trans, trans->paths + path_pos, &pos);
+
+ __btree_path_get(trans, trans->paths + path_pos, intent);
path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
path = trans->paths + path_idx;
} else {
path_idx = btree_path_alloc(trans, path_pos);
path = trans->paths + path_idx;
- __btree_path_get(path, intent);
+ __btree_path_get(trans, path, intent);
path->pos = pos;
path->btree_id = btree_id;
path->cached = cached;
@@ -1738,6 +1749,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
path->ip_allocated = ip;
#endif
trans->paths_sorted = false;
+
+ trace_btree_path_alloc(trans, path);
}
if (!(flags & BTREE_ITER_nopreserve))
@@ -1857,7 +1870,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
struct btree_path *path = btree_iter_path(trans, iter);
if (btree_path_node(path, path->level))
- btree_path_set_should_be_locked(path);
+ btree_path_set_should_be_locked(trans, path);
return 0;
}
@@ -1889,7 +1902,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -1983,7 +1996,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
@@ -2155,7 +2168,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
if (unlikely(ret))
return bkey_s_c_err(ret);
- btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
+ btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
if (k.k && !bkey_err(k)) {
@@ -2199,7 +2212,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
- btree_path_set_should_be_locked(path);
+ btree_path_set_should_be_locked(trans, path);
k = btree_path_level_peek_all(trans->c, l, &iter->k);
@@ -2326,7 +2339,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* advance, same as on exit for iter->path, but only up
* to snapshot
*/
- __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
+ __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
iter->update_path = iter->path;
iter->update_path = bch2_btree_path_set_pos(trans,
@@ -2382,14 +2395,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
if (iter->update_path) {
ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
if (unlikely(ret))
k = bkey_s_c_err(ret);
else
- btree_path_set_should_be_locked(trans->paths + iter->update_path);
+ btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
}
if (!(iter->flags & BTREE_ITER_all_snapshots))
@@ -2511,6 +2524,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
iter->flags & BTREE_ITER_intent,
_THIS_IP_);
path = btree_iter_path(trans, iter);
+ trace_btree_path_save_pos(trans, path, trans->paths + saved_path);
saved_k = *k.k;
saved_v = k.v;
}
@@ -2527,7 +2541,7 @@ got_key:
continue;
}
- btree_path_set_should_be_locked(path);
+ btree_path_set_should_be_locked(trans, path);
break;
} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
/* Advance to previous leaf node: */
@@ -2685,7 +2699,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
}
out:
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -2712,6 +2726,7 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
return bch2_btree_iter_peek_slot(iter);
}
+/* Obsolete, but still used by rust wrapper in -tools */
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
{
struct bkey_s_c k;
@@ -2911,9 +2926,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
dst->ip_allocated = _RET_IP_;
#endif
if (src->path)
- __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent);
+ __btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
if (src->update_path)
- __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
+ __btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
dst->key_cache_path = 0;
}
@@ -3237,7 +3252,7 @@ void bch2_trans_put(struct btree_trans *trans)
bch2_trans_unlock(trans);
trans_for_each_update(trans, i)
- __btree_path_put(trans->paths + i->path, true);
+ __btree_path_put(trans, trans->paths + i->path, true);
trans->nr_updates = 0;
check_btree_paths_leaked(trans);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 222b7ce8a901..78e63ad7d380 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -6,6 +6,12 @@
#include "btree_types.h"
#include "trace.h"
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
+
static inline int __bkey_err(const struct bkey *k)
{
return PTR_ERR_OR_ZERO(k);
@@ -13,16 +19,28 @@ static inline int __bkey_err(const struct bkey *k)
#define bkey_err(_k) __bkey_err((_k).k)
-static inline void __btree_path_get(struct btree_path *path, bool intent)
+static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent)
{
+ unsigned idx = path - trans->paths;
+
+ EBUG_ON(!test_bit(idx, trans->paths_allocated));
+ if (unlikely(path->ref == U8_MAX)) {
+ bch2_dump_trans_paths_updates(trans);
+ panic("path %u refcount overflow\n", idx);
+ }
+
path->ref++;
path->intent_ref += intent;
+ trace_btree_path_get_ll(trans, path);
}
-static inline bool __btree_path_put(struct btree_path *path, bool intent)
+static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
{
+ EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
EBUG_ON(!path->ref);
EBUG_ON(!path->intent_ref && intent);
+
+ trace_btree_path_put_ll(trans, path);
path->intent_ref -= intent;
return --path->ref == 0;
}
@@ -511,6 +529,12 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *);
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+/**
+ * bch2_trans_kmalloc - allocate memory for use by the current transaction
+ *
+ * Must be called after bch2_trans_begin, which on second and further calls
+ * frees all memory allocated in this transaction
+ */
static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
size = roundup(size, 8);
@@ -814,20 +838,6 @@ transaction_restart: \
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
- struct btree_iter *iter, unsigned flags)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(trans) ||
- (k = bch2_btree_iter_peek_type(iter, flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(trans);
-
- return k;
-}
-
#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
@@ -868,7 +878,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
\
if (bch2_err_matches(_ret, ENOMEM)) { \
_gfp = GFP_KERNEL; \
- _ret = drop_locks_do(trans, _do); \
+ _ret = drop_locks_do(_trans, _do); \
} \
_ret; \
})
@@ -881,7 +891,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
_ret = 0; \
if (unlikely(!_p)) { \
_gfp = GFP_KERNEL; \
- _ret = drop_locks_do(trans, ((_p = _do), 0)); \
+ _ret = drop_locks_do(_trans, ((_p = _do), 0)); \
} \
_p; \
})
@@ -894,12 +904,6 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
_ret; \
})
-void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
-void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
-void bch2_dump_trans_updates(struct btree_trans *);
-void bch2_dump_trans_paths_updates(struct btree_trans *);
-
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
void bch2_trans_put(struct btree_trans *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index fda7998734cb..244610b1d0b5 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -79,134 +79,47 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
return true;
}
-static void bkey_cached_evict(struct btree_key_cache *c,
+static bool bkey_cached_evict(struct btree_key_cache *c,
struct bkey_cached *ck)
{
- BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
- bch2_btree_key_cache_params));
- memset(&ck->key, ~0, sizeof(ck->key));
-
- atomic_long_dec(&c->nr_keys);
-}
-
-static void bkey_cached_free(struct btree_key_cache *bc,
- struct bkey_cached *ck)
-{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
- BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
- ck->btree_trans_barrier_seq =
- start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
- if (ck->c.lock.readers) {
- list_move_tail(&ck->list, &bc->freed_pcpu);
- bc->nr_freed_pcpu++;
- } else {
- list_move_tail(&ck->list, &bc->freed_nonpcpu);
- bc->nr_freed_nonpcpu++;
+ bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
+ bch2_btree_key_cache_params);
+ if (ret) {
+ memset(&ck->key, ~0, sizeof(ck->key));
+ atomic_long_dec(&c->nr_keys);
}
- atomic_long_inc(&bc->nr_freed);
-
- kfree(ck->k);
- ck->k = NULL;
- ck->u64s = 0;
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
+ return ret;
}
-#ifdef __KERNEL__
-static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
- struct bkey_cached *ck)
+static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
{
- struct bkey_cached *pos;
-
- bc->nr_freed_nonpcpu++;
+ struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
+ struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
- list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
- if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
- pos->btree_trans_barrier_seq)) {
- list_move(&ck->list, &pos->list);
- return;
- }
- }
-
- list_move(&ck->list, &bc->freed_nonpcpu);
+ this_cpu_dec(*c->btree_key_cache.nr_pending);
+ kmem_cache_free(bch2_key_cache, ck);
}
-#endif
-
-static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
- struct bkey_cached *ck)
-{
- BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
- if (!ck->c.lock.readers) {
-#ifdef __KERNEL__
- struct btree_key_cache_freelist *f;
- bool freed = false;
-
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- if (f->nr < ARRAY_SIZE(f->objs)) {
- f->objs[f->nr++] = ck;
- freed = true;
- }
- preempt_enable();
- if (!freed) {
- mutex_lock(&bc->lock);
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- while (f->nr > ARRAY_SIZE(f->objs) / 2) {
- struct bkey_cached *ck2 = f->objs[--f->nr];
-
- __bkey_cached_move_to_freelist_ordered(bc, ck2);
- }
- preempt_enable();
-
- __bkey_cached_move_to_freelist_ordered(bc, ck);
- mutex_unlock(&bc->lock);
- }
-#else
- mutex_lock(&bc->lock);
- list_move_tail(&ck->list, &bc->freed_nonpcpu);
- bc->nr_freed_nonpcpu++;
- mutex_unlock(&bc->lock);
-#endif
- } else {
- mutex_lock(&bc->lock);
- list_move_tail(&ck->list, &bc->freed_pcpu);
- bc->nr_freed_pcpu++;
- mutex_unlock(&bc->lock);
- }
-}
-
-static void bkey_cached_free_fast(struct btree_key_cache *bc,
- struct bkey_cached *ck)
+static void bkey_cached_free(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
- ck->btree_trans_barrier_seq =
- start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
- list_del_init(&ck->list);
- atomic_long_inc(&bc->nr_freed);
-
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
- bkey_cached_move_to_freelist(bc, ck);
-
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
+
+ bool pcpu_readers = ck->c.lock.readers != NULL;
+ rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
+ this_cpu_inc(*bc->nr_pending);
}
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
{
+ gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
+
struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
if (unlikely(!ck))
return NULL;
@@ -224,74 +137,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
- struct bkey_cached *ck = NULL;
bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
int ret;
- if (!pcpu_readers) {
-#ifdef __KERNEL__
- struct btree_key_cache_freelist *f;
-
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
- if (f->nr)
- ck = f->objs[--f->nr];
- preempt_enable();
-
- if (!ck) {
- mutex_lock(&bc->lock);
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- while (!list_empty(&bc->freed_nonpcpu) &&
- f->nr < ARRAY_SIZE(f->objs) / 2) {
- ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- bc->nr_freed_nonpcpu--;
- f->objs[f->nr++] = ck;
- }
-
- ck = f->nr ? f->objs[--f->nr] : NULL;
- preempt_enable();
- mutex_unlock(&bc->lock);
- }
-#else
- mutex_lock(&bc->lock);
- if (!list_empty(&bc->freed_nonpcpu)) {
- ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- bc->nr_freed_nonpcpu--;
- }
- mutex_unlock(&bc->lock);
-#endif
- } else {
- mutex_lock(&bc->lock);
- if (!list_empty(&bc->freed_pcpu)) {
- ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- bc->nr_freed_pcpu--;
- }
- mutex_unlock(&bc->lock);
- }
-
- if (ck) {
- ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
- if (unlikely(ret)) {
- bkey_cached_move_to_freelist(bc, ck);
- return ERR_PTR(ret);
- }
-
- btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
-
- ret = bch2_btree_node_lock_write(trans, path, &ck->c);
- if (unlikely(ret)) {
- btree_node_unlock(trans, path, 0);
- bkey_cached_move_to_freelist(bc, ck);
- return ERR_PTR(ret);
- }
-
- return ck;
- }
+ struct bkey_cached *ck = container_of_or_null(
+ rcu_pending_dequeue(&bc->pending[pcpu_readers]),
+ struct bkey_cached, rcu);
+ if (ck)
+ goto lock;
ck = allocate_dropping_locks(trans, ret,
__bkey_cached_alloc(key_u64s, _gfp));
@@ -302,15 +155,19 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
return ERR_PTR(ret);
}
- if (!ck)
- return NULL;
-
- INIT_LIST_HEAD(&ck->list);
- bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+ if (ck) {
+ bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+ ck->c.cached = true;
+ goto lock;
+ }
- ck->c.cached = true;
- BUG_ON(!six_trylock_intent(&ck->c.lock));
- BUG_ON(!six_trylock_write(&ck->c.lock));
+ ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
+ struct bkey_cached, rcu);
+ if (ck)
+ goto lock;
+lock:
+ six_lock_intent(&ck->c.lock, NULL, NULL);
+ six_lock_write(&ck->c.lock, NULL, NULL);
return ck;
}
@@ -322,21 +179,21 @@ bkey_cached_reuse(struct btree_key_cache *c)
struct bkey_cached *ck;
unsigned i;
- mutex_lock(&c->lock);
rcu_read_lock();
tbl = rht_dereference_rcu(c->table.tbl, &c->table);
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
- bkey_cached_evict(c, ck);
- goto out;
+ if (bkey_cached_evict(c, ck))
+ goto out;
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
}
}
ck = NULL;
out:
rcu_read_unlock();
- mutex_unlock(&c->lock);
return ck;
}
@@ -415,7 +272,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
path->uptodate = BTREE_ITER_UPTODATE;
return 0;
err:
- bkey_cached_free_fast(bc, ck);
+ bkey_cached_free(bc, ck);
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
return ret;
@@ -611,8 +468,12 @@ evict:
}
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
- bkey_cached_evict(&c->btree_key_cache, ck);
- bkey_cached_free_fast(&c->btree_key_cache, ck);
+ if (bkey_cached_evict(&c->btree_key_cache, ck)) {
+ bkey_cached_free(&c->btree_key_cache, ck);
+ } else {
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
+ }
}
out:
bch2_trans_iter_exit(trans, &b_iter);
@@ -722,7 +583,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
}
bkey_cached_evict(bc, ck);
- bkey_cached_free_fast(bc, ck);
+ bkey_cached_free(bc, ck);
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
@@ -735,48 +596,14 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bucket_table *tbl;
- struct bkey_cached *ck, *t;
+ struct bkey_cached *ck;
size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
- unsigned start, flags;
+ unsigned iter, start;
int srcu_idx;
- mutex_lock(&bc->lock);
- bc->requested_to_free += sc->nr_to_scan;
-
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- flags = memalloc_nofs_save();
-
- /*
- * Newest freed entries are at the end of the list - once we hit one
- * that's too new to be freed, we can bail out:
- */
- list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
- if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
- ck->btree_trans_barrier_seq))
- break;
-
- list_del(&ck->list);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
- atomic_long_dec(&bc->nr_freed);
- bc->nr_freed_nonpcpu--;
- bc->freed++;
- }
-
- list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
- if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
- ck->btree_trans_barrier_seq))
- break;
-
- list_del(&ck->list);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
- atomic_long_dec(&bc->nr_freed);
- bc->nr_freed_pcpu--;
- bc->freed++;
- }
-
rcu_read_lock();
+
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
/*
@@ -792,17 +619,18 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
return SHRINK_STOP;
}
- if (bc->shrink_iter >= tbl->size)
- bc->shrink_iter = 0;
- start = bc->shrink_iter;
+ iter = bc->shrink_iter;
+ if (iter >= tbl->size)
+ iter = 0;
+ start = iter;
do {
struct rhash_head *pos, *next;
- pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]);
+ pos = rht_ptr_rcu(&tbl->buckets[iter]);
while (!rht_is_a_nulls(pos)) {
- next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+ next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
ck = container_of(pos, struct bkey_cached, hash);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@@ -812,29 +640,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->skipped_accessed++;
} else if (!bkey_cached_lock_for_evict(ck)) {
bc->skipped_lock_fail++;
- } else {
- bkey_cached_evict(bc, ck);
+ } else if (bkey_cached_evict(bc, ck)) {
bkey_cached_free(bc, ck);
- bc->moved_to_freelist++;
+ bc->freed++;
freed++;
+ } else {
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
}
scanned++;
if (scanned >= nr)
- break;
+ goto out;
pos = next;
}
- bc->shrink_iter++;
- if (bc->shrink_iter >= tbl->size)
- bc->shrink_iter = 0;
- } while (scanned < nr && bc->shrink_iter != start);
+ iter++;
+ if (iter >= tbl->size)
+ iter = 0;
+ } while (scanned < nr && iter != start);
+out:
+ bc->shrink_iter = iter;
rcu_read_unlock();
- memalloc_nofs_restore(flags);
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
- mutex_unlock(&bc->lock);
return freed;
}
@@ -862,18 +692,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct bucket_table *tbl;
- struct bkey_cached *ck, *n;
+ struct bkey_cached *ck;
struct rhash_head *pos;
LIST_HEAD(items);
unsigned i;
-#ifdef __KERNEL__
- int cpu;
-#endif
shrinker_free(bc->shrink);
- mutex_lock(&bc->lock);
-
/*
* The loop is needed to guard against racing with rehash:
*/
@@ -892,44 +717,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
for (i = 0; i < tbl->size; i++)
while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
ck = container_of(pos, struct bkey_cached, hash);
- bkey_cached_evict(bc, ck);
- list_add(&ck->list, &items);
+ BUG_ON(!bkey_cached_evict(bc, ck));
+ kfree(ck->k);
+ kmem_cache_free(bch2_key_cache, ck);
}
}
rcu_read_unlock();
}
-#ifdef __KERNEL__
- if (bc->pcpu_freed) {
- for_each_possible_cpu(cpu) {
- struct btree_key_cache_freelist *f =
- per_cpu_ptr(bc->pcpu_freed, cpu);
-
- for (i = 0; i < f->nr; i++) {
- ck = f->objs[i];
- list_add(&ck->list, &items);
- }
- }
- }
-#endif
-
- BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
- BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
-
- list_splice(&bc->freed_pcpu, &items);
- list_splice(&bc->freed_nonpcpu, &items);
-
- mutex_unlock(&bc->lock);
-
- list_for_each_entry_safe(ck, n, &items, list) {
- cond_resched();
-
- list_del(&ck->list);
- kfree(ck->k);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
- }
-
if (atomic_long_read(&bc->nr_dirty) &&
!bch2_journal_error(&c->journal) &&
test_bit(BCH_FS_was_rw, &c->flags))
@@ -943,14 +738,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
- free_percpu(bc->pcpu_freed);
+ rcu_pending_exit(&bc->pending[0]);
+ rcu_pending_exit(&bc->pending[1]);
+
+ free_percpu(bc->nr_pending);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
- mutex_init(&c->lock);
- INIT_LIST_HEAD(&c->freed_pcpu);
- INIT_LIST_HEAD(&c->freed_nonpcpu);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
@@ -958,11 +753,13 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct shrinker *shrink;
-#ifdef __KERNEL__
- bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
- if (!bc->pcpu_freed)
+ bc->nr_pending = alloc_percpu(size_t);
+ if (!bc->nr_pending)
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
+ if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
+ rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-#endif
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
@@ -984,45 +781,21 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
printbuf_tabstop_push(out, 24);
printbuf_tabstop_push(out, 12);
- unsigned flags = memalloc_nofs_save();
- mutex_lock(&bc->lock);
prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
- prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed));
- prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu);
- prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu);
-
- prt_printf(out, "\nshrinker:\n");
+ prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size);
+ prt_newline(out);
+ prt_printf(out, "shrinker:\n");
prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
prt_printf(out, "freed:\t%lu\r\n", bc->freed);
- prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist);
prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
-
- prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier));
-
- struct bkey_cached *ck;
- unsigned iter = 0;
- list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
- prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
- if (++iter > 10)
- break;
- }
-
- iter = 0;
- list_for_each_entry(ck, &bc->freed_pcpu, list) {
- prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
- if (++iter > 10)
- break;
- }
- mutex_unlock(&bc->lock);
- memalloc_flags_restore(flags);
+ prt_newline(out);
+ prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending));
}
void bch2_btree_key_cache_exit(void)
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
index 237e8bb3ac40..722f1ed10551 100644
--- a/fs/bcachefs/btree_key_cache_types.h
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -2,33 +2,25 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-struct btree_key_cache_freelist {
- struct bkey_cached *objs[16];
- unsigned nr;
-};
+#include "rcu_pending.h"
struct btree_key_cache {
- struct mutex lock;
struct rhashtable table;
bool table_init_done;
- struct list_head freed_pcpu;
- size_t nr_freed_pcpu;
- struct list_head freed_nonpcpu;
- size_t nr_freed_nonpcpu;
-
struct shrinker *shrink;
unsigned shrink_iter;
- struct btree_key_cache_freelist __percpu *pcpu_freed;
- atomic_long_t nr_freed;
+ /* 0: non pcpu reader locks, 1: pcpu reader locks */
+ struct rcu_pending pending[2];
+ size_t __percpu *nr_pending;
+
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
/* shrinker stats */
unsigned long requested_to_free;
unsigned long freed;
- unsigned long moved_to_freelist;
unsigned long skipped_dirty;
unsigned long skipped_accessed;
unsigned long skipped_lock_fail;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 11a64ead8685..7c07f9fa9add 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -218,16 +218,17 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
bool lock_may_not_fail,
unsigned long ip)
{
- int ret;
-
trans->lock_may_not_fail = lock_may_not_fail;
trans->lock_must_abort = false;
trans->locking = b;
- ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
- bch2_six_check_for_deadlock, trans, ip);
+ int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans, ip);
WRITE_ONCE(trans->locking, NULL);
WRITE_ONCE(trans->locking_wait.start_time, 0);
+
+ if (!ret)
+ trace_btree_path_lock(trans, _THIS_IP_, b);
return ret;
}
@@ -281,6 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
+ bch2_trans_verify_not_unlocked(trans);
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
@@ -400,12 +402,13 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
/* misc: */
-static inline void btree_path_set_should_be_locked(struct btree_path *path)
+static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path)
{
EBUG_ON(!btree_node_locked(path, path->level));
EBUG_ON(path->uptodate);
path->should_be_locked = true;
+ trace_btree_path_should_be_locked(trans, path);
}
static inline void __btree_path_set_level_up(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index a0101d9c5d83..91884da4e30a 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -214,7 +214,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
overwrite:
- bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+ bch2_bset_insert(b, k, insert, clobber_u64s);
new_u64s = k->u64s;
fix_iter:
if (clobber_u64s != new_u64s)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b256b2a20a4f..4568a41fefaf 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -138,6 +138,31 @@ struct btree {
struct list_head list;
};
+#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \
+ x(lock_intent) \
+ x(lock_write) \
+ x(dirty) \
+ x(read_in_flight) \
+ x(write_in_flight) \
+ x(noevict) \
+ x(write_blocked) \
+ x(will_make_reachable) \
+ x(access_bit)
+
+enum bch_btree_cache_not_freed_reasons {
+#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
+ BCH_BTREE_CACHE_NOT_FREED_REASONS()
+#undef x
+ BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
+};
+
+struct btree_cache_list {
+ unsigned idx;
+ struct shrinker *shrink;
+ struct list_head list;
+ size_t nr;
+};
+
struct btree_cache {
struct rhashtable table;
bool table_init_done;
@@ -155,28 +180,19 @@ struct btree_cache {
* should never grow past ~2-3 nodes in practice.
*/
struct mutex lock;
- struct list_head live;
struct list_head freeable;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
+ struct btree_cache_list live[2];
- /* Number of elements in live + freeable lists */
- unsigned used;
- unsigned reserve;
- unsigned freed;
- unsigned not_freed_lock_intent;
- unsigned not_freed_lock_write;
- unsigned not_freed_dirty;
- unsigned not_freed_read_in_flight;
- unsigned not_freed_write_in_flight;
- unsigned not_freed_noevict;
- unsigned not_freed_write_blocked;
- unsigned not_freed_will_make_reachable;
- unsigned not_freed_access_bit;
- atomic_t dirty;
- struct shrinker *shrink;
+ size_t nr_freeable;
+ size_t nr_reserve;
+ size_t nr_by_btree[BTREE_ID_NR];
+ atomic_long_t nr_dirty;
- unsigned used_by_btree[BTREE_ID_NR];
+ /* shrinker stats */
+ size_t nr_freed;
+ u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
/*
* If we need to allocate memory for a new btree node and that
@@ -189,8 +205,8 @@ struct btree_cache {
struct bbpos pinned_nodes_start;
struct bbpos pinned_nodes_end;
- u64 pinned_nodes_leaf_mask;
- u64 pinned_nodes_interior_mask;
+ /* btree id mask: 0 for leaves, 1 for interior */
+ u64 pinned_nodes_mask[2];
};
struct btree_node_iter {
@@ -386,17 +402,16 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
- unsigned long btree_trans_barrier_seq;
u16 u64s;
struct bkey_cached_key key;
struct rhash_head hash;
- struct list_head list;
struct journal_entry_pin journal;
u64 seq;
struct bkey_i *k;
+ struct rcu_head rcu;
};
static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
@@ -583,7 +598,8 @@ enum btree_write_type {
x(dying) \
x(fake) \
x(need_rewrite) \
- x(never_write)
+ x(never_write) \
+ x(pinned)
enum btree_flags {
/* First bits for btree node write type */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index d6f6df10dcc3..514df618548e 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -374,7 +374,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
i->key_cache_already_flushed = true;
i->flags |= BTREE_TRIGGER_norun;
- btree_path_set_should_be_locked(btree_path);
+ btree_path_set_should_be_locked(trans, btree_path);
ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
out:
bch2_path_put(trans, path_idx, true);
@@ -422,7 +422,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
break;
}
- if (!cmp && i < trans->updates + trans->nr_updates) {
+ bool overwrite = !cmp && i < trans->updates + trans->nr_updates;
+
+ if (overwrite) {
EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
bch2_path_put(trans, i->path, true);
@@ -449,7 +451,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
}
}
- __btree_path_get(trans->paths + i->path, true);
+ __btree_path_get(trans, trans->paths + i->path, true);
+
+ trace_update_by_path(trans, path, i, overwrite);
/*
* If a key is present in the key cache, it must also exist in the
@@ -498,7 +502,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
}
- btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
+ btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
}
return 0;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 8fd112026e7a..190bc1e81756 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -16,6 +16,7 @@
#include "clock.h"
#include "error.h"
#include "extents.h"
+#include "io_write.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
@@ -145,7 +146,7 @@ fsck_err:
printbuf_exit(&buf);
return ret;
topology_repair:
- if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
+ if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
bch2_inconsistent_error(c);
ret = -BCH_ERR_btree_need_topology_repair;
@@ -250,8 +251,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
unsigned i, level = b->c.level;
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+
+ mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_unlock(&c->btree_cache.lock);
+
__btree_node_free(trans, b);
+
six_unlock_write(&b->c.lock);
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
@@ -283,7 +289,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
clear_btree_node_need_write(b);
mutex_lock(&c->btree_cache.lock);
- list_del_init(&b->list);
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
@@ -732,6 +737,18 @@ static void btree_update_nodes_written(struct btree_update *as)
"%s", bch2_err_str(ret));
err:
/*
+ * Ensure transaction is unlocked before using btree_node_lock_nopath()
+ * (the use of which is always suspect, we need to work on removing this
+ * in the future)
+ *
+ * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
+ * calls bch2_path_upgrade(), before we call path_make_mut(), so we may
+ * rarely end up with a locked path besides the one we have here:
+ */
+ bch2_trans_unlock(trans);
+ bch2_trans_begin(trans);
+
+ /*
* We have to be careful because another thread might be getting ready
* to free as->b and calling btree_update_reparent() on us - we'll
* recheck under btree_update_lock below:
@@ -750,18 +767,6 @@ err:
* we're in journal error state:
*/
- /*
- * Ensure transaction is unlocked before using
- * btree_node_lock_nopath() (the use of which is always suspect,
- * we need to work on removing this in the future)
- *
- * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
- * calls bch2_path_upgrade(), before we call path_make_mut(), so
- * we may rarely end up with a locked path besides the one we
- * have here:
- */
- bch2_trans_unlock(trans);
- bch2_trans_begin(trans);
btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
as->btree_id, b->c.level, b->key.k.p);
struct btree_path *path = trans->paths + path_idx;
@@ -1899,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
six_unlock_intent(&n->c.lock);
mutex_lock(&c->btree_cache.lock);
- list_add_tail(&b->list, &c->btree_cache.live);
+ list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
mutex_unlock(&c->btree_cache.lock);
bch2_trans_verify_locks(trans);
@@ -1981,7 +1986,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (ret)
goto err;
- btree_path_set_should_be_locked(trans->paths + sib_path);
+ btree_path_set_should_be_locked(trans, trans->paths + sib_path);
m = trans->paths[sib_path].l[level].b;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 02c6ecada97c..10f400957f21 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -159,6 +159,8 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
unsigned level,
unsigned flags)
{
+ bch2_trans_verify_not_unlocked(trans);
+
return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
btree_prev_sib) ?:
bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 721bbe1dffc1..546cd01a72e3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -75,6 +75,15 @@ void bch2_dev_usage_to_text(struct printbuf *out,
struct bch_dev *ca,
struct bch_dev_usage *usage)
{
+ if (out->nr_tabstops < 5) {
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 12);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ }
+
prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
@@ -272,7 +281,7 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
goto err;
rcu_read_lock();
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_rcu(c, ptr->dev));
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
rcu_read_unlock();
if (level) {
@@ -477,7 +486,7 @@ out:
return ret;
err:
bch2_dump_trans_updates(trans);
- ret = -EIO;
+ ret = -BCH_ERR_bucket_ref_update;
goto out;
}
@@ -556,22 +565,24 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
s64 *sectors,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
bool insert = !(flags & BTREE_TRIGGER_overwrite);
struct printbuf buf = PRINTBUF;
int ret = 0;
- struct bch_fs *c = trans->c;
+ u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
+ *sectors = insert ? abs_sectors : -abs_sectors;
+
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (unlikely(!ca)) {
if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
- ret = -EIO;
+ ret = -BCH_ERR_trigger_pointer;
goto err;
}
struct bpos bucket;
struct bch_backpointer bp;
- bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp);
- *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
+ __bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors);
if (flags & BTREE_TRIGGER_transactional) {
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
@@ -593,7 +604,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
p.ptr.dev,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_trigger_pointer;
goto err_unlock;
}
@@ -638,7 +649,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
- ret = -EIO;
+ ret = -BCH_ERR_trigger_stripe_pointer;
goto err;
}
@@ -677,7 +688,7 @@ err:
(u64) p.ec.idx, buf.buf);
printbuf_exit(&buf);
bch2_inconsistent_error(c);
- return -EIO;
+ return -BCH_ERR_trigger_stripe_pointer;
}
m->block_sectors[p.ec.block] += sectors;
@@ -741,7 +752,7 @@ static int __trigger_extent(struct btree_trans *trans,
return ret;
} else if (!p.has_ec) {
*replicas_sectors += disk_sectors;
- acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
+ replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
if (ret)
@@ -957,7 +968,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
bch2_data_type_str(a->v.data_type),
bch2_data_type_str(type),
bch2_data_type_str(type));
- ret = -EIO;
+ ret = -BCH_ERR_metadata_bucket_inconsistency;
goto err;
}
@@ -1013,7 +1024,7 @@ err:
bucket_unlock(g);
err_unlock:
percpu_up_read(&c->mark_lock);
- return -EIO;
+ return -BCH_ERR_metadata_bucket_inconsistency;
}
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index edbdffd508fc..e2cb7b24b220 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -80,22 +80,9 @@ static inline void bucket_lock(struct bucket *b)
TASK_UNINTERRUPTIBLE);
}
-static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
-{
- return rcu_dereference_check(ca->buckets_gc,
- !ca->fs ||
- percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->state_lock) ||
- lockdep_is_held(&ca->bucket_lock));
-}
-
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
- struct bucket_array *buckets = gc_bucket_array(ca);
-
- if (b - buckets->first_bucket >= buckets->nbuckets_minus_first)
- return NULL;
- return buckets->b + b;
+ return genradix_ptr(&ca->buckets_gc, b);
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index c9698cdf866f..28bd09a253c8 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -19,14 +19,6 @@ struct bucket {
u32 stripe_sectors;
} __aligned(sizeof(long));
-struct bucket_array {
- struct rcu_head rcu;
- u16 first_bucket;
- size_t nbuckets;
- size_t nbuckets_minus_first;
- struct bucket b[];
-};
-
struct bucket_gens {
struct rcu_head rcu;
u16 first_bucket;
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index e7208bf1974e..ce8fc677bef9 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -100,13 +100,12 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct scatterlist *sg, size_t len)
{
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
- int ret;
skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
- ret = crypto_skcipher_encrypt(req);
+ int ret = crypto_skcipher_encrypt(req);
if (ret)
pr_err("got error %i from crypto_skcipher_encrypt()", ret);
@@ -118,38 +117,47 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
void *buf, size_t len)
{
if (!is_vmalloc_addr(buf)) {
- struct scatterlist sg;
-
- sg_init_table(&sg, 1);
- sg_set_page(&sg,
- is_vmalloc_addr(buf)
- ? vmalloc_to_page(buf)
- : virt_to_page(buf),
- len, offset_in_page(buf));
+ struct scatterlist sg = {};
+
+ sg_mark_end(&sg);
+ sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
return do_encrypt_sg(tfm, nonce, &sg, len);
} else {
- unsigned pages = buf_pages(buf, len);
- struct scatterlist *sg;
- size_t orig_len = len;
- int ret, i;
-
- sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
- if (!sg)
- return -BCH_ERR_ENOMEM_do_encrypt;
+ DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
+ size_t sgl_len = 0;
+ int ret;
- sg_init_table(sg, pages);
+ darray_init(&sgl);
- for (i = 0; i < pages; i++) {
+ while (len) {
unsigned offset = offset_in_page(buf);
- unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
+ struct scatterlist sg = {
+ .page_link = (unsigned long) vmalloc_to_page(buf),
+ .offset = offset,
+ .length = min(len, PAGE_SIZE - offset),
+ };
- sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
- buf += pg_len;
- len -= pg_len;
+ if (darray_push(&sgl, sg)) {
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
+ if (ret)
+ goto err;
+
+ nonce = nonce_add(nonce, sgl_len);
+ sgl_len = 0;
+ sgl.nr = 0;
+ BUG_ON(darray_push(&sgl, sg));
+ }
+
+ buf += sg.length;
+ len -= sg.length;
+ sgl_len += sg.length;
}
- ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
- kfree(sg);
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
+err:
+ darray_exit(&sgl);
return ret;
}
}
@@ -325,39 +333,42 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
{
struct bio_vec bv;
struct bvec_iter iter;
- struct scatterlist sgl[16], *sg = sgl;
- size_t bytes = 0;
+ DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
+ size_t sgl_len = 0;
int ret = 0;
if (!bch2_csum_type_is_encryption(type))
return 0;
- sg_init_table(sgl, ARRAY_SIZE(sgl));
+ darray_init(&sgl);
bio_for_each_segment(bv, bio, iter) {
- if (sg == sgl + ARRAY_SIZE(sgl)) {
- sg_mark_end(sg - 1);
-
- ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ struct scatterlist sg = {
+ .page_link = (unsigned long) bv.bv_page,
+ .offset = bv.bv_offset,
+ .length = bv.bv_len,
+ };
+
+ if (darray_push(&sgl, sg)) {
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
if (ret)
- return ret;
+ goto err;
- nonce = nonce_add(nonce, bytes);
- bytes = 0;
+ nonce = nonce_add(nonce, sgl_len);
+ sgl_len = 0;
+ sgl.nr = 0;
- sg_init_table(sgl, ARRAY_SIZE(sgl));
- sg = sgl;
+ BUG_ON(darray_push(&sgl, sg));
}
- sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
- bytes += bv.bv_len;
- }
-
- if (sg != sgl) {
- sg_mark_end(sg - 1);
- return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ sgl_len += sg.length;
}
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
+err:
+ darray_exit(&sgl);
return ret;
}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 85c975dfbcfe..82c79c8baf92 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -20,15 +20,6 @@ static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
-#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
-({ \
- long __ret = timeout; \
- might_sleep(); \
- if (!___wait_cond_timeout(condition)) \
- __ret = __wait_event_timeout(wq, condition, timeout); \
- __ret; \
-})
-
void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
void bch2_io_clock_exit(struct io_clock *);
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index b7d223f85873..4f06cd8bbbe1 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -4,12 +4,12 @@
#include <linux/slab.h>
#include "darray.h"
-int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
- void *data = kvmalloc_array(new_size, element_size, gfp);
+ void *data = kvmalloc_array_noprof(new_size, element_size, gfp);
if (!data)
return -ENOMEM;
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 4b340d13caac..8f4c3f0665c4 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -22,29 +22,23 @@ struct { \
typedef DARRAY(char) darray_char;
typedef DARRAY(char *) darray_str;
-int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
-
-static inline int __darray_resize(darray_char *d, size_t element_size,
- size_t new_size, gfp_t gfp)
-{
- return unlikely(new_size > d->size)
- ? __bch2_darray_resize(d, element_size, new_size, gfp)
- : 0;
-}
+int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
+
+#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
+
+#define __darray_resize(_d, _element_size, _new_size, _gfp) \
+ (unlikely((_new_size) > (_d)->size) \
+ ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
+ : 0)
#define darray_resize_gfp(_d, _new_size, _gfp) \
- unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
+ __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
#define darray_resize(_d, _new_size) \
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
-static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
-{
- return __darray_resize(d, t_size, d->nr + more, gfp);
-}
-
#define darray_make_room_gfp(_d, _more, _gfp) \
- __darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
+ darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
#define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 004894ad4147..757b9884ef55 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -571,7 +571,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
while (data_opts.kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
- bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+ bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
data_opts.kill_ptrs ^= 1U << drop;
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 32bfdf19289a..84dd4a879d98 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -552,62 +552,30 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- subvol_inum target;
- u32 snapshot;
struct bkey_buf sk;
- int ret;
-
bch2_bkey_buf_init(&sk);
-retry:
- bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(inum.inum, ctx->pos, snapshot),
- POS(inum.inum, U64_MAX), 0, k, ret) {
- if (k.k->type != KEY_TYPE_dirent)
- continue;
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents,
+ POS(inum.inum, ctx->pos),
+ POS(inum.inum, U64_MAX),
+ inum.subvol, 0, k, ({
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
- /* dir_emit() can fault and block: */
- bch2_bkey_buf_reassemble(&sk, c, k);
- struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
+ /* dir_emit() can fault and block: */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
- ret = bch2_dirent_read_target(trans, inum, dirent, &target);
- if (ret < 0)
- break;
- if (ret)
- continue;
+ subvol_inum target;
+ int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
+ if (ret2 > 0)
+ continue;
- /*
- * read_target looks up subvolumes, we can overflow paths if the
- * directory has many subvolumes in it
- *
- * XXX: btree_trans_too_many_iters() is something we'd like to
- * get rid of, and there's no good reason to be using it here
- * except that we don't yet have a for_each_btree_key() helper
- * that does subvolume_get_snapshot().
- */
- ret = drop_locks_do(trans,
- bch2_dir_emit(ctx, dirent, target)) ?:
- btree_trans_too_many_iters(trans);
- if (ret) {
- ret = ret < 0 ? ret : 0;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
+ ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
+ })));
- bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
- return ret;
+ return ret < 0 ? ret : 0;
}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 141a4c63142f..1587c6e1866a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -18,6 +18,7 @@
#include "ec.h"
#include "error.h"
#include "io_read.h"
+#include "io_write.h"
#include "keylist.h"
#include "recovery.h"
#include "replicas.h"
@@ -146,12 +147,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
bch2_prt_csum_type(out, s.csum_type);
prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+ if (s.disk_label) {
+ prt_str(out, " label");
+ bch2_disk_path_to_text(out, c, s.disk_label - 1);
+ }
+
for (unsigned i = 0; i < s.nr_blocks; i++) {
const struct bch_extent_ptr *ptr = sp->ptrs + i;
if ((void *) ptr >= bkey_val_end(k))
break;
+ prt_char(out, ' ');
bch2_extent_ptr_to_text(out, c, ptr);
if (s.csum_type < BCH_CSUM_NR &&
@@ -192,7 +199,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->stripe, s.k->p.offset,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -203,7 +210,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
} else {
@@ -213,7 +220,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bucket.inode, bucket.offset, a->gen,
a->stripe,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -223,7 +230,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bch2_data_type_str(a->data_type),
bch2_data_type_str(data_type),
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -235,7 +242,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
}
@@ -273,8 +280,8 @@ static int mark_stripe_bucket(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
if (unlikely(!ca)) {
- if (!(flags & BTREE_TRIGGER_overwrite))
- ret = -EIO;
+ if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -293,7 +300,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
ptr->dev,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err_unlock;
}
@@ -351,6 +358,19 @@ static int mark_stripe_buckets(struct btree_trans *trans,
return 0;
}
+static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
+{
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->disk_label = s->disk_label;
+ m->blocks_nonempty = 0;
+
+ for (unsigned i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+}
+
int bch2_trigger_stripe(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
@@ -467,14 +487,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
memset(m, 0, sizeof(*m));
} else {
- m->sectors = le16_to_cpu(new_s->sectors);
- m->algorithm = new_s->algorithm;
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
- m->blocks_nonempty = 0;
-
- for (unsigned i = 0; i < new_s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+ stripe_to_mem(m, new_s);
if (!old_s)
bch2_stripes_heap_insert(c, m, idx);
@@ -816,13 +829,16 @@ err:
}
/* recovery read path: */
-int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
+ struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
- struct ec_stripe_buf *buf;
+ struct ec_stripe_buf *buf = NULL;
struct closure cl;
struct bch_stripe *v;
unsigned i, offset;
+ const char *msg = NULL;
+ struct printbuf msgbuf = PRINTBUF;
int ret = 0;
closure_init_stack(&cl);
@@ -835,32 +851,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: error %i looking up stripe", ret);
- kfree(buf);
- return -EIO;
+ msg = "stripe not found";
+ goto err;
}
v = &bkey_i_to_stripe(&buf->key)->v;
if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: pointer doesn't match stripe");
- ret = -EIO;
+ msg = "pointer doesn't match stripe";
goto err;
}
offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: read is bigger than stripe");
- ret = -EIO;
+ msg = "read is bigger than stripe";
goto err;
}
ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
- if (ret)
+ if (ret) {
+ msg = "-ENOMEM";
goto err;
+ }
for (i = 0; i < v->nr_blocks; i++)
ec_block_io(c, buf, REQ_OP_READ, i, &cl);
@@ -868,9 +880,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
closure_sync(&cl);
if (ec_nr_failed(buf) > v->nr_redundant) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: unable to read enough blocks");
- ret = -EIO;
+ msg = "unable to read enough blocks";
goto err;
}
@@ -882,10 +892,17 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
-err:
+out:
ec_stripe_buf_exit(buf);
kfree(buf);
return ret;
+err:
+ bch2_bkey_val_to_text(&msgbuf, c, orig_k);
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
+ printbuf_exit(&msgbuf);;
+ ret = -BCH_ERR_stripe_reconstruct;
+ goto out;
}
/* stripe bucket accounting: */
@@ -1305,7 +1322,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bkey_reassemble(n, k);
- bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
+ bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
BUG_ON(!ec_ptr);
@@ -1555,10 +1572,12 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
}
-static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
+static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
{
struct ec_stripe_new *s = h->s;
+ lockdep_assert_held(&h->lock);
+
BUG_ON(!s->allocated && !s->err);
h->s = NULL;
@@ -1571,6 +1590,12 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
ec_stripe_new_put(c, s, STRIPE_REF_io);
}
+static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
+{
+ h->s->err = err;
+ ec_stripe_new_set_pending(c, h);
+}
+
void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
{
struct ec_stripe_new *s = ob->ec;
@@ -1641,7 +1666,8 @@ static void ec_stripe_key_init(struct bch_fs *c,
struct bkey_i *k,
unsigned nr_data,
unsigned nr_parity,
- unsigned stripe_size)
+ unsigned stripe_size,
+ unsigned disk_label)
{
struct bkey_i_stripe *s = bkey_stripe_init(k);
unsigned u64s;
@@ -1652,7 +1678,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
s->v.nr_redundant = nr_parity;
s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9);
s->v.csum_type = BCH_CSUM_crc32c;
- s->v.pad = 0;
+ s->v.disk_label = disk_label;
while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
BUG_ON(1 << s->v.csum_granularity_bits >=
@@ -1685,40 +1711,32 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
s->nr_parity = h->redundancy;
ec_stripe_key_init(c, &s->new_stripe.key,
- s->nr_data, s->nr_parity, h->blocksize);
+ s->nr_data, s->nr_parity,
+ h->blocksize, h->disk_label);
h->s = s;
+ h->nr_created++;
return 0;
}
-static struct ec_stripe_head *
-ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
- unsigned algo, unsigned redundancy,
- enum bch_watermark watermark)
+static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
{
- struct ec_stripe_head *h;
-
- h = kzalloc(sizeof(*h), GFP_KERNEL);
- if (!h)
- return NULL;
-
- mutex_init(&h->lock);
- BUG_ON(!mutex_trylock(&h->lock));
-
- h->target = target;
- h->algo = algo;
- h->redundancy = redundancy;
- h->watermark = watermark;
+ struct bch_devs_mask devs = h->devs;
rcu_read_lock();
- h->devs = target_rw_devs(c, BCH_DATA_user, target);
+ h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
+ ? group_to_target(h->disk_label - 1)
+ : 0);
+ unsigned nr_devs = dev_mask_nr(&h->devs);
for_each_member_device_rcu(c, ca, &h->devs)
if (!ca->mi.durability)
__clear_bit(ca->dev_idx, h->devs.d);
+ unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
h->blocksize = pick_blocksize(c, &h->devs);
+ h->nr_active_devs = 0;
for_each_member_device_rcu(c, ca, &h->devs)
if (ca->mi.bucket_size == h->blocksize)
h->nr_active_devs++;
@@ -1729,9 +1747,50 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
* If we only have redundancy + 1 devices, we're better off with just
* replication:
*/
- if (h->nr_active_devs < h->redundancy + 2)
- bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
- h->nr_active_devs, h->redundancy + 2);
+ h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
+
+ if (h->insufficient_devs) {
+ const char *err;
+
+ if (nr_devs < h->redundancy + 2)
+ err = NULL;
+ else if (nr_devs_with_durability < h->redundancy + 2)
+ err = "cannot use durability=0 devices";
+ else
+ err = "mismatched bucket sizes";
+
+ if (err)
+ bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
+ h->nr_active_devs, h->redundancy + 2, err);
+ }
+
+ struct bch_devs_mask devs_leaving;
+ bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX);
+
+ if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving))
+ ec_stripe_new_cancel(c, h, -EINTR);
+
+ h->rw_devs_change_count = c->rw_devs_change_count;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
+ unsigned algo, unsigned redundancy,
+ enum bch_watermark watermark)
+{
+ struct ec_stripe_head *h;
+
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
+ if (!h)
+ return NULL;
+
+ mutex_init(&h->lock);
+ BUG_ON(!mutex_trylock(&h->lock));
+
+ h->disk_label = disk_label;
+ h->algo = algo;
+ h->redundancy = redundancy;
+ h->watermark = watermark;
list_add(&h->list, &c->ec_stripe_head_list);
return h;
@@ -1743,14 +1802,14 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
h->s->allocated &&
bitmap_weight(h->s->blocks_allocated,
h->s->nr_data) == h->s->nr_data)
- ec_stripe_set_pending(c, h);
+ ec_stripe_new_set_pending(c, h);
mutex_unlock(&h->lock);
}
static struct ec_stripe_head *
__bch2_ec_stripe_head_get(struct btree_trans *trans,
- unsigned target,
+ unsigned disk_label,
unsigned algo,
unsigned redundancy,
enum bch_watermark watermark)
@@ -1768,27 +1827,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
if (test_bit(BCH_FS_going_ro, &c->flags)) {
h = ERR_PTR(-BCH_ERR_erofs_no_writes);
- goto found;
+ goto err;
}
list_for_each_entry(h, &c->ec_stripe_head_list, list)
- if (h->target == target &&
+ if (h->disk_label == disk_label &&
h->algo == algo &&
h->redundancy == redundancy &&
h->watermark == watermark) {
ret = bch2_trans_mutex_lock(trans, &h->lock);
- if (ret)
+ if (ret) {
h = ERR_PTR(ret);
+ goto err;
+ }
goto found;
}
- h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
+ h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
found:
- if (!IS_ERR_OR_NULL(h) &&
- h->nr_active_devs < h->redundancy + 2) {
+ if (h->rw_devs_change_count != c->rw_devs_change_count)
+ ec_stripe_head_devs_update(c, h);
+
+ if (h->insufficient_devs) {
mutex_unlock(&h->lock);
h = NULL;
}
+err:
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
@@ -1878,7 +1942,6 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
return 0;
}
-/* XXX: doesn't obey target: */
static s64 get_existing_stripe(struct bch_fs *c,
struct ec_stripe_head *head)
{
@@ -1901,7 +1964,8 @@ static s64 get_existing_stripe(struct bch_fs *c,
m = genradix_ptr(&c->stripes, stripe_idx);
- if (m->algorithm == head->algo &&
+ if (m->disk_label == head->disk_label &&
+ m->algorithm == head->algo &&
m->nr_redundant == head->redundancy &&
m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
@@ -2046,9 +2110,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct ec_stripe_head *h;
bool waiting = false;
+ unsigned disk_label = 0;
+ struct target t = target_decode(target);
int ret;
- h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
+ if (t.type == TARGET_GROUP) {
+ if (t.group > U8_MAX) {
+ bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
+ return NULL;
+ }
+ disk_label = t.group + 1; /* 0 == no label */
+ }
+
+ h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
if (IS_ERR_OR_NULL(h))
return h;
@@ -2126,6 +2200,73 @@ err:
return ERR_PTR(ret);
}
+/* device removal */
+
+static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
+{
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
+
+ if (!a->stripe)
+ return 0;
+
+ if (a->stripe_sectors) {
+ bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
+ return -BCH_ERR_invalidate_stripe_to_dev;
+ }
+
+ struct btree_iter iter;
+ struct bkey_i_stripe *s =
+ bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
+ BTREE_ITER_slots, stripe);
+ int ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+
+ s64 sectors = 0;
+ for (unsigned i = 0; i < s->v.nr_blocks; i++)
+ sectors -= stripe_blockcount_get(&s->v, i);
+
+ bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+ acc.replicas.data_type = BCH_DATA_user;
+ ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+ if (ret)
+ goto err;
+
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == k_a.k->p.inode)
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+
+ sectors = -sectors;
+
+ bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+ acc.replicas.data_type = BCH_DATA_user;
+ ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
+{
+ return bch2_trans_run(c,
+ for_each_btree_key_upto_commit(trans, iter,
+ BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
+ BTREE_ITER_intent, k,
+ NULL, NULL, 0, ({
+ bch2_invalidate_stripe_to_dev(trans, k);
+ })));
+}
+
+/* startup/shutdown */
+
static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
{
struct ec_stripe_head *h;
@@ -2151,8 +2292,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
}
goto unlock;
found:
- h->s->err = -BCH_ERR_erofs_no_writes;
- ec_stripe_set_pending(c, h);
+ ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
unlock:
mutex_unlock(&h->lock);
}
@@ -2197,17 +2337,9 @@ int bch2_stripes_read(struct bch_fs *c)
if (ret)
break;
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-
struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->blocks_nonempty = 0;
- for (unsigned i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
bch2_stripes_heap_insert(c, m, k.k->p.offset);
0;
@@ -2252,6 +2384,8 @@ static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
prt_printf(out, " %u", s->blocks[i]);
prt_newline(out);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
+ prt_newline(out);
}
void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
@@ -2261,9 +2395,10 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->ec_stripe_head_lock);
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
- prt_printf(out, "target %u algo %u redundancy %u %s:\n",
- h->target, h->algo, h->redundancy,
- bch2_watermarks[h->watermark]);
+ prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
+ h->disk_label, h->algo, h->redundancy,
+ bch2_watermarks[h->watermark],
+ h->nr_created);
if (h->s)
bch2_new_stripe_to_text(out, c, h->s);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 9baf3411a8f9..43326370b410 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -188,10 +188,15 @@ struct ec_stripe_head {
struct list_head list;
struct mutex lock;
- unsigned target;
+ unsigned disk_label;
unsigned algo;
unsigned redundancy;
enum bch_watermark watermark;
+ bool insufficient_devs;
+
+ unsigned long rw_devs_change_count;
+
+ u64 nr_created;
struct bch_devs_mask devs;
unsigned nr_active_devs;
@@ -204,7 +209,7 @@ struct ec_stripe_head {
struct ec_stripe_new *s;
};
-int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
@@ -249,6 +254,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
}
}
+int bch2_dev_remove_stripes(struct bch_fs *, unsigned);
+
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_fs_ec_stop(struct bch_fs *);
void bch2_fs_ec_flush(struct bch_fs *);
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
index 44ce88ba08d7..64ef52e00078 100644
--- a/fs/bcachefs/ec_format.h
+++ b/fs/bcachefs/ec_format.h
@@ -11,7 +11,14 @@ struct bch_stripe {
__u8 csum_granularity_bits;
__u8 csum_type;
- __u8 pad;
+
+ /*
+ * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2
+ *
+ * we can manage with this because this only needs to point to a
+ * disk label, not a target:
+ */
+ __u8 disk_label;
struct bch_extent_ptr ptrs[];
} __packed __aligned(8);
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 1df03dccfc72..8d1e70e830ac 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -16,6 +16,7 @@ struct stripe {
u8 nr_blocks;
u8 nr_redundant;
u8 blocks_nonempty;
+ u8 disk_label;
};
struct gc_stripe {
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 742dcdd3e5d7..60b7875adada 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -119,8 +119,8 @@
x(EEXIST, EEXIST_str_hash_set) \
x(EEXIST, EEXIST_discard_in_flight_add) \
x(EEXIST, EEXIST_subvolume_create) \
- x(0, open_buckets_empty) \
- x(0, freelist_empty) \
+ x(ENOSPC, open_buckets_empty) \
+ x(ENOSPC, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \
x(0, transaction_restart) \
x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \
@@ -244,6 +244,16 @@
x(EIO, btree_node_read_error) \
x(EIO, btree_node_read_validate_error) \
x(EIO, btree_need_topology_repair) \
+ x(EIO, bucket_ref_update) \
+ x(EIO, trigger_pointer) \
+ x(EIO, trigger_stripe_pointer) \
+ x(EIO, metadata_bucket_inconsistency) \
+ x(EIO, mark_stripe) \
+ x(EIO, stripe_reconstruct) \
+ x(EIO, key_type_error) \
+ x(EIO, no_device_to_read_from) \
+ x(EIO, missing_indirect_extent) \
+ x(EIO, invalidate_stripe_to_dev) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 324303bf4353..cc0d22085aef 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -115,7 +115,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
int ret = 0;
if (k.k->type == KEY_TYPE_error)
- return -EIO;
+ return -BCH_ERR_key_type_error;
rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
@@ -133,7 +133,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
* read:
*/
if (!ret && !p.ptr.cached)
- ret = -EIO;
+ ret = -BCH_ERR_no_device_to_read_from;
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
@@ -146,16 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
? f->idx
: f->idx + 1;
- if (!p.idx && !ca)
+ if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
p.idx++;
if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
p.idx++;
- if (!p.idx && !bch2_dev_is_readable(ca))
- p.idx++;
-
- if (p.idx >= (unsigned) p.has_ec + 1)
+ if (p.idx > (unsigned) p.has_ec)
continue;
if (ret > 0 && !ptr_better(c, p, *pick))
@@ -821,6 +818,18 @@ void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
{
+ if (k.k->type != KEY_TYPE_stripe) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev == ptr->dev && p.has_ec) {
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+ return;
+ }
+ }
+
bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
bch2_bkey_drop_ptr_noerror(k, ptr);
@@ -848,10 +857,7 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
{
- struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
-
- if (ptr)
- bch2_bkey_drop_ptr_noerror(k, ptr);
+ bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
}
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
@@ -1021,7 +1027,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
{
out->atomic++;
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
if (!ca) {
prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
@@ -1125,8 +1131,9 @@ static int extent_ptr_validate(struct bch_fs *c,
{
int ret = 0;
+ /* bad pointers are repaired by check_fix_ptrs(): */
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
if (!ca) {
rcu_read_unlock();
return 0;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 42a7c6d820a0..ed5001dd662e 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -357,7 +357,7 @@ out: \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
_ptr, _entry)
-#define bkey_crc_next(_k, _start, _end, _crc, _iter) \
+#define bkey_crc_next(_k, _end, _crc, _iter) \
({ \
__bkey_extent_entry_for_each_from(_iter, _end, _iter) \
if (extent_entry_is_crc(_iter)) { \
@@ -372,7 +372,7 @@ out: \
#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
(_iter) = (_start); \
- bkey_crc_next(_k, _start, _end, _crc, _iter); \
+ bkey_crc_next(_k, _end, _crc, _iter); \
(_iter) = extent_entry_next(_iter))
#define bkey_for_each_crc(_k, _p, _crc, _iter) \
@@ -611,9 +611,6 @@ unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_d
unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
@@ -652,6 +649,23 @@ void bch2_extent_ptr_decoded_append(struct bkey_i *,
void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+
+#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \
+do { \
+ __label__ _again; \
+ struct bkey_ptrs _ptrs; \
+_again: \
+ _ptrs = bch2_bkey_ptrs(_k); \
+ \
+ bkey_for_each_ptr(_ptrs, _ptr) \
+ if (_cond) { \
+ bch2_bkey_drop_ptr_noerror(_k, _ptr); \
+ goto _again; \
+ } \
+} while (0)
+
#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
do { \
__label__ _again; \
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 508d029ac53d..7e10a9ddcfd9 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -42,7 +42,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir,
+ BTREE_ITER_intent|BTREE_ITER_with_updates);
if (ret)
goto err;
@@ -163,7 +164,7 @@ int bch2_create_trans(struct btree_trans *trans,
name,
dir_target,
&dir_offset,
- STR_HASH_must_create);
+ STR_HASH_must_create|BTREE_ITER_with_updates);
if (ret)
goto err;
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index ff60c041abe5..48a1ab9a649b 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -151,7 +151,6 @@ static void bchfs_read(struct btree_trans *trans,
struct bkey_buf sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
- u32 snapshot;
int ret = 0;
rbio->c = c;
@@ -159,29 +158,23 @@ static void bchfs_read(struct btree_trans *trans,
rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
-retry:
bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+ POS(inum.inum, rbio->bio.bi_iter.bi_sector),
BTREE_ITER_slots);
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
- /*
- * read_extent -> io_time_reset may cause a transaction restart
- * without returning an error, we need to check for that here:
- */
- ret = bch2_trans_relock(trans);
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- break;
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
@@ -189,7 +182,7 @@ retry:
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
- break;
+ goto err;
offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
@@ -200,7 +193,7 @@ retry:
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &sk);
if (ret)
- break;
+ goto err;
k = bkey_i_to_s_c(sk.k);
@@ -210,7 +203,7 @@ retry:
ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
extent_partial_reads_expensive(k));
if (ret)
- break;
+ goto err;
}
bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
@@ -229,17 +222,13 @@ retry:
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
-
- ret = btree_trans_too_many_iters(trans);
- if (ret)
+err:
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
-err:
bch2_trans_iter_exit(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
if (ret) {
bch_err_inum_offset_ratelimited(c,
iter.pos.inode,
@@ -486,7 +475,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
- op->subvol = inode->ei_subvol;
+ op->subvol = inode->ei_inum.subvol;
op->pos = POS(inode->v.i_ino, sector);
op->end_io = bch2_writepage_io_done;
op->devs_need_flush = &inode->ei_devs_need_flush;
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index e246b1e05aa2..ee1c0325f313 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -500,7 +500,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
dio->op.target = dio->op.opts.foreground_target;
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
- dio->op.subvol = inode->ei_subvol;
+ dio->op.subvol = inode->ei_inum.subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
dio->op.devs_need_flush = &inode->ei_devs_need_flush;
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index a9cc5cad9cc9..af3a24546aa3 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -182,18 +182,11 @@ static void __bch2_folio_set(struct folio *folio,
int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
struct folio **fs, unsigned nr_folios)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_folio *s;
u64 offset = folio_sector(fs[0]);
- unsigned folio_idx;
- u32 snapshot;
bool need_set = false;
- int ret;
- for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
- s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
+ for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+ struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
if (!s)
return -ENOMEM;
@@ -203,53 +196,40 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
if (!need_set)
return 0;
- folio_idx = 0;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_slots, k, ret) {
- unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k);
-
- while (folio_idx < nr_folios) {
- struct folio *folio = fs[folio_idx];
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
- folio_start;
- unsigned folio_len = min(k.k->p.offset, folio_end) -
- folio_offset - folio_start;
-
- BUG_ON(k.k->p.offset < folio_start);
- BUG_ON(bkey_start_offset(k.k) > folio_end);
-
- if (!bch2_folio(folio)->uptodate)
- __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
- if (k.k->p.offset < folio_end)
- break;
- folio_idx++;
- }
-
- if (folio_idx == nr_folios)
- break;
- }
-
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_put(trans);
+ unsigned folio_idx = 0;
+
+ return bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ POS(inum.inum, offset),
+ POS(inum.inum, U64_MAX),
+ inum.subvol, BTREE_ITER_slots, k, ({
+ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k);
+
+ while (folio_idx < nr_folios) {
+ struct folio *folio = fs[folio_idx];
+ u64 folio_start = folio_sector(folio);
+ u64 folio_end = folio_end_sector(folio);
+ unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
+ folio_start;
+ unsigned folio_len = min(k.k->p.offset, folio_end) -
+ folio_offset - folio_start;
+
+ BUG_ON(k.k->p.offset < folio_start);
+ BUG_ON(bkey_start_offset(k.k) > folio_end);
+
+ if (!bch2_folio(folio)->uptodate)
+ __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+
+ if (k.k->p.offset < folio_end)
+ break;
+ folio_idx++;
+ }
- return ret;
+ if (folio_idx == nr_folios)
+ break;
+ 0;
+ })));
}
void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index fd7d692c087e..fad911cf5068 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -99,9 +99,7 @@ static inline void bch2_folio_release(struct folio *folio)
static inline struct bch_folio *__bch2_folio(struct folio *folio)
{
- return folio_has_private(folio)
- ? (struct bch_folio *) folio_get_private(folio)
- : NULL;
+ return folio_get_private(folio);
}
static inline struct bch_folio *bch2_folio(struct folio *folio)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 77b85da30fb2..71d0fa387509 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -221,30 +221,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
struct bpos start,
struct bpos end)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
- if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
- ret = 1;
- break;
- }
- start = iter.pos;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
- return ret;
+ return bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end,
+ subvol, 0, k, ({
+ bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
+ })));
}
static int __bch2_truncate_folio(struct bch_inode_info *inode,
@@ -267,7 +248,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
* XXX: we're doing two index lookups when we end up reading the
* folio
*/
- ret = range_has_data(c, inode->ei_subvol,
+ ret = range_has_data(c, inode->ei_inum.subvol,
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
if (ret <= 0)
@@ -618,7 +599,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans,
- inode->ei_subvol, &snapshot);
+ inode->ei_inum.subvol, &snapshot);
if (ret)
goto bkey_err;
@@ -813,41 +794,23 @@ static int quota_reserve_range(struct bch_inode_info *inode,
u64 start, u64 end)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- u32 snapshot;
u64 sectors = end - start;
- u64 pos = start;
- int ret;
-retry:
- bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, pos, snapshot), 0);
-
- while (!(ret = btree_trans_too_many_iters(trans)) &&
- (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
- !(ret = bkey_err(k))) {
- if (bkey_extent_is_allocation(k.k)) {
- u64 s = min(end, k.k->p.offset) -
- max(start, bkey_start_offset(k.k));
- BUG_ON(s > sectors);
- sectors -= s;
- }
- bch2_btree_iter_advance(&iter);
- }
- pos = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter,
+ BTREE_ID_extents,
+ POS(inode->v.i_ino, start),
+ POS(inode->v.i_ino, end - 1),
+ inode->ei_inum.subvol, 0, k, ({
+ if (bkey_extent_is_allocation(k.k)) {
+ u64 s = min(end, k.k->p.offset) -
+ max(start, bkey_start_offset(k.k));
+ BUG_ON(s > sectors);
+ sectors -= s;
+ }
+
+ 0;
+ })));
return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
}
@@ -942,42 +905,25 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_data = MAX_LFS_FILESIZE;
- u32 snapshot;
- int ret;
isize = i_size_read(&inode->v);
if (offset >= isize)
return -ENXIO;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, offset >> 9, snapshot),
- POS(inode->v.i_ino, U64_MAX),
- 0, k, ret) {
- if (bkey_extent_is_data(k.k)) {
- next_data = max(offset, bkey_start_offset(k.k) << 9);
- break;
- } else if (k.k->p.offset >> 9 > isize)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ POS(inode->v.i_ino, offset >> 9),
+ POS(inode->v.i_ino, U64_MAX),
+ inum.subvol, 0, k, ({
+ if (bkey_extent_is_data(k.k)) {
+ next_data = max(offset, bkey_start_offset(k.k) << 9);
+ break;
+ } else if (k.k->p.offset >> 9 > isize)
+ break;
+ 0;
+ })));
if (ret)
return ret;
@@ -995,50 +941,34 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_hole = MAX_LFS_FILESIZE;
- u32 snapshot;
- int ret;
isize = i_size_read(&inode->v);
if (offset >= isize)
return -ENXIO;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, offset >> 9, snapshot),
- BTREE_ITER_slots, k, ret) {
- if (k.k->p.inode != inode->v.i_ino) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- offset, MAX_LFS_FILESIZE, 0, false);
- break;
- } else if (!bkey_extent_is_data(k.k)) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- max(offset, bkey_start_offset(k.k) << 9),
- k.k->p.offset << 9, 0, false);
-
- if (next_hole < k.k->p.offset << 9)
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ POS(inode->v.i_ino, offset >> 9),
+ POS(inode->v.i_ino, U64_MAX),
+ inum.subvol, BTREE_ITER_slots, k, ({
+ if (k.k->p.inode != inode->v.i_ino) {
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
+ offset, MAX_LFS_FILESIZE, 0, false);
break;
- } else {
- offset = max(offset, bkey_start_offset(k.k) << 9);
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ } else if (!bkey_extent_is_data(k.k)) {
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
+ max(offset, bkey_start_offset(k.k) << 9),
+ k.k->p.offset << 9, 0, false);
+
+ if (next_hole < k.k->p.offset << 9)
+ break;
+ } else {
+ offset = max(offset, bkey_start_offset(k.k) << 9);
+ }
+ 0;
+ })));
if (ret)
return ret;
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 99c7fe987c74..405cf08bda34 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -100,7 +100,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
@@ -184,7 +184,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_set_projid(c, inode, fa.fsx_projid) ?:
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
ATTR_CTIME);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 011817afc3ad..4a1bb07a2574 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -108,7 +108,7 @@ retry:
goto retry;
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
- "%s: inode %u:%llu not found when updating",
+ "%s: inode %llu:%llu not found when updating",
bch2_err_str(ret),
inode_inum(inode).subvol,
inode_inum(inode).inum);
@@ -152,50 +152,106 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret;
}
-static int bch2_iget5_test(struct inode *vinode, void *p)
+static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- subvol_inum *inum = p;
-
- return inode->ei_subvol == inum->subvol &&
- inode->ei_inode.bi_inum == inum->inum;
+ return a.subvol == b.subvol && a.inum == b.inum;
}
-static int bch2_iget5_set(struct inode *vinode, void *p)
+static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
+ const void *obj)
{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- subvol_inum *inum = p;
+ const struct bch_inode_info *inode = obj;
+ const subvol_inum *v = arg->key;
- inode->v.i_ino = inum->inum;
- inode->ei_subvol = inum->subvol;
- inode->ei_inode.bi_inum = inum->inum;
- return 0;
+ return !subvol_inum_eq(inode->ei_inum, *v);
}
-static unsigned bch2_inode_hash(subvol_inum inum)
+static const struct rhashtable_params bch2_vfs_inodes_params = {
+ .head_offset = offsetof(struct bch_inode_info, hash),
+ .key_offset = offsetof(struct bch_inode_info, ei_inum),
+ .key_len = sizeof(subvol_inum),
+ .obj_cmpfn = bch2_vfs_inode_cmp_fn,
+ .automatic_shrinking = true,
+};
+
+static void __wait_on_freeing_inode(struct inode *inode)
{
- return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+ wait_queue_head_t *wq;
+ DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+ wq = bit_waitqueue(&inode->i_state, __I_NEW);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
}
struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
{
- return to_bch_ei(ilookup5_nowait(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- &inum));
+ return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
+}
+
+static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
+ subvol_inum inum)
+{
+ struct bch_inode_info *inode;
+repeat:
+ inode = __bch2_inode_hash_find(c, inum);
+ if (inode) {
+ spin_lock(&inode->v.i_lock);
+ if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
+ spin_unlock(&inode->v.i_lock);
+ return NULL;
+ }
+ if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
+ if (!trans) {
+ __wait_on_freeing_inode(&inode->v);
+ } else {
+ bch2_trans_unlock(trans);
+ __wait_on_freeing_inode(&inode->v);
+ int ret = bch2_trans_relock(trans);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ goto repeat;
+ }
+ __iget(&inode->v);
+ spin_unlock(&inode->v.i_lock);
+ }
+
+ return inode;
+}
+
+static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
+{
+ spin_lock(&inode->v.i_lock);
+ bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
+ spin_unlock(&inode->v.i_lock);
+
+ if (remove) {
+ int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
+ &inode->hash, bch2_vfs_inodes_params);
+ BUG_ON(ret);
+ inode->v.i_hash.pprev = NULL;
+ }
}
-static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
+static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
+ struct btree_trans *trans,
+ struct bch_inode_info *inode)
{
- subvol_inum inum = inode_inum(inode);
- struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- bch2_iget5_set,
- &inum));
- BUG_ON(!old);
+ struct bch_inode_info *old = inode;
+
+ set_bit(EI_INODE_HASHED, &inode->ei_flags);
+retry:
+ if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
+ &inode->hash,
+ bch2_vfs_inodes_params))) {
+ old = bch2_inode_hash_find(c, trans, inode->ei_inum);
+ if (!old)
+ goto retry;
+
+ clear_bit(EI_INODE_HASHED, &inode->ei_flags);
- if (unlikely(old != inode)) {
/*
* bcachefs doesn't use I_NEW; we have no use for it since we
* only insert fully created inodes in the inode hash table. But
@@ -209,21 +265,17 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
*/
set_nlink(&inode->v, 1);
discard_new_inode(&inode->v);
- inode = old;
+ return old;
} else {
+ inode_fake_hash(&inode->v);
+
+ inode_sb_list_add(&inode->v);
+
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
- /*
- * Again, I_NEW makes no sense for bcachefs. This is only needed
- * for clearing I_NEW, but since the inode was already fully
- * created and initialized we didn't actually want
- * inode_insert5() to set it for us.
- */
- unlock_new_inode(&inode->v);
+ return inode;
}
-
- return inode;
}
#define memalloc_flags_do(_flags, _do) \
@@ -241,7 +293,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
{
- struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+ struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
+ bch2_inode_cache, GFP_NOFS);
if (!inode)
return NULL;
@@ -283,13 +336,24 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
return inode;
}
+static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *bi,
+ struct bch_subvolume *subvol)
+{
+ struct bch_inode_info *inode = bch2_new_inode(trans);
+ if (IS_ERR(inode))
+ return inode;
+
+ bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
+
+ return bch2_inode_hash_insert(trans->c, trans, inode);
+
+}
+
struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
{
- struct bch_inode_info *inode =
- to_bch_ei(ilookup5_nowait(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- &inum));
+ struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
if (inode)
return &inode->v;
@@ -300,11 +364,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
int ret = lockrestart_do(trans,
bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
- PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
- if (!ret) {
- bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
- inode = bch2_inode_insert(c, inode);
- }
+ PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
bch2_trans_put(trans);
return ret ? ERR_PTR(ret) : &inode->v;
@@ -325,6 +385,8 @@ __bch2_create(struct mnt_idmap *idmap,
subvol_inum inum;
struct bch_subvolume subvol;
u64 journal_seq = 0;
+ kuid_t kuid;
+ kgid_t kgid;
int ret;
/*
@@ -351,13 +413,15 @@ __bch2_create(struct mnt_idmap *idmap,
retry:
bch2_trans_begin(trans);
- ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
+ kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
+ kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
+ ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
bch2_create_trans(trans,
inode_inum(dir), &dir_u, &inode_u,
!(flags & BCH_CREATE_TMPFILE)
? &dentry->d_name : NULL,
- from_kuid(i_user_ns(&dir->v), current_fsuid()),
- from_kgid(i_user_ns(&dir->v), current_fsgid()),
+ from_kuid(i_user_ns(&dir->v), kuid),
+ from_kgid(i_user_ns(&dir->v), kgid),
mode, rdev,
default_acl, acl, snapshot_src, flags) ?:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
@@ -365,7 +429,7 @@ retry:
if (unlikely(ret))
goto err_before_quota;
- inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+ inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
inum.inum = inode_u.bi_inum;
ret = bch2_subvolume_get(trans, inum.subvol, true,
@@ -395,8 +459,16 @@ err_before_quota:
* we must insert the new inode into the inode cache before calling
* bch2_trans_exit() and dropping locks, else we could race with another
* thread pulling the inode in and modifying it:
+ *
+ * also, calling bch2_inode_hash_insert() without passing in the
+ * transaction object is sketchy - if we could ever end up in
+ * __wait_on_freeing_inode(), we'd risk deadlock.
+ *
+ * But that shouldn't be possible, since we still have the inode locked
+ * that we just created, and we _really_ can't take a transaction
+ * restart here.
*/
- inode = bch2_inode_insert(c, inode);
+ inode = bch2_inode_hash_insert(c, NULL, inode);
bch2_trans_put(trans);
err:
posix_acl_release(default_acl);
@@ -436,11 +508,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
if (ret)
goto err;
- struct bch_inode_info *inode =
- to_bch_ei(ilookup5_nowait(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- &inum));
+ struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
if (inode)
goto out;
@@ -448,7 +516,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
struct bch_inode_unpacked inode_u;
ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
- PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+ PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
c, "dirent to missing inode:\n %s",
@@ -468,9 +536,6 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
ret = -ENOENT;
goto err;
}
-
- bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
- inode = bch2_inode_insert(c, inode);
out:
bch2_trans_iter_exit(trans, &dirent_iter);
printbuf_exit(&buf);
@@ -557,8 +622,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
lockdep_assert_held(&inode->v.i_rwsem);
- ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
- bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
+ bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
__bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
return bch2_err_class(ret);
@@ -614,7 +679,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
struct bch_inode_info *dir= to_bch_ei(vdir);
struct bch_fs *c = dir->v.i_sb->s_fs_info;
- int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+ int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
__bch2_unlink(vdir, dentry, false);
return bch2_err_class(ret);
}
@@ -671,15 +736,16 @@ static int bch2_rename2(struct mnt_idmap *idmap,
struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
struct bch_inode_unpacked dst_dir_u, src_dir_u;
- struct bch_inode_unpacked src_inode_u, dst_inode_u;
+ struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
struct btree_trans *trans;
enum bch_rename_mode mode = flags & RENAME_EXCHANGE
? BCH_RENAME_EXCHANGE
: dst_dentry->d_inode
? BCH_RENAME_OVERWRITE : BCH_RENAME;
+ bool whiteout = !!(flags & RENAME_WHITEOUT);
int ret;
- if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
return -EINVAL;
if (mode == BCH_RENAME_OVERWRITE) {
@@ -697,8 +763,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
trans = bch2_trans_get(c);
- ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
- bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
+ ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
+ bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
if (ret)
goto err;
@@ -720,18 +786,48 @@ static int bch2_rename2(struct mnt_idmap *idmap,
if (ret)
goto err;
}
+retry:
+ bch2_trans_begin(trans);
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_rename_trans(trans,
- inode_inum(src_dir), &src_dir_u,
- inode_inum(dst_dir), &dst_dir_u,
- &src_inode_u,
- &dst_inode_u,
- &src_dentry->d_name,
- &dst_dentry->d_name,
- mode));
+ ret = bch2_rename_trans(trans,
+ inode_inum(src_dir), &src_dir_u,
+ inode_inum(dst_dir), &dst_dir_u,
+ &src_inode_u,
+ &dst_inode_u,
+ &src_dentry->d_name,
+ &dst_dentry->d_name,
+ mode);
if (unlikely(ret))
+ goto err_tx_restart;
+
+ if (whiteout) {
+ whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
+ ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
+ if (unlikely(ret))
+ goto err_tx_restart;
+ bch2_inode_init_early(c, whiteout_inode_u);
+
+ ret = bch2_create_trans(trans,
+ inode_inum(src_dir), &src_dir_u,
+ whiteout_inode_u,
+ &src_dentry->d_name,
+ from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
+ from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
+ S_IFCHR|WHITEOUT_MODE, 0,
+ NULL, NULL, (subvol_inum) { 0 }, 0) ?:
+ bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
+ KEY_TYPE_QUOTA_PREALLOC);
+ if (unlikely(ret))
+ goto err_tx_restart;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0);
+ if (unlikely(ret)) {
+err_tx_restart:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
goto err;
+ }
BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
BUG_ON(dst_inode &&
@@ -779,11 +875,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
unsigned int ia_valid = attr->ia_valid;
+ kuid_t kuid;
+ kgid_t kgid;
- if (ia_valid & ATTR_UID)
- bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
- if (ia_valid & ATTR_GID)
- bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+ if (ia_valid & ATTR_UID) {
+ kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+ bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
+ }
+ if (ia_valid & ATTR_GID) {
+ kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+ bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
+ }
if (ia_valid & ATTR_SIZE)
bi->bi_size = attr->ia_size;
@@ -798,11 +900,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
kgid_t gid = ia_valid & ATTR_GID
- ? attr->ia_gid
+ ? kgid
: inode->v.i_gid;
- if (!in_group_p(gid) &&
- !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
+ if (!in_group_or_capable(idmap, &inode->v,
+ make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
mode &= ~S_ISGID;
bi->bi_mode = mode;
}
@@ -818,17 +920,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
+ kuid_t kuid;
+ kgid_t kgid;
int ret;
mutex_lock(&inode->ei_update_lock);
qid = inode->ei_qid;
- if (attr->ia_valid & ATTR_UID)
- qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+ if (attr->ia_valid & ATTR_UID) {
+ kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+ qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
+ }
- if (attr->ia_valid & ATTR_GID)
- qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+ if (attr->ia_valid & ATTR_GID) {
+ kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+ qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
+ }
ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
KEY_TYPE_QUOTA_PREALLOC);
@@ -884,13 +992,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
stat->dev = inode->v.i_sb->s_dev;
stat->ino = inode->v.i_ino;
stat->mode = inode->v.i_mode;
stat->nlink = inode->v.i_nlink;
- stat->uid = inode->v.i_uid;
- stat->gid = inode->v.i_gid;
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->v.i_rdev;
stat->size = i_size_read(&inode->v);
stat->atime = inode_get_atime(&inode->v);
@@ -899,7 +1009,7 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->blksize = block_bytes(c);
stat->blocks = inode->v.i_blocks;
- stat->subvol = inode->ei_subvol;
+ stat->subvol = inode->ei_inum.subvol;
stat->result_mask |= STATX_SUBVOL;
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
@@ -941,7 +1051,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
lockdep_assert_held(&inode->v.i_rwsem);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
setattr_prepare(idmap, dentry, iattr);
if (ret)
return ret;
@@ -1034,7 +1144,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct bkey_buf cur, prev;
unsigned offset_into_extent, sectors;
bool have_extent = false;
- u32 snapshot;
int ret = 0;
ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
@@ -1050,21 +1159,30 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
- if (ret)
- goto err;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(ei->v.i_ino, start, snapshot), 0);
+ POS(ei->v.i_ino, start), 0);
- while (!(ret = btree_trans_too_many_iters(trans)) &&
- (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
- !(ret = bkey_err(k))) {
+ while (true) {
enum btree_id data_btree = BTREE_ID_extents;
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+ k = bch2_btree_iter_peek_upto(&iter, end);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k)
+ break;
+
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
bch2_btree_iter_advance(&iter);
@@ -1108,16 +1226,12 @@ retry:
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
-
- ret = bch2_trans_relock(trans);
- if (ret)
+err:
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
- start = iter.pos.offset;
bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
if (!ret && have_extent) {
bch2_trans_unlock(trans);
@@ -1173,7 +1287,7 @@ static int bch2_open(struct inode *vinode, struct file *file)
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
+ int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
if (ret)
return ret;
}
@@ -1305,8 +1419,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type)
static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
{
return (struct bcachefs_fid) {
- .inum = inode->ei_inode.bi_inum,
- .subvol = inode->ei_subvol,
+ .inum = inode->ei_inum.inum,
+ .subvol = inode->ei_inum.subvol,
.gen = inode->ei_inode.bi_generation,
};
}
@@ -1391,7 +1505,7 @@ static struct dentry *bch2_get_parent(struct dentry *child)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
subvol_inum parent_inum = {
.subvol = inode->ei_inode.bi_parent_subvol ?:
- inode->ei_subvol,
+ inode->ei_inum.subvol,
.inum = inode->ei_inode.bi_dir,
};
@@ -1427,7 +1541,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
retry:
bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
if (ret)
goto err;
@@ -1458,8 +1572,7 @@ retry:
if (ret)
goto err;
- if (target.subvol == inode->ei_subvol &&
- target.inum == inode->ei_inode.bi_inum)
+ if (subvol_inum_eq(target, inode->ei_inum))
goto found;
} else {
/*
@@ -1480,8 +1593,7 @@ retry:
if (ret)
continue;
- if (target.subvol == inode->ei_subvol &&
- target.inum == inode->ei_inode.bi_inum)
+ if (subvol_inum_eq(target, inode->ei_inum))
goto found;
}
}
@@ -1513,12 +1625,15 @@ static const struct export_operations bch_export_ops = {
.get_name = bch2_get_name,
};
-static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
+static void bch2_vfs_inode_init(struct btree_trans *trans,
+ subvol_inum inum,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol)
{
- bch2_iget5_set(&inode->v, &inum);
+ inode->v.i_ino = inum.inum;
+ inode->ei_inum = inum;
+ inode->ei_inode.bi_inum = inum.inum;
bch2_inode_update_after_write(trans, inode, bi, ~0);
inode->v.i_blocks = bi->bi_sectors;
@@ -1530,7 +1645,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
inode->ei_flags = 0;
inode->ei_quota_reserved = 0;
inode->ei_qid = bch_qid(bi);
- inode->ei_subvol = inum.subvol;
if (BCH_SUBVOLUME_SNAP(subvol))
set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
@@ -1597,6 +1711,17 @@ static void bch2_evict_inode(struct inode *vinode)
{
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(vinode);
+ bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
+
+ /*
+ * evict() has waited for outstanding writeback, we'll do no more IO
+ * through this inode: it's safe to remove from VFS inode hashtable here
+ *
+ * Do that now so that other threads aren't blocked from pulling it back
+ * in, there's no reason for them to be:
+ */
+ if (!delete)
+ bch2_inode_hash_remove(c, inode);
truncate_inode_pages_final(&inode->v.i_data);
@@ -1604,12 +1729,18 @@ static void bch2_evict_inode(struct inode *vinode)
BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
- if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+ if (delete) {
bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
bch2_inode_rm(c, inode_inum(inode));
+
+ /*
+ * If we are deleting, we need it present in the vfs hash table
+ * so that fsck can check if unlinked inodes are still open:
+ */
+ bch2_inode_hash_remove(c, inode);
}
mutex_lock(&c->vfs_inodes_lock);
@@ -1639,7 +1770,7 @@ again:
mutex_lock(&c->vfs_inodes_lock);
list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
- if (!snapshot_list_has_id(s, inode->ei_subvol))
+ if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
continue;
if (!(inode->v.i_state & I_DONTCACHE) &&
@@ -1801,30 +1932,14 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
static int bch2_show_options(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
- enum bch_opt_id i;
struct printbuf buf = PRINTBUF;
- int ret = 0;
- for (i = 0; i < bch2_opts_nr; i++) {
- const struct bch_option *opt = &bch2_opt_table[i];
- u64 v = bch2_opt_get_by_id(&c->opts, i);
+ bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
+ OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
+ printbuf_nul_terminate(&buf);
+ seq_puts(seq, buf.buf);
- if ((opt->flags & OPT_HIDDEN) ||
- !(opt->flags & OPT_MOUNT))
- continue;
-
- if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
- continue;
-
- printbuf_reset(&buf);
- bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
- OPT_SHOW_MOUNT_STYLE);
- seq_putc(seq, ',');
- seq_puts(seq, buf.buf);
- }
-
- if (buf.allocation_failure)
- ret = -ENOMEM;
+ int ret = buf.allocation_failure ? -ENOMEM : 0;
printbuf_exit(&buf);
return ret;
}
@@ -2129,12 +2244,23 @@ static int bch2_init_fs_context(struct fs_context *fc)
return 0;
}
+void bch2_fs_vfs_exit(struct bch_fs *c)
+{
+ if (c->vfs_inodes_table.tbl)
+ rhashtable_destroy(&c->vfs_inodes_table);
+}
+
+int bch2_fs_vfs_init(struct bch_fs *c)
+{
+ return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
+}
+
static struct file_system_type bcache_fs_type = {
.owner = THIS_MODULE,
.name = "bcachefs",
.init_fs_context = bch2_init_fs_context,
.kill_sb = bch2_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("bcachefs");
@@ -2149,7 +2275,8 @@ int __init bch2_vfs_init(void)
{
int ret = -ENOMEM;
- bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
+ bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT);
if (!bch2_inode_cache)
goto err;
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 990ec43e0365..da74ecc236e7 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -13,6 +13,9 @@
struct bch_inode_info {
struct inode v;
+ struct rhash_head hash;
+ subvol_inum ei_inum;
+
struct list_head ei_vfs_inode_list;
unsigned long ei_flags;
@@ -24,8 +27,6 @@ struct bch_inode_info {
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
- u32 ei_subvol;
-
/*
* When we've been doing nocow writes we'll need to issue flushes to the
* underlying block devices
@@ -50,10 +51,7 @@ struct bch_inode_info {
static inline subvol_inum inode_inum(struct bch_inode_info *inode)
{
- return (subvol_inum) {
- .subvol = inode->ei_subvol,
- .inum = inode->ei_inode.bi_inum,
- };
+ return inode->ei_inum;
}
struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
@@ -69,6 +67,7 @@ struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
* those:
*/
#define EI_INODE_SNAPSHOT 1
+#define EI_INODE_HASHED 2
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)
@@ -189,6 +188,9 @@ int __bch2_unlink(struct inode *, struct dentry *, bool);
void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
+void bch2_fs_vfs_exit(struct bch_fs *);
+int bch2_fs_vfs_init(struct bch_fs *);
+
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
@@ -203,6 +205,10 @@ static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, su
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {}
+
+static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
+static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
+
static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; }
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 2be6be33afa3..6ac0ff7e074b 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -365,7 +365,7 @@ int bch2_inode_peek(struct btree_trans *trans,
subvol_inum inum, unsigned flags)
{
int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
- bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
+ bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
return ret;
}
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 7ee3b75480df..b2f50e74bb76 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -286,7 +286,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
*/
bool promote_full = (failed ||
*read_full ||
- READ_ONCE(c->promote_whole_extents));
+ READ_ONCE(c->opts.promote_whole_extents));
/* data might have to be decompressed in the write path: */
unsigned sectors = promote_full
? max(pick->crc.compressed_size, pick->crc.live_size)
@@ -777,7 +777,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
orig_k->k->k.size,
reflink_offset);
bch2_inconsistent_error(trans->c);
- ret = -EIO;
+ ret = -BCH_ERR_missing_indirect_extent;
goto err;
}
@@ -869,9 +869,15 @@ retry_pick:
goto hole;
if (pick_ret < 0) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+
bch_err_inum_offset_ratelimited(c,
read_pos.inode, read_pos.offset << 9,
- "no device to read from");
+ "no device to read from: %s\n %s",
+ bch2_err_str(pick_ret),
+ buf.buf);
+ printbuf_exit(&buf);
goto err;
}
@@ -1086,7 +1092,7 @@ get_bio:
trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
- if (bch2_ec_read_extent(trans, rbio)) {
+ if (bch2_ec_read_extent(trans, rbio, k)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
@@ -1214,10 +1220,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-
- ret = btree_trans_too_many_iters(trans);
- if (ret)
- goto err;
err:
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 1d4761d15002..d3b5be7fd9bf 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1447,9 +1447,7 @@ again:
op->nr_replicas_required,
op->watermark,
op->flags,
- (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_ONLY_SPECIFIED_DEVS))
- ? NULL : &op->cl, &wp));
+ &op->cl, &wp));
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
break;
@@ -1592,6 +1590,9 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
+ if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+ op->flags |= BCH_WRITE_ALLOC_NOWAIT;
+
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7664b68e6a15..30460bce04be 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1353,6 +1353,7 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
struct bch_replicas_padded replicas = {
.e.data_type = BCH_DATA_journal,
+ .e.nr_devs = 0,
.e.nr_required = 1,
};
@@ -1379,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c,
goto err;
darray_for_each(i->ptrs, ptr)
- replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
+ replicas_entry_add_dev(&replicas.e, ptr->dev);
bch2_replicas_entry_sort(&replicas.e);
@@ -1950,7 +1951,8 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
if (error ||
w->noflush ||
(!w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ time_before(jiffies, j->last_flush_write +
+ msecs_to_jiffies(c->opts.journal_flush_delay)) &&
test_bit(JOURNAL_may_skip_flush, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 70b998d9f19c..ace291f175dd 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j)
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct btree_cache *bc = &c->btree_cache;
bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush;
size_t min_nr, min_key_cache, nr_flushed;
@@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
- if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
+ size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
+ if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
min_nr = 1;
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
@@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
- atomic_read(&c->btree_cache.dirty),
- c->btree_cache.used,
+ atomic_long_read(&bc->nr_dirty), btree_cache_live,
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index e10fc1da71b1..232be8a44051 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -230,6 +230,8 @@ const struct bch_option bch2_opt_table[] = {
#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
.min = 0, .max = U64_MAX, \
.choices = _choices
+#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \
+ .choices = _choices
#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
@@ -376,6 +378,13 @@ int bch2_opt_parse(struct bch_fs *c,
*res = ret;
break;
+ case BCH_OPT_BITFIELD: {
+ s64 v = bch2_read_flag_list(val, opt->choices);
+ if (v < 0)
+ return v;
+ *res = v;
+ break;
+ }
case BCH_OPT_FN:
ret = opt->fn.parse(c, val, res, err);
@@ -423,6 +432,9 @@ void bch2_opt_to_text(struct printbuf *out,
else
prt_str(out, opt->choices[v]);
break;
+ case BCH_OPT_BITFIELD:
+ prt_bitflags(out, opt->choices, v);
+ break;
case BCH_OPT_FN:
opt->fn.to_text(out, c, sb, v);
break;
@@ -431,6 +443,32 @@ void bch2_opt_to_text(struct printbuf *out,
}
}
+void bch2_opts_to_text(struct printbuf *out,
+ struct bch_opts opts,
+ struct bch_fs *c, struct bch_sb *sb,
+ unsigned show_mask, unsigned hide_mask,
+ unsigned flags)
+{
+ bool first = true;
+
+ for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
+ const struct bch_option *opt = &bch2_opt_table[i];
+
+ if ((opt->flags & hide_mask) || !(opt->flags & show_mask))
+ continue;
+
+ u64 v = bch2_opt_get_by_id(&opts, i);
+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+ continue;
+
+ if (!first)
+ prt_char(out, ',');
+ first = false;
+
+ bch2_opt_to_text(out, c, sb, opt, v, flags);
+ }
+}
+
int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
{
int ret = 0;
@@ -608,10 +646,20 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
return 0;
}
-void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
+struct bch_dev_sb_opt_set {
+ void (*set_sb)(struct bch_member *, u64);
+};
+
+static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
+#define x(n, set) [Opt_##n] = { .set_sb = SET_##set },
+ BCH_DEV_OPT_SETTERS()
+#undef x
+};
+
+void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
+ const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_BCH2_NO_SB_OPT)
- return;
+ enum bch_opt_id id = opt - bch2_opt_table;
if (opt->flags & OPT_SB_FIELD_SECTORS)
v >>= 9;
@@ -619,16 +667,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
if (opt->flags & OPT_SB_FIELD_ILOG2)
v = ilog2(v);
- opt->set_sb(sb, v);
+ if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
+ v++;
+
+ if (opt->flags & OPT_FS) {
+ if (opt->set_sb != SET_BCH2_NO_SB_OPT)
+ opt->set_sb(sb, v);
+ }
+
+ if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) {
+ if (WARN(!bch2_member_exists(sb, dev_idx),
+ "tried to set device option %s on nonexistent device %i",
+ opt->attr.name, dev_idx))
+ return;
+
+ struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
+
+ const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
+ if (set->set_sb)
+ set->set_sb(m, v);
+ else
+ pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name);
+ }
}
-void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
+void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
+ const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_BCH2_NO_SB_OPT)
- return;
-
mutex_lock(&c->sb_lock);
- __bch2_opt_set_sb(c->disk_sb.sb, opt, v);
+ __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index cda1725702ea..cb2e244a2429 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -53,23 +53,25 @@ void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
/* When can be set: */
enum opt_flags {
- OPT_FS = (1 << 0), /* Filesystem option */
- OPT_DEVICE = (1 << 1), /* Device option */
- OPT_INODE = (1 << 2), /* Inode option */
- OPT_FORMAT = (1 << 3), /* May be specified at format time */
- OPT_MOUNT = (1 << 4), /* May be specified at mount time */
- OPT_RUNTIME = (1 << 5), /* May be specified at runtime */
- OPT_HUMAN_READABLE = (1 << 6),
- OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */
- OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
- OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */
- OPT_HIDDEN = (1 << 10),
+ OPT_FS = BIT(0), /* Filesystem option */
+ OPT_DEVICE = BIT(1), /* Device option */
+ OPT_INODE = BIT(2), /* Inode option */
+ OPT_FORMAT = BIT(3), /* May be specified at format time */
+ OPT_MOUNT = BIT(4), /* May be specified at mount time */
+ OPT_RUNTIME = BIT(5), /* May be specified at runtime */
+ OPT_HUMAN_READABLE = BIT(6),
+ OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */
+ OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */
+ OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */
+ OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */
+ OPT_HIDDEN = BIT(11),
};
enum opt_type {
BCH_OPT_BOOL,
BCH_OPT_UINT,
BCH_OPT_STR,
+ BCH_OPT_BITFIELD,
BCH_OPT_FN,
};
@@ -263,6 +265,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Enable inline data extents") \
+ x(promote_whole_extents, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \
+ NULL, "Promote whole extents, instead of just part being read")\
x(acl, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
@@ -366,6 +373,16 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Exit recovery immediately prior to journal replay")\
+ x(recovery_passes, u64, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BITFIELD(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Recovery passes to run explicitly") \
+ x(recovery_passes_exclude, u64, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BITFIELD(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Recovery passes to exclude") \
x(recovery_pass_last, u8, \
OPT_FS|OPT_MOUNT, \
OPT_STR_NOLIMIT(bch2_recovery_passes), \
@@ -472,11 +489,16 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, 0, \
"size", "Size of filesystem on device") \
x(durability, u8, \
- OPT_DEVICE, \
+ OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \
OPT_UINT(0, BCH_REPLICAS_MAX), \
BCH2_NO_SB_OPT, 1, \
"n", "Data written to this device will be considered\n"\
"to have already been replicated n times") \
+ x(data_allowed, u8, \
+ OPT_DEVICE, \
+ OPT_BITFIELD(__bch2_data_types), \
+ BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
+ "types", "Allowed data types for this device: journal, btree, and/or user")\
x(btree_node_prefetch, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
@@ -484,6 +506,11 @@ enum fsck_err_opts {
NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
" prefetched sequentially")
+#define BCH_DEV_OPT_SETTERS() \
+ x(discard, BCH_MEMBER_DISCARD) \
+ x(durability, BCH_MEMBER_DURABILITY) \
+ x(data_allowed, BCH_MEMBER_DATA_ALLOWED)
+
struct bch_opts {
#define x(_name, _bits, ...) unsigned _name##_defined:1;
BCH_OPTS()
@@ -563,8 +590,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
-void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
-void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
+void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
+
+struct bch_dev;
+void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
int bch2_opt_lookup(const char *);
int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
@@ -576,6 +605,10 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
const struct bch_option *, u64, unsigned);
+void bch2_opts_to_text(struct printbuf *,
+ struct bch_opts,
+ struct bch_fs *, struct bch_sb *,
+ unsigned, unsigned, unsigned);
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
int bch2_opts_check_may_set(struct bch_fs *);
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
new file mode 100644
index 000000000000..40a20192eee8
--- /dev/null
+++ b/fs/bcachefs/rcu_pending.c
@@ -0,0 +1,650 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+
+#include <linux/generic-radix-tree.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/vmalloc.h>
+
+#include "rcu_pending.h"
+#include "darray.h"
+#include "util.h"
+
+#define static_array_for_each(_a, _i) \
+ for (typeof(&(_a)[0]) _i = _a; \
+ _i < (_a) + ARRAY_SIZE(_a); \
+ _i++)
+
+enum rcu_pending_special {
+ RCU_PENDING_KVFREE = 1,
+ RCU_PENDING_CALL_RCU = 2,
+};
+
+#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
+#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
+
+static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
+{
+ return ssp
+ ? get_state_synchronize_srcu(ssp)
+ : get_state_synchronize_rcu();
+}
+
+static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
+{
+ return ssp
+ ? start_poll_synchronize_srcu(ssp)
+ : start_poll_synchronize_rcu();
+}
+
+static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ return ssp
+ ? poll_state_synchronize_srcu(ssp, cookie)
+ : poll_state_synchronize_rcu(cookie);
+}
+
+static inline void __rcu_barrier(struct srcu_struct *ssp)
+{
+ return ssp
+ ? srcu_barrier(ssp)
+ : rcu_barrier();
+}
+
+static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+ rcu_callback_t func)
+{
+ if (ssp)
+ call_srcu(ssp, rhp, func);
+ else
+ call_rcu(rhp, func);
+}
+
+struct rcu_pending_seq {
+ /*
+ * We're using a radix tree like a vector - we're just pushing elements
+ * onto the end; we're using a radix tree instead of an actual vector to
+ * avoid reallocation overhead
+ */
+ GENRADIX(struct rcu_head *) objs;
+ size_t nr;
+ struct rcu_head **cursor;
+ unsigned long seq;
+};
+
+struct rcu_pending_list {
+ struct rcu_head *head;
+ struct rcu_head *tail;
+ unsigned long seq;
+};
+
+struct rcu_pending_pcpu {
+ struct rcu_pending *parent;
+ spinlock_t lock;
+ int cpu;
+
+ /*
+ * We can't bound the number of unprocessed gp sequence numbers, and we
+ * can't efficiently merge radix trees for expired grace periods, so we
+ * need darray/vector:
+ */
+ DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
+
+ /* Third entry is for expired objects: */
+ struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
+
+ struct rcu_head cb;
+ bool cb_armed;
+ struct work_struct work;
+};
+
+static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
+{
+ if (p->objs.nr)
+ return true;
+
+ static_array_for_each(p->lists, i)
+ if (i->head)
+ return true;
+
+ return false;
+}
+
+static void rcu_pending_list_merge(struct rcu_pending_list *l1,
+ struct rcu_pending_list *l2)
+{
+#ifdef __KERNEL__
+ if (!l1->head)
+ l1->head = l2->head;
+ else
+ l1->tail->next = l2->head;
+#else
+ if (!l1->head)
+ l1->head = l2->head;
+ else
+ l1->tail->next.next = (void *) l2->head;
+#endif
+
+ l1->tail = l2->tail;
+ l2->head = l2->tail = NULL;
+}
+
+static void rcu_pending_list_add(struct rcu_pending_list *l,
+ struct rcu_head *n)
+{
+#ifdef __KERNEL__
+ if (!l->head)
+ l->head = n;
+ else
+ l->tail->next = n;
+ l->tail = n;
+ n->next = NULL;
+#else
+ if (!l->head)
+ l->head = n;
+ else
+ l->tail->next.next = (void *) n;
+ l->tail = n;
+ n->next.next = NULL;
+#endif
+}
+
+static void merge_expired_lists(struct rcu_pending_pcpu *p)
+{
+ struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+
+ for (struct rcu_pending_list *i = p->lists; i < expired; i++)
+ if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
+ rcu_pending_list_merge(expired, i);
+}
+
+#ifndef __KERNEL__
+static inline void kfree_bulk(size_t nr, void ** p)
+{
+ while (nr--)
+ kfree(*p);
+}
+
+#define local_irq_save(flags) \
+do { \
+ flags = 0; \
+} while (0)
+#endif
+
+static noinline void __process_finished_items(struct rcu_pending *pending,
+ struct rcu_pending_pcpu *p,
+ unsigned long flags)
+{
+ struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+ struct rcu_pending_seq objs = {};
+ struct rcu_head *list = NULL;
+
+ if (p->objs.nr &&
+ __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
+ objs = p->objs.data[0];
+ darray_remove_item(&p->objs, p->objs.data);
+ }
+
+ merge_expired_lists(p);
+
+ list = expired->head;
+ expired->head = expired->tail = NULL;
+
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ switch ((ulong) pending->process) {
+ case RCU_PENDING_KVFREE:
+ for (size_t i = 0; i < objs.nr; ) {
+ size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
+
+ kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
+ i += nr_this_node;
+ }
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+
+ /*
+ * low bit of pointer indicates whether rcu_head needs
+ * to be freed - kvfree_rcu_mightsleep()
+ */
+ BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
+
+ void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
+ bool free_head = ((unsigned long) obj->func) & 1UL;
+
+ kvfree(ptr);
+ if (free_head)
+ kfree(obj);
+ }
+
+ break;
+
+ case RCU_PENDING_CALL_RCU:
+ for (size_t i = 0; i < objs.nr; i++) {
+ struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
+ obj->func(obj);
+ }
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+ obj->func(obj);
+ }
+ break;
+
+ default:
+ for (size_t i = 0; i < objs.nr; i++)
+ pending->process(pending, *genradix_ptr(&objs.objs, i));
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+ pending->process(pending, obj);
+ }
+ break;
+ }
+}
+
+static bool process_finished_items(struct rcu_pending *pending,
+ struct rcu_pending_pcpu *p,
+ unsigned long flags)
+{
+ /*
+ * XXX: we should grab the gp seq once and avoid multiple function
+ * calls, this is called from __rcu_pending_enqueue() fastpath in
+ * may_sleep==true mode
+ */
+ if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
+ (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
+ (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
+ p->lists[2].head) {
+ __process_finished_items(pending, p, flags);
+ return true;
+ }
+
+ return false;
+}
+
+static void rcu_pending_work(struct work_struct *work)
+{
+ struct rcu_pending_pcpu *p =
+ container_of(work, struct rcu_pending_pcpu, work);
+ struct rcu_pending *pending = p->parent;
+ unsigned long flags;
+
+ do {
+ spin_lock_irqsave(&p->lock, flags);
+ } while (process_finished_items(pending, p, flags));
+
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void rcu_pending_rcu_cb(struct rcu_head *rcu)
+{
+ struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
+
+ schedule_work_on(p->cpu, &p->work);
+
+ unsigned long flags;
+ spin_lock_irqsave(&p->lock, flags);
+ if (__rcu_pending_has_pending(p)) {
+ spin_unlock_irqrestore(&p->lock, flags);
+ __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
+ } else {
+ p->cb_armed = false;
+ spin_unlock_irqrestore(&p->lock, flags);
+ }
+}
+
+static __always_inline struct rcu_pending_seq *
+get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
+{
+ darray_for_each_reverse(p->objs, objs)
+ if (objs->seq == seq)
+ return objs;
+
+ if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
+ return NULL;
+
+ return &darray_last(p->objs);
+}
+
+static noinline bool
+rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
+ struct rcu_head *head, void *ptr,
+ unsigned long *flags)
+{
+ if (ptr) {
+ if (!head) {
+ /*
+ * kvfree_rcu_mightsleep(): we weren't passed an
+ * rcu_head, but we need one: use the low bit of the
+ * ponter to free to flag that the head needs to be
+ * freed as well:
+ */
+ ptr = (void *)(((unsigned long) ptr)|1UL);
+ head = kmalloc(sizeof(*head), __GFP_NOWARN);
+ if (!head) {
+ spin_unlock_irqrestore(&p->lock, *flags);
+ head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
+ /*
+ * dropped lock, did GFP_KERNEL allocation,
+ * check for gp expiration
+ */
+ if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
+ kvfree(--ptr);
+ kfree(head);
+ spin_lock_irqsave(&p->lock, *flags);
+ return false;
+ }
+ }
+ }
+
+ head->func = ptr;
+ }
+again:
+ for (struct rcu_pending_list *i = p->lists;
+ i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+ if (i->seq == seq) {
+ rcu_pending_list_add(i, head);
+ return false;
+ }
+ }
+
+ for (struct rcu_pending_list *i = p->lists;
+ i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+ if (!i->head) {
+ i->seq = seq;
+ rcu_pending_list_add(i, head);
+ return true;
+ }
+ }
+
+ merge_expired_lists(p);
+ goto again;
+}
+
+/*
+ * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
+ * pending->pracess) once grace period elapses.
+ *
+ * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
+ * back to a linked list.
+ *
+ * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
+ * process callback
+ *
+ * - If @ptr and @head are both not NULL, we're kvfree_rcu()
+ *
+ * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
+ *
+ * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
+ * expired items.
+ */
+static __always_inline void
+__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
+ void *ptr, bool may_sleep)
+{
+
+ struct rcu_pending_pcpu *p;
+ struct rcu_pending_seq *objs;
+ struct genradix_node *new_node = NULL;
+ unsigned long seq, flags;
+ bool start_gp = false;
+
+ BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
+
+ local_irq_save(flags);
+ p = this_cpu_ptr(pending->p);
+ spin_lock(&p->lock);
+ seq = __get_state_synchronize_rcu(pending->srcu);
+restart:
+ if (may_sleep &&
+ unlikely(process_finished_items(pending, p, flags)))
+ goto check_expired;
+
+ /*
+ * In kvfree_rcu() mode, the radix tree is only for slab pointers so
+ * that we can do kfree_bulk() - vmalloc pointers always use the linked
+ * list:
+ */
+ if (ptr && unlikely(is_vmalloc_addr(ptr)))
+ goto list_add;
+
+ objs = get_object_radix(p, seq);
+ if (unlikely(!objs))
+ goto list_add;
+
+ if (unlikely(!objs->cursor)) {
+ /*
+ * New radix tree nodes must be added under @p->lock because the
+ * tree root is in a darray that can be resized (typically,
+ * genradix supports concurrent unlocked allocation of new
+ * nodes) - hence preallocation and the retry loop:
+ */
+ objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
+ objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
+ if (unlikely(!objs->cursor)) {
+ if (may_sleep) {
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ gfp_t gfp = GFP_KERNEL;
+ if (!head)
+ gfp |= __GFP_NOFAIL;
+
+ new_node = genradix_alloc_node(gfp);
+ if (!new_node)
+ may_sleep = false;
+ goto check_expired;
+ }
+list_add:
+ start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
+ goto start_gp;
+ }
+ }
+
+ *objs->cursor++ = ptr ?: head;
+ /* zero cursor if we hit the end of a radix tree node: */
+ if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
+ objs->cursor = NULL;
+ start_gp = !objs->nr;
+ objs->nr++;
+start_gp:
+ if (unlikely(start_gp)) {
+ /*
+ * We only have one callback (ideally, we would have one for
+ * every outstanding graceperiod) - so if our callback is
+ * already in flight, we may still have to start a grace period
+ * (since we used get_state() above, not start_poll())
+ */
+ if (!p->cb_armed) {
+ p->cb_armed = true;
+ __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
+ } else {
+ __start_poll_synchronize_rcu(pending->srcu);
+ }
+ }
+ spin_unlock_irqrestore(&p->lock, flags);
+free_node:
+ if (new_node)
+ genradix_free_node(new_node);
+ return;
+check_expired:
+ if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
+ switch ((ulong) pending->process) {
+ case RCU_PENDING_KVFREE:
+ kvfree(ptr);
+ break;
+ case RCU_PENDING_CALL_RCU:
+ head->func(head);
+ break;
+ default:
+ pending->process(pending, head);
+ break;
+ }
+ goto free_node;
+ }
+
+ local_irq_save(flags);
+ p = this_cpu_ptr(pending->p);
+ spin_lock(&p->lock);
+ goto restart;
+}
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
+{
+ __rcu_pending_enqueue(pending, obj, NULL, true);
+}
+
+static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
+{
+ struct rcu_head *ret = NULL;
+
+ spin_lock_irq(&p->lock);
+ darray_for_each(p->objs, objs)
+ if (objs->nr) {
+ ret = *genradix_ptr(&objs->objs, --objs->nr);
+ objs->cursor = NULL;
+ if (!objs->nr)
+ genradix_free(&objs->objs);
+ goto out;
+ }
+
+ static_array_for_each(p->lists, i)
+ if (i->head) {
+ ret = i->head;
+#ifdef __KERNEL__
+ i->head = ret->next;
+#else
+ i->head = (void *) ret->next.next;
+#endif
+ if (!i->head)
+ i->tail = NULL;
+ goto out;
+ }
+out:
+ spin_unlock_irq(&p->lock);
+
+ return ret;
+}
+
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
+{
+ return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
+}
+
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
+{
+ struct rcu_head *ret = rcu_pending_dequeue(pending);
+
+ if (ret)
+ return ret;
+
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
+{
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ spin_lock_irq(&p->lock);
+ if (__rcu_pending_has_pending(p) || p->cb_armed) {
+ spin_unlock_irq(&p->lock);
+ return true;
+ }
+ spin_unlock_irq(&p->lock);
+ }
+
+ return false;
+}
+
+void rcu_pending_exit(struct rcu_pending *pending)
+{
+ int cpu;
+
+ if (!pending->p)
+ return;
+
+ while (rcu_pending_has_pending_or_armed(pending)) {
+ __rcu_barrier(pending->srcu);
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ flush_work(&p->work);
+ }
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ flush_work(&p->work);
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+
+ static_array_for_each(p->lists, i)
+ WARN_ON(i->head);
+ WARN_ON(p->objs.nr);
+ darray_exit(&p->objs);
+ }
+ free_percpu(pending->p);
+}
+
+/**
+ * rcu_pending_init: - initialize a rcu_pending
+ *
+ * @pending: Object to init
+ * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
+ * RCU flavor
+ * @process: Callback function invoked on objects once their RCU barriers
+ * have completed; if NULL, kvfree() is used.
+ */
+int rcu_pending_init(struct rcu_pending *pending,
+ struct srcu_struct *srcu,
+ rcu_pending_process_fn process)
+{
+ pending->p = alloc_percpu(struct rcu_pending_pcpu);
+ if (!pending->p)
+ return -ENOMEM;
+
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ p->parent = pending;
+ p->cpu = cpu;
+ spin_lock_init(&p->lock);
+ darray_init(&p->objs);
+ INIT_WORK(&p->work, rcu_pending_work);
+ }
+
+ pending->srcu = srcu;
+ pending->process = process;
+
+ return 0;
+}
diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h
new file mode 100644
index 000000000000..71a2f4ddaade
--- /dev/null
+++ b/fs/bcachefs/rcu_pending.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RCU_PENDING_H
+#define _LINUX_RCU_PENDING_H
+
+#include <linux/rcupdate.h>
+
+struct rcu_pending;
+typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
+
+struct rcu_pending_pcpu;
+
+struct rcu_pending {
+ struct rcu_pending_pcpu __percpu *p;
+ struct srcu_struct *srcu;
+ rcu_pending_process_fn process;
+};
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
+
+void rcu_pending_exit(struct rcu_pending *pending);
+int rcu_pending_init(struct rcu_pending *pending,
+ struct srcu_struct *srcu,
+ rcu_pending_process_fn process);
+
+#endif /* _LINUX_RCU_PENDING_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index cf81e5128c3a..2d299a37cf07 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -13,6 +13,7 @@
#include "errcode.h"
#include "error.h"
#include "inode.h"
+#include "io_write.h"
#include "move.h"
#include "rebalance.h"
#include "subvolume.h"
@@ -156,6 +157,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
data_opts->rewrite_ptrs =
bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
data_opts->target = r->target;
+ data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
if (!data_opts->rewrite_ptrs) {
/*
@@ -263,6 +265,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
data_opts->target = target;
+ data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
return data_opts->rewrite_ptrs != 0;
}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 36de1c6fe8c3..be1e7ca4362f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -97,7 +97,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
@@ -525,17 +525,17 @@ static int read_btree_roots(struct bch_fs *c)
"error reading btree root %s l=%u: %s",
bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
if (btree_id_is_alloc(i)) {
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
r->error = 0;
- } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
+ } else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
bch_info(c, "will run btree node scan");
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
}
ret = 0;
@@ -706,14 +706,14 @@ int bch2_fs_recovery(struct bch_fs *c)
if (check_version_upgrade(c))
write_sb = true;
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
if (write_sb)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
if (c->opts.fsck)
set_bit(BCH_FS_fsck_running, &c->flags);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 73339a0a3111..735b8adc8f9d 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -40,7 +40,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
set_bit(BCH_FS_may_go_rw, &c->flags);
- if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit)
+ if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
return bch2_fs_read_write_early(c);
return 0;
}
@@ -97,14 +97,14 @@ u64 bch2_recovery_passes_from_stable(u64 v)
int bch2_run_explicit_recovery_pass(struct bch_fs *c,
enum bch_recovery_pass pass)
{
- if (c->recovery_passes_explicit & BIT_ULL(pass))
+ if (c->opts.recovery_passes & BIT_ULL(pass))
return 0;
bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
bch2_recovery_passes[pass], pass,
bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
- c->recovery_passes_explicit |= BIT_ULL(pass);
+ c->opts.recovery_passes |= BIT_ULL(pass);
if (c->curr_recovery_pass >= pass) {
c->curr_recovery_pass = pass;
@@ -161,7 +161,9 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa
{
struct recovery_pass_fn *p = recovery_pass_fns + pass;
- if (c->recovery_passes_explicit & BIT_ULL(pass))
+ if (c->opts.recovery_passes_exclude & BIT_ULL(pass))
+ return false;
+ if (c->opts.recovery_passes & BIT_ULL(pass))
return true;
if ((p->when & PASS_FSCK) && c->opts.fsck)
return true;
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 1f34c92a6d11..998c0bd06802 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -123,7 +123,7 @@ static void extent_to_replicas(struct bkey_s_c k,
continue;
if (!p.has_ec)
- r->devs[r->nr_devs++] = p.ptr.dev;
+ replicas_entry_add_dev(r, p.ptr.dev);
else
r->nr_required = 0;
}
@@ -140,7 +140,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
for (ptr = s.v->ptrs;
ptr < s.v->ptrs + s.v->nr_blocks;
ptr++)
- r->devs[r->nr_devs++] = ptr->dev;
+ replicas_entry_add_dev(r, ptr->dev);
}
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
@@ -181,7 +181,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
e->nr_required = 1;
darray_for_each(devs, i)
- e->devs[e->nr_devs++] = *i;
+ replicas_entry_add_dev(e, *i);
bch2_replicas_entry_sort(e);
}
@@ -795,12 +795,12 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
for (unsigned i = 0; i < e->nr_devs; i++) {
nr_online += test_bit(e->devs[i], devs.d);
- struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
}
rcu_read_unlock();
- if (nr_failed == e->nr_devs)
+ if (nr_online + nr_failed == e->nr_devs)
continue;
if (nr_online < e->nr_required)
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
index b97208195d06..b7eff904acdb 100644
--- a/fs/bcachefs/replicas_format.h
+++ b/fs/bcachefs/replicas_format.h
@@ -5,7 +5,7 @@
struct bch_replicas_entry_v0 {
__u8 data_type;
__u8 nr_devs;
- __u8 devs[];
+ __u8 devs[] __counted_by(nr_devs);
} __packed;
struct bch_sb_field_replicas_v0 {
@@ -17,7 +17,7 @@ struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
- __u8 devs[];
+ __u8 devs[] __counted_by(nr_devs);
} __packed;
struct bch_sb_field_replicas {
@@ -28,4 +28,9 @@ struct bch_sb_field_replicas {
#define replicas_entry_bytes(_i) \
(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+#define replicas_entry_add_dev(e, d) ({ \
+ (e)->nr_devs++; \
+ (e)->devs[(e)->nr_devs - 1] = (d); \
+})
+
#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index c57d42bb8d1b..025848a9c4c0 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -155,7 +155,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
mutex_unlock(&c->sb_lock);
- return NULL;
+ return ERR_PTR(-BCH_ERR_invalid_sb_clean);
}
clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 4b765422dd77..02bcde3c1b02 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -465,3 +465,60 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
}
}
+
+unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
+{
+ unsigned nr = 0;
+
+ for (unsigned i = 0; i < sb->nr_devices; i++)
+ nr += bch2_member_exists((struct bch_sb *) sb, i);
+ return nr;
+}
+
+int bch2_sb_member_alloc(struct bch_fs *c)
+{
+ unsigned dev_idx = c->sb.nr_devices;
+ struct bch_sb_field_members_v2 *mi;
+ unsigned nr_devices;
+ unsigned u64s;
+ int best = -1;
+ u64 best_last_mount = 0;
+
+ if (dev_idx < BCH_SB_MEMBERS_MAX)
+ goto have_slot;
+
+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
+ /* eventually BCH_SB_MEMBERS_MAX will be raised */
+ if (dev_idx == BCH_SB_MEMBER_INVALID)
+ continue;
+
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
+ if (bch2_member_alive(&m))
+ continue;
+
+ u64 last_mount = le64_to_cpu(m.last_mount);
+ if (best < 0 || last_mount < best_last_mount) {
+ best = dev_idx;
+ best_last_mount = last_mount;
+ }
+ }
+ if (best >= 0) {
+ dev_idx = best;
+ goto have_slot;
+ }
+
+ return -BCH_ERR_ENOSPC_sb_members;
+have_slot:
+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+
+ mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+ le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+
+ mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+ if (!mi)
+ return -BCH_ERR_ENOSPC_sb_members;
+
+ c->disk_sb.sb->nr_devices = nr_devices;
+ return dev_idx;
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index dd93192ec065..762083b564ee 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -198,29 +198,37 @@ static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
lockdep_is_held(&c->state_lock));
}
-static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
+static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev)
{
return c && dev < c->sb.nr_devices
? rcu_dereference(c->devs[dev])
: NULL;
}
+void bch2_dev_missing(struct bch_fs *, unsigned);
+
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
+{
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
+ if (unlikely(!ca))
+ bch2_dev_missing(c, dev);
+ return ca;
+}
+
static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
{
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
if (ca)
bch2_dev_get(ca);
rcu_read_unlock();
return ca;
}
-void bch2_dev_missing(struct bch_fs *, unsigned);
-
static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
{
struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
- if (!ca)
+ if (unlikely(!ca))
bch2_dev_missing(c, dev);
return ca;
}
@@ -307,6 +315,8 @@ static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
return false;
}
+unsigned bch2_sb_nr_devices(const struct bch_sb *);
+
static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
{
return (struct bch_member_cpu) {
@@ -352,4 +362,6 @@ static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64
bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
+int bch2_sb_member_alloc(struct bch_fs *);
+
#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index c8c266cb5797..215eed4cce6d 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -270,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
POS(insert->k.p.inode, U64_MAX),
- BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
+ BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index a8299ba2cab2..e62f876541fe 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -31,6 +31,51 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
int bch2_subvol_is_ro(struct bch_fs *, u32);
+static inline struct bkey_s_c
+bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end,
+ u32 subvolid, unsigned flags)
+{
+ u32 snapshot;
+ int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
+ if (ret)
+ return bkey_s_c_err(ret);
+
+ bch2_btree_iter_set_snapshot(iter, snapshot);
+ return bch2_btree_iter_peek_upto_type(iter, end, flags);
+}
+
+#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
+ _end, _subvolid, _flags, _k, _do) \
+({ \
+ struct bkey_s_c _k; \
+ int _ret3 = 0; \
+ \
+ do { \
+ _ret3 = lockrestart_do(_trans, ({ \
+ (_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \
+ _end, _subvolid, (_flags)); \
+ if (!(_k).k) \
+ break; \
+ \
+ bkey_err(_k) ?: (_do); \
+ })); \
+ } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret3; \
+})
+
+#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \
+ _start, _end, _subvolid, _flags, _k, _do) \
+({ \
+ struct btree_iter _iter; \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
+ _end, _subvolid, _flags, _k, _do); \
+})
+
int bch2_delete_dead_snapshots(struct bch_fs *);
void bch2_delete_dead_snapshots_async(struct bch_fs *);
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 9b10c8947828..f2ec4277c2a5 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -30,7 +30,8 @@ struct snapshot_table {
};
typedef struct {
- u32 subvol;
+ /* we can't have padding in this struct: */
+ u64 subvol;
u64 inum;
} subvol_inum;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c8c2ccbdfbb5..d86d5dae54c9 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -418,6 +418,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
!BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
+
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
+ SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
}
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
@@ -1292,15 +1295,9 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
bool print_layout, unsigned fields)
{
- u64 fields_have = 0;
- unsigned nr_devices = 0;
-
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 44);
- for (int i = 0; i < sb->nr_devices; i++)
- nr_devices += bch2_member_exists(sb, i);
-
prt_printf(out, "External UUID:\t");
pr_uuid(out, sb->user_uuid.b);
prt_newline(out);
@@ -1356,9 +1353,10 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_newline(out);
prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
- prt_printf(out, "Devices:\t%u\n", nr_devices);
+ prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
prt_printf(out, "Sections:\t");
+ u64 fields_have = 0;
vstruct_for_each(sb, f)
fields_have |= 1 << le32_to_cpu(f->type);
prt_bitflags(out, bch2_sb_fields, fields_have);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e7fa2de35014..873e4be7e1dc 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -370,7 +370,7 @@ void bch2_fs_read_only(struct bch_fs *c)
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
- BUG_ON(atomic_read(&c->btree_cache.dirty));
+ BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
BUG_ON(c->btree_write_buffer.inc.keys.nr);
BUG_ON(c->btree_write_buffer.flushing.keys.nr);
@@ -543,6 +543,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_fs_io_direct_exit(c);
bch2_fs_fs_io_buffered_exit(c);
bch2_fs_fsio_exit(c);
+ bch2_fs_vfs_exit(c);
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_nocow_locking_exit(c);
@@ -810,7 +811,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->copy_gc_enabled = 1;
c->rebalance.enabled = 1;
- c->promote_whole_extents = true;
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
@@ -926,6 +926,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_encryption_init(c) ?:
bch2_fs_compress_init(c) ?:
bch2_fs_ec_init(c) ?:
+ bch2_fs_vfs_init(c) ?:
bch2_fs_fsio_init(c) ?:
bch2_fs_fs_io_buffered_init(c) ?:
bch2_fs_fs_io_direct_init(c);
@@ -1591,33 +1592,6 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
/* Device add/removal: */
-static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bpos start = POS(ca->dev_idx, 0);
- struct bpos end = POS(ca->dev_idx, U64_MAX);
- int ret;
-
- /*
- * We clear the LRU and need_discard btrees first so that we don't race
- * with bch2_do_invalidates() and bch2_do_discards()
- */
- ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_dev_usage_remove(c, ca->dev_idx);
- bch_err_msg(c, ret, "removing dev alloc info");
- return ret;
-}
-
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_member *m;
@@ -1729,9 +1703,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb;
struct bch_dev *ca = NULL;
- struct bch_sb_field_members_v2 *mi;
- struct bch_member dev_mi;
- unsigned dev_idx, nr_devices, u64s;
struct printbuf errbuf = PRINTBUF;
struct printbuf label = PRINTBUF;
int ret;
@@ -1741,7 +1712,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (ret)
goto err;
- dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
+ struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
@@ -1779,55 +1750,19 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_unlock;
if (dynamic_fault("bcachefs:add:no_slot"))
- goto no_slot;
-
- if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) {
- dev_idx = c->sb.nr_devices;
- goto have_slot;
- }
-
- int best = -1;
- u64 best_last_mount = 0;
- for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
- struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
- if (bch2_member_alive(&m))
- continue;
-
- u64 last_mount = le64_to_cpu(m.last_mount);
- if (best < 0 || last_mount < best_last_mount) {
- best = dev_idx;
- best_last_mount = last_mount;
- }
- }
- if (best >= 0) {
- dev_idx = best;
- goto have_slot;
- }
-no_slot:
- ret = -BCH_ERR_ENOSPC_sb_members;
- bch_err_msg(c, ret, "setting up new superblock");
- goto err_unlock;
-
-have_slot:
- nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
-
- mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
- le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+ goto err_unlock;
- mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
- if (!mi) {
- ret = -BCH_ERR_ENOSPC_sb_members;
+ ret = bch2_sb_member_alloc(c);
+ if (ret < 0) {
bch_err_msg(c, ret, "setting up new superblock");
goto err_unlock;
}
- struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+ unsigned dev_idx = ret;
/* success: */
- *m = dev_mi;
- m->last_mount = cpu_to_le64(ktime_get_real_seconds());
- c->disk_sb.sb->nr_devices = nr_devices;
+ dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
+ *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 33f2a64c14c9..03e59f86f360 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -219,7 +219,6 @@ read_attribute(copy_gc_wait);
rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_status);
-rw_attribute(promote_whole_extents);
read_attribute(new_stripes);
@@ -234,7 +233,7 @@ write_attribute(perf_test);
#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
- { .name = #_name, .mode = 0444 };
+ { .name = #_name, .mode = 0644 };
BCH_TIME_STATS()
#undef x
@@ -245,14 +244,18 @@ static struct attribute sysfs_state_rw = {
static size_t bch2_btree_cache_size(struct bch_fs *c)
{
+ struct btree_cache *bc = &c->btree_cache;
size_t ret = 0;
struct btree *b;
- mutex_lock(&c->btree_cache.lock);
- list_for_each_entry(b, &c->btree_cache.live, list)
+ mutex_lock(&bc->lock);
+ list_for_each_entry(b, &bc->live[0].list, list)
ret += btree_buf_bytes(b);
-
- mutex_unlock(&c->btree_cache.lock);
+ list_for_each_entry(b, &bc->live[1].list, list)
+ ret += btree_buf_bytes(b);
+ list_for_each_entry(b, &bc->freeable, list)
+ ret += btree_buf_bytes(b);
+ mutex_unlock(&bc->lock);
return ret;
}
@@ -288,7 +291,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
prt_tab_rjust(out);
prt_human_readable_u64(out, nr_extents
- ? div_u64(sectors_uncompressed << 9, nr_extents)
+ ? div64_u64(sectors_uncompressed << 9, nr_extents)
: 0);
prt_tab_rjust(out);
prt_newline(out);
@@ -347,8 +350,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_rebalance_status)
bch2_rebalance_status_to_text(out, c);
- sysfs_print(promote_whole_extents, c->promote_whole_extents);
-
/* Debugging: */
if (attr == &sysfs_journal_debug)
@@ -436,8 +437,6 @@ STORE(bch2_fs)
sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
- sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
-
/* Debugging: */
if (!test_bit(BCH_FS_started, &c->flags))
@@ -449,11 +448,12 @@ STORE(bch2_fs)
return -EROFS;
if (attr == &sysfs_trigger_btree_cache_shrink) {
+ struct btree_cache *bc = &c->btree_cache;
struct shrink_control sc;
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
+ bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
}
if (attr == &sysfs_trigger_btree_key_cache_shrink) {
@@ -514,7 +514,7 @@ struct attribute *bch2_fs_files[] = {
&sysfs_btree_cache_size,
&sysfs_btree_write_stats,
- &sysfs_promote_whole_extents,
+ &sysfs_rebalance_status,
&sysfs_compression_stats,
@@ -614,7 +614,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_copy_gc_wait,
&sysfs_rebalance_enabled,
- &sysfs_rebalance_status,
sysfs_pd_controller_files(rebalance),
&sysfs_moving_ctxts,
@@ -674,7 +673,7 @@ STORE(bch2_fs_opts_dir)
if (ret < 0)
goto err;
- bch2_opt_set_sb(c, opt, v);
+ bch2_opt_set_sb(c, NULL, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
if (v &&
@@ -728,6 +727,13 @@ SHOW(bch2_fs_time_stats)
STORE(bch2_fs_time_stats)
{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name) \
+ if (attr == &sysfs_time_stat_##name) \
+ bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
+ BCH_TIME_STATS()
+#undef x
return size;
}
SYSFS_OPS(bch2_fs_time_stats);
@@ -821,32 +827,17 @@ STORE(bch2_dev)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
- struct bch_member *mi;
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
- mutex_lock(&c->sb_lock);
- mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
- if (v != BCH_MEMBER_DISCARD(mi)) {
- SET_BCH_MEMBER_DISCARD(mi, v);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
+ bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v);
}
if (attr == &sysfs_durability) {
u64 v = strtoul_or_return(buf);
- mutex_lock(&c->sb_lock);
- mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
- if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
- SET_BCH_MEMBER_DURABILITY(mi, v + 1);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
+ bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v);
}
if (attr == &sysfs_label) {
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 0807ce9b171a..fb3442a7c67f 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -387,7 +387,7 @@ again:
seen = buf->buf.nr;
char *n = memchr(buf->buf.data, '\n', seen);
- if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) {
+ if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
spin_unlock(&buf->lock);
return -ETIME;
}
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
index 4508e9dcbee2..3fe82757f93a 100644
--- a/fs/bcachefs/time_stats.c
+++ b/fs/bcachefs/time_stats.c
@@ -151,6 +151,20 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
}
}
+void bch2_time_stats_reset(struct bch2_time_stats *stats)
+{
+ spin_lock_irq(&stats->lock);
+ unsigned offset = offsetof(struct bch2_time_stats, min_duration);
+ memset((void *) stats + offset, 0, sizeof(*stats) - offset);
+
+ if (stats->buffer) {
+ int cpu;
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(stats->buffer, cpu)->nr = 0;
+ }
+ spin_unlock_irq(&stats->lock);
+}
+
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
free_percpu(stats->buffer);
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
index 5df61403744b..dc6493f7bbab 100644
--- a/fs/bcachefs/time_stats.h
+++ b/fs/bcachefs/time_stats.h
@@ -70,6 +70,7 @@ struct time_stat_buffer {
struct bch2_time_stats {
spinlock_t lock;
bool have_quantiles;
+ struct time_stat_buffer __percpu *buffer;
/* all fields are in nanoseconds */
u64 min_duration;
u64 max_duration;
@@ -87,7 +88,6 @@ struct bch2_time_stats {
struct mean_and_variance_weighted duration_stats_weighted;
struct mean_and_variance_weighted freq_stats_weighted;
- struct time_stat_buffer __percpu *buffer;
};
struct bch2_time_stats_quantiles {
@@ -142,6 +142,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
return false;
}
+void bch2_time_stats_reset(struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index c62f00322d1e..5597b9d6297f 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -3,7 +3,6 @@
#define TRACE_SYSTEM bcachefs
#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_BCACHEFS_H
#include <linux/tracepoint.h>
@@ -558,6 +557,7 @@ TRACE_EVENT(btree_path_relock_fail,
__field(unsigned long, caller_ip )
__field(u8, btree_id )
__field(u8, level )
+ __field(u8, path_idx)
TRACE_BPOS_entries(pos)
__array(char, node, 24 )
__field(u8, self_read_count )
@@ -575,7 +575,8 @@ TRACE_EVENT(btree_path_relock_fail,
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->btree_id = path->btree_id;
- __entry->level = path->level;
+ __entry->level = level;
+ __entry->path_idx = path - trans->paths;
TRACE_BPOS_assign(pos, path->pos);
c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
@@ -588,7 +589,7 @@ TRACE_EVENT(btree_path_relock_fail,
c = six_lock_counts(&path->l[level].b->c.lock);
__entry->read_count = c.n[SIX_LOCK_read];
__entry->intent_count = c.n[SIX_LOCK_intent];
- scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+ scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c);
}
__entry->iter_lock_seq = path->l[level].lock_seq;
__entry->node_lock_seq = is_btree_node(path, level)
@@ -596,9 +597,10 @@ TRACE_EVENT(btree_path_relock_fail,
: 0;
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
+ TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
+ __entry->path_idx,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
@@ -625,6 +627,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
__field(unsigned long, caller_ip )
__field(u8, btree_id )
__field(u8, level )
+ __field(u8, path_idx)
TRACE_BPOS_entries(pos)
__field(u8, locked )
__field(u8, self_read_count )
@@ -642,6 +645,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
__entry->caller_ip = caller_ip;
__entry->btree_id = path->btree_id;
__entry->level = level;
+ __entry->path_idx = path - trans->paths;
TRACE_BPOS_assign(pos, path->pos);
__entry->locked = btree_node_locked(path, level);
@@ -657,9 +661,10 @@ TRACE_EVENT(btree_path_upgrade_fail,
: 0;
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
+ TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
+ __entry->path_idx,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
@@ -1438,6 +1443,456 @@ TRACE_EVENT(error_downcast,
TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
);
+#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
+
+TRACE_EVENT(update_by_path,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path,
+ struct btree_insert_entry *i, bool overwrite),
+ TP_ARGS(trans, path, i, overwrite),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(btree_path_idx_t, path_idx )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ __field(u8, overwrite )
+ __field(btree_path_idx_t, update_idx )
+ __field(btree_path_idx_t, nr_updates )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->path_idx = path - trans->paths;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ __entry->overwrite = overwrite;
+ __entry->update_idx = i - trans->updates;
+ __entry->nr_updates = trans->nr_updates;
+ ),
+
+ TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u",
+ __entry->trans_fn,
+ __entry->path_idx,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->overwrite,
+ __entry->update_idx,
+ __entry->nr_updates)
+);
+
+TRACE_EVENT(btree_path_lock,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_bkey_cached_common *b),
+ TP_ARGS(trans, caller_ip, b),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ __field(u8, level )
+ __array(char, node, 24 )
+ __field(u32, lock_seq )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = b->btree_id;
+ __entry->level = b->level;
+
+ scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+ __entry->lock_seq = six_lock_seq(&b->lock);
+ ),
+
+ TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->level,
+ __entry->node,
+ __entry->lock_seq)
+);
+
+DECLARE_EVENT_CLASS(btree_path_ev,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path),
+
+ TP_STRUCT__entry(
+ __field(u16, idx )
+ __field(u8, ref )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ ),
+
+ TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u",
+ __entry->idx, __entry->ref,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_get_ll,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_put_ll,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+TRACE_EVENT(btree_path_alloc,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, locks_want )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->locks_want = path->locks_want;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ ),
+
+ TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u",
+ __entry->idx,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->locks_want,
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot)
+);
+
+TRACE_EVENT(btree_path_get,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos),
+ TP_ARGS(trans, path, new_pos),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, ref )
+ __field(u8, preserve )
+ __field(u8, locks_want )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(old_pos)
+ TRACE_BPOS_entries(new_pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ __entry->locks_want = path->locks_want;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(old_pos, path->pos);
+ TRACE_BPOS_assign(new_pos, *new_pos);
+ ),
+
+ TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u",
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->locks_want,
+ __entry->old_pos_inode,
+ __entry->old_pos_offset,
+ __entry->old_pos_snapshot,
+ __entry->new_pos_inode,
+ __entry->new_pos_offset,
+ __entry->new_pos_snapshot)
+);
+
+DECLARE_EVENT_CLASS(btree_path_clone,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+ TP_ARGS(trans, path, new),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, new_idx )
+ __field(u8, btree_id )
+ __field(u8, ref )
+ __field(u8, preserve )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->new_idx = new - trans->paths;
+ __entry->btree_id = path->btree_id;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ TRACE_BPOS_assign(pos, path->pos);
+ ),
+
+ TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u",
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->new_idx)
+);
+
+DEFINE_EVENT(btree_path_clone, btree_path_clone,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+ TP_ARGS(trans, path, new)
+);
+
+DEFINE_EVENT(btree_path_clone, btree_path_save_pos,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+ TP_ARGS(trans, path, new)
+);
+
+DECLARE_EVENT_CLASS(btree_path_traverse,
+ TP_PROTO(struct btree_trans *trans,
+ struct btree_path *path),
+ TP_ARGS(trans, path),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(btree_path_idx_t, idx )
+ __field(u8, ref )
+ __field(u8, preserve )
+ __field(u8, should_be_locked )
+ __field(u8, btree_id )
+ __field(u8, level )
+ TRACE_BPOS_entries(pos)
+ __field(u8, locks_want )
+ __field(u8, nodes_locked )
+ __array(char, node0, 24 )
+ __array(char, node1, 24 )
+ __array(char, node2, 24 )
+ __array(char, node3, 24 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ __entry->btree_id = path->btree_id;
+ __entry->level = path->level;
+ TRACE_BPOS_assign(pos, path->pos);
+
+ __entry->locks_want = path->locks_want;
+ __entry->nodes_locked = path->nodes_locked;
+ struct btree *b = path->l[0].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[1].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[2].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[3].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
+ ),
+
+ TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n"
+ "locks %u %u %u %u node %s %s %s %s",
+ __entry->trans_fn,
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->level,
+ __entry->locks_want,
+ (__entry->nodes_locked >> 6) & 3,
+ (__entry->nodes_locked >> 4) & 3,
+ (__entry->nodes_locked >> 2) & 3,
+ (__entry->nodes_locked >> 0) & 3,
+ __entry->node3,
+ __entry->node2,
+ __entry->node1,
+ __entry->node0)
+);
+
+DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start,
+ TP_PROTO(struct btree_trans *trans,
+ struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+TRACE_EVENT(btree_path_set_pos,
+ TP_PROTO(struct btree_trans *trans,
+ struct btree_path *path,
+ struct bpos *new_pos),
+ TP_ARGS(trans, path, new_pos),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, ref )
+ __field(u8, preserve )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(old_pos)
+ TRACE_BPOS_entries(new_pos)
+ __field(u8, locks_want )
+ __field(u8, nodes_locked )
+ __array(char, node0, 24 )
+ __array(char, node1, 24 )
+ __array(char, node2, 24 )
+ __array(char, node3, 24 )
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(old_pos, path->pos);
+ TRACE_BPOS_assign(new_pos, *new_pos);
+
+ __entry->nodes_locked = path->nodes_locked;
+ struct btree *b = path->l[0].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[1].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[2].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[3].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
+ ),
+
+ TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n"
+ "locks %u %u %u %u node %s %s %s %s",
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->old_pos_inode,
+ __entry->old_pos_offset,
+ __entry->old_pos_snapshot,
+ __entry->new_pos_inode,
+ __entry->new_pos_offset,
+ __entry->new_pos_snapshot,
+ (__entry->nodes_locked >> 6) & 3,
+ (__entry->nodes_locked >> 4) & 3,
+ (__entry->nodes_locked >> 2) & 3,
+ (__entry->nodes_locked >> 0) & 3,
+ __entry->node3,
+ __entry->node2,
+ __entry->node1,
+ __entry->node0)
+);
+
+TRACE_EVENT(btree_path_free,
+ TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup),
+ TP_ARGS(trans, path, dup),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, preserve )
+ __field(u8, should_be_locked)
+ __field(s8, dup )
+ __field(u8, dup_locked )
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path;
+ __entry->preserve = trans->paths[path].preserve;
+ __entry->should_be_locked = trans->paths[path].should_be_locked;
+ __entry->dup = dup ? dup - trans->paths : -1;
+ __entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0;
+ ),
+
+ TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx,
+ __entry->preserve ? 'P' : ' ',
+ __entry->should_be_locked ? 'S' : ' ',
+ __entry->dup,
+ __entry->dup_locked)
+);
+
+TRACE_EVENT(btree_path_free_trans_begin,
+ TP_PROTO(btree_path_idx_t path),
+ TP_ARGS(path),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path;
+ ),
+
+ TP_printk(" path %3u", __entry->idx)
+);
+
+#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
+#ifndef _TRACE_BCACHEFS_H
+
+static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path,
+ struct btree_insert_entry *i, bool overwrite) {}
+static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {}
+static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
+static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
+static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
+static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
+static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
+static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {}
+
+#endif
+#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
+
+#define _TRACE_BCACHEFS_H
#endif /* _TRACE_BCACHEFS_H */
/* This part must be outside protection */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 1b8554460af4..42f565c76181 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -64,7 +64,7 @@ static int bch2_pow(u64 n, u64 p, u64 *res)
*res = 1;
while (p--) {
- if (*res > div_u64(U64_MAX, n))
+ if (*res > div64_u64(U64_MAX, n))
return -ERANGE;
*res *= n;
}
@@ -140,14 +140,14 @@ static int __bch2_strtou64_h(const char *cp, u64 *res)
parse_or_ret(cp, parse_unit_suffix(cp, &b));
- if (v > div_u64(U64_MAX, b))
+ if (v > div64_u64(U64_MAX, b))
return -ERANGE;
v *= b;
- if (f_n > div_u64(U64_MAX, b))
+ if (f_n > div64_u64(U64_MAX, b))
return -ERANGE;
- f_n = div_u64(f_n * b, f_d);
+ f_n = div64_u64(f_n * b, f_d);
if (v + f_n < v)
return -ERANGE;
v += f_n;
@@ -204,7 +204,7 @@ STRTO_H(strtoll, long long)
STRTO_H(strtoull, unsigned long long)
STRTO_H(strtou64, u64)
-u64 bch2_read_flag_list(char *opt, const char * const list[])
+u64 bch2_read_flag_list(const char *opt, const char * const list[])
{
u64 ret = 0;
char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
@@ -214,7 +214,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[])
s = strim(d);
- while ((p = strsep(&s, ","))) {
+ while ((p = strsep(&s, ",;"))) {
int flag = match_string(list, -1, p);
if (flag < 0) {
@@ -360,7 +360,7 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = bch2_pick_time_units(ns);
- prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+ prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name);
}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
@@ -477,7 +477,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
u64 q = max(quantiles->entries[i].m, last_q);
- prt_printf(out, "%llu ", div_u64(q, u->nsecs));
+ prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
if (is_last)
prt_newline(out);
last_q = q;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 902b7f5406a2..fb02c1c36004 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -195,7 +195,7 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
bool bch2_is_zero(const void *, size_t);
-u64 bch2_read_flag_list(char *, const char * const[]);
+u64 bch2_read_flag_list(const char *, const char * const[]);
void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
void bch2_prt_u64_base2(struct printbuf *, u64);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 331f944d73dc..56c8d3fe55a4 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -250,17 +250,27 @@ static int __bch2_xattr_emit(const char *prefix,
return 0;
}
+static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry)
+{
+ const struct xattr_handler *handler = bch2_xattr_type_to_handler(type);
+
+ if (!xattr_handler_can_list(handler, dentry))
+ return NULL;
+
+ return xattr_prefix(handler);
+}
+
static int bch2_xattr_emit(struct dentry *dentry,
const struct bch_xattr *xattr,
struct xattr_buf *buf)
{
- const struct xattr_handler *handler =
- bch2_xattr_type_to_handler(xattr->x_type);
+ const char *prefix;
+
+ prefix = bch2_xattr_prefix(xattr->x_type, dentry);
+ if (!prefix)
+ return 0;
- return handler && (!handler->list || handler->list(dentry))
- ? __bch2_xattr_emit(handler->prefix ?: handler->name,
- xattr->x_name, xattr->x_name_len, buf)
- : 0;
+ return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
}
static int bch2_xattr_list_bcachefs(struct bch_fs *c,
@@ -295,54 +305,23 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct bch_fs *c = dentry->d_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
u64 offset = 0, inum = inode->ei_inode.bi_inum;
- u32 snapshot;
- int ret;
-retry:
- bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
- SPOS(inum, offset, snapshot),
- POS(inum, U64_MAX), 0, k, ret) {
- if (k.k->type != KEY_TYPE_xattr)
- continue;
-
- ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
- if (ret)
- break;
- }
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs,
+ POS(inum, offset),
+ POS(inum, U64_MAX),
+ inode->ei_inum.subvol, 0, k, ({
+ if (k.k->type != KEY_TYPE_xattr)
+ continue;
- if (ret)
- goto out;
+ bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+ }))) ?:
+ bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
+ bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
- ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
- if (ret)
- goto out;
-
- ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
- if (ret)
- goto out;
-
- return buf.used;
-out:
- return bch2_err_class(ret);
+ return ret ? bch2_err_class(ret) : buf.used;
}
static int bch2_xattr_get_handler(const struct xattr_handler *handler,
@@ -632,10 +611,6 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
const struct xattr_handler *bch2_xattr_handlers[] = {
&bch_xattr_user_handler,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- &nop_posix_acl_access,
- &nop_posix_acl_default,
-#endif
&bch_xattr_trusted_handler,
&bch_xattr_security_handler,
#ifndef NO_BCACHEFS_FS
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
index e9f810539552..c7916011ef34 100644
--- a/fs/bcachefs/xattr_format.h
+++ b/fs/bcachefs/xattr_format.h
@@ -13,7 +13,7 @@ struct bch_xattr {
__u8 x_type;
__u8 x_name_len;
__le16 x_val_len;
- __u8 x_name[];
+ __u8 x_name[] __counted_by(x_name_len);
} __packed __aligned(8);
#endif /* _BCACHEFS_XATTR_FORMAT_H */
diff --git a/fs/inode.c b/fs/inode.c
index af78f515403f..471ae4a31549 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -439,14 +439,6 @@ static void init_once(void *foo)
}
/*
- * inode->i_lock must be held
- */
-void __iget(struct inode *inode)
-{
- atomic_inc(&inode->i_count);
-}
-
-/*
* get additional reference to inode; caller must already hold one.
*/
void ihold(struct inode *inode)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6b8df574729c..776298fbfcb4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3148,7 +3148,14 @@ static inline bool is_zero_ino(ino_t ino)
return (u32)ino == 0;
}
-extern void __iget(struct inode * inode);
+/*
+ * inode->i_lock must be held
+ */
+static inline void __iget(struct inode *inode)
+{
+ atomic_inc(&inode->i_count);
+}
+
extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index f3512fddf3d7..5b51c3d582d6 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -41,6 +41,7 @@
#include <linux/limits.h>
#include <linux/log2.h>
#include <linux/math.h>
+#include <linux/slab.h>
#include <linux/types.h>
struct genradix_root;
@@ -48,10 +49,63 @@ struct genradix_root;
#define GENRADIX_NODE_SHIFT 9
#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT)
+#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
+#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
+
+/* depth that's needed for a genradix that can address up to ULONG_MAX: */
+#define GENRADIX_MAX_DEPTH \
+ DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
+
+#define GENRADIX_DEPTH_MASK \
+ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
+
+static inline int genradix_depth_shift(unsigned depth)
+{
+ return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
+}
+
+/*
+ * Returns size (of data, in bytes) that a tree of a given depth holds:
+ */
+static inline size_t genradix_depth_size(unsigned depth)
+{
+ return 1UL << genradix_depth_shift(depth);
+}
+
+static inline unsigned genradix_root_to_depth(struct genradix_root *r)
+{
+ return (unsigned long) r & GENRADIX_DEPTH_MASK;
+}
+
+static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+{
+ return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
+}
+
struct __genradix {
struct genradix_root *root;
};
+struct genradix_node {
+ union {
+ /* Interior node: */
+ struct genradix_node *children[GENRADIX_ARY];
+
+ /* Leaf: */
+ u8 data[GENRADIX_NODE_SIZE];
+ };
+};
+
+static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
+{
+ return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
+}
+
+static inline void genradix_free_node(struct genradix_node *node)
+{
+ kfree(node);
+}
+
/*
* NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
*/
@@ -128,6 +182,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
#define __genradix_idx_to_offset(_radix, _idx) \
__idx_to_offset(_idx, __genradix_obj_size(_radix))
+static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset)
+{
+ struct genradix_root *r = READ_ONCE(radix->root);
+ struct genradix_node *n = genradix_root_to_node(r);
+ unsigned level = genradix_root_to_depth(r);
+ unsigned shift = genradix_depth_shift(level);
+
+ if (unlikely(ilog2(offset) >= genradix_depth_shift(level)))
+ return NULL;
+
+ while (n && shift > GENRADIX_NODE_SHIFT) {
+ shift -= GENRADIX_ARY_SHIFT;
+ n = n->children[offset >> shift];
+ offset &= (1UL << shift) - 1;
+ }
+
+ return n ? &n->data[offset] : NULL;
+}
+
+#define genradix_ptr_inlined(_radix, _idx) \
+ (__genradix_cast(_radix) \
+ __genradix_ptr_inlined(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx)))
+
void *__genradix_ptr(struct __genradix *, size_t);
/**
@@ -142,7 +220,24 @@ void *__genradix_ptr(struct __genradix *, size_t);
__genradix_ptr(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx)))
-void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
+void *__genradix_ptr_alloc(struct __genradix *, size_t,
+ struct genradix_node **, gfp_t);
+
+#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \
+ (__genradix_cast(_radix) \
+ (__genradix_ptr_inlined(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx)) ?: \
+ __genradix_ptr_alloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx), \
+ NULL, _gfp)))
+
+#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\
+ (__genradix_cast(_radix) \
+ (__genradix_ptr_inlined(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx)) ?: \
+ __genradix_ptr_alloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx), \
+ _new_node, _gfp)))
/**
* genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
@@ -157,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
(__genradix_cast(_radix) \
__genradix_ptr_alloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx), \
- _gfp))
+ NULL, _gfp))
+
+#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\
+ (__genradix_cast(_radix) \
+ __genradix_ptr_alloc(&(_radix)->tree, \
+ __genradix_idx_to_offset(_radix, _idx), \
+ _new_node, _gfp))
struct genradix_iter {
size_t offset;
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
index fa692c86f069..79e067b51488 100644
--- a/lib/generic-radix-tree.c
+++ b/lib/generic-radix-tree.c
@@ -5,99 +5,31 @@
#include <linux/gfp.h>
#include <linux/kmemleak.h>
-#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
-#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
-
-struct genradix_node {
- union {
- /* Interior node: */
- struct genradix_node *children[GENRADIX_ARY];
-
- /* Leaf: */
- u8 data[GENRADIX_NODE_SIZE];
- };
-};
-
-static inline int genradix_depth_shift(unsigned depth)
-{
- return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
-}
-
-/*
- * Returns size (of data, in bytes) that a tree of a given depth holds:
- */
-static inline size_t genradix_depth_size(unsigned depth)
-{
- return 1UL << genradix_depth_shift(depth);
-}
-
-/* depth that's needed for a genradix that can address up to ULONG_MAX: */
-#define GENRADIX_MAX_DEPTH \
- DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
-
-#define GENRADIX_DEPTH_MASK \
- ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
-
-static inline unsigned genradix_root_to_depth(struct genradix_root *r)
-{
- return (unsigned long) r & GENRADIX_DEPTH_MASK;
-}
-
-static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
-{
- return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
-}
-
/*
* Returns pointer to the specified byte @offset within @radix, or NULL if not
* allocated
*/
void *__genradix_ptr(struct __genradix *radix, size_t offset)
{
- struct genradix_root *r = READ_ONCE(radix->root);
- struct genradix_node *n = genradix_root_to_node(r);
- unsigned level = genradix_root_to_depth(r);
-
- if (ilog2(offset) >= genradix_depth_shift(level))
- return NULL;
-
- while (1) {
- if (!n)
- return NULL;
- if (!level)
- break;
-
- level--;
-
- n = n->children[offset >> genradix_depth_shift(level)];
- offset &= genradix_depth_size(level) - 1;
- }
-
- return &n->data[offset];
+ return __genradix_ptr_inlined(radix, offset);
}
EXPORT_SYMBOL(__genradix_ptr);
-static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
-{
- return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
-}
-
-static inline void genradix_free_node(struct genradix_node *node)
-{
- kfree(node);
-}
-
/*
* Returns pointer to the specified byte @offset within @radix, allocating it if
* necessary - newly allocated slots are always zeroed out:
*/
void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
+ struct genradix_node **preallocated,
gfp_t gfp_mask)
{
struct genradix_root *v = READ_ONCE(radix->root);
struct genradix_node *n, *new_node = NULL;
unsigned level;
+ if (preallocated)
+ swap(new_node, *preallocated);
+
/* Increase tree depth if necessary: */
while (1) {
struct genradix_root *r = v, *new_root;
@@ -281,7 +213,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size,
size_t offset;
for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
- if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
+ if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask))
return -ENOMEM;
return 0;