summaryrefslogtreecommitdiff
path: root/drivers/md/bcache
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-06-03 10:25:56 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-06-03 10:25:56 -0700
commit78c6499c92090d0fd1ddd1684fc3a5dc41d98c92 (patch)
tree2459aac592803b4be762457f91ed3a4e45877c6a /drivers/md/bcache
parent72fbbc3d0e3e3117c29a73d0b4d928dc00ed99ce (diff)
parentaacae8c469f9ce4b303a2eb61593ff522c1420bc (diff)
Merge tag 'for-5.19/drivers-2022-06-02' of git://git.kernel.dk/linux-block
Pull more block driver updates from Jens Axboe: "A collection of stragglers that were late on sending in their changes and just followup fixes. - NVMe fixes pull request via Christoph: - set controller enable bit in a separate write (Niklas Cassel) - disable namespace identifiers for the MAXIO MAP1001 (Christoph) - fix a comment typo (Julia Lawall)" - MD fixes pull request via Song: - Remove uses of bdevname (Christoph Hellwig) - Bug fixes (Guoqing Jiang, and Xiao Ni) - bcache fixes series (Coly) - null_blk zoned write fix (Damien) - nbd fixes (Yu, Zhang) - Fix for loop partition scanning (Christoph)" * tag 'for-5.19/drivers-2022-06-02' of git://git.kernel.dk/linux-block: (23 commits) block: null_blk: Fix null_zone_write() nvmet: fix typo in comment nvme: set controller enable bit in a separate write nvme-pci: disable namespace identifiers for the MAXIO MAP1001 bcache: avoid unnecessary soft lockup in kworker update_writeback_rate() nbd: use pr_err to output error message nbd: fix possible overflow on 'first_minor' in nbd_dev_add() nbd: fix io hung while disconnecting device nbd: don't clear 'NBD_CMD_INFLIGHT' flag if request is not completed nbd: fix race between nbd_alloc_config() and module removal nbd: call genl_unregister_family() first in nbd_cleanup() md: bcache: check the return value of kzalloc() in detached_dev_do_request() bcache: memset on stack variables in bch_btree_check() and bch_sectors_dirty_init() block, loop: support partitions without scanning bcache: avoid journal no-space deadlock by reserving 1 journal bucket bcache: remove incremental dirty sector counting for bch_sectors_dirty_init() bcache: improve multithreaded bch_sectors_dirty_init() bcache: improve multithreaded bch_btree_check() md: fix double free of io_acct_set bioset md: Don't set mddev private to NULL in raid0 pers->free ...
Diffstat (limited to 'drivers/md/bcache')
-rw-r--r--drivers/md/bcache/bcache.h7
-rw-r--r--drivers/md/bcache/btree.c59
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/journal.c31
-rw-r--r--drivers/md/bcache/journal.h2
-rw-r--r--drivers/md/bcache/request.c6
-rw-r--r--drivers/md/bcache/super.c1
-rw-r--r--drivers/md/bcache/writeback.c133
-rw-r--r--drivers/md/bcache/writeback.h2
9 files changed, 130 insertions, 113 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 9ed9c955add7..2acda9cea0f9 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -395,6 +395,13 @@ struct cached_dev {
atomic_t io_errors;
unsigned int error_limit;
unsigned int offline_seconds;
+
+ /*
+ * Retry to update writeback_rate if contention happens for
+ * down_read(dc->writeback_lock) in update_writeback_rate()
+ */
+#define BCH_WBRATE_UPDATE_MAX_SKIPS 15
+ unsigned int rate_update_retry;
};
enum alloc_reserve {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index ad9f16689419..e136d6edc1ed 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2006,8 +2006,7 @@ int bch_btree_check(struct cache_set *c)
int i;
struct bkey *k = NULL;
struct btree_iter iter;
- struct btree_check_state *check_state;
- char name[32];
+ struct btree_check_state check_state;
/* check and mark root node keys */
for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
@@ -2018,63 +2017,59 @@ int bch_btree_check(struct cache_set *c)
if (c->root->level == 0)
return 0;
- check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL);
- if (!check_state)
- return -ENOMEM;
-
- check_state->c = c;
- check_state->total_threads = bch_btree_chkthread_nr();
- check_state->key_idx = 0;
- spin_lock_init(&check_state->idx_lock);
- atomic_set(&check_state->started, 0);
- atomic_set(&check_state->enough, 0);
- init_waitqueue_head(&check_state->wait);
+ memset(&check_state, 0, sizeof(struct btree_check_state));
+ check_state.c = c;
+ check_state.total_threads = bch_btree_chkthread_nr();
+ check_state.key_idx = 0;
+ spin_lock_init(&check_state.idx_lock);
+ atomic_set(&check_state.started, 0);
+ atomic_set(&check_state.enough, 0);
+ init_waitqueue_head(&check_state.wait);
+ rw_lock(0, c->root, c->root->level);
/*
* Run multiple threads to check btree nodes in parallel,
- * if check_state->enough is non-zero, it means current
+ * if check_state.enough is non-zero, it means current
* running check threads are enough, unncessary to create
* more.
*/
- for (i = 0; i < check_state->total_threads; i++) {
- /* fetch latest check_state->enough earlier */
+ for (i = 0; i < check_state.total_threads; i++) {
+ /* fetch latest check_state.enough earlier */
smp_mb__before_atomic();
- if (atomic_read(&check_state->enough))
+ if (atomic_read(&check_state.enough))
break;
- check_state->infos[i].result = 0;
- check_state->infos[i].state = check_state;
- snprintf(name, sizeof(name), "bch_btrchk[%u]", i);
- atomic_inc(&check_state->started);
+ check_state.infos[i].result = 0;
+ check_state.infos[i].state = &check_state;
- check_state->infos[i].thread =
+ check_state.infos[i].thread =
kthread_run(bch_btree_check_thread,
- &check_state->infos[i],
- name);
- if (IS_ERR(check_state->infos[i].thread)) {
+ &check_state.infos[i],
+ "bch_btrchk[%d]", i);
+ if (IS_ERR(check_state.infos[i].thread)) {
pr_err("fails to run thread bch_btrchk[%d]\n", i);
for (--i; i >= 0; i--)
- kthread_stop(check_state->infos[i].thread);
+ kthread_stop(check_state.infos[i].thread);
ret = -ENOMEM;
goto out;
}
+ atomic_inc(&check_state.started);
}
/*
* Must wait for all threads to stop.
*/
- wait_event_interruptible(check_state->wait,
- atomic_read(&check_state->started) == 0);
+ wait_event(check_state.wait, atomic_read(&check_state.started) == 0);
- for (i = 0; i < check_state->total_threads; i++) {
- if (check_state->infos[i].result) {
- ret = check_state->infos[i].result;
+ for (i = 0; i < check_state.total_threads; i++) {
+ if (check_state.infos[i].result) {
+ ret = check_state.infos[i].result;
goto out;
}
}
out:
- kfree(check_state);
+ rw_unlock(0, c->root);
return ret;
}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 50482107134f..1b5fdbc0d83e 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -226,7 +226,7 @@ struct btree_check_info {
int result;
};
-#define BCH_BTR_CHKTHREAD_MAX 64
+#define BCH_BTR_CHKTHREAD_MAX 12
struct btree_check_state {
struct cache_set *c;
int total_threads;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index df5347ea450b..e5da469a4235 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -405,6 +405,11 @@ err:
return ret;
}
+void bch_journal_space_reserve(struct journal *j)
+{
+ j->do_reserve = true;
+}
+
/* Journalling */
static void btree_flush_write(struct cache_set *c)
@@ -621,12 +626,30 @@ static void do_journal_discard(struct cache *ca)
}
}
+static unsigned int free_journal_buckets(struct cache_set *c)
+{
+ struct journal *j = &c->journal;
+ struct cache *ca = c->cache;
+ struct journal_device *ja = &c->cache->journal;
+ unsigned int n;
+
+ /* In case njournal_buckets is not power of 2 */
+ if (ja->cur_idx >= ja->discard_idx)
+ n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx;
+ else
+ n = ja->discard_idx - ja->cur_idx;
+
+ if (n > (1 + j->do_reserve))
+ return n - (1 + j->do_reserve);
+
+ return 0;
+}
+
static void journal_reclaim(struct cache_set *c)
{
struct bkey *k = &c->journal.key;
struct cache *ca = c->cache;
uint64_t last_seq;
- unsigned int next;
struct journal_device *ja = &ca->journal;
atomic_t p __maybe_unused;
@@ -649,12 +672,10 @@ static void journal_reclaim(struct cache_set *c)
if (c->journal.blocks_free)
goto out;
- next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
- /* No space available on this device */
- if (next == ja->discard_idx)
+ if (!free_journal_buckets(c))
goto out;
- ja->cur_idx = next;
+ ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
k->ptr[0] = MAKE_PTR(0,
bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
ca->sb.nr_this_dev);
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index f2ea34d5f431..cd316b4a1e95 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -105,6 +105,7 @@ struct journal {
spinlock_t lock;
spinlock_t flush_write_lock;
bool btree_flushing;
+ bool do_reserve;
/* used when waiting because the journal was full */
struct closure_waitlist wait;
struct closure io;
@@ -182,5 +183,6 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list);
void bch_journal_free(struct cache_set *c);
int bch_journal_alloc(struct cache_set *c);
+void bch_journal_space_reserve(struct journal *j);
#endif /* _BCACHE_JOURNAL_H */
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 9c5dde73da88..f2c5a7e06fa9 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1105,6 +1105,12 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio,
* which would call closure_get(&dc->disk.cl)
*/
ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
+ if (!ddip) {
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio->bi_end_io(bio);
+ return;
+ }
+
ddip->d = d;
/* Count on the bcache device */
ddip->orig_bdev = orig_bdev;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 2f49e31142f6..3563d15dbaf2 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2127,6 +2127,7 @@ static int run_cache_set(struct cache_set *c)
flash_devs_run(c);
+ bch_journal_space_reserve(&c->journal);
set_bit(CACHE_SET_RUNNING, &c->flags);
return 0;
err:
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 9ee0005874cd..3f0ff3aab6f2 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -235,19 +235,27 @@ static void update_writeback_rate(struct work_struct *work)
return;
}
- if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
- /*
- * If the whole cache set is idle, set_at_max_writeback_rate()
- * will set writeback rate to a max number. Then it is
- * unncessary to update writeback rate for an idle cache set
- * in maximum writeback rate number(s).
- */
- if (!set_at_max_writeback_rate(c, dc)) {
- down_read(&dc->writeback_lock);
+ /*
+ * If the whole cache set is idle, set_at_max_writeback_rate()
+ * will set writeback rate to a max number. Then it is
+ * unncessary to update writeback rate for an idle cache set
+ * in maximum writeback rate number(s).
+ */
+ if (atomic_read(&dc->has_dirty) && dc->writeback_percent &&
+ !set_at_max_writeback_rate(c, dc)) {
+ do {
+ if (!down_read_trylock((&dc->writeback_lock))) {
+ dc->rate_update_retry++;
+ if (dc->rate_update_retry <=
+ BCH_WBRATE_UPDATE_MAX_SKIPS)
+ break;
+ down_read(&dc->writeback_lock);
+ dc->rate_update_retry = 0;
+ }
__update_writeback_rate(dc);
update_gc_after_writeback(c);
up_read(&dc->writeback_lock);
- }
+ } while (0);
}
@@ -805,13 +813,11 @@ static int bch_writeback_thread(void *arg)
/* Init */
#define INIT_KEYS_EACH_TIME 500000
-#define INIT_KEYS_SLEEP_MS 100
struct sectors_dirty_init {
struct btree_op op;
unsigned int inode;
size_t count;
- struct bkey start;
};
static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
@@ -827,11 +833,8 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
KEY_START(k), KEY_SIZE(k));
op->count++;
- if (atomic_read(&b->c->search_inflight) &&
- !(op->count % INIT_KEYS_EACH_TIME)) {
- bkey_copy_key(&op->start, k);
- return -EAGAIN;
- }
+ if (!(op->count % INIT_KEYS_EACH_TIME))
+ cond_resched();
return MAP_CONTINUE;
}
@@ -846,24 +849,16 @@ static int bch_root_node_dirty_init(struct cache_set *c,
bch_btree_op_init(&op.op, -1);
op.inode = d->id;
op.count = 0;
- op.start = KEY(op.inode, 0, 0);
-
- do {
- ret = bcache_btree(map_keys_recurse,
- k,
- c->root,
- &op.op,
- &op.start,
- sectors_dirty_init_fn,
- 0);
- if (ret == -EAGAIN)
- schedule_timeout_interruptible(
- msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
- else if (ret < 0) {
- pr_warn("sectors dirty init failed, ret=%d!\n", ret);
- break;
- }
- } while (ret == -EAGAIN);
+
+ ret = bcache_btree(map_keys_recurse,
+ k,
+ c->root,
+ &op.op,
+ &KEY(op.inode, 0, 0),
+ sectors_dirty_init_fn,
+ 0);
+ if (ret < 0)
+ pr_warn("sectors dirty init failed, ret=%d!\n", ret);
return ret;
}
@@ -907,7 +902,6 @@ static int bch_dirty_init_thread(void *arg)
goto out;
}
skip_nr--;
- cond_resched();
}
if (p) {
@@ -917,7 +911,6 @@ static int bch_dirty_init_thread(void *arg)
p = NULL;
prev_idx = cur_idx;
- cond_resched();
}
out:
@@ -948,67 +941,56 @@ void bch_sectors_dirty_init(struct bcache_device *d)
struct btree_iter iter;
struct sectors_dirty_init op;
struct cache_set *c = d->c;
- struct bch_dirty_init_state *state;
- char name[32];
+ struct bch_dirty_init_state state;
/* Just count root keys if no leaf node */
+ rw_lock(0, c->root, c->root->level);
if (c->root->level == 0) {
bch_btree_op_init(&op.op, -1);
op.inode = d->id;
op.count = 0;
- op.start = KEY(op.inode, 0, 0);
for_each_key_filter(&c->root->keys,
k, &iter, bch_ptr_invalid)
sectors_dirty_init_fn(&op.op, c->root, k);
- return;
- }
- state = kzalloc(sizeof(struct bch_dirty_init_state), GFP_KERNEL);
- if (!state) {
- pr_warn("sectors dirty init failed: cannot allocate memory\n");
+ rw_unlock(0, c->root);
return;
}
- state->c = c;
- state->d = d;
- state->total_threads = bch_btre_dirty_init_thread_nr();
- state->key_idx = 0;
- spin_lock_init(&state->idx_lock);
- atomic_set(&state->started, 0);
- atomic_set(&state->enough, 0);
- init_waitqueue_head(&state->wait);
-
- for (i = 0; i < state->total_threads; i++) {
- /* Fetch latest state->enough earlier */
+ memset(&state, 0, sizeof(struct bch_dirty_init_state));
+ state.c = c;
+ state.d = d;
+ state.total_threads = bch_btre_dirty_init_thread_nr();
+ state.key_idx = 0;
+ spin_lock_init(&state.idx_lock);
+ atomic_set(&state.started, 0);
+ atomic_set(&state.enough, 0);
+ init_waitqueue_head(&state.wait);
+
+ for (i = 0; i < state.total_threads; i++) {
+ /* Fetch latest state.enough earlier */
smp_mb__before_atomic();
- if (atomic_read(&state->enough))
+ if (atomic_read(&state.enough))
break;
- state->infos[i].state = state;
- atomic_inc(&state->started);
- snprintf(name, sizeof(name), "bch_dirty_init[%d]", i);
-
- state->infos[i].thread =
- kthread_run(bch_dirty_init_thread,
- &state->infos[i],
- name);
- if (IS_ERR(state->infos[i].thread)) {
+ state.infos[i].state = &state;
+ state.infos[i].thread =
+ kthread_run(bch_dirty_init_thread, &state.infos[i],
+ "bch_dirtcnt[%d]", i);
+ if (IS_ERR(state.infos[i].thread)) {
pr_err("fails to run thread bch_dirty_init[%d]\n", i);
for (--i; i >= 0; i--)
- kthread_stop(state->infos[i].thread);
+ kthread_stop(state.infos[i].thread);
goto out;
}
+ atomic_inc(&state.started);
}
- /*
- * Must wait for all threads to stop.
- */
- wait_event_interruptible(state->wait,
- atomic_read(&state->started) == 0);
-
out:
- kfree(state);
+ /* Must wait for all threads to stop. */
+ wait_event(state.wait, atomic_read(&state.started) == 0);
+ rw_unlock(0, c->root);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -1032,6 +1014,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_rate_fp_term_high = 1000;
dc->writeback_rate_i_term_inverse = 10000;
+ /* For dc->writeback_lock contention in update_writeback_rate() */
+ dc->rate_update_retry = 0;
+
WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 02b2f9df73f6..31df716951f6 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -20,7 +20,7 @@
#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID 57
#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH 64
-#define BCH_DIRTY_INIT_THRD_MAX 64
+#define BCH_DIRTY_INIT_THRD_MAX 12
/*
* 14 (16384ths) is chosen here as something that each backing device
* should be a reasonable fraction of the share, and not to blow up