summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-08-29 20:21:42 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-08-29 20:21:42 -0700
commit3d3dfeb3aec7b612d266d500c82054f1fded4980 (patch)
tree11649eab5c74deb74e6e0879613e8053ae3b9970 /drivers
parentc1b7fcf3f6d94c2c3528bf77054bf174a5ef63d7 (diff)
parent146afeb235ccec10c17ad8ea26327c0c79dbd968 (diff)
Merge tag 'for-6.6/block-2023-08-28' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: "Pretty quiet round for this release. This contains: - Add support for zoned storage to ublk (Andreas, Ming) - Series improving performance for drivers that mark themselves as needing a blocking context for issue (Bart) - Cleanup the flush logic (Chengming) - sed opal keyring support (Greg) - Fixes and improvements to the integrity support (Jinyoung) - Add some exports for bcachefs that we can hopefully delete again in the future (Kent) - deadline throttling fix (Zhiguo) - Series allowing building the kernel without buffer_head support (Christoph) - Sanitize the bio page adding flow (Christoph) - Write back cache fixes (Christoph) - MD updates via Song: - Fix perf regression for raid0 large sequential writes (Jan) - Fix split bio iostat for raid0 (David) - Various raid1 fixes (Heinz, Xueshi) - raid6test build fixes (WANG) - Deprecate bitmap file support (Christoph) - Fix deadlock with md sync thread (Yu) - Refactor md io accounting (Yu) - Various non-urgent fixes (Li, Yu, Jack) - Various fixes and cleanups (Arnd, Azeem, Chengming, Damien, Li, Ming, Nitesh, Ruan, Tejun, Thomas, Xu)" * tag 'for-6.6/block-2023-08-28' of git://git.kernel.dk/linux: (113 commits) block: use strscpy() to instead of strncpy() block: sed-opal: keyring support for SED keys block: sed-opal: Implement IOC_OPAL_REVERT_LSP block: sed-opal: Implement IOC_OPAL_DISCOVERY blk-mq: prealloc tags when increase tagset nr_hw_queues blk-mq: delete redundant tagset map update when fallback blk-mq: fix tags leak when shrink nr_hw_queues ublk: zoned: support REQ_OP_ZONE_RESET_ALL md: raid0: account for split bio in iostat accounting md/raid0: Fix performance regression for large sequential writes md/raid0: Factor out helper for mapping and submitting a bio md raid1: allow writebehind to work on any leg device set WriteMostly md/raid1: hold the barrier until handle_read_error() finishes md/raid1: free the r1bio before waiting for blocked rdev md/raid1: call free_r1bio() before allow_barrier() in raid_end_bio_io() blk-cgroup: Fix NULL deref caused by blkg_policy_data being installed before init drivers/rnbd: restore sysfs interface to rnbd-client md/raid5-cache: fix null-ptr-deref for r5l_flush_stripe_to_raid() raid6: test: only check for Altivec if building on powerpc hosts raid6: test: make sure all intermediate and artifact files are .gitignored ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/nbd.c1
-rw-r--r--drivers/block/swim3.c2
-rw-r--r--drivers/block/ublk_drv.c366
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/dm-crypt.c1
-rw-r--r--drivers/md/dm-raid.c1
-rw-r--r--drivers/md/md-bitmap.c347
-rw-r--r--drivers/md/md-bitmap.h1
-rw-r--r--drivers/md/md-cluster.c8
-rw-r--r--drivers/md/md-faulty.c2
-rw-r--r--drivers/md/md-linear.c1
-rw-r--r--drivers/md/md-multipath.c1
-rw-r--r--drivers/md/md.c228
-rw-r--r--drivers/md/md.h13
-rw-r--r--drivers/md/raid0.c98
-rw-r--r--drivers/md/raid1.c86
-rw-r--r--drivers/md/raid1.h1
-rw-r--r--drivers/md/raid10.c85
-rw-r--r--drivers/md/raid10.h1
-rw-r--r--drivers/md/raid5-cache.c14
-rw-r--r--drivers/md/raid5.c72
-rw-r--r--drivers/nvme/host/ioctl.c1
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c3
-rw-r--r--drivers/scsi/scsi_lib.c12
-rw-r--r--drivers/target/target_core_iblock.c3
25 files changed, 854 insertions, 505 deletions
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 42e0159bb258..df1cd0f718b8 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -2334,6 +2334,7 @@ static struct genl_family nbd_genl_family __ro_after_init = {
.mcgrps = nbd_mcast_grps,
.n_mcgrps = ARRAY_SIZE(nbd_mcast_grps),
};
+MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME);
static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
{
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index dc43a63b3469..c2bc85826358 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -1277,7 +1277,7 @@ static struct macio_driver swim3_driver =
};
-int swim3_init(void)
+static int swim3_init(void)
{
macio_register_driver(&swim3_driver);
return 0;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 21d2e71c5514..630ddfe6657b 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -56,16 +56,21 @@
| UBLK_F_USER_RECOVERY_REISSUE \
| UBLK_F_UNPRIVILEGED_DEV \
| UBLK_F_CMD_IOCTL_ENCODE \
- | UBLK_F_USER_COPY)
+ | UBLK_F_USER_COPY \
+ | UBLK_F_ZONED)
/* All UBLK_PARAM_TYPE_* should be included here */
-#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
- UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
+#define UBLK_PARAM_TYPE_ALL \
+ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
+ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
struct ublk_rq_data {
struct llist_node node;
struct kref ref;
+ __u64 sector;
+ __u32 operation;
+ __u32 nr_zones;
};
struct ublk_uring_cmd_pdu {
@@ -185,6 +190,266 @@ struct ublk_params_header {
__u32 types;
};
+static inline unsigned int ublk_req_build_flags(struct request *req);
+static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
+ int tag);
+
+static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_USER_COPY;
+}
+
+static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_ZONED;
+}
+
+static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_ZONED;
+}
+
+#ifdef CONFIG_BLK_DEV_ZONED
+
+static int ublk_get_nr_zones(const struct ublk_device *ub)
+{
+ const struct ublk_param_basic *p = &ub->params.basic;
+
+ /* Zone size is a power of 2 */
+ return p->dev_sectors >> ilog2(p->chunk_sectors);
+}
+
+static int ublk_revalidate_disk_zones(struct ublk_device *ub)
+{
+ return blk_revalidate_disk_zones(ub->ub_disk, NULL);
+}
+
+static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
+{
+ const struct ublk_param_zoned *p = &ub->params.zoned;
+ int nr_zones;
+
+ if (!ublk_dev_is_zoned(ub))
+ return -EINVAL;
+
+ if (!p->max_zone_append_sectors)
+ return -EINVAL;
+
+ nr_zones = ublk_get_nr_zones(ub);
+
+ if (p->max_active_zones > nr_zones)
+ return -EINVAL;
+
+ if (p->max_open_zones > nr_zones)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
+{
+ const struct ublk_param_zoned *p = &ub->params.zoned;
+
+ disk_set_zoned(ub->ub_disk, BLK_ZONED_HM);
+ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
+ blk_queue_required_elevator_features(ub->ub_disk->queue,
+ ELEVATOR_F_ZBD_SEQ_WRITE);
+ disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
+ disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
+ blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
+
+ ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
+
+ return 0;
+}
+
+/* Based on virtblk_alloc_report_buffer */
+static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
+ unsigned int nr_zones, size_t *buflen)
+{
+ struct request_queue *q = ublk->ub_disk->queue;
+ size_t bufsize;
+ void *buf;
+
+ nr_zones = min_t(unsigned int, nr_zones,
+ ublk->ub_disk->nr_zones);
+
+ bufsize = nr_zones * sizeof(struct blk_zone);
+ bufsize =
+ min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
+
+ while (bufsize >= sizeof(struct blk_zone)) {
+ buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
+ if (buf) {
+ *buflen = bufsize;
+ return buf;
+ }
+ bufsize >>= 1;
+ }
+
+ *buflen = 0;
+ return NULL;
+}
+
+static int ublk_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+ struct ublk_device *ub = disk->private_data;
+ unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
+ unsigned int first_zone = sector >> ilog2(zone_size_sectors);
+ unsigned int done_zones = 0;
+ unsigned int max_zones_per_request;
+ int ret;
+ struct blk_zone *buffer;
+ size_t buffer_length;
+
+ nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
+ nr_zones);
+
+ buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
+ if (!buffer)
+ return -ENOMEM;
+
+ max_zones_per_request = buffer_length / sizeof(struct blk_zone);
+
+ while (done_zones < nr_zones) {
+ unsigned int remaining_zones = nr_zones - done_zones;
+ unsigned int zones_in_request =
+ min_t(unsigned int, remaining_zones, max_zones_per_request);
+ struct request *req;
+ struct ublk_rq_data *pdu;
+ blk_status_t status;
+
+ memset(buffer, 0, buffer_length);
+
+ req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ goto out;
+ }
+
+ pdu = blk_mq_rq_to_pdu(req);
+ pdu->operation = UBLK_IO_OP_REPORT_ZONES;
+ pdu->sector = sector;
+ pdu->nr_zones = zones_in_request;
+
+ ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
+ GFP_KERNEL);
+ if (ret) {
+ blk_mq_free_request(req);
+ goto out;
+ }
+
+ status = blk_execute_rq(req, 0);
+ ret = blk_status_to_errno(status);
+ blk_mq_free_request(req);
+ if (ret)
+ goto out;
+
+ for (unsigned int i = 0; i < zones_in_request; i++) {
+ struct blk_zone *zone = buffer + i;
+
+ /* A zero length zone means no more zones in this response */
+ if (!zone->len)
+ break;
+
+ ret = cb(zone, i, data);
+ if (ret)
+ goto out;
+
+ done_zones++;
+ sector += zone_size_sectors;
+
+ }
+ }
+
+ ret = done_zones;
+
+out:
+ kvfree(buffer);
+ return ret;
+}
+
+static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
+ struct request *req)
+{
+ struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
+ struct ublk_io *io = &ubq->ios[req->tag];
+ struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
+ u32 ublk_op;
+
+ switch (req_op(req)) {
+ case REQ_OP_ZONE_OPEN:
+ ublk_op = UBLK_IO_OP_ZONE_OPEN;
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ ublk_op = UBLK_IO_OP_ZONE_CLOSE;
+ break;
+ case REQ_OP_ZONE_FINISH:
+ ublk_op = UBLK_IO_OP_ZONE_FINISH;
+ break;
+ case REQ_OP_ZONE_RESET:
+ ublk_op = UBLK_IO_OP_ZONE_RESET;
+ break;
+ case REQ_OP_ZONE_APPEND:
+ ublk_op = UBLK_IO_OP_ZONE_APPEND;
+ break;
+ case REQ_OP_ZONE_RESET_ALL:
+ ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
+ break;
+ case REQ_OP_DRV_IN:
+ ublk_op = pdu->operation;
+ switch (ublk_op) {
+ case UBLK_IO_OP_REPORT_ZONES:
+ iod->op_flags = ublk_op | ublk_req_build_flags(req);
+ iod->nr_zones = pdu->nr_zones;
+ iod->start_sector = pdu->sector;
+ return BLK_STS_OK;
+ default:
+ return BLK_STS_IOERR;
+ }
+ case REQ_OP_DRV_OUT:
+ /* We do not support drv_out */
+ return BLK_STS_NOTSUPP;
+ default:
+ return BLK_STS_IOERR;
+ }
+
+ iod->op_flags = ublk_op | ublk_req_build_flags(req);
+ iod->nr_sectors = blk_rq_sectors(req);
+ iod->start_sector = blk_rq_pos(req);
+ iod->addr = io->addr;
+
+ return BLK_STS_OK;
+}
+
+#else
+
+#define ublk_report_zones (NULL)
+
+static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
+{
+ return -EOPNOTSUPP;
+}
+
+static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
+{
+ return -EOPNOTSUPP;
+}
+
+static int ublk_revalidate_disk_zones(struct ublk_device *ub)
+{
+ return 0;
+}
+
+static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
+ struct request *req)
+{
+ return BLK_STS_NOTSUPP;
+}
+
+#endif
+
static inline void __ublk_complete_rq(struct request *req);
static void ublk_complete_rq(struct kref *ref);
@@ -281,6 +546,9 @@ static int ublk_validate_params(const struct ublk_device *ub)
if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
return -EINVAL;
+
+ if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
+ return -EINVAL;
} else
return -EINVAL;
@@ -299,6 +567,11 @@ static int ublk_validate_params(const struct ublk_device *ub)
if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
return -EINVAL;
+ if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
+ return ublk_dev_param_zoned_validate(ub);
+ else if (ublk_dev_is_zoned(ub))
+ return -EINVAL;
+
return 0;
}
@@ -312,6 +585,9 @@ static int ublk_apply_params(struct ublk_device *ub)
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
ublk_dev_param_discard_apply(ub);
+ if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
+ return ublk_dev_param_zoned_apply(ub);
+
return 0;
}
@@ -482,6 +758,7 @@ static const struct block_device_operations ub_fops = {
.owner = THIS_MODULE,
.open = ublk_open,
.free_disk = ublk_free_disk,
+ .report_zones = ublk_report_zones,
};
#define UBLK_MAX_PIN_PAGES 32
@@ -596,7 +873,8 @@ static inline bool ublk_need_map_req(const struct request *req)
static inline bool ublk_need_unmap_req(const struct request *req)
{
- return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
+ return ublk_rq_has_data(req) &&
+ (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
}
static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
@@ -680,8 +958,13 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
{
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
struct ublk_io *io = &ubq->ios[req->tag];
+ enum req_op op = req_op(req);
u32 ublk_op;
+ if (!ublk_queue_is_zoned(ubq) &&
+ (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
+ return BLK_STS_IOERR;
+
switch (req_op(req)) {
case REQ_OP_READ:
ublk_op = UBLK_IO_OP_READ;
@@ -699,6 +982,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
ublk_op = UBLK_IO_OP_WRITE_ZEROES;
break;
default:
+ if (ublk_queue_is_zoned(ubq))
+ return ublk_setup_iod_zoned(ubq, req);
return BLK_STS_IOERR;
}
@@ -751,7 +1036,8 @@ static inline void __ublk_complete_rq(struct request *req)
*
* Both the two needn't unmap.
*/
- if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
+ if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
+ req_op(req) != REQ_OP_DRV_IN)
goto exit;
/* for READ request, writing data in iod->addr to rq buffers */
@@ -1114,8 +1400,13 @@ static void ublk_commit_completion(struct ublk_device *ub,
/* find the io request and complete */
req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
+ if (WARN_ON_ONCE(unlikely(!req)))
+ return;
- if (req && likely(!blk_should_fake_timeout(req->q)))
+ if (req_op(req) == REQ_OP_ZONE_APPEND)
+ req->__sector = ub_cmd->zone_append_lba;
+
+ if (likely(!blk_should_fake_timeout(req->q)))
ublk_put_req_ref(ubq, req);
}
@@ -1414,11 +1705,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
goto out;
- if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
- ret = -EINVAL;
- goto out;
- }
-
ret = ublk_check_cmd_op(cmd_op);
if (ret)
goto out;
@@ -1445,6 +1731,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
*/
if (!ub_cmd->addr && !ublk_need_get_data(ubq))
goto out;
+ } else if (ub_cmd->addr) {
+ /* User copy requires addr to be unset */
+ ret = -EINVAL;
+ goto out;
}
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
@@ -1464,7 +1754,15 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
req_op(req) == REQ_OP_READ))
goto out;
+ } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
+ /*
+ * User copy requires addr to be unset when command is
+ * not zone append
+ */
+ ret = -EINVAL;
+ goto out;
}
+
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_commit_completion(ub, ub_cmd);
break;
@@ -1537,11 +1835,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req,
int ubuf_dir)
{
/* copy ubuf to request pages */
- if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
+ if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
+ ubuf_dir == ITER_SOURCE)
return true;
/* copy request pages to ubuf */
- if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
+ if ((req_op(req) == REQ_OP_WRITE ||
+ req_op(req) == REQ_OP_ZONE_APPEND) &&
+ ubuf_dir == ITER_DEST)
return true;
return false;
@@ -1881,17 +2182,24 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
get_device(&ub->cdev_dev);
ub->dev_info.state = UBLK_S_DEV_LIVE;
+
+ if (ublk_dev_is_zoned(ub)) {
+ ret = ublk_revalidate_disk_zones(ub);
+ if (ret)
+ goto out_put_cdev;
+ }
+
ret = add_disk(disk);
+ if (ret)
+ goto out_put_cdev;
+
+ set_bit(UB_STATE_USED, &ub->state);
+
+out_put_cdev:
if (ret) {
- /*
- * Has to drop the reference since ->free_disk won't be
- * called in case of add_disk failure.
- */
ub->dev_info.state = UBLK_S_DEV_DEAD;
ublk_put_device(ub);
- goto out_put_disk;
}
- set_bit(UB_STATE_USED, &ub->state);
out_put_disk:
if (ret)
put_disk(disk);
@@ -2038,9 +2346,16 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
UBLK_F_URING_CMD_COMP_IN_TASK;
/* GET_DATA isn't needed any more with USER_COPY */
- if (ub->dev_info.flags & UBLK_F_USER_COPY)
+ if (ublk_dev_is_user_copy(ub))
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
+ /* Zoned storage support requires user copy feature */
+ if (ublk_dev_is_zoned(ub) &&
+ (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
+ ret = -EINVAL;
+ goto out_free_dev_number;
+ }
+
/* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
@@ -2433,14 +2748,9 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
if (header->len < header->dev_path_len)
return -EINVAL;
- dev_path = kmalloc(header->dev_path_len + 1, GFP_KERNEL);
- if (!dev_path)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(dev_path, argp, header->dev_path_len))
- goto exit;
- dev_path[header->dev_path_len] = 0;
+ dev_path = memdup_user_nul(argp, header->dev_path_len);
+ if (IS_ERR(dev_path))
+ return PTR_ERR(dev_path);
ret = -EINVAL;
switch (_IOC_NR(cmd->cmd_op)) {
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b0a22e99bade..2a8b081bce7d 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -15,6 +15,7 @@ if MD
config BLK_DEV_MD
tristate "RAID support"
select BLOCK_HOLDER_DEPRECATED if SYSFS
+ select BUFFER_HEAD
# BLOCK_LEGACY_AUTOLOAD requirement should be removed
# after relevant mdadm enhancements - to make "names=yes"
# the default - are widely available.
@@ -50,6 +51,16 @@ config MD_AUTODETECT
If unsure, say Y.
+config MD_BITMAP_FILE
+ bool "MD bitmap file support (deprecated)"
+ default y
+ help
+ If you say Y here, support for write intent bitmaps in files on an
+ external file system is enabled. This is an alternative to the internal
+ bitmaps near the MD superblock, and very problematic code that abuses
+ various kernel APIs and can only work with files on a file system not
+ actually sitting on the MD device.
+
config MD_LINEAR
tristate "Linear (append) mode (deprecated)"
depends on BLK_DEV_MD
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 1dc6227d353e..f2662c21a6df 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1160,7 +1160,6 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift);
- bip->bip_iter.bi_size = tag_len;
bip->bip_iter.bi_sector = io->cc->start + io->sector;
ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index becdb689190e..5f9991765f27 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3723,7 +3723,6 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 1ff712889a3b..6f9ff14971f9 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -139,29 +139,26 @@ static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page
*/
/* IO operations when bitmap is stored near all superblocks */
+
+/* choose a good rdev and read the page from there */
static int read_sb_page(struct mddev *mddev, loff_t offset,
- struct page *page,
- unsigned long index, int size)
+ struct page *page, unsigned long index, int size)
{
- /* choose a good rdev and read the page from there */
+ sector_t sector = mddev->bitmap_info.offset + offset +
+ index * (PAGE_SIZE / SECTOR_SIZE);
struct md_rdev *rdev;
- sector_t target;
rdev_for_each(rdev, mddev) {
- if (! test_bit(In_sync, &rdev->flags)
- || test_bit(Faulty, &rdev->flags)
- || test_bit(Bitmap_sync, &rdev->flags))
- continue;
+ u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev));
- target = offset + index * (PAGE_SIZE/512);
+ if (!test_bit(In_sync, &rdev->flags) ||
+ test_bit(Faulty, &rdev->flags) ||
+ test_bit(Bitmap_sync, &rdev->flags))
+ continue;
- if (sync_page_io(rdev, target,
- roundup(size, bdev_logical_block_size(rdev->bdev)),
- page, REQ_OP_READ, true)) {
- page->index = index;
+ if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true))
return 0;
- }
}
return -EIO;
}
@@ -225,18 +222,19 @@ static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
}
static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
- struct page *page)
+ unsigned long pg_index, struct page *page)
{
struct block_device *bdev;
struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage;
loff_t sboff, offset = mddev->bitmap_info.offset;
- sector_t ps, doff;
+ sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
unsigned int size = PAGE_SIZE;
unsigned int opt_size = PAGE_SIZE;
+ sector_t doff;
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
- if (page->index == store->file_pages - 1) {
+ if (pg_index == store->file_pages - 1) {
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
if (last_page_size == 0)
@@ -245,7 +243,6 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
opt_size = optimal_io_size(bdev, last_page_size, size);
}
- ps = page->index * PAGE_SIZE / SECTOR_SIZE;
sboff = rdev->sb_start + offset;
doff = rdev->data_offset;
@@ -279,55 +276,41 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
return 0;
}
-static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index,
+ struct page *page, bool wait)
{
- struct md_rdev *rdev;
struct mddev *mddev = bitmap->mddev;
- int ret;
do {
- rdev = NULL;
+ struct md_rdev *rdev = NULL;
+
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
- ret = __write_sb_page(rdev, bitmap, page);
- if (ret)
- return ret;
+ if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) {
+ set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
+ return;
+ }
}
} while (wait && md_super_wait(mddev) < 0);
-
- return 0;
}
static void md_bitmap_file_kick(struct bitmap *bitmap);
-/*
- * write out a page to a file
- */
-static void write_page(struct bitmap *bitmap, struct page *page, int wait)
-{
- struct buffer_head *bh;
-
- if (bitmap->storage.file == NULL) {
- switch (write_sb_page(bitmap, page, wait)) {
- case -EINVAL:
- set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
- }
- } else {
-
- bh = page_buffers(page);
- while (bh && bh->b_blocknr) {
- atomic_inc(&bitmap->pending_writes);
- set_buffer_locked(bh);
- set_buffer_mapped(bh);
- submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
- bh = bh->b_this_page;
- }
+#ifdef CONFIG_MD_BITMAP_FILE
+static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+ struct buffer_head *bh = page_buffers(page);
- if (wait)
- wait_event(bitmap->write_wait,
- atomic_read(&bitmap->pending_writes)==0);
+ while (bh && bh->b_blocknr) {
+ atomic_inc(&bitmap->pending_writes);
+ set_buffer_locked(bh);
+ set_buffer_mapped(bh);
+ submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
+ bh = bh->b_this_page;
}
- if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
- md_bitmap_file_kick(bitmap);
+
+ if (wait)
+ wait_event(bitmap->write_wait,
+ atomic_read(&bitmap->pending_writes) == 0);
}
static void end_bitmap_write(struct buffer_head *bh, int uptodate)
@@ -364,10 +347,8 @@ static void free_buffers(struct page *page)
* This usage is similar to how swap files are handled, and allows us
* to write to a file with no concerns of memory allocation failing.
*/
-static int read_page(struct file *file, unsigned long index,
- struct bitmap *bitmap,
- unsigned long count,
- struct page *page)
+static int read_file_page(struct file *file, unsigned long index,
+ struct bitmap *bitmap, unsigned long count, struct page *page)
{
int ret = 0;
struct inode *inode = file_inode(file);
@@ -415,7 +396,6 @@ static int read_page(struct file *file, unsigned long index,
blk_cur++;
bh = bh->b_this_page;
}
- page->index = index;
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0);
@@ -429,12 +409,46 @@ out:
ret);
return ret;
}
+#else /* CONFIG_MD_BITMAP_FILE */
+static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+}
+static int read_file_page(struct file *file, unsigned long index,
+ struct bitmap *bitmap, unsigned long count, struct page *page)
+{
+ return -EIO;
+}
+static void free_buffers(struct page *page)
+{
+ put_page(page);
+}
+#endif /* CONFIG_MD_BITMAP_FILE */
/*
* bitmap file superblock operations
*/
/*
+ * write out a page to a file
+ */
+static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
+ bool wait)
+{
+ struct bitmap_storage *store = &bitmap->storage;
+ struct page *page = store->filemap[pg_index];
+
+ if (mddev_is_clustered(bitmap->mddev)) {
+ pg_index += bitmap->cluster_slot *
+ DIV_ROUND_UP(store->bytes, PAGE_SIZE);
+ }
+
+ if (store->file)
+ write_file_page(bitmap, page, wait);
+ else
+ write_sb_page(bitmap, pg_index, page, wait);
+}
+
+/*
* md_bitmap_wait_writes() should be called before writing any bitmap
* blocks, to ensure previous writes, particularly from
* md_bitmap_daemon_work(), have completed.
@@ -488,7 +502,12 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
bitmap_info.space);
kunmap_atomic(sb);
- write_page(bitmap, bitmap->storage.sb_page, 1);
+
+ if (bitmap->storage.file)
+ write_file_page(bitmap, bitmap->storage.sb_page, 1);
+ else
+ write_sb_page(bitmap, bitmap->storage.sb_index,
+ bitmap->storage.sb_page, 1);
}
EXPORT_SYMBOL(md_bitmap_update_sb);
@@ -540,7 +559,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (bitmap->storage.sb_page == NULL)
return -ENOMEM;
- bitmap->storage.sb_page->index = 0;
+ bitmap->storage.sb_index = 0;
sb = kmap_atomic(bitmap->storage.sb_page);
@@ -601,7 +620,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
unsigned long sectors_reserved = 0;
int err = -EINVAL;
struct page *sb_page;
- loff_t offset = bitmap->mddev->bitmap_info.offset;
+ loff_t offset = 0;
if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
chunksize = 128 * 1024 * 1024;
@@ -628,7 +647,7 @@ re_read:
bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
/* to 4k blocks */
bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
- offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
+ offset = bitmap->cluster_slot * (bm_blocks << 3);
pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
bitmap->cluster_slot, offset);
}
@@ -637,13 +656,11 @@ re_read:
loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
- err = read_page(bitmap->storage.file, 0,
+ err = read_file_page(bitmap->storage.file, 0,
bitmap, bytes, sb_page);
} else {
- err = read_sb_page(bitmap->mddev,
- offset,
- sb_page,
- 0, sizeof(bitmap_super_t));
+ err = read_sb_page(bitmap->mddev, offset, sb_page, 0,
+ sizeof(bitmap_super_t));
}
if (err)
return err;
@@ -819,7 +836,7 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
if (store->sb_page) {
store->filemap[0] = store->sb_page;
pnum = 1;
- store->sb_page->index = offset;
+ store->sb_index = offset;
}
for ( ; pnum < num_pages; pnum++) {
@@ -828,7 +845,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
store->file_pages = pnum;
return -ENOMEM;
}
- store->filemap[pnum]->index = pnum + offset;
}
store->file_pages = pnum;
@@ -847,14 +863,10 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
static void md_bitmap_file_unmap(struct bitmap_storage *store)
{
- struct page **map, *sb_page;
- int pages;
- struct file *file;
-
- file = store->file;
- map = store->filemap;
- pages = store->file_pages;
- sb_page = store->sb_page;
+ struct file *file = store->file;
+ struct page *sb_page = store->sb_page;
+ struct page **map = store->filemap;
+ int pages = store->file_pages;
while (pages--)
if (map[pages] != sb_page) /* 0 is sb_page, release it below */
@@ -879,21 +891,13 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store)
*/
static void md_bitmap_file_kick(struct bitmap *bitmap)
{
- char *path, *ptr = NULL;
-
if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
md_bitmap_update_sb(bitmap);
if (bitmap->storage.file) {
- path = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (path)
- ptr = file_path(bitmap->storage.file,
- path, PAGE_SIZE);
+ pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
+ bmname(bitmap), bitmap->storage.file);
- pr_warn("%s: kicking failed bitmap file %s from array!\n",
- bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
-
- kfree(path);
} else
pr_warn("%s: disabling internal bitmap due to errors\n",
bmname(bitmap));
@@ -945,6 +949,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
void *kaddr;
unsigned long chunk = block >> bitmap->counts.chunkshift;
struct bitmap_storage *store = &bitmap->storage;
+ unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0;
if (mddev_is_clustered(bitmap->mddev))
@@ -962,9 +967,9 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
else
set_bit_le(bit, kaddr);
kunmap_atomic(kaddr);
- pr_debug("set file bit %lu page %lu\n", bit, page->index);
+ pr_debug("set file bit %lu page %lu\n", bit, index);
/* record page number so it gets flushed to disk when unplug occurs */
- set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY);
+ set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
}
static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
@@ -974,6 +979,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
void *paddr;
unsigned long chunk = block >> bitmap->counts.chunkshift;
struct bitmap_storage *store = &bitmap->storage;
+ unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0;
if (mddev_is_clustered(bitmap->mddev))
@@ -989,8 +995,8 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
else
clear_bit_le(bit, paddr);
kunmap_atomic(paddr);
- if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
- set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING);
+ if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
+ set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
}
@@ -1042,7 +1048,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
"md bitmap_unplug");
}
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
- write_page(bitmap, bitmap->storage.filemap[i], 0);
+ filemap_write_page(bitmap, i, false);
writing = 1;
}
}
@@ -1084,33 +1090,31 @@ void md_bitmap_unplug_async(struct bitmap *bitmap)
EXPORT_SYMBOL(md_bitmap_unplug_async);
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
-/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
- * the in-memory bitmap from the on-disk bitmap -- also, sets up the
- * memory mapping of the bitmap file
- * Special cases:
- * if there's no bitmap file, or if the bitmap file had been
- * previously kicked from the array, we mark all the bits as
- * 1's in order to cause a full resync.
+
+/*
+ * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory
+ * mapping of the bitmap file.
+ *
+ * Special case: If there's no bitmap file, or if the bitmap file had been
+ * previously kicked from the array, we mark all the bits as 1's in order to
+ * cause a full resync.
*
* We ignore all bits for sectors that end earlier than 'start'.
- * This is used when reading an out-of-date bitmap...
+ * This is used when reading an out-of-date bitmap.
*/
static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
{
- unsigned long i, chunks, index, oldindex, bit, node_offset = 0;
- struct page *page = NULL;
- unsigned long bit_cnt = 0;
- struct file *file;
- unsigned long offset;
- int outofdate;
- int ret = -ENOSPC;
- void *paddr;
+ bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
+ struct mddev *mddev = bitmap->mddev;
+ unsigned long chunks = bitmap->counts.chunks;
struct bitmap_storage *store = &bitmap->storage;
+ struct file *file = store->file;
+ unsigned long node_offset = 0;
+ unsigned long bit_cnt = 0;
+ unsigned long i;
+ int ret;
- chunks = bitmap->counts.chunks;
- file = store->file;
-
- if (!file && !bitmap->mddev->bitmap_info.offset) {
+ if (!file && !mddev->bitmap_info.offset) {
/* No permanent bitmap - fill with '1s'. */
store->filemap = NULL;
store->file_pages = 0;
@@ -1125,77 +1129,79 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
return 0;
}
- outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
- if (outofdate)
- pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap));
-
if (file && i_size_read(file->f_mapping->host) < store->bytes) {
pr_warn("%s: bitmap file too short %lu < %lu\n",
bmname(bitmap),
(unsigned long) i_size_read(file->f_mapping->host),
store->bytes);
+ ret = -ENOSPC;
goto err;
}
- oldindex = ~0L;
- offset = 0;
- if (!bitmap->mddev->bitmap_info.external)
- offset = sizeof(bitmap_super_t);
-
- if (mddev_is_clustered(bitmap->mddev))
+ if (mddev_is_clustered(mddev))
node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
- for (i = 0; i < chunks; i++) {
- int b;
- index = file_page_index(&bitmap->storage, i);
- bit = file_page_offset(&bitmap->storage, i);
- if (index != oldindex) { /* this is a new page, read it in */
- int count;
- /* unmap the old page, we're done with it */
- if (index == store->file_pages-1)
- count = store->bytes - index * PAGE_SIZE;
- else
- count = PAGE_SIZE;
- page = store->filemap[index];
- if (file)
- ret = read_page(file, index, bitmap,
- count, page);
- else
- ret = read_sb_page(
- bitmap->mddev,
- bitmap->mddev->bitmap_info.offset,
- page,
- index + node_offset, count);
+ for (i = 0; i < store->file_pages; i++) {
+ struct page *page = store->filemap[i];
+ int count;
- if (ret)
- goto err;
+ /* unmap the old page, we're done with it */
+ if (i == store->file_pages - 1)
+ count = store->bytes - i * PAGE_SIZE;
+ else
+ count = PAGE_SIZE;
- oldindex = index;
+ if (file)
+ ret = read_file_page(file, i, bitmap, count, page);
+ else
+ ret = read_sb_page(mddev, 0, page, i + node_offset,
+ count);
+ if (ret)
+ goto err;
+ }
- if (outofdate) {
- /*
- * if bitmap is out of date, dirty the
- * whole page and write it out
- */
- paddr = kmap_atomic(page);
- memset(paddr + offset, 0xff,
- PAGE_SIZE - offset);
- kunmap_atomic(paddr);
- write_page(bitmap, page, 1);
+ if (outofdate) {
+ pr_warn("%s: bitmap file is out of date, doing full recovery\n",
+ bmname(bitmap));
+ for (i = 0; i < store->file_pages; i++) {
+ struct page *page = store->filemap[i];
+ unsigned long offset = 0;
+ void *paddr;
+
+ if (i == 0 && !mddev->bitmap_info.external)
+ offset = sizeof(bitmap_super_t);
+
+ /*
+ * If the bitmap is out of date, dirty the whole page
+ * and write it out
+ */
+ paddr = kmap_atomic(page);
+ memset(paddr + offset, 0xff, PAGE_SIZE - offset);
+ kunmap_atomic(paddr);
+
+ filemap_write_page(bitmap, i, true);
+ if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
ret = -EIO;
- if (test_bit(BITMAP_WRITE_ERROR,
- &bitmap->flags))
- goto err;
+ goto err;
}
}
+ }
+
+ for (i = 0; i < chunks; i++) {
+ struct page *page = filemap_get_page(&bitmap->storage, i);
+ unsigned long bit = file_page_offset(&bitmap->storage, i);
+ void *paddr;
+ bool was_set;
+
paddr = kmap_atomic(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
- b = test_bit(bit, paddr);
+ was_set = test_bit(bit, paddr);
else
- b = test_bit_le(bit, paddr);
+ was_set = test_bit_le(bit, paddr);
kunmap_atomic(paddr);
- if (b) {
+
+ if (was_set) {
/* if the disk bit is set, set the memory bit */
int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
>= start);
@@ -1204,7 +1210,6 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
needed);
bit_cnt++;
}
- offset = 0;
}
pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",
@@ -1396,9 +1401,8 @@ void md_bitmap_daemon_work(struct mddev *mddev)
break;
if (bitmap->storage.filemap &&
test_and_clear_page_attr(bitmap, j,
- BITMAP_PAGE_NEEDWRITE)) {
- write_page(bitmap, bitmap->storage.filemap[j], 0);
- }
+ BITMAP_PAGE_NEEDWRITE))
+ filemap_write_page(bitmap, j, false);
}
done:
@@ -2542,6 +2546,10 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (backlog > COUNTER_MAX)
return -EINVAL;
+ rv = mddev_lock(mddev);
+ if (rv)
+ return rv;
+
/*
* Without write mostly device, it doesn't make sense to set
* backlog for max_write_behind.
@@ -2555,6 +2563,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (!has_write_mostly) {
pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n",
mdname(mddev));
+ mddev_unlock(mddev);
return -EINVAL;
}
@@ -2565,13 +2574,13 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
mddev_destroy_serial_pool(mddev, NULL, false);
} else if (backlog && !mddev->serial_info_pool) {
/* serial_info_pool is needed since backlog is not zero */
- struct md_rdev *rdev;
-
rdev_for_each(rdev, mddev)
mddev_create_serial_pool(mddev, rdev, false);
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
+
+ mddev_unlock(mddev);
return len;
}
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index 8a3788c9bfef..bb9eb418780a 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -201,6 +201,7 @@ struct bitmap {
struct file *file; /* backing disk file */
struct page *sb_page; /* cached copy of the bitmap
* file superblock */
+ unsigned long sb_index;
struct page **filemap; /* list of cache pages for
* the file */
unsigned long *filemap_attr; /* attributes associated
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 3d9fd74233df..1e26eb223349 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -952,8 +952,8 @@ static int join(struct mddev *mddev, int nodes)
return 0;
err:
set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
- md_unregister_thread(&cinfo->recovery_thread);
- md_unregister_thread(&cinfo->recv_thread);
+ md_unregister_thread(mddev, &cinfo->recovery_thread);
+ md_unregister_thread(mddev, &cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
@@ -1015,8 +1015,8 @@ static int leave(struct mddev *mddev)
resync_bitmap(mddev);
set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
- md_unregister_thread(&cinfo->recovery_thread);
- md_unregister_thread(&cinfo->recv_thread);
+ md_unregister_thread(mddev, &cinfo->recovery_thread);
+ md_unregister_thread(mddev, &cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index 50ad818978a4..a039e8e20f55 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
failit = 1;
}
}
+
+ md_account_bio(mddev, &bio);
if (failit) {
struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
&mddev->bio_set);
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 4eb72b9dd933..71ac99646827 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
bio = split;
}
+ md_account_bio(mddev, &bio);
bio_set_dev(bio, tmp_dev->rdev->bdev);
bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
start_sector + data_offset;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 92c45be203d7..d22276870283 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
&& md_flush_request(mddev, bio))
return true;
+ md_account_bio(mddev, &bio);
mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
mp_bh->master_bio = bio;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 78be7811a89f..0fe7ab6e8ab9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -453,7 +453,6 @@ void mddev_suspend(struct mddev *mddev)
mddev->pers->prepare_suspend(mddev);
wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
- mddev->pers->quiesce(mddev, 1);
clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
@@ -465,14 +464,15 @@ EXPORT_SYMBOL_GPL(mddev_suspend);
void mddev_resume(struct mddev *mddev)
{
- /* entred the memalloc scope from mddev_suspend() */
- memalloc_noio_restore(mddev->noio_flag);
lockdep_assert_held(&mddev->reconfig_mutex);
if (--mddev->suspended)
return;
+
+ /* entred the memalloc scope from mddev_suspend() */
+ memalloc_noio_restore(mddev->noio_flag);
+
percpu_ref_resurrect(&mddev->active_io);
wake_up(&mddev->sb_wait);
- mddev->pers->quiesce(mddev, 0);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -643,6 +643,7 @@ void mddev_init(struct mddev *mddev)
{
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
+ mutex_init(&mddev->sync_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
@@ -650,6 +651,7 @@ void mddev_init(struct mddev *mddev)
timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
+ atomic_set(&mddev->sync_seq, 0);
spin_lock_init(&mddev->lock);
atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
@@ -2304,7 +2306,7 @@ int md_integrity_register(struct mddev *mddev)
pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
(mddev->level != 1 && mddev->level != 10 &&
- bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) {
+ bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
/*
* No need to handle the failure of bioset_integrity_create,
* because the function is called by md_run() -> pers->run(),
@@ -4747,6 +4749,62 @@ action_show(struct mddev *mddev, char *page)
return sprintf(page, "%s\n", type);
}
+static void stop_sync_thread(struct mddev *mddev)
+{
+ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return;
+
+ if (mddev_lock(mddev))
+ return;
+
+ /*
+ * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
+ * held.
+ */
+ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+ mddev_unlock(mddev);
+ return;
+ }
+
+ if (work_pending(&mddev->del_work))
+ flush_workqueue(md_misc_wq);
+
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ /*
+ * Thread might be blocked waiting for metadata update which will now
+ * never happen
+ */
+ md_wakeup_thread_directly(mddev->sync_thread);
+
+ mddev_unlock(mddev);
+}
+
+static void idle_sync_thread(struct mddev *mddev)
+{
+ int sync_seq = atomic_read(&mddev->sync_seq);
+
+ mutex_lock(&mddev->sync_mutex);
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ stop_sync_thread(mddev);
+
+ wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) ||
+ !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+
+ mutex_unlock(&mddev->sync_mutex);
+}
+
+static void frozen_sync_thread(struct mddev *mddev)
+{
+ mutex_lock(&mddev->sync_mutex);
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ stop_sync_thread(mddev);
+
+ wait_event(resync_wait, mddev->sync_thread == NULL &&
+ !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+
+ mutex_unlock(&mddev->sync_mutex);
+}
+
static ssize_t
action_store(struct mddev *mddev, const char *page, size_t len)
{
@@ -4754,35 +4812,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
return -EINVAL;
- if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
- if (cmd_match(page, "frozen"))
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- else
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
- mddev_lock(mddev) == 0) {
- if (work_pending(&mddev->del_work))
- flush_workqueue(md_misc_wq);
- if (mddev->sync_thread) {
- sector_t save_rp = mddev->reshape_position;
-
- mddev_unlock(mddev);
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(&mddev->sync_thread);
- mddev_lock_nointr(mddev);
- /*
- * set RECOVERY_INTR again and restore reshape
- * position in case others changed them after
- * got lock, eg, reshape_position_store and
- * md_check_recovery.
- */
- mddev->reshape_position = save_rp;
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_reap_sync_thread(mddev);
- }
- mddev_unlock(mddev);
- }
- } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ if (cmd_match(page, "idle"))
+ idle_sync_thread(mddev);
+ else if (cmd_match(page, "frozen"))
+ frozen_sync_thread(mddev);
+ else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
else if (cmd_match(page, "resync"))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -5842,6 +5876,13 @@ int md_run(struct mddev *mddev)
goto exit_bio_set;
}
+ if (!bioset_initialized(&mddev->io_clone_set)) {
+ err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
+ offsetof(struct md_io_clone, bio_clone), 0);
+ if (err)
+ goto exit_sync_set;
+ }
+
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
if (!pers || !try_module_get(pers->owner)) {
@@ -6019,6 +6060,8 @@ bitmap_abort:
module_put(pers->owner);
md_bitmap_destroy(mddev);
abort:
+ bioset_exit(&mddev->io_clone_set);
+exit_sync_set:
bioset_exit(&mddev->sync_set);
exit_bio_set:
bioset_exit(&mddev->bio_set);
@@ -6176,7 +6219,6 @@ static void __md_stop_writes(struct mddev *mddev)
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
@@ -6216,7 +6258,7 @@ static void mddev_detach(struct mddev *mddev)
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
- md_unregister_thread(&mddev->thread);
+ md_unregister_thread(mddev, &mddev->thread);
if (mddev->queue)
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
}
@@ -6243,6 +6285,7 @@ static void __md_stop(struct mddev *mddev)
percpu_ref_exit(&mddev->active_io);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
+ bioset_exit(&mddev->io_clone_set);
}
void md_stop(struct mddev *mddev)
@@ -7012,6 +7055,15 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
if (mddev->bitmap || mddev->bitmap_info.file)
return -EEXIST; /* cannot add when bitmap is present */
+
+ if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
+ pr_warn("%s: bitmap files not supported by this kernel\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ pr_warn("%s: using deprecated bitmap file support\n",
+ mdname(mddev));
+
f = fget(fd);
if (f == NULL) {
@@ -7940,9 +7992,10 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
}
EXPORT_SYMBOL(md_register_thread);
-void md_unregister_thread(struct md_thread __rcu **threadp)
+void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp)
{
- struct md_thread *thread = rcu_dereference_protected(*threadp, true);
+ struct md_thread *thread = rcu_dereference_protected(*threadp,
+ lockdep_is_held(&mddev->reconfig_mutex));
if (!thread)
return;
@@ -8601,62 +8654,44 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
-int acct_bioset_init(struct mddev *mddev)
+static void md_end_clone_io(struct bio *bio)
{
- int err = 0;
-
- if (!bioset_initialized(&mddev->io_acct_set))
- err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
- offsetof(struct md_io_acct, bio_clone), 0);
- return err;
-}
-EXPORT_SYMBOL_GPL(acct_bioset_init);
-
-void acct_bioset_exit(struct mddev *mddev)
-{
- bioset_exit(&mddev->io_acct_set);
-}
-EXPORT_SYMBOL_GPL(acct_bioset_exit);
-
-static void md_end_io_acct(struct bio *bio)
-{
- struct md_io_acct *md_io_acct = bio->bi_private;
- struct bio *orig_bio = md_io_acct->orig_bio;
- struct mddev *mddev = md_io_acct->mddev;
+ struct md_io_clone *md_io_clone = bio->bi_private;
+ struct bio *orig_bio = md_io_clone->orig_bio;
+ struct mddev *mddev = md_io_clone->mddev;
orig_bio->bi_status = bio->bi_status;
- bio_end_io_acct(orig_bio, md_io_acct->start_time);
+ if (md_io_clone->start_time)
+ bio_end_io_acct(orig_bio, md_io_clone->start_time);
+
bio_put(bio);
bio_endio(orig_bio);
-
percpu_ref_put(&mddev->active_io);
}
-/*
- * Used by personalities that don't already clone the bio and thus can't
- * easily add the timestamp to their extended bio structure.
- */
-void md_account_bio(struct mddev *mddev, struct bio **bio)
+static void md_clone_bio(struct mddev *mddev, struct bio **bio)
{
struct block_device *bdev = (*bio)->bi_bdev;
- struct md_io_acct *md_io_acct;
- struct bio *clone;
-
- if (!blk_queue_io_stat(bdev->bd_disk->queue))
- return;
+ struct md_io_clone *md_io_clone;
+ struct bio *clone =
+ bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
+
+ md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
+ md_io_clone->orig_bio = *bio;
+ md_io_clone->mddev = mddev;
+ if (blk_queue_io_stat(bdev->bd_disk->queue))
+ md_io_clone->start_time = bio_start_io_acct(*bio);
+
+ clone->bi_end_io = md_end_clone_io;
+ clone->bi_private = md_io_clone;
+ *bio = clone;
+}
+void md_account_bio(struct mddev *mddev, struct bio **bio)
+{
percpu_ref_get(&mddev->active_io);
-
- clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
- md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
- md_io_acct->orig_bio = *bio;
- md_io_acct->start_time = bio_start_io_acct(*bio);
- md_io_acct->mddev = mddev;
-
- clone->bi_end_io = md_end_io_acct;
- clone->bi_private = md_io_acct;
- *bio = clone;
+ md_clone_bio(mddev, bio);
}
EXPORT_SYMBOL_GPL(md_account_bio);
@@ -9329,7 +9364,6 @@ void md_check_recovery(struct mddev *mddev)
* ->spare_active and clear saved_raid_disk
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -9358,17 +9392,24 @@ void md_check_recovery(struct mddev *mddev)
if (mddev->sb_flags)
md_update_sb(mddev, 0);
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
- /* resync/recovery still happening */
- clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- goto unlock;
- }
- if (mddev->sync_thread) {
- md_unregister_thread(&mddev->sync_thread);
+ /*
+ * Never start a new sync thread if MD_RECOVERY_RUNNING is
+ * still set.
+ */
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+ if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
+ /* resync/recovery still happening */
+ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ goto unlock;
+ }
+
+ if (WARN_ON_ONCE(!mddev->sync_thread))
+ goto unlock;
+
md_reap_sync_thread(mddev);
goto unlock;
}
+
/* Set RUNNING before clearing NEEDED to avoid
* any transients in the value of "sync_action".
*/
@@ -9445,7 +9486,10 @@ void md_reap_sync_thread(struct mddev *mddev)
sector_t old_dev_sectors = mddev->dev_sectors;
bool is_reshaped = false;
- /* sync_thread should be unregistered, collect result */
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev, &mddev->sync_thread);
+ atomic_inc(&mddev->sync_seq);
+
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
mddev->degraded != mddev->raid_disks) {
@@ -9490,7 +9534,6 @@ void md_reap_sync_thread(struct mddev *mddev)
if (mddev_is_clustered(mddev) && is_reshaped
&& !test_bit(MD_CLOSING, &mddev->flags))
md_cluster_ops->update_size(mddev, old_dev_sectors);
- wake_up(&resync_wait);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
@@ -9498,6 +9541,7 @@ void md_reap_sync_thread(struct mddev *mddev)
md_new_event();
if (mddev->event_work.func)
queue_work(md_misc_wq, &mddev->event_work);
+ wake_up(&resync_wait);
}
EXPORT_SYMBOL(md_reap_sync_thread);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1aef86bf3fc3..9bcb77bca963 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -510,7 +510,7 @@ struct mddev {
struct bio_set sync_set; /* for sync operations like
* metadata and bitmap writes
*/
- struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */
+ struct bio_set io_clone_set;
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit
@@ -535,6 +535,11 @@ struct mddev {
*/
struct list_head deleting;
+ /* Used to synchronize idle and frozen for action_store() */
+ struct mutex sync_mutex;
+ /* The sequence number for sync thread */
+ atomic_t sync_seq;
+
bool has_superblocks:1;
bool fail_last_dev:1;
bool serialize_policy:1;
@@ -731,7 +736,7 @@ struct md_thread {
void *private;
};
-struct md_io_acct {
+struct md_io_clone {
struct mddev *mddev;
struct bio *orig_bio;
unsigned long start_time;
@@ -756,7 +761,7 @@ extern struct md_thread *md_register_thread(
void (*run)(struct md_thread *thread),
struct mddev *mddev,
const char *name);
-extern void md_unregister_thread(struct md_thread __rcu **threadp);
+extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp);
extern void md_wakeup_thread(struct md_thread __rcu *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
@@ -769,8 +774,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio, sector_t start, sector_t size);
-int acct_bioset_init(struct mddev *mddev);
-void acct_bioset_exit(struct mddev *mddev);
void md_account_bio(struct mddev *mddev, struct bio **bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d1ac73fcd852..c50a7abda744 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -377,7 +377,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
struct r0conf *conf = priv;
free_conf(mddev, conf);
- acct_bioset_exit(mddev);
}
static int raid0_run(struct mddev *mddev)
@@ -392,16 +391,11 @@ static int raid0_run(struct mddev *mddev)
if (md_check_no_bitmap(mddev))
return -EINVAL;
- if (acct_bioset_init(mddev)) {
- pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
- return -ENOMEM;
- }
-
/* if private is not null, we are here after takeover */
if (mddev->private == NULL) {
ret = create_strip_zones(mddev, &conf);
if (ret < 0)
- goto exit_acct_set;
+ return ret;
mddev->private = conf;
}
conf = mddev->private;
@@ -432,15 +426,9 @@ static int raid0_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret)
- goto free;
+ free_conf(mddev, conf);
return ret;
-
-free:
- free_conf(mddev, conf);
-exit_acct_set:
- acct_bioset_exit(mddev);
- return ret;
}
/*
@@ -557,54 +545,20 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
bio_endio(bio);
}
-static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
+static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio)
{
struct r0conf *conf = mddev->private;
struct strip_zone *zone;
struct md_rdev *tmp_dev;
- sector_t bio_sector;
- sector_t sector;
- sector_t orig_sector;
- unsigned chunk_sects;
- unsigned sectors;
-
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)
- && md_flush_request(mddev, bio))
- return true;
-
- if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
- raid0_handle_discard(mddev, bio);
- return true;
- }
-
- bio_sector = bio->bi_iter.bi_sector;
- sector = bio_sector;
- chunk_sects = mddev->chunk_sectors;
-
- sectors = chunk_sects -
- (likely(is_power_of_2(chunk_sects))
- ? (sector & (chunk_sects-1))
- : sector_div(sector, chunk_sects));
-
- /* Restore due to sector_div */
- sector = bio_sector;
-
- if (sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, sectors, GFP_NOIO,
- &mddev->bio_set);
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
- }
+ sector_t bio_sector = bio->bi_iter.bi_sector;
+ sector_t sector = bio_sector;
- if (bio->bi_pool != &mddev->bio_set)
- md_account_bio(mddev, &bio);
+ md_account_bio(mddev, &bio);
- orig_sector = sector;
zone = find_zone(mddev->private, &sector);
switch (conf->layout) {
case RAID0_ORIG_LAYOUT:
- tmp_dev = map_sector(mddev, zone, orig_sector, &sector);
+ tmp_dev = map_sector(mddev, zone, bio_sector, &sector);
break;
case RAID0_ALT_MULTIZONE_LAYOUT:
tmp_dev = map_sector(mddev, zone, sector, &sector);
@@ -612,13 +566,13 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
default:
WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev));
bio_io_error(bio);
- return true;
+ return;
}
if (unlikely(is_rdev_broken(tmp_dev))) {
bio_io_error(bio);
md_error(mddev, tmp_dev);
- return true;
+ return;
}
bio_set_dev(bio, tmp_dev->bdev);
@@ -630,6 +584,40 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
bio_sector);
mddev_check_write_zeroes(mddev, bio);
submit_bio_noacct(bio);
+}
+
+static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
+{
+ sector_t sector;
+ unsigned chunk_sects;
+ unsigned sectors;
+
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
+ return true;
+
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
+ raid0_handle_discard(mddev, bio);
+ return true;
+ }
+
+ sector = bio->bi_iter.bi_sector;
+ chunk_sects = mddev->chunk_sectors;
+
+ sectors = chunk_sects -
+ (likely(is_power_of_2(chunk_sects))
+ ? (sector & (chunk_sects-1))
+ : sector_div(sector, chunk_sects));
+
+ if (sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, sectors, GFP_NOIO,
+ &mddev->bio_set);
+ bio_chain(split, bio);
+ raid0_map_submit_bio(mddev, bio);
+ bio = split;
+ }
+
+ raid0_map_submit_bio(mddev, bio);
return true;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index dd25832eb045..4b30a1742162 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
bio->bi_status = BLK_STS_IOERR;
- if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
- bio_end_io_acct(bio, r1_bio->start_time);
bio_endio(bio);
}
@@ -313,6 +311,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
struct r1conf *conf = r1_bio->mddev->private;
+ sector_t sector = r1_bio->sector;
/* if nobody has done the final endio yet, do it now */
if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
@@ -323,13 +322,13 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
call_bio_endio(r1_bio);
}
+
+ free_r1bio(r1_bio);
/*
* Wake up any possible resync thread that waits for the device
* to go idle. All I/Os, even write-behind writes, are done.
*/
- allow_barrier(conf, r1_bio->sector);
-
- free_r1bio(r1_bio);
+ allow_barrier(conf, sector);
}
/*
@@ -791,11 +790,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
+static void wake_up_barrier(struct r1conf *conf)
+{
+ if (wq_has_sleeper(&conf->wait_barrier))
+ wake_up(&conf->wait_barrier);
+}
+
static void flush_bio_list(struct r1conf *conf, struct bio *bio)
{
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
raid1_prepare_flush_writes(conf->mddev->bitmap);
- wake_up(&conf->wait_barrier);
+ wake_up_barrier(conf);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
@@ -972,7 +977,7 @@ static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
* In case freeze_array() is waiting for
* get_unqueued_pending() == extra
*/
- wake_up(&conf->wait_barrier);
+ wake_up_barrier(conf);
/* Wait for the barrier in same barrier unit bucket to drop. */
/* Return false when nowait flag is set */
@@ -1015,7 +1020,7 @@ static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowa
* In case freeze_array() is waiting for
* get_unqueued_pending() == extra
*/
- wake_up(&conf->wait_barrier);
+ wake_up_barrier(conf);
/* Wait for array to be unfrozen */
/* Return false when nowait flag is set */
@@ -1044,7 +1049,7 @@ static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
static void _allow_barrier(struct r1conf *conf, int idx)
{
atomic_dec(&conf->nr_pending[idx]);
- wake_up(&conf->wait_barrier);
+ wake_up_barrier(conf);
}
static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
@@ -1173,7 +1178,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
spin_unlock_irq(&conf->device_lock);
- wake_up(&conf->wait_barrier);
+ wake_up_barrier(conf);
md_wakeup_thread(mddev->thread);
kfree(plug);
return;
@@ -1303,10 +1308,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
}
r1_bio->read_disk = rdisk;
-
- if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
- r1_bio->start_time = bio_start_io_acct(bio);
-
+ if (!r1bio_existed) {
+ md_account_bio(mddev, &bio);
+ r1_bio->master_bio = bio;
+ }
read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
&mddev->bio_set);
@@ -1373,6 +1378,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
return;
}
+ retry_write:
r1_bio = alloc_r1bio(mddev, bio);
r1_bio->sectors = max_write_sectors;
@@ -1388,7 +1394,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
*/
disks = conf->raid_disks * 2;
- retry_write:
blocked_rdev = NULL;
rcu_read_lock();
max_sectors = r1_bio->sectors;
@@ -1468,7 +1473,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
for (j = 0; j < i; j++)
if (r1_bio->bios[j])
rdev_dec_pending(conf->mirrors[j].rdev, mddev);
- r1_bio->state = 0;
+ free_r1bio(r1_bio);
allow_barrier(conf, bio->bi_iter.bi_sector);
if (bio->bi_opf & REQ_NOWAIT) {
@@ -1500,8 +1505,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio->sectors = max_sectors;
}
- if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
- r1_bio->start_time = bio_start_io_acct(bio);
+ md_account_bio(mddev, &bio);
+ r1_bio->master_bio = bio;
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
@@ -1518,8 +1523,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* Not if there are too many, or cannot
* allocate memory, or a reader on WriteMostly
* is waiting for behind writes to flush */
- if (bitmap &&
- test_bit(WriteMostly, &rdev->flags) &&
+ if (bitmap && write_behind &&
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
!waitqueue_active(&bitmap->behind_wait)) {
@@ -1576,7 +1580,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio_write_done(r1_bio);
/* In case raid1d snuck in to freeze_array */
- wake_up(&conf->wait_barrier);
+ wake_up_barrier(conf);
}
static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
@@ -1766,7 +1770,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r1conf *conf = mddev->private;
int err = -EEXIST;
- int mirror = 0;
+ int mirror = 0, repl_slot = -1;
struct raid1_info *p;
int first = 0;
int last = conf->raid_disks - 1;
@@ -1809,17 +1813,21 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
break;
}
if (test_bit(WantReplacement, &p->rdev->flags) &&
- p[conf->raid_disks].rdev == NULL) {
- /* Add this device as a replacement */
- clear_bit(In_sync, &rdev->flags);
- set_bit(Replacement, &rdev->flags);
- rdev->raid_disk = mirror;
- err = 0;
- conf->fullsync = 1;
- rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
- break;
- }
+ p[conf->raid_disks].rdev == NULL && repl_slot < 0)
+ repl_slot = mirror;
}
+
+ if (err && repl_slot >= 0) {
+ /* Add this device as a replacement */
+ p = conf->mirrors + repl_slot;
+ clear_bit(In_sync, &rdev->flags);
+ set_bit(Replacement, &rdev->flags);
+ rdev->raid_disk = repl_slot;
+ err = 0;
+ conf->fullsync = 1;
+ rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
+ }
+
print_conf(conf);
return err;
}
@@ -1829,6 +1837,10 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r1conf *conf = mddev->private;
int err = 0;
int number = rdev->raid_disk;
+
+ if (unlikely(number >= conf->raid_disks))
+ goto abort;
+
struct raid1_info *p = conf->mirrors + number;
if (rdev != p->rdev)
@@ -2299,7 +2311,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
d++;
if (d == conf->raid_disks * 2)
d = 0;
- } while (!success && d != read_disk);
+ } while (d != read_disk);
if (!success) {
/* Cannot read from anywhere - mark it bad */
@@ -2498,6 +2510,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
struct mddev *mddev = conf->mddev;
struct bio *bio;
struct md_rdev *rdev;
+ sector_t sector;
clear_bit(R1BIO_ReadError, &r1_bio->state);
/* we got a read error. Maybe the drive is bad. Maybe just
@@ -2527,12 +2540,13 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
}
rdev_dec_pending(rdev, conf->mddev);
- allow_barrier(conf, r1_bio->sector);
+ sector = r1_bio->sector;
bio = r1_bio->master_bio;
/* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
r1_bio->state = 0;
raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
+ allow_barrier(conf, sector);
}
static void raid1d(struct md_thread *thread)
@@ -3144,7 +3158,7 @@ static int raid1_run(struct mddev *mddev)
* RAID1 needs at least one disk in active
*/
if (conf->raid_disks - mddev->degraded < 1) {
- md_unregister_thread(&conf->thread);
+ md_unregister_thread(mddev, &conf->thread);
ret = -EINVAL;
goto abort;
}
@@ -3171,7 +3185,7 @@ static int raid1_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret) {
- md_unregister_thread(&mddev->thread);
+ md_unregister_thread(mddev, &mddev->thread);
goto abort;
}
return 0;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 468f189da7a0..14d4211a123a 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -157,7 +157,6 @@ struct r1bio {
sector_t sector;
int sectors;
unsigned long state;
- unsigned long start_time;
struct mddev *mddev;
/*
* original bio going to /dev/mdx
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5051149e27bb..023413120851 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
bio->bi_status = BLK_STS_IOERR;
- if (r10_bio->start_time)
- bio_end_io_acct(bio, r10_bio->start_time);
bio_endio(bio);
/*
* Wake up any possible resync thread that waits for the device
@@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
}
static void raid10_read_request(struct mddev *mddev, struct bio *bio,
- struct r10bio *r10_bio)
+ struct r10bio *r10_bio, bool io_accounting)
{
struct r10conf *conf = mddev->private;
struct bio *read_bio;
@@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
slot = r10_bio->read_slot;
- if (!r10_bio->start_time &&
- blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
- r10_bio->start_time = bio_start_io_acct(bio);
+ if (io_accounting) {
+ md_account_bio(mddev, &bio);
+ r10_bio->master_bio = bio;
+ }
read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
r10_bio->devs[slot].bio = read_bio;
@@ -1322,6 +1321,25 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
}
}
+static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror,
+ struct md_rdev **prrdev)
+{
+ struct md_rdev *rdev, *rrdev;
+
+ rrdev = rcu_dereference(mirror->replacement);
+ /*
+ * Read replacement first to prevent reading both rdev and
+ * replacement as NULL during replacement replace rdev.
+ */
+ smp_mb();
+ rdev = rcu_dereference(mirror->rdev);
+ if (rdev == rrdev)
+ rrdev = NULL;
+
+ *prrdev = rrdev;
+ return rdev;
+}
+
static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
{
int i;
@@ -1332,11 +1350,9 @@ retry_wait:
blocked_rdev = NULL;
rcu_read_lock();
for (i = 0; i < conf->copies; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
- struct md_rdev *rrdev = rcu_dereference(
- conf->mirrors[i].replacement);
- if (rdev == rrdev)
- rrdev = NULL;
+ struct md_rdev *rdev, *rrdev;
+
+ rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev);
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev;
@@ -1465,15 +1481,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
int d = r10_bio->devs[i].devnum;
struct md_rdev *rdev, *rrdev;
- rrdev = rcu_dereference(conf->mirrors[d].replacement);
- /*
- * Read replacement first to prevent reading both rdev and
- * replacement as NULL during replacement replace rdev.
- */
- smp_mb();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev == rrdev)
- rrdev = NULL;
+ rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev);
if (rdev && (test_bit(Faulty, &rdev->flags)))
rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
@@ -1543,8 +1551,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->master_bio = bio;
}
- if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
- r10_bio->start_time = bio_start_io_acct(bio);
+ md_account_bio(mddev, &bio);
+ r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
@@ -1571,12 +1579,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
r10_bio->read_slot = -1;
- r10_bio->start_time = 0;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
conf->geo.raid_disks);
if (bio_data_dir(bio) == READ)
- raid10_read_request(mddev, bio, r10_bio);
+ raid10_read_request(mddev, bio, r10_bio, true);
else
raid10_write_request(mddev, bio, r10_bio);
}
@@ -1780,10 +1787,9 @@ retry_discard:
*/
rcu_read_lock();
for (disk = 0; disk < geo->raid_disks; disk++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
- struct md_rdev *rrdev = rcu_dereference(
- conf->mirrors[disk].replacement);
+ struct md_rdev *rdev, *rrdev;
+ rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev);
r10_bio->devs[disk].bio = NULL;
r10_bio->devs[disk].repl_bio = NULL;
@@ -2720,10 +2726,10 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
{
int sect = 0; /* Offset from r10_bio->sector */
- int sectors = r10_bio->sectors;
+ int sectors = r10_bio->sectors, slot = r10_bio->read_slot;
struct md_rdev *rdev;
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
- int d = r10_bio->devs[r10_bio->read_slot].devnum;
+ int d = r10_bio->devs[slot].devnum;
/* still own a reference to this rdev, so it cannot
* have been cleared recently.
@@ -2744,13 +2750,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
pr_notice("md/raid10:%s: %pg: Failing raid device\n",
mdname(mddev), rdev->bdev);
md_error(mddev, rdev);
- r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
+ r10_bio->devs[slot].bio = IO_BLOCKED;
return;
}
while(sectors) {
int s = sectors;
- int sl = r10_bio->read_slot;
+ int sl = slot;
int success = 0;
int start;
@@ -2785,7 +2791,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
sl++;
if (sl == conf->copies)
sl = 0;
- } while (!success && sl != r10_bio->read_slot);
+ } while (sl != slot);
rcu_read_unlock();
if (!success) {
@@ -2793,16 +2799,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
* as bad on the first device to discourage future
* reads.
*/
- int dn = r10_bio->devs[r10_bio->read_slot].devnum;
+ int dn = r10_bio->devs[slot].devnum;
rdev = conf->mirrors[dn].rdev;
if (!rdev_set_badblocks(
rdev,
- r10_bio->devs[r10_bio->read_slot].addr
+ r10_bio->devs[slot].addr
+ sect,
s, 0)) {
md_error(mddev, rdev);
- r10_bio->devs[r10_bio->read_slot].bio
+ r10_bio->devs[slot].bio
= IO_BLOCKED;
}
break;
@@ -2811,7 +2817,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
start = sl;
/* write it back and re-read */
rcu_read_lock();
- while (sl != r10_bio->read_slot) {
+ while (sl != slot) {
if (sl==0)
sl = conf->copies;
sl--;
@@ -2845,7 +2851,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
rcu_read_lock();
}
sl = start;
- while (sl != r10_bio->read_slot) {
+ while (sl != slot) {
if (sl==0)
sl = conf->copies;
sl--;
@@ -2985,7 +2991,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
rdev_dec_pending(rdev, mddev);
r10_bio->state = 0;
- raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
+ raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false);
/*
* allow_barrier after re-submit to ensure no sync io
* can be issued while regular io pending.
@@ -4314,7 +4320,7 @@ static int raid10_run(struct mddev *mddev)
return 0;
out_free_conf:
- md_unregister_thread(&mddev->thread);
+ md_unregister_thread(mddev, &mddev->thread);
raid10_free_conf(conf);
mddev->private = NULL;
out:
@@ -4411,7 +4417,6 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
rdev->new_raid_disk = rdev->raid_disk * 2;
rdev->sectors = size;
}
- WRITE_ONCE(conf->barrier, 1);
}
return conf;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 63e48b11b552..2e75e88d0802 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -123,7 +123,6 @@ struct r10bio {
sector_t sector; /* virtual sector number */
int sectors;
unsigned long state;
- unsigned long start_time;
struct mddev *mddev;
/*
* original bio going to /dev/mdx
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 47ba7d9e81e1..518b7cfa78b9 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1260,14 +1260,13 @@ static void r5l_log_flush_endio(struct bio *bio)
if (bio->bi_status)
md_error(log->rdev->mddev, log->rdev);
+ bio_uninit(bio);
spin_lock_irqsave(&log->io_list_lock, flags);
list_for_each_entry(io, &log->flushing_ios, log_sibling)
r5l_io_run_stripes(io);
list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
spin_unlock_irqrestore(&log->io_list_lock, flags);
-
- bio_uninit(bio);
}
/*
@@ -3168,12 +3167,15 @@ void r5l_exit_log(struct r5conf *conf)
{
struct r5l_log *log = conf->log;
- /* Ensure disable_writeback_work wakes up and exits */
- wake_up(&conf->mddev->sb_wait);
- flush_work(&log->disable_writeback_work);
- md_unregister_thread(&log->reclaim_thread);
+ md_unregister_thread(conf->mddev, &log->reclaim_thread);
+ /*
+ * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to
+ * ensure disable_writeback_work wakes up and exits.
+ */
conf->log = NULL;
+ wake_up(&conf->mddev->sb_wait);
+ flush_work(&log->disable_writeback_work);
mempool_exit(&log->meta_pool);
bioset_exit(&log->bs);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 85b3004594e0..4cb9c608ee19 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
*/
static void raid5_align_endio(struct bio *bi)
{
- struct md_io_acct *md_io_acct = bi->bi_private;
- struct bio *raid_bi = md_io_acct->orig_bio;
- struct mddev *mddev;
- struct r5conf *conf;
- struct md_rdev *rdev;
+ struct bio *raid_bi = bi->bi_private;
+ struct md_rdev *rdev = (void *)raid_bi->bi_next;
+ struct mddev *mddev = rdev->mddev;
+ struct r5conf *conf = mddev->private;
blk_status_t error = bi->bi_status;
- unsigned long start_time = md_io_acct->start_time;
bio_put(bi);
-
- rdev = (void*)raid_bi->bi_next;
raid_bi->bi_next = NULL;
- mddev = rdev->mddev;
- conf = mddev->private;
-
rdev_dec_pending(rdev, conf->mddev);
if (!error) {
- if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
- bio_end_io_acct(raid_bi, start_time);
bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
@@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct md_rdev *rdev;
sector_t sector, end_sector, first_bad;
int bad_sectors, dd_idx;
- struct md_io_acct *md_io_acct;
bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
return 0;
}
- align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
- &mddev->io_acct_set);
- md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
+ md_account_bio(mddev, &raid_bio);
raid_bio->bi_next = (void *)rdev;
- if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
- md_io_acct->start_time = bio_start_io_acct(raid_bio);
- md_io_acct->orig_bio = raid_bio;
+ align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
+ &mddev->bio_set);
align_bio->bi_end_io = raid5_align_endio;
- align_bio->bi_private = md_io_acct;
+ align_bio->bi_private = raid_bio;
align_bio->bi_iter.bi_sector = sector;
/* No reshape active, so we can trust rdev->data_offset */
@@ -7787,19 +7774,12 @@ static int raid5_run(struct mddev *mddev)
struct md_rdev *rdev;
struct md_rdev *journal_dev = NULL;
sector_t reshape_offset = 0;
- int i, ret = 0;
+ int i;
long long min_offset_diff = 0;
int first = 1;
- if (acct_bioset_init(mddev)) {
- pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
+ if (mddev_init_writes_pending(mddev) < 0)
return -ENOMEM;
- }
-
- if (mddev_init_writes_pending(mddev) < 0) {
- ret = -ENOMEM;
- goto exit_acct_set;
- }
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@@ -7830,8 +7810,7 @@ static int raid5_run(struct mddev *mddev)
(mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
if (mddev->reshape_position != MaxSector) {
@@ -7856,15 +7835,13 @@ static int raid5_run(struct mddev *mddev)
if (journal_dev) {
pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
if (mddev->new_level != mddev->level) {
pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
old_disks = mddev->raid_disks - mddev->delta_disks;
/* reshape_position must be on a new-stripe boundary, and one
@@ -7880,8 +7857,7 @@ static int raid5_run(struct mddev *mddev)
if (sector_div(here_new, chunk_sectors * new_data_disks)) {
pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
reshape_offset = here_new * chunk_sectors;
/* here_new is the stripe we will write to */
@@ -7903,8 +7879,7 @@ static int raid5_run(struct mddev *mddev)
else if (mddev->ro == 0) {
pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
} else if (mddev->reshape_backwards
? (here_new * chunk_sectors + min_offset_diff <=
@@ -7914,8 +7889,7 @@ static int raid5_run(struct mddev *mddev)
/* Reading from the same stripe as writing to - bad */
pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
/* OK, we should be able to continue; */
@@ -7939,10 +7913,8 @@ static int raid5_run(struct mddev *mddev)
else
conf = mddev->private;
- if (IS_ERR(conf)) {
- ret = PTR_ERR(conf);
- goto exit_acct_set;
- }
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
if (!journal_dev) {
@@ -8135,15 +8107,12 @@ static int raid5_run(struct mddev *mddev)
return 0;
abort:
- md_unregister_thread(&mddev->thread);
+ md_unregister_thread(mddev, &mddev->thread);
print_raid5_conf(conf);
free_conf(conf);
mddev->private = NULL;
pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
- ret = -EIO;
-exit_acct_set:
- acct_bioset_exit(mddev);
- return ret;
+ return -EIO;
}
static void raid5_free(struct mddev *mddev, void *priv)
@@ -8151,7 +8120,6 @@ static void raid5_free(struct mddev *mddev, void *priv)
struct r5conf *conf = priv;
free_conf(conf);
- acct_bioset_exit(mddev);
mddev->to_remove = &raid5_attrs_group;
}
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index d39f3219358b..d8ff796fd5f2 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -118,7 +118,6 @@ static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
goto out_free_meta;
}
- bip->bip_iter.bi_size = len;
bip->bip_iter.bi_sector = seed;
ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
offset_in_page(buf));
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 2733e0158585..468833675cc9 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -206,12 +206,11 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio,
return PTR_ERR(bip);
}
- bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
/* virtual start sector must be in integrity interval units */
bip_set_seed(bip, bio->bi_iter.bi_sector >>
(bi->interval_exp - SECTOR_SHIFT));
- resid = bip->bip_iter.bi_size;
+ resid = bio_integrity_bytes(bi, bio_sectors(bio));
while (resid > 0 && sg_miter_next(miter)) {
len = min_t(size_t, miter->length, resid);
rc = bio_integrity_add_page(bio, miter->page, len,
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ad9afae49544..59176946ab56 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -300,11 +300,6 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
cmd->budget_token = -1;
}
-static void scsi_kick_queue(struct request_queue *q)
-{
- blk_mq_run_hw_queues(q, false);
-}
-
/*
* Kick the queue of SCSI device @sdev if @sdev != current_sdev. Called with
* interrupts disabled.
@@ -340,7 +335,8 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev)
* but in most cases, we will be first. Ideally, each LU on the
* target would get some limited time or requests on the target.
*/
- scsi_kick_queue(current_sdev->request_queue);
+ blk_mq_run_hw_queues(current_sdev->request_queue,
+ shost->queuecommand_may_block);
spin_lock_irqsave(shost->host_lock, flags);
if (!starget->starget_sdev_user)
@@ -427,7 +423,7 @@ static void scsi_starved_list_run(struct Scsi_Host *shost)
continue;
spin_unlock_irqrestore(shost->host_lock, flags);
- scsi_kick_queue(slq);
+ blk_mq_run_hw_queues(slq, false);
blk_put_queue(slq);
spin_lock_irqsave(shost->host_lock, flags);
@@ -452,8 +448,8 @@ static void scsi_run_queue(struct request_queue *q)
if (!list_empty(&sdev->host->starved_list))
scsi_starved_list_run(sdev->host);
+ /* Note: blk_mq_kick_requeue_list() runs the queue asynchronously. */
blk_mq_kick_requeue_list(q);
- blk_mq_run_hw_queues(q, false);
}
void scsi_requeue_run_queue(struct work_struct *work)
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 3d1b511ea284..a7050f63b7cc 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -689,7 +689,6 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio,
return PTR_ERR(bip);
}
- bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
/* virtual start sector must be in integrity interval units */
bip_set_seed(bip, bio->bi_iter.bi_sector >>
(bi->interval_exp - SECTOR_SHIFT));
@@ -697,7 +696,7 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio,
pr_debug("IBLOCK BIP Size: %u Sector: %llu\n", bip->bip_iter.bi_size,
(unsigned long long)bip->bip_iter.bi_sector);
- resid = bip->bip_iter.bi_size;
+ resid = bio_integrity_bytes(bi, bio_sectors(bio));
while (resid > 0 && sg_miter_next(miter)) {
len = min_t(size_t, miter->length, resid);