summaryrefslogtreecommitdiff
path: root/drivers/block/ublk_drv.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/ublk_drv.c')
-rw-r--r--drivers/block/ublk_drv.c302
1 files changed, 285 insertions, 17 deletions
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 6a4a94b4cdf4..2651bf41dde3 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -49,7 +49,9 @@
/* All UBLK_F_* have to be included into UBLK_F_ALL */
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
| UBLK_F_URING_CMD_COMP_IN_TASK \
- | UBLK_F_NEED_GET_DATA)
+ | UBLK_F_NEED_GET_DATA \
+ | UBLK_F_USER_RECOVERY \
+ | UBLK_F_USER_RECOVERY_REISSUE)
/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
@@ -119,7 +121,7 @@ struct ublk_queue {
unsigned long io_addr; /* mapped vm address */
unsigned int max_io_sz;
- bool abort_work_pending;
+ bool force_abort;
unsigned short nr_io_ready; /* how many ios setup */
struct ublk_device *dev;
struct ublk_io ios[0];
@@ -161,6 +163,7 @@ struct ublk_device {
* monitor each queue's daemon periodically
*/
struct delayed_work monitor_work;
+ struct work_struct quiesce_work;
struct work_struct stop_work;
};
@@ -323,6 +326,30 @@ static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
PAGE_SIZE);
}
+static inline bool ublk_queue_can_use_recovery_reissue(
+ struct ublk_queue *ubq)
+{
+ if ((ubq->flags & UBLK_F_USER_RECOVERY) &&
+ (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE))
+ return true;
+ return false;
+}
+
+static inline bool ublk_queue_can_use_recovery(
+ struct ublk_queue *ubq)
+{
+ if (ubq->flags & UBLK_F_USER_RECOVERY)
+ return true;
+ return false;
+}
+
+static inline bool ublk_can_use_recovery(struct ublk_device *ub)
+{
+ if (ub->dev_info.flags & UBLK_F_USER_RECOVERY)
+ return true;
+ return false;
+}
+
static void ublk_free_disk(struct gendisk *disk)
{
struct ublk_device *ub = disk->private_data;
@@ -612,13 +639,17 @@ static void ublk_complete_rq(struct request *req)
* Also aborting may not be started yet, keep in mind that one failed
* request may be issued by block layer again.
*/
-static void __ublk_fail_req(struct ublk_io *io, struct request *req)
+static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
+ struct request *req)
{
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
io->flags |= UBLK_IO_FLAG_ABORTED;
- blk_mq_end_request(req, BLK_STS_IOERR);
+ if (ublk_queue_can_use_recovery_reissue(ubq))
+ blk_mq_requeue_request(req, false);
+ else
+ blk_mq_end_request(req, BLK_STS_IOERR);
}
}
@@ -639,22 +670,40 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res)
#define UBLK_REQUEUE_DELAY_MS 3
+static inline void __ublk_abort_rq(struct ublk_queue *ubq,
+ struct request *rq)
+{
+ /* We cannot process this rq so just requeue it. */
+ if (ublk_queue_can_use_recovery(ubq))
+ blk_mq_requeue_request(rq, false);
+ else
+ blk_mq_end_request(rq, BLK_STS_IOERR);
+
+ mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
+}
+
static inline void __ublk_rq_task_work(struct request *req)
{
struct ublk_queue *ubq = req->mq_hctx->driver_data;
- struct ublk_device *ub = ubq->dev;
int tag = req->tag;
struct ublk_io *io = &ubq->ios[tag];
- bool task_exiting = current != ubq->ubq_daemon || ubq_daemon_is_dying(ubq);
unsigned int mapped_bytes;
pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr);
- if (unlikely(task_exiting)) {
- blk_mq_end_request(req, BLK_STS_IOERR);
- mod_delayed_work(system_wq, &ub->monitor_work, 0);
+ /*
+ * Task is exiting if either:
+ *
+ * (1) current != ubq_daemon.
+ * io_uring_cmd_complete_in_task() tries to run task_work
+ * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
+ *
+ * (2) current->flags & PF_EXITING.
+ */
+ if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
+ __ublk_abort_rq(ubq, req);
return;
}
@@ -739,13 +788,24 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
res = ublk_setup_iod(ubq, rq);
if (unlikely(res != BLK_STS_OK))
return BLK_STS_IOERR;
+ /* With recovery feature enabled, force_abort is set in
+ * ublk_stop_dev() before calling del_gendisk(). We have to
+ * abort all requeued and new rqs here to let del_gendisk()
+ * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
+ * to avoid UAF on io_uring ctx.
+ *
+ * Note: force_abort is guaranteed to be seen because it is set
+ * before request queue is unqiuesced.
+ */
+ if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
+ return BLK_STS_IOERR;
blk_mq_start_request(bd->rq);
if (unlikely(ubq_daemon_is_dying(ubq))) {
fail:
- mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
- return BLK_STS_IOERR;
+ __ublk_abort_rq(ubq, rq);
+ return BLK_STS_OK;
}
if (ublk_can_use_task_work(ubq)) {
@@ -916,7 +976,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
*/
rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
if (rq)
- __ublk_fail_req(io, rq);
+ __ublk_fail_req(ubq, io, rq);
}
}
ublk_put_device(ub);
@@ -932,7 +992,10 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
struct ublk_queue *ubq = ublk_get_queue(ub, i);
if (ubq_daemon_is_dying(ubq)) {
- schedule_work(&ub->stop_work);
+ if (ublk_queue_can_use_recovery(ubq))
+ schedule_work(&ub->quiesce_work);
+ else
+ schedule_work(&ub->stop_work);
/* abort queue is for making forward progress */
ublk_abort_queue(ub, ubq);
@@ -940,12 +1003,13 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
}
/*
- * We can't schedule monitor work after ublk_remove() is started.
+ * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE.
+ * after ublk_remove() or __ublk_quiesce_dev() is started.
*
* No need ub->mutex, monitor work are canceled after state is marked
- * as DEAD, so DEAD state is observed reliably.
+ * as not LIVE, so new state is observed reliably.
*/
- if (ub->dev_info.state != UBLK_S_DEV_DEAD)
+ if (ub->dev_info.state == UBLK_S_DEV_LIVE)
schedule_delayed_work(&ub->monitor_work,
UBLK_DAEMON_MONITOR_PERIOD);
}
@@ -982,12 +1046,97 @@ static void ublk_cancel_dev(struct ublk_device *ub)
ublk_cancel_queue(ublk_get_queue(ub, i));
}
-static void ublk_stop_dev(struct ublk_device *ub)
+static bool ublk_check_inflight_rq(struct request *rq, void *data)
+{
+ bool *idle = data;
+
+ if (blk_mq_request_started(rq)) {
+ *idle = false;
+ return false;
+ }
+ return true;
+}
+
+static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
{
+ bool idle;
+
+ WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
+ while (true) {
+ idle = true;
+ blk_mq_tagset_busy_iter(&ub->tag_set,
+ ublk_check_inflight_rq, &idle);
+ if (idle)
+ break;
+ msleep(UBLK_REQUEUE_DELAY_MS);
+ }
+}
+
+static void __ublk_quiesce_dev(struct ublk_device *ub)
+{
+ pr_devel("%s: quiesce ub: dev_id %d state %s\n",
+ __func__, ub->dev_info.dev_id,
+ ub->dev_info.state == UBLK_S_DEV_LIVE ?
+ "LIVE" : "QUIESCED");
+ blk_mq_quiesce_queue(ub->ub_disk->queue);
+ ublk_wait_tagset_rqs_idle(ub);
+ ub->dev_info.state = UBLK_S_DEV_QUIESCED;
+ ublk_cancel_dev(ub);
+ /* we are going to release task_struct of ubq_daemon and resets
+ * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF.
+ * Besides, monitor_work is not necessary in QUIESCED state since we have
+ * already scheduled quiesce_work and quiesced all ubqs.
+ *
+ * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel
+ * it here and re-schedule it in END_USER_RECOVERY to avoid UAF.
+ */
+ cancel_delayed_work_sync(&ub->monitor_work);
+}
+
+static void ublk_quiesce_work_fn(struct work_struct *work)
+{
+ struct ublk_device *ub =
+ container_of(work, struct ublk_device, quiesce_work);
+
mutex_lock(&ub->mutex);
if (ub->dev_info.state != UBLK_S_DEV_LIVE)
goto unlock;
+ __ublk_quiesce_dev(ub);
+ unlock:
+ mutex_unlock(&ub->mutex);
+}
+
+static void ublk_unquiesce_dev(struct ublk_device *ub)
+{
+ int i;
+
+ pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
+ __func__, ub->dev_info.dev_id,
+ ub->dev_info.state == UBLK_S_DEV_LIVE ?
+ "LIVE" : "QUIESCED");
+ /* quiesce_work has run. We let requeued rqs be aborted
+ * before running fallback_wq. "force_abort" must be seen
+ * after request queue is unqiuesced. Then del_gendisk()
+ * can move on.
+ */
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+ ublk_get_queue(ub, i)->force_abort = true;
+
+ blk_mq_unquiesce_queue(ub->ub_disk->queue);
+ /* We may have requeued some rqs in ublk_quiesce_queue() */
+ blk_mq_kick_requeue_list(ub->ub_disk->queue);
+}
+static void ublk_stop_dev(struct ublk_device *ub)
+{
+ mutex_lock(&ub->mutex);
+ if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+ goto unlock;
+ if (ublk_can_use_recovery(ub)) {
+ if (ub->dev_info.state == UBLK_S_DEV_LIVE)
+ __ublk_quiesce_dev(ub);
+ ublk_unquiesce_dev(ub);
+ }
del_gendisk(ub->ub_disk);
ub->dev_info.state = UBLK_S_DEV_DEAD;
ub->dev_info.ublksrv_pid = -1;
@@ -1311,6 +1460,7 @@ static void ublk_remove(struct ublk_device *ub)
{
ublk_stop_dev(ub);
cancel_work_sync(&ub->stop_work);
+ cancel_work_sync(&ub->quiesce_work);
cdev_device_del(&ub->cdev, &ub->cdev_dev);
put_device(&ub->cdev_dev);
}
@@ -1487,6 +1637,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
goto out_unlock;
mutex_init(&ub->mutex);
spin_lock_init(&ub->mm_lock);
+ INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
@@ -1607,6 +1758,7 @@ static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd)
ublk_stop_dev(ub);
cancel_work_sync(&ub->stop_work);
+ cancel_work_sync(&ub->quiesce_work);
ublk_put_device(ub);
return 0;
@@ -1709,6 +1861,116 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd)
return ret;
}
+static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
+{
+ int i;
+
+ WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
+ /* All old ioucmds have to be completed */
+ WARN_ON_ONCE(ubq->nr_io_ready);
+ /* old daemon is PF_EXITING, put it now */
+ put_task_struct(ubq->ubq_daemon);
+ /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
+ ubq->ubq_daemon = NULL;
+
+ for (i = 0; i < ubq->q_depth; i++) {
+ struct ublk_io *io = &ubq->ios[i];
+
+ /* forget everything now and be ready for new FETCH_REQ */
+ io->flags = 0;
+ io->cmd = NULL;
+ io->addr = 0;
+ }
+}
+
+static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd)
+{
+ struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ struct ublk_device *ub;
+ int ret = -EINVAL;
+ int i;
+
+ ub = ublk_get_device_from_id(header->dev_id);
+ if (!ub)
+ return ret;
+
+ mutex_lock(&ub->mutex);
+ if (!ublk_can_use_recovery(ub))
+ goto out_unlock;
+ /*
+ * START_RECOVERY is only allowd after:
+ *
+ * (1) UB_STATE_OPEN is not set, which means the dying process is exited
+ * and related io_uring ctx is freed so file struct of /dev/ublkcX is
+ * released.
+ *
+ * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
+ * (a)has quiesced request queue
+ * (b)has requeued every inflight rqs whose io_flags is ACTIVE
+ * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
+ * (d)has completed/camceled all ioucmds owned by ther dying process
+ */
+ if (test_bit(UB_STATE_OPEN, &ub->state) ||
+ ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+ ublk_queue_reinit(ub, ublk_get_queue(ub, i));
+ /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
+ ub->mm = NULL;
+ ub->nr_queues_ready = 0;
+ init_completion(&ub->completion);
+ ret = 0;
+ out_unlock:
+ mutex_unlock(&ub->mutex);
+ ublk_put_device(ub);
+ return ret;
+}
+
+static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd)
+{
+ struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ int ublksrv_pid = (int)header->data[0];
+ struct ublk_device *ub;
+ int ret = -EINVAL;
+
+ ub = ublk_get_device_from_id(header->dev_id);
+ if (!ub)
+ return ret;
+
+ pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
+ __func__, ub->dev_info.nr_hw_queues, header->dev_id);
+ /* wait until new ubq_daemon sending all FETCH_REQ */
+ wait_for_completion_interruptible(&ub->completion);
+ pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
+ __func__, ub->dev_info.nr_hw_queues, header->dev_id);
+
+ mutex_lock(&ub->mutex);
+ if (!ublk_can_use_recovery(ub))
+ goto out_unlock;
+
+ if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ ub->dev_info.ublksrv_pid = ublksrv_pid;
+ pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
+ __func__, ublksrv_pid, header->dev_id);
+ blk_mq_unquiesce_queue(ub->ub_disk->queue);
+ pr_devel("%s: queue unquiesced, dev id %d.\n",
+ __func__, header->dev_id);
+ blk_mq_kick_requeue_list(ub->ub_disk->queue);
+ ub->dev_info.state = UBLK_S_DEV_LIVE;
+ schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
+ ret = 0;
+ out_unlock:
+ mutex_unlock(&ub->mutex);
+ ublk_put_device(ub);
+ return ret;
+}
+
static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
@@ -1750,6 +2012,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_CMD_SET_PARAMS:
ret = ublk_ctrl_set_params(cmd);
break;
+ case UBLK_CMD_START_USER_RECOVERY:
+ ret = ublk_ctrl_start_recovery(cmd);
+ break;
+ case UBLK_CMD_END_USER_RECOVERY:
+ ret = ublk_ctrl_end_recovery(cmd);
+ break;
default:
break;
}