From 202700e18acbed55970dbb9d4d518ac59b1172c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 12 Sep 2020 13:18:10 -0600 Subject: io_uring: grab any needed state during defer prep Always grab work environment for deferred links. The assumption that we will be running it always from the task in question is false, as exiting tasks may mean that we're deferring this one to a thread helper. And at that point it's too late to grab the work environment. Fixes: debb85f496c9 ("io_uring: factor out grab_env() from defer_prep()") Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 175fb647d099..be9d628e7854 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5449,6 +5449,8 @@ static int io_req_defer_prep(struct io_kiocb *req, if (unlikely(ret)) return ret; + io_prep_async_work(req); + switch (req->opcode) { case IORING_OP_NOP: break; -- cgit v1.2.3 From 87ceb6a6b81eca000911403446d4c6043b4e4f82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Sep 2020 08:20:12 -0600 Subject: io_uring: drop 'ctx' ref on task work cancelation If task_work ends up being marked for cancelation, we go through a cancelation helper instead of the queue path. In converting task_work to always hold a ctx reference, this path was missed. Make sure that io_req_task_cancel() puts the reference that is being held against the ctx. Fixes: 6d816e088c35 ("io_uring: hold 'ctx' reference around task_work queue + execute") Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index be9d628e7854..01756a131be6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1787,8 +1787,10 @@ static void __io_req_task_cancel(struct io_kiocb *req, int error) static void io_req_task_cancel(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; __io_req_task_cancel(req, -ECANCELED); + percpu_ref_put(&ctx->refs); } static void __io_req_task_submit(struct io_kiocb *req) -- cgit v1.2.3 From 6200b0ae4ea28a4bfd8eb434e33e6201b7a6a282 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 13 Sep 2020 14:38:30 -0600 Subject: io_uring: don't run task work on an exiting task This isn't safe, and isn't needed either. We are guaranteed that any work we queue is on a live task (and will be run), or it goes to our backup io-wq threads if the task is exiting. Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 01756a131be6..a29c8913b1f0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1753,6 +1753,9 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb, struct io_ring_ctx *ctx = req->ctx; int ret, notify; + if (tsk->flags & PF_EXITING) + return -ESRCH; + /* * SQPOLL kernel thread doesn't need notification, just a wakeup. For * all other cases, use TWA_SIGNAL unconditionally to ensure we're @@ -2012,6 +2015,12 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) static inline bool io_run_task_work(void) { + /* + * Not safe to run on exiting task, and the task_work handling will + * not add work to such a task. + */ + if (unlikely(current->flags & PF_EXITING)) + return false; if (current->task_works) { __set_current_state(TASK_RUNNING); task_work_run(); @@ -8184,6 +8193,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, /* cancel this request, or head link requests */ io_attempt_cancel(ctx, cancel_req); io_put_req(cancel_req); + /* cancellations _may_ trigger task work */ + io_run_task_work(); schedule(); finish_wait(&ctx->inflight_wait, &wait); } -- cgit v1.2.3 From 8f3d749685e48c44dbe877ac9781079d85f914c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Sep 2020 09:28:14 -0600 Subject: io_uring: don't re-setup vecs/iter in io_resumit_prep() is already there If we already have mapped the necessary data for retry, then don't set it up again. It's a pointless operation, and we leak the iovec if it's a large (non-stack) vec. Fixes: b63534c41e20 ("io_uring: re-issue block requests that failed because of resources") Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index a29c8913b1f0..b93355ccceeb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2294,13 +2294,17 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) goto end_req; } - ret = io_import_iovec(rw, req, &iovec, &iter, false); - if (ret < 0) - goto end_req; - ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); - if (!ret) + if (!req->io) { + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); + if (!ret) + return true; + kfree(iovec); + } else { return true; - kfree(iovec); + } end_req: req_set_fail_links(req); io_req_complete(req, ret); -- cgit v1.2.3 From f5cac8b156e8b7b67bb0fdfd19900855bf9569f3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Sep 2020 09:30:38 -0600 Subject: io_uring: don't use retry based buffered reads for non-async bdev Some block devices, like dm, bubble back -EAGAIN through the completion handler. We check for this in io_read(), but don't honor it for when we have copied the iov. Return -EAGAIN for this case before retrying, to force punt to io-wq. Fixes: bcf5a06304d6 ("io_uring: support true async buffered reads, if file provides it") Reported-by: Zorro Lang Tested-by: Zorro Lang Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index b93355ccceeb..10d742391544 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3130,6 +3130,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct iov_iter __iter, *iter = &__iter; ssize_t io_size, ret, ret2; size_t iov_count; + bool no_async; if (req->io) iter = &req->io->rw.iter; @@ -3147,7 +3148,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, kiocb->ki_flags &= ~IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req->file, READ)) + no_async = force_nonblock && !io_file_supports_async(req->file, READ); + if (no_async) goto copy_iov; ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); @@ -3191,6 +3193,8 @@ copy_iov: ret = ret2; goto out_free; } + if (no_async) + return -EAGAIN; /* it's copied and will be cleaned with ->io */ iovec = NULL; /* now use our persistent iterator, if we aren't already */ -- cgit v1.2.3 From 6ca56f845955e325033758f90a2cffe150f31bc8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 18 Sep 2020 16:51:19 -0600 Subject: io_uring: mark statx/files_update/epoll_ctl as non-SQPOLL These will naturally fail when attempted through SQPOLL, but either with -EFAULT or -EBADF. Make it explicit that these are not workable through SQPOLL and return -EINVAL, just like other ops that need to use ->files. Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 10d742391544..f2b180731676 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3786,7 +3786,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #if defined(CONFIG_EPOLL) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); @@ -3901,7 +3901,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock) static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->buf_index) return -EINVAL; @@ -5418,6 +5418,8 @@ static int io_async_cancel(struct io_kiocb *req) static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL)) + return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->ioprio || sqe->rw_flags) -- cgit v1.2.3 From 4eb8dded6b82e184c09bb963bea0335fa3f30b55 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 18 Sep 2020 19:36:24 -0600 Subject: io_uring: fix openat/openat2 unified prep handling A previous commit unified how we handle prep for these two functions, but this means that we check the allowed context (SQPOLL, specifically) later than we should. Move the ring type checking into the two parent functions, instead of doing it after we've done some setup work. Fixes: ec65fea5a8d7 ("io_uring: deduplicate io_openat{,2}_prep()") Reported-by: Andy Lutomirski Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index f2b180731676..e6004b92e553 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3527,8 +3527,6 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe const char __user *fname; int ret; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) - return -EINVAL; if (unlikely(sqe->ioprio || sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) @@ -3555,6 +3553,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { u64 flags, mode; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; if (req->flags & REQ_F_NEED_CLEANUP) return 0; mode = READ_ONCE(sqe->len); @@ -3569,6 +3569,8 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret; + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; if (req->flags & REQ_F_NEED_CLEANUP) return 0; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); -- cgit v1.2.3