diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-18 17:02:57 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-18 17:02:57 -0800 |
commit | 8350142a4b4cedebfa76cd4cc6e5a7ba6a330629 (patch) | |
tree | 14310648caff08366a28cd6d2b8dab44688a9995 /io_uring/rw.c | |
parent | 77a0cfafa9af9c0d5b43534eb90d530c189edca1 (diff) | |
parent | a652958888fb1ada3e4f6b548576c2d2c1b60d66 (diff) |
Merge tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe:
- Cleanups of the eventfd handling code, making it fully private.
- Support for sending a sync message to another ring, without having a
ring available to send a normal async message.
- Get rid of the separate unlocked hash table, unify everything around
the single locked one.
- Add support for ring resizing. It can be hard to appropriately size
the CQ ring upfront, if the application doesn't know how busy it will
be. This results in applications sizing rings for the most busy case,
which can be wasteful. With ring resizing, they can start small and
grow the ring, if needed.
- Add support for fixed wait regions, rather than needing to copy the
same wait data tons of times for each wait operation.
- Rewrite the resource node handling, which before was serialized per
ring. This caused issues with particularly fixed files, where one
file waiting on IO could hold up putting and freeing of other
unrelated files. Now each node is handled separately. New code is
much simpler too, and was a net 250 line reduction in code.
- Add support for just doing partial buffer clones, rather than always
cloning the entire buffer table.
- Series adding static NAPI support, where a specific NAPI instance is
used rather than having a list of them available that need lookup.
- Add support for mapped regions, and also convert the fixed wait
support mentioned above to that concept. This avoids doing special
mappings for various planned features, and folds the existing
registered wait into that too.
- Add support for hybrid IO polling, which is a variant of strict
IOPOLL but with an initial sleep delay to avoid spinning too early
and wasting resources on devices that aren't necessarily in the < 5
usec category wrt latencies.
- Various cleanups and little fixes.
* tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux: (79 commits)
io_uring/region: fix error codes after failed vmap
io_uring: restore back registered wait arguments
io_uring: add memory region registration
io_uring: introduce concept of memory regions
io_uring: temporarily disable registered waits
io_uring: disable ENTER_EXT_ARG_REG for IOPOLL
io_uring: fortify io_pin_pages with a warning
switch io_msg_ring() to CLASS(fd)
io_uring: fix invalid hybrid polling ctx leaks
io_uring/uring_cmd: fix buffer index retrieval
io_uring/rsrc: add & apply io_req_assign_buf_node()
io_uring/rsrc: remove '->ctx_ptr' of 'struct io_rsrc_node'
io_uring/rsrc: pass 'struct io_ring_ctx' reference to rsrc helpers
io_uring: avoid normal tw intermediate fallback
io_uring/napi: add static napi tracking strategy
io_uring/napi: clean up __io_napi_do_busy_loop
io_uring/napi: Use lock guards
io_uring/napi: improve __io_napi_add
io_uring/napi: fix io_napi_entry RCU accesses
io_uring/napi: protect concurrent io_napi_entry timeout accesses
...
Diffstat (limited to 'io_uring/rw.c')
-rw-r--r-- | io_uring/rw.c | 105 |
1 files changed, 87 insertions, 18 deletions
diff --git a/io_uring/rw.c b/io_uring/rw.c index 12444c0a8e3d..cce8bc2ecd3f 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -330,22 +330,21 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *node; struct io_async_rw *io; - u16 index; int ret; ret = io_prep_rw(req, sqe, ddir, false); if (unlikely(ret)) return ret; - if (unlikely(req->buf_index >= ctx->nr_user_bufs)) + node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); + if (!node) return -EFAULT; - index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); - req->imu = ctx->user_bufs[index]; - io_req_set_rsrc_node(req, ctx, 0); + io_req_assign_buf_node(req, node); io = req->async_data; - ret = io_import_fixed(ddir, &io->iter, req->imu, rw->addr, rw->len); + ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len); iov_iter_save_state(&io->iter, &io->iter_state); return ret; } @@ -435,7 +434,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req) * Play it safe and assume not safe to re-import and reissue if we're * not in the original thread group (or in task context). */ - if (!same_thread_group(req->task, current) || !in_task()) + if (!same_thread_group(req->tctx->task, current) || !in_task()) return false; return true; } @@ -818,6 +817,11 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { + /* make sure every req only blocks once*/ + req->flags &= ~REQ_F_IOPOLL_STATE; + req->iopoll_start = ktime_get_ns(); + } } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -1135,6 +1139,78 @@ void io_rw_fail(struct io_kiocb *req) io_req_set_res(req, res, req->cqe.flags); } +static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob, + unsigned int poll_flags) +{ + struct file *file = req->file; + + if (req->opcode == IORING_OP_URING_CMD) { + struct io_uring_cmd *ioucmd; + + ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags); + } else { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + + return file->f_op->iopoll(&rw->kiocb, iob, poll_flags); + } +} + +static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + struct hrtimer_sleeper timer; + enum hrtimer_mode mode; + ktime_t kt; + u64 sleep_time; + + if (req->flags & REQ_F_IOPOLL_STATE) + return 0; + + if (ctx->hybrid_poll_time == LLONG_MAX) + return 0; + + /* Using half the running time to do schedule */ + sleep_time = ctx->hybrid_poll_time / 2; + + kt = ktime_set(0, sleep_time); + req->flags |= REQ_F_IOPOLL_STATE; + + mode = HRTIMER_MODE_REL; + hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode); + hrtimer_set_expires(&timer.timer, kt); + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_sleeper_start_expires(&timer, mode); + + if (timer.task) + io_schedule(); + + hrtimer_cancel(&timer.timer); + __set_current_state(TASK_RUNNING); + destroy_hrtimer_on_stack(&timer.timer); + return sleep_time; +} + +static int io_uring_hybrid_poll(struct io_kiocb *req, + struct io_comp_batch *iob, unsigned int poll_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + u64 runtime, sleep_time; + int ret; + + sleep_time = io_hybrid_iopoll_delay(ctx, req); + ret = io_uring_classic_poll(req, iob, poll_flags); + runtime = ktime_get_ns() - req->iopoll_start - sleep_time; + + /* + * Use minimum sleep time if we're polling devices with different + * latencies. We could get more completions from the faster ones. + */ + if (ctx->hybrid_poll_time > runtime) + ctx->hybrid_poll_time = runtime; + + return ret; +} + int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { struct io_wq_work_node *pos, *start, *prev; @@ -1151,7 +1227,6 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) wq_list_for_each(pos, start, &ctx->iopoll_list) { struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - struct file *file = req->file; int ret; /* @@ -1162,17 +1237,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (READ_ONCE(req->iopoll_completed)) break; - if (req->opcode == IORING_OP_URING_CMD) { - struct io_uring_cmd *ioucmd; - - ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob, - poll_flags); - } else { - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) + ret = io_uring_hybrid_poll(req, &iob, poll_flags); + else + ret = io_uring_classic_poll(req, &iob, poll_flags); - ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags); - } if (unlikely(ret < 0)) return ret; else if (ret) |