diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-12-06 10:08:59 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-12-06 10:08:59 -0800 |
commit | 9feb1af97e7366b512ecb9e4dd61d3252074cda3 (patch) | |
tree | b94821803bc3c5a69b3132e82ab6ddd1fcbcbcc0 /fs | |
parent | 0aecba6173216931c436a03183f4759a4fd4c2f2 (diff) | |
parent | 8539429917c48c994d2e2cafa02ab06587b3b42c (diff) |
Merge tag 'for-linus-20191205' of git://git.kernel.dk/linux-block
Pull more block and io_uring updates from Jens Axboe:
"I wasn't expecting this to be so big, and if I was, I would have used
separate branches for this. Going forward I'll be doing separate
branches for the current tree, just like for the next kernel version
tree. In any case, this contains:
- Series from Christoph that fixes an inherent race condition with
zoned devices and revalidation.
- null_blk zone size fix (Damien)
- Fix for a regression in this merge window that caused busy spins by
sending empty disk uevents (Eric)
- Fix for a regression in this merge window for bfq stats (Hou)
- Fix for io_uring creds allocation failure handling (me)
- io_uring -ERESTARTSYS send/recvmsg fix (me)
- Series that fixes the need for applications to retain state across
async request punts for io_uring. This one is a bit larger than I
would have hoped, but I think it's important we get this fixed for
5.5.
- connect(2) improvement for io_uring, handling EINPROGRESS instead
of having applications needing to poll for it (me)
- Have io_uring use a hash for poll requests instead of an rbtree.
This turned out to work much better in practice, so I think we
should make the switch now. For some workloads, even with a fair
amount of cancellations, the insertion sort is just too expensive.
(me)
- Various little io_uring fixes (me, Jackie, Pavel, LimingWu)
- Fix for brd unaligned IO, and a warning for the future (Ming)
- Fix for a bio integrity data leak (Justin)
- bvec_iter_advance() improvement (Pavel)
- Xen blkback page unmap fix (SeongJae)
The major items in here are all well tested, and on the liburing side
we continue to add regression and feature test cases. We're up to 50
topic cases now, each with anywhere from 1 to more than 10 cases in
each"
* tag 'for-linus-20191205' of git://git.kernel.dk/linux-block: (33 commits)
block: fix memleak of bio integrity data
io_uring: fix a typo in a comment
bfq-iosched: Ensure bio->bi_blkg is valid before using it
io_uring: hook all linked requests via link_list
io_uring: fix error handling in io_queue_link_head
io_uring: use hash table for poll command lookups
io-wq: clear node->next on list deletion
io_uring: ensure deferred timeouts copy necessary data
io_uring: allow IO_SQE_* flags on IORING_OP_TIMEOUT
null_blk: remove unused variable warning on !CONFIG_BLK_DEV_ZONED
brd: warn on un-aligned buffer
brd: remove max_hw_sectors queue limit
xen/blkback: Avoid unmapping unmapped grant pages
io_uring: handle connect -EINPROGRESS like -EAGAIN
block: set the zone size in blk_revalidate_disk_zones atomically
block: don't handle bio based drivers in blk_revalidate_disk_zones
block: allocate the zone bitmaps lazily
block: replace seq_zones_bitmap with conv_zones_bitmap
block: simplify blkdev_nr_zones
block: remove the empty line at the end of blk-zoned.c
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/block_dev.c | 2 | ||||
-rw-r--r-- | fs/io-wq.c | 2 | ||||
-rw-r--r-- | fs/io-wq.h | 11 | ||||
-rw-r--r-- | fs/io_uring.c | 694 |
4 files changed, 493 insertions, 216 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index ee63c2732fa2..69bf2fb6f7cd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1531,7 +1531,7 @@ rescan: ret = blk_add_partitions(disk, bdev); if (ret == -EAGAIN) goto rescan; - } else { + } else if (invalidate) { /* * Tell userspace that the media / partition table may have * changed. diff --git a/fs/io-wq.c b/fs/io-wq.c index 91b85df0861e..74b40506c5d9 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -111,7 +111,7 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; - struct cred *creds; + const struct cred *creds; struct mm_struct *mm; refcount_t refs; struct completion done; diff --git a/fs/io-wq.h b/fs/io-wq.h index 600e0158cba7..7c333a28e2a7 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -52,6 +52,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, list->last = prev; if (prev) prev->next = node->next; + node->next = NULL; } #define wq_list_for_each(pos, prv, head) \ @@ -87,7 +88,7 @@ typedef void (put_work_fn)(struct io_wq_work *); struct io_wq_data { struct mm_struct *mm; struct user_struct *user; - struct cred *creds; + const struct cred *creds; get_work_fn *get_work; put_work_fn *put_work; @@ -118,10 +119,6 @@ static inline void io_wq_worker_sleeping(struct task_struct *tsk) static inline void io_wq_worker_running(struct task_struct *tsk) { } -#endif +#endif /* CONFIG_IO_WQ */ -static inline bool io_wq_current_is_worker(void) -{ - return in_task() && (current->flags & PF_IO_WORKER); -} -#endif +#endif /* INTERNAL_IO_WQ_H */ diff --git a/fs/io_uring.c b/fs/io_uring.c index ec53aa7cdc94..405be10da73d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -145,7 +145,7 @@ struct io_rings { /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure - * there are not more requests pending thatn there is space in + * there are not more requests pending than there is space in * the completion queue. * * Written by the kernel, shouldn't be modified by the @@ -238,7 +238,7 @@ struct io_ring_ctx { struct user_struct *user; - struct cred *creds; + const struct cred *creds; /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ struct completion *completions; @@ -275,7 +275,8 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct list_head poll_list; - struct rb_root cancel_tree; + struct hlist_head *cancel_hash; + unsigned cancel_hash_bits; spinlock_t inflight_lock; struct list_head inflight_list; @@ -303,9 +304,32 @@ struct io_timeout_data { u32 seq_offset; }; -struct io_timeout { - struct file *file; - struct io_timeout_data *data; +struct io_async_connect { + struct sockaddr_storage address; +}; + +struct io_async_msghdr { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + struct sockaddr __user *uaddr; + struct msghdr msg; +}; + +struct io_async_rw { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + ssize_t nr_segs; + ssize_t size; +}; + +struct io_async_ctx { + struct io_uring_sqe sqe; + union { + struct io_async_rw rw; + struct io_async_msghdr msg; + struct io_async_connect connect; + struct io_timeout_data timeout; + }; }; /* @@ -319,10 +343,10 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; - struct io_timeout timeout; }; const struct io_uring_sqe *sqe; + struct io_async_ctx *io; struct file *ring_file; int ring_fd; bool has_user; @@ -332,7 +356,7 @@ struct io_kiocb { struct io_ring_ctx *ctx; union { struct list_head list; - struct rb_node rb_node; + struct hlist_node hash_node; }; struct list_head link_list; unsigned int flags; @@ -353,7 +377,6 @@ struct io_kiocb { #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ -#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */ u64 user_data; u32 result; u32 sequence; @@ -422,6 +445,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -435,6 +459,21 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx->completions) goto err; + /* + * Use 5 bits less than the max cq entries, that should give us around + * 32 entries per hash list if totally full and uniformly spread. + */ + hash_bits = ilog2(p->cq_entries); + hash_bits -= 5; + if (hash_bits <= 0) + hash_bits = 1; + ctx->cancel_hash_bits = hash_bits; + ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), + GFP_KERNEL); + if (!ctx->cancel_hash) + goto err; + __hash_init(ctx->cancel_hash, 1U << hash_bits); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -448,7 +487,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); - ctx->cancel_tree = RB_ROOT; INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -459,6 +497,7 @@ err: if (ctx->fallback_req) kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx->completions); + kfree(ctx->cancel_hash); kfree(ctx); return NULL; } @@ -592,7 +631,7 @@ static void io_kill_timeout(struct io_kiocb *req) { int ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); @@ -806,6 +845,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, } got_it: + req->io = NULL; req->ring_file = NULL; req->file = NULL; req->ctx = ctx; @@ -836,8 +876,8 @@ static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_FREE_SQE) - kfree(req->sqe); + if (req->io) + kfree(req->io); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); if (req->flags & REQ_F_INFLIGHT) { @@ -849,8 +889,6 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - if (req->flags & REQ_F_TIMEOUT) - kfree(req->timeout.data); percpu_ref_put(&ctx->refs); if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -863,7 +901,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); @@ -878,7 +916,6 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt; bool wake_ev = false; /* Already got next link */ @@ -890,24 +927,21 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * potentially happen if the chain is messed up, check to be on the * safe side. */ - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - while (nxt) { - list_del_init(&nxt->list); + while (!list_empty(&req->link_list)) { + struct io_kiocb *nxt = list_first_entry(&req->link_list, + struct io_kiocb, link_list); - if ((req->flags & REQ_F_LINK_TIMEOUT) && - (nxt->flags & REQ_F_TIMEOUT)) { + if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && + (nxt->flags & REQ_F_TIMEOUT))) { + list_del_init(&nxt->link_list); wake_ev |= io_link_cancel_timeout(nxt); - nxt = list_first_entry_or_null(&req->link_list, - struct io_kiocb, list); req->flags &= ~REQ_F_LINK_TIMEOUT; continue; } - if (!list_empty(&req->link_list)) { - INIT_LIST_HEAD(&nxt->link_list); - list_splice(&req->link_list, &nxt->link_list); - nxt->flags |= REQ_F_LINK; - } + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK; *nxtptr = nxt; break; } @@ -923,15 +957,15 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) static void io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { - link = list_first_entry(&req->link_list, struct io_kiocb, list); - list_del_init(&link->list); + struct io_kiocb *link = list_first_entry(&req->link_list, + struct io_kiocb, link_list); + list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); if ((req->flags & REQ_F_LINK_TIMEOUT) && @@ -1079,9 +1113,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * completions for those, only batch free for fixed * file and non-linked commands. */ - if (((req->flags & - (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) == - REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) { + if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == + REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && + !req->io) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); @@ -1410,15 +1444,6 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) if (S_ISREG(file_inode(req->file)->i_mode)) req->flags |= REQ_F_ISREG; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; - return -EAGAIN; - } - kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -1587,6 +1612,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return io_import_fixed(req->ctx, rw, sqe, iter); } + if (req->io) { + struct io_async_rw *iorw = &req->io->rw; + + *iovec = iorw->iov; + iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); + if (iorw->iov == iorw->fast_iov) + *iovec = NULL; + return iorw->size; + } + if (!req->has_user) return -EFAULT; @@ -1657,6 +1692,50 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } +static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + req->io->rw.nr_segs = iter->nr_segs; + req->io->rw.size = io_size; + req->io->rw.iov = iovec; + if (!req->io->rw.iov) { + req->io->rw.iov = req->io->rw.fast_iov; + memcpy(req->io->rw.iov, fast_iov, + sizeof(struct iovec) * iter->nr_segs); + } +} + +static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); + if (req->io) { + io_req_map_io(req, io_size, iovec, fast_iov, iter); + memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); + req->sqe = &req->io->sqe; + return 0; + } + + return -ENOMEM; +} + +static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_READ))) + return -EBADF; + + return io_import_iovec(READ, req, iovec, iter); +} + static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { @@ -1665,23 +1744,31 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t read_size, ret; - - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; - file = kiocb->ki_filp; + ssize_t io_size, ret; - if (unlikely(!(file->f_mode & FMODE_READ))) - return -EBADF; - - ret = io_import_iovec(READ, req, &iovec, &iter); - if (ret < 0) - return ret; + if (!req->io) { + ret = io_read_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(READ, req, &iovec, &iter); + if (ret < 0) + return ret; + } - read_size = ret; + file = req->file; + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = read_size; + req->result = io_size; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; + } iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); @@ -1703,18 +1790,40 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, */ if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && (req->flags & REQ_F_ISREG) && - ret2 > 0 && ret2 < read_size) + ret2 > 0 && ret2 < io_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) + if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); - else - ret = -EAGAIN; + } else { +copy_iov: + ret = io_setup_async_io(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; + } } +out_free: kfree(iovec); return ret; } +static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_WRITE))) + return -EBADF; + + return io_import_iovec(WRITE, req, iovec, iter); +} + static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { @@ -1723,29 +1832,36 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; + ssize_t ret, io_size; - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + if (!req->io) { + ret = io_write_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(WRITE, req, &iovec, &iter); + if (ret < 0) + return ret; + } file = kiocb->ki_filp; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - return -EBADF; - - ret = io_import_iovec(WRITE, req, &iovec, &iter); - if (ret < 0) - return ret; - + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = ret; + req->result = io_size; - iov_count = iov_iter_count(&iter); + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; + } - ret = -EAGAIN; if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) - goto out_free; + goto copy_iov; + iov_count = iov_iter_count(&iter); ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; @@ -1769,10 +1885,16 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, ret2 = call_write_iter(file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); - if (!force_nonblock || ret2 != -EAGAIN) + if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); - else - ret = -EAGAIN; + } else { +copy_iov: + ret = io_setup_async_io(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; + } } out_free: kfree(iovec); @@ -1888,12 +2010,25 @@ static int io_sync_file_range(struct io_kiocb *req, return 0; } +static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ #if defined(CONFIG_NET) -static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock, - long (*fn)(struct socket *, struct user_msghdr __user *, - unsigned int)) + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); +#else + return 0; +#endif +} + +static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { +#if defined(CONFIG_NET) struct socket *sock; int ret; @@ -1902,7 +2037,9 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { - struct user_msghdr __user *msg; + struct io_async_ctx io, *copy; + struct sockaddr_storage addr; + struct msghdr *kmsg; unsigned flags; flags = READ_ONCE(sqe->msg_flags); @@ -1911,30 +2048,59 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, else if (force_nonblock) flags |= MSG_DONTWAIT; - msg = (struct user_msghdr __user *) (unsigned long) - READ_ONCE(sqe->addr); + if (req->io) { + kmsg = &req->io->msg.msg; + kmsg->msg_name = &addr; + } else { + kmsg = &io.msg.msg; + kmsg->msg_name = &addr; + io.msg.iov = io.msg.fast_iov; + ret = io_sendmsg_prep(req, &io); + if (ret) + goto out; + } - ret = fn(sock, msg, flags); - if (force_nonblock && ret == -EAGAIN) + ret = __sys_sendmsg_sock(sock, kmsg, flags); + if (force_nonblock && ret == -EAGAIN) { + copy = kmalloc(sizeof(*copy), GFP_KERNEL); + if (!copy) { + ret = -ENOMEM; + goto out; + } + memcpy(©->msg, &io.msg, sizeof(copy->msg)); + req->io = copy; + memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &req->io->sqe; return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; } +out: io_cqring_add_event(req, ret); if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_put_req_find_next(req, nxt); return 0; -} +#else + return -EOPNOTSUPP; #endif +} -static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, nxt, force_nonblock, - __sys_sendmsg_sock); + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, + &io->msg.iov); #else - return -EOPNOTSUPP; + return 0; #endif } @@ -1942,8 +2108,63 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, nxt, force_nonblock, - __sys_recvmsg_sock); + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct user_msghdr __user *msg; + struct io_async_ctx io, *copy; + struct sockaddr_storage addr; + struct msghdr *kmsg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + msg = (struct user_msghdr __user *) (unsigned long) + READ_ONCE(sqe->addr); + if (req->io) { + kmsg = &req->io->msg.msg; + kmsg->msg_name = &addr; + } else { + kmsg = &io.msg.msg; + kmsg->msg_name = &addr; + io.msg.iov = io.msg.fast_iov; + ret = io_recvmsg_prep(req, &io); + if (ret) + goto out; + } + + ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags); + if (force_nonblock && ret == -EAGAIN) { + copy = kmalloc(sizeof(*copy), GFP_KERNEL); + if (!copy) { + ret = -ENOMEM; + goto out; + } + memcpy(copy, &io, sizeof(*copy)); + req->io = copy; + memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &req->io->sqe; + return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + +out: + io_cqring_add_event(req, ret); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_put_req_find_next(req, nxt); + return 0; #else return -EOPNOTSUPP; #endif @@ -1985,11 +2206,26 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } +static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ +#if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct sockaddr __user *addr; + int addr_len; + + addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr_len = READ_ONCE(sqe->addr2); + return move_addr_to_kernel(addr, addr_len, &io->connect.address); +#else + return 0; +#endif +} + static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - struct sockaddr __user *addr; + struct io_async_ctx __io, *io; unsigned file_flags; int addr_len, ret; @@ -1998,15 +2234,35 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) return -EINVAL; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); addr_len = READ_ONCE(sqe->addr2); file_flags = force_nonblock ? O_NONBLOCK : 0; - ret = __sys_connect_file(req->file, addr, addr_len, file_flags); - if (ret == -EAGAIN && force_nonblock) + if (req->io) { + io = req->io; + } else { + ret = io_connect_prep(req, &__io); + if (ret) + goto out; + io = &__io; + } + + ret = __sys_connect_file(req->file, &io->connect.address, addr_len, + file_flags); + if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) { + ret = -ENOMEM; + goto out; + } + memcpy(&io->connect, &__io.connect, sizeof(io->connect)); + req->io = io; + memcpy(&io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &io->sqe; return -EAGAIN; + } if (ret == -ERESTARTSYS) ret = -EINTR; +out: if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req, ret); @@ -2017,14 +2273,6 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } -static inline void io_poll_remove_req(struct io_kiocb *req) -{ - if (!RB_EMPTY_NODE(&req->rb_node)) { - rb_erase(&req->rb_node, &req->ctx->cancel_tree); - RB_CLEAR_NODE(&req->rb_node); - } -} - static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; @@ -2036,36 +2284,34 @@ static void io_poll_remove_one(struct io_kiocb *req) io_queue_async_work(req); } spin_unlock(&poll->head->lock); - io_poll_remove_req(req); + hash_del(&req->hash_node); } static void io_poll_remove_all(struct io_ring_ctx *ctx) { - struct rb_node *node; + struct hlist_node *tmp; struct io_kiocb *req; + int i; spin_lock_irq(&ctx->completion_lock); - while ((node = rb_first(&ctx->cancel_tree)) != NULL) { - req = rb_entry(node, struct io_kiocb, rb_node); - io_poll_remove_one(req); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(req, tmp, list, hash_node) + io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) { - struct rb_node *p, *parent = NULL; + struct hlist_head *list; struct io_kiocb *req; - p = ctx->cancel_tree.rb_node; - while (p) { - parent = p; - req = rb_entry(parent, struct io_kiocb, rb_node); - if (sqe_addr < req->user_data) { - p = p->rb_left; - } else if (sqe_addr > req->user_data) { - p = p->rb_right; - } else { + list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; + hlist_for_each_entry(req, list, hash_node) { + if (sqe_addr == req->user_data) { io_poll_remove_one(req); return 0; } @@ -2147,7 +2393,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) spin_unlock_irq(&ctx->completion_lock); return; } - io_poll_remove_req(req); + hash_del(&req->hash_node); io_poll_complete(req, mask, ret); spin_unlock_irq(&ctx->completion_lock); @@ -2182,7 +2428,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * for finalizing the request, mark us as having grabbed that already. */ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - io_poll_remove_req(req); + hash_del(&req->hash_node); io_poll_complete(req, mask, 0); req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -2220,20 +2466,10 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, static void io_poll_req_insert(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct rb_node **p = &ctx->cancel_tree.rb_node; - struct rb_node *parent = NULL; - struct io_kiocb *tmp; - - while (*p) { - parent = *p; - tmp = rb_entry(parent, struct io_kiocb, rb_node); - if (req->user_data < tmp->user_data) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - rb_link_node(&req->rb_node, parent, p); - rb_insert_color(&req->rb_node, &ctx->cancel_tree); + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); } static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, @@ -2257,11 +2493,11 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->wait) return -ENOMEM; - req->sqe = NULL; + req->io = NULL; INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - RB_CLEAR_NODE(&req->rb_node); + INIT_HLIST_NODE(&req->hash_node); poll->head = NULL; poll->done = false; @@ -2368,7 +2604,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret == -1) return -EALREADY; @@ -2410,7 +2646,8 @@ static int io_timeout_remove(struct io_kiocb *req, return 0; } -static int io_timeout_setup(struct io_kiocb *req) +static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, + bool is_timeout_link) { const struct io_uring_sqe *sqe = req->sqe; struct io_timeout_data *data; @@ -2420,15 +2657,14 @@ static int io_timeout_setup(struct io_kiocb *req) return -EINVAL; if (sqe->ioprio || sqe->buf_index || sqe->len != 1) return -EINVAL; + if (sqe->off && is_timeout_link) + return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL); - if (!data) - return -ENOMEM; + data = &io->timeout; data->req = req; - req->timeout.data = data; req->flags |= REQ_F_TIMEOUT; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) @@ -2440,6 +2676,7 @@ static int io_timeout_setup(struct io_kiocb *req) data->mode = HRTIMER_MODE_REL; hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + req->io = io; return 0; } @@ -2448,16 +2685,24 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned count; struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data; + struct io_async_ctx *io; struct list_head *entry; unsigned span = 0; - int ret; - ret = io_timeout_setup(req); - /* common setup allows flags (like links) set, we don't */ - if (!ret && sqe->flags) - ret = -EINVAL; - if (ret) - return ret; + io = req->io; + if (!io) { + int ret; + + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) + return -ENOMEM; + ret = io_timeout_prep(req, io, false); + if (ret) { + kfree(io); + return ret; + } + } + data = &req->io->timeout; /* * sqe->off holds how many events that need to occur for this @@ -2473,7 +2718,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->sequence = ctx->cached_sq_head + count - 1; - req->timeout.data->seq_offset = count; + data->seq_offset = count; /* * Insertion sort, ensuring the first entry in the list is always @@ -2484,7 +2729,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); unsigned nxt_sq_head; long long tmp, tmp_nxt; - u32 nxt_offset = nxt->timeout.data->seq_offset; + u32 nxt_offset = nxt->io->timeout.seq_offset; if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) continue; @@ -2517,7 +2762,6 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->sequence -= span; add: list_add(&req->list, entry); - data = req->timeout.data; data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); @@ -2598,30 +2842,76 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct iov_iter iter; + ssize_t ret; + + memcpy(&io->sqe, req->sqe, sizeof(io->sqe)); + req->sqe = &io->sqe; + + switch (io->sqe.opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + ret = io_read_prep(req, &iovec, &iter, true); + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + ret = io_write_prep(req, &iovec, &iter, true); + break; + case IORING_OP_SENDMSG: + ret = io_sendmsg_prep(req, io); + break; + case IORING_OP_RECVMSG: + ret = io_recvmsg_prep(req, io); + break; + case IORING_OP_CONNECT: + ret = io_connect_prep(req, io); + break; + case IORING_OP_TIMEOUT: + return io_timeout_prep(req, io, false); + case IORING_OP_LINK_TIMEOUT: + return io_timeout_prep(req, io, true); + default: + req->io = io; + return 0; + } + + if (ret < 0) + return ret; + + req->io = io; + io_req_map_io(req, ret, iovec, inline_vecs, &iter); + return 0; +} + static int io_req_defer(struct io_kiocb *req) { - struct io_uring_sqe *sqe_copy; struct io_ring_ctx *ctx = req->ctx; + struct io_async_ctx *io; + int ret; /* Still need defer if there is pending req in defer list. */ if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) return -EAGAIN; + ret = io_req_defer_prep(req, io); + if (ret < 0) { + kfree(io); + return ret; + } + spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); - kfree(sqe_copy); return 0; } - memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy)); - req->flags |= REQ_F_FREE_SQE; - req->sqe = sqe_copy; - trace_io_uring_defer(ctx, req, req->user_data); list_add_tail(&req->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); @@ -2876,10 +3166,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (!list_empty(&req->list)) { - prev = list_entry(req->list.prev, struct io_kiocb, link_list); + if (!list_empty(&req->link_list)) { + prev = list_entry(req->link_list.prev, struct io_kiocb, + link_list); if (refcount_inc_not_zero(&prev->refs)) { - list_del_init(&req->list); + list_del_init(&req->link_list); prev->flags &= ~REQ_F_LINK_TIMEOUT; } else prev = NULL; @@ -2909,8 +3200,8 @@ static void io_queue_linked_timeout(struct io_kiocb *req) * we got a chance to setup the timer */ spin_lock_irq(&ctx->completion_lock); - if (!list_empty(&req->list)) { - struct io_timeout_data *data = req->timeout.data; + if (!list_empty(&req->link_list)) { + struct io_timeout_data *data = &req->io->timeout; data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), @@ -2929,7 +3220,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK)) return NULL; - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); + nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, + link_list); if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) return NULL; @@ -2953,15 +3245,6 @@ static void __io_queue_sqe(struct io_kiocb *req) */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { - struct io_uring_sqe *sqe_copy; - - sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) - goto err; - - req->sqe = sqe_copy; - req->flags |= REQ_F_FREE_SQE; - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { ret = io_grab_files(req); if (ret) @@ -3030,7 +3313,7 @@ static inline void io_queue_link_head(struct io_kiocb *req) #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) -static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, +static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; @@ -3049,7 +3332,7 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, err_req: io_cqring_add_event(req, ret); io_double_put_req(req); - return; + return false; } /* @@ -3061,32 +3344,25 @@ err_req: */ if (*link) { struct io_kiocb *prev = *link; - struct io_uring_sqe *sqe_copy; + struct io_async_ctx *io; if (req->sqe->flags & IOSQE_IO_DRAIN) (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { - ret = io_timeout_setup(req); - /* common setup allows offset being set, we don't */ - if (!ret && req->sqe->off) - ret = -EINVAL; - if (ret) { - prev->flags |= REQ_F_FAIL_LINK; - goto err_req; - } - } - - sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) { + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) { ret = -EAGAIN; goto err_req; } - req->sqe = sqe_copy; - req->flags |= REQ_F_FREE_SQE; + ret = io_req_defer_prep(req, io); + if (ret) { + kfree(io); + prev->flags |= REQ_F_FAIL_LINK; + goto err_req; + } trace_io_uring_link(ctx, req, prev); - list_add_tail(&req->list, &prev->link_list); + list_add_tail(&req->link_list, &prev->link_list); } else if (req->sqe->flags & IOSQE_IO_LINK) { req->flags |= REQ_F_LINK; @@ -3095,6 +3371,8 @@ err_req: } else { io_queue_sqe(req); } + + return true; } /* @@ -3113,7 +3391,7 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - struct io_ring_ctx *ctx, unsigned max_ios) + unsigned int max_ios) { blk_start_plug(&state->plug); state->free_reqs = 0; @@ -3197,7 +3475,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, return -EBUSY; if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, ctx, nr); + io_submit_state_start(&state, nr); statep = &state; } @@ -3224,6 +3502,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } + submitted++; sqe_flags = req->sqe->flags; req->ring_file = ring_file; @@ -3233,9 +3512,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->sqe->user_data, true, async); - io_submit_sqe(req, statep, &link); - submitted++; - + if (!io_submit_sqe(req, statep, &link)) + break; /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. @@ -4363,6 +4641,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->completions); + kfree(ctx->cancel_hash); kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); } @@ -4759,7 +5038,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ctx->compat = in_compat_syscall(); ctx->account_mem = account_mem; ctx->user = user; - ctx->creds = prepare_creds(); + ctx->creds = get_current_cred(); ret = io_allocate_scq_urings(ctx, p); if (ret) @@ -4794,7 +5073,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret < 0) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP; + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | + IORING_FEAT_SUBMIT_STABLE; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: |