diff options
Diffstat (limited to 'net')
42 files changed, 530 insertions, 451 deletions
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 12ecacf0c55f..c0762a302162 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -950,7 +950,7 @@ static int p9_bind_privport(struct socket *sock) memset(&cl, 0, sizeof(cl)); cl.sin_family = AF_INET; - cl.sin_addr.s_addr = INADDR_ANY; + cl.sin_addr.s_addr = htonl(INADDR_ANY); for (port = p9_ipport_resv_max; port >= p9_ipport_resv_min; port--) { cl.sin_port = htons((ushort)port); err = kernel_bind(sock, (struct sockaddr *)&cl, sizeof(cl)); diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index d7bec7adc267..f36f9a3a4e20 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -13,7 +13,7 @@ config CEPH_LIB common functionality to both the Ceph filesystem and to the rados block device (rbd). - More information at http://ceph.newdream.net/. + More information at https://ceph.io/. If unsure, say N. diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index 9a5850f264ed..81e1e006c540 100644 --- a/net/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -4,7 +4,7 @@ /* * Robert Jenkin's hash function. - * http://burtleburtle.net/bob/hash/evahash.html + * https://burtleburtle.net/bob/hash/evahash.html * This is in the public domain. */ #define mix(a, b, c) \ diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c index e5cc603cdb17..fe79f6d2d0db 100644 --- a/net/ceph/crush/hash.c +++ b/net/ceph/crush/hash.c @@ -7,7 +7,7 @@ /* * Robert Jenkins' function for mixing 32-bit values - * http://burtleburtle.net/bob/hash/evahash.html + * https://burtleburtle.net/bob/hash/evahash.html * a, b = random bits, c = input and output */ #define crush_hashmix(a, b, c) do { \ diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 3f323ed9df52..07e5614eb3f1 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -298,7 +298,7 @@ static __u64 crush_ln(unsigned int xin) * * for reference, see: * - * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables + * https://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables * */ diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 409d505ff320..2110439f8a24 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -223,6 +223,9 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req) if (op->op == CEPH_OSD_OP_WATCH) seq_printf(s, "-%s", ceph_osd_watch_op_name(op->watch.op)); + else if (op->op == CEPH_OSD_OP_CALL) + seq_printf(s, "-%s/%s", op->cls.class_name, + op->cls.method_name); } seq_putc(s, '\n'); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2db8b44e70c2..e4fbcad6e7d8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -525,7 +525,7 @@ EXPORT_SYMBOL(ceph_osdc_put_request); static void request_init(struct ceph_osd_request *req) { - /* req only, each op is zeroed in _osd_req_op_init() */ + /* req only, each op is zeroed in osd_req_op_init() */ memset(req, 0, sizeof(*req)); kref_init(&req->r_kref); @@ -746,8 +746,8 @@ EXPORT_SYMBOL(ceph_osdc_alloc_messages); * other information associated with them. It also serves as a * common init routine for all the other init functions, below. */ -static struct ceph_osd_req_op * -_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, +struct ceph_osd_req_op * +osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u32 flags) { struct ceph_osd_req_op *op; @@ -762,12 +762,6 @@ _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, return op; } - -void osd_req_op_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode, u32 flags) -{ - (void)_osd_req_op_init(osd_req, which, opcode, flags); -} EXPORT_SYMBOL(osd_req_op_init); void osd_req_op_extent_init(struct ceph_osd_request *osd_req, @@ -775,8 +769,8 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - opcode, 0); + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, + opcode, 0); size_t payload_len = 0; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && @@ -822,7 +816,7 @@ void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, BUG_ON(which + 1 >= osd_req->r_num_ops); prev_op = &osd_req->r_ops[which]; - op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); + op = osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); /* dup previous one */ op->indata_len = prev_op->indata_len; op->outdata_len = prev_op->outdata_len; @@ -845,7 +839,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, size_t size; int ret; - op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); + op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) @@ -883,8 +877,8 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *name, const void *value, size_t size, u8 cmp_op, u8 cmp_mode) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - opcode, 0); + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, + opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; int ret; @@ -928,7 +922,7 @@ static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, { struct ceph_osd_req_op *op; - op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); + op = osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); op->watch.cookie = cookie; op->watch.op = watch_opcode; op->watch.gen = 0; @@ -943,10 +937,9 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, u64 expected_write_size, u32 flags) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - CEPH_OSD_OP_SETALLOCHINT, - 0); + struct ceph_osd_req_op *op; + op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_SETALLOCHINT, 0); op->alloc_hint.expected_object_size = expected_object_size; op->alloc_hint.expected_write_size = expected_write_size; op->alloc_hint.flags = flags; @@ -3076,9 +3069,7 @@ static void send_linger(struct ceph_osd_linger_request *lreq) cancel_linger_request(req); request_reinit(req); - ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); - ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); - req->r_flags = lreq->t.flags; + target_copy(&req->r_t, &lreq->t); req->r_mtime = lreq->mtime; mutex_lock(&lreq->lock); @@ -4801,7 +4792,7 @@ static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, struct ceph_pagelist *pl; int ret; - op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); + op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); pl = ceph_pagelist_alloc(GFP_NOIO); if (!pl) @@ -4870,7 +4861,7 @@ static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, struct ceph_pagelist *pl; int ret; - op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); + op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); op->notify.cookie = cookie; pl = ceph_pagelist_alloc(GFP_NOIO); @@ -5334,8 +5325,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, if (IS_ERR(pages)) return PTR_ERR(pages); - op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2, - dst_fadvise_flags); + op = osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2, + dst_fadvise_flags); op->copy_from.snapid = src_snapid; op->copy_from.src_version = src_version; op->copy_from.flags = copy_from_flags; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d3377c90a291..b988f48153a4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -1384,18 +1384,39 @@ static int bpf_iter_init_sk_storage_map(void *priv_data, return 0; } -static int bpf_iter_check_map(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux) +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) { - struct bpf_map *map = aux->map; + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) - return -EINVAL; + goto put_map; - if (prog->aux->max_rdonly_access > map->value_size) - return -EACCES; + if (prog->aux->max_rdonly_access > map->value_size) { + err = -EACCES; + goto put_map; + } + aux->map = map; return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); } static const struct seq_operations bpf_sk_storage_map_seq_ops = { @@ -1414,8 +1435,8 @@ static const struct bpf_iter_seq_info iter_seq_info = { static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { .target = "bpf_sk_storage_map", - .check_target = bpf_iter_check_map, - .req_linfo = BPF_ITER_LINK_MAP_FD, + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 9de33b594ff2..efec66fa78b7 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -757,11 +757,13 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, return err; } - hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; - cpumask_and(mask, mask, housekeeping_cpumask(hk_flags)); - if (cpumask_empty(mask)) { - free_cpumask_var(mask); - return -EINVAL; + if (!cpumask_empty(mask)) { + hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; + cpumask_and(mask, mask, housekeeping_cpumask(hk_flags)); + if (cpumask_empty(mask)) { + free_cpumask_var(mask); + return -EINVAL; + } } map = kzalloc(max_t(unsigned int, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2828f6d5ba89..7e2e502ef519 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4853,7 +4853,7 @@ static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) if (err < 0) goto out; - if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF)) + if (ip_is_fragment(ip_hdr(skb))) fragment = true; off = ip_hdrlen(skb); diff --git a/net/core/sock.c b/net/core/sock.c index a2044b4b606b..e4f40b175acb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3414,6 +3414,16 @@ static void sock_inuse_add(struct net *net, int val) } #endif +static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) +{ + if (!twsk_prot) + return; + kfree(twsk_prot->twsk_slab_name); + twsk_prot->twsk_slab_name = NULL; + kmem_cache_destroy(twsk_prot->twsk_slab); + twsk_prot->twsk_slab = NULL; +} + static void req_prot_cleanup(struct request_sock_ops *rsk_prot) { if (!rsk_prot) @@ -3484,7 +3494,7 @@ int proto_register(struct proto *prot, int alloc_slab) prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) - goto out_free_timewait_sock_slab_name; + goto out_free_timewait_sock_slab; } } @@ -3492,15 +3502,15 @@ int proto_register(struct proto *prot, int alloc_slab) ret = assign_proto_idx(prot); if (ret) { mutex_unlock(&proto_list_mutex); - goto out_free_timewait_sock_slab_name; + goto out_free_timewait_sock_slab; } list_add(&prot->node, &proto_list); mutex_unlock(&proto_list_mutex); return ret; -out_free_timewait_sock_slab_name: +out_free_timewait_sock_slab: if (alloc_slab && prot->twsk_prot) - kfree(prot->twsk_prot->twsk_slab_name); + tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { req_prot_cleanup(prot->rsk_prot); @@ -3524,12 +3534,7 @@ void proto_unregister(struct proto *prot) prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); - - if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { - kmem_cache_destroy(prot->twsk_prot->twsk_slab); - kfree(prot->twsk_prot->twsk_slab_name); - prot->twsk_prot->twsk_slab = NULL; - } + tw_prot_cleanup(prot->twsk_prot); } EXPORT_SYMBOL(proto_unregister); diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 545b2640f019..1b34cb9a7708 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -57,18 +57,16 @@ int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, return bpfilter_mbox_request(sk, optname, optval, optlen, true); } -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, - char __user *user_optval, int __user *optlen) +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, + int __user *optlen) { - sockptr_t optval; - int err, len; + int len; if (get_user(len, optlen)) return -EFAULT; - err = init_user_sockptr(&optval, user_optval, len); - if (err) - return err; - return bpfilter_mbox_request(sk, optname, optval, len, false); + + return bpfilter_mbox_request(sk, optname, USER_SOCKPTR(optval), len, + false); } static int __init bpfilter_sockopt_init(void) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d1a3913eebe0..b457dd2d6c75 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -296,6 +296,57 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, ipv6_only_sock(sk), true, false); } +void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, + struct sock *sk) +{ + kuid_t uid = sock_i_uid(sk); + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; + + if (hlist_empty(&tb->owners)) { + tb->fastreuse = reuse; + if (sk->sk_reuseport) { + tb->fastreuseport = FASTREUSEPORT_ANY; + tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif + } else { + tb->fastreuseport = 0; + } + } else { + if (!reuse) + tb->fastreuse = 0; + if (sk->sk_reuseport) { + /* We didn't match or we don't have fastreuseport set on + * the tb, but we have sk_reuseport set on this socket + * and we know that there are no bind conflicts with + * this socket in this tb, so reset our tb's reuseport + * settings so that any subsequent sockets that match + * our current socket will be put on the fast path. + * + * If we reset we need to set FASTREUSEPORT_STRICT so we + * do extra checking for all subsequent sk_reuseport + * socks. + */ + if (!sk_reuseport_match(tb, sk)) { + tb->fastreuseport = FASTREUSEPORT_STRICT; + tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif + } + } else { + tb->fastreuseport = 0; + } + } +} + /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) @@ -308,7 +359,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); struct inet_bind_bucket *tb = NULL; - kuid_t uid = sock_i_uid(sk); int l3mdev; l3mdev = inet_sk_bound_l3mdev(sk); @@ -345,49 +395,8 @@ tb_found: goto fail_unlock; } success: - if (hlist_empty(&tb->owners)) { - tb->fastreuse = reuse; - if (sk->sk_reuseport) { - tb->fastreuseport = FASTREUSEPORT_ANY; - tb->fastuid = uid; - tb->fast_rcv_saddr = sk->sk_rcv_saddr; - tb->fast_ipv6_only = ipv6_only_sock(sk); - tb->fast_sk_family = sk->sk_family; -#if IS_ENABLED(CONFIG_IPV6) - tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; -#endif - } else { - tb->fastreuseport = 0; - } - } else { - if (!reuse) - tb->fastreuse = 0; - if (sk->sk_reuseport) { - /* We didn't match or we don't have fastreuseport set on - * the tb, but we have sk_reuseport set on this socket - * and we know that there are no bind conflicts with - * this socket in this tb, so reset our tb's reuseport - * settings so that any subsequent sockets that match - * our current socket will be put on the fast path. - * - * If we reset we need to set FASTREUSEPORT_STRICT so we - * do extra checking for all subsequent sk_reuseport - * socks. - */ - if (!sk_reuseport_match(tb, sk)) { - tb->fastreuseport = FASTREUSEPORT_STRICT; - tb->fastuid = uid; - tb->fast_rcv_saddr = sk->sk_rcv_saddr; - tb->fast_ipv6_only = ipv6_only_sock(sk); - tb->fast_sk_family = sk->sk_family; -#if IS_ENABLED(CONFIG_IPV6) - tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; -#endif - } - } else { - tb->fastreuseport = 0; - } - } + inet_csk_update_fastreuse(tb, sk); + if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4eb4cd8d20dd..239e54474b65 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -163,6 +163,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) return -ENOMEM; } } + inet_csk_update_fastreuse(tb, child); } inet_bind_hash(child, tb, port); spin_unlock(&head->lock); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5653e3b011bf..54023a46db04 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -301,24 +301,16 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2 * TCP_FASTOPEN_KEY_MAX) + (TCP_FASTOPEN_KEY_MAX * 5)) }; - struct tcp_fastopen_context *ctx; - u32 user_key[TCP_FASTOPEN_KEY_MAX * 4]; - __le32 key[TCP_FASTOPEN_KEY_MAX * 4]; + u32 user_key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u32)]; + __le32 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(__le32)]; char *backup_data; - int ret, i = 0, off = 0, n_keys = 0; + int ret, i = 0, off = 0, n_keys; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) return -ENOMEM; - rcu_read_lock(); - ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); - if (ctx) { - n_keys = tcp_fastopen_context_len(ctx); - memcpy(&key[0], &ctx->key[0], TCP_FASTOPEN_KEY_LENGTH * n_keys); - } - rcu_read_unlock(); - + n_keys = tcp_fastopen_get_cipher(net, NULL, (u64 *)key); if (!n_keys) { memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH); n_keys = 1; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c06d2bfd2ec4..31f3b858db81 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3685,22 +3685,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; case TCP_FASTOPEN_KEY: { - __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; - struct tcp_fastopen_context *ctx; - unsigned int key_len = 0; + u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; + unsigned int key_len; if (get_user(len, optlen)) return -EFAULT; - rcu_read_lock(); - ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); - if (ctx) { - key_len = tcp_fastopen_context_len(ctx) * - TCP_FASTOPEN_KEY_LENGTH; - memcpy(&key[0], &ctx->key[0], key_len); - } - rcu_read_unlock(); - + key_len = tcp_fastopen_get_cipher(net, icsk, key) * + TCP_FASTOPEN_KEY_LENGTH; len = min_t(unsigned int, len, key_len); if (put_user(len, optlen)) return -EFAULT; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index c1a54f3d58f5..09b62de04eea 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -108,6 +108,29 @@ out: return err; } +int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, + u64 *key) +{ + struct tcp_fastopen_context *ctx; + int n_keys = 0, i; + + rcu_read_lock(); + if (icsk) + ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); + else + ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctx) { + n_keys = tcp_fastopen_context_len(ctx); + for (i = 0; i < n_keys; i++) { + put_unaligned_le64(ctx->key[i].key[0], key + (i * 2)); + put_unaligned_le64(ctx->key[i].key[1], key + (i * 2) + 1); + } + } + rcu_read_unlock(); + + return n_keys; +} + static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, struct sk_buff *syn, const siphash_key_t *key, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 96f4f2fe50ad..e8cac2655c82 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -423,12 +423,12 @@ static void mptcp_sock_destruct(struct sock *sk) * also remove the mptcp socket, via * sock_put(ctx->conn). * - * Problem is that the mptcp socket will not be in - * SYN_RECV state and doesn't have SOCK_DEAD flag. + * Problem is that the mptcp socket will be in + * ESTABLISHED state and will not have the SOCK_DEAD flag. * Both result in warnings from inet_sock_destruct. */ - if (sk->sk_state == TCP_SYN_RECV) { + if (sk->sk_state == TCP_ESTABLISHED) { sk->sk_state = TCP_CLOSE; WARN_ON_ONCE(sk->sk_socket); sock_orphan(sk); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e38b60fc183e..5b97d233f89b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); -seqcount_t nf_conntrack_generation __read_mostly; +seqcount_spinlock_t nf_conntrack_generation __read_mostly; static unsigned int nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, @@ -2588,7 +2588,8 @@ int nf_conntrack_init_start(void) /* struct nf_ct_ext uses u8 to store offsets/size */ BUILD_BUG_ON(total_extension_size() > 255u); - seqcount_init(&nf_conntrack_generation); + seqcount_spinlock_init(&nf_conntrack_generation, + &nf_conntrack_locks_all_lock); for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_conntrack_locks[i]); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index b6aad3fc46c3..4b2834fd17b2 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -18,7 +18,7 @@ struct nft_rbtree { struct rb_root root; rwlock_t lock; - seqcount_t count; + seqcount_rwlock_t count; struct delayed_work gc_work; }; @@ -523,7 +523,7 @@ static int nft_rbtree_init(const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); rwlock_init(&priv->lock); - seqcount_init(&priv->count); + seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc); diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index b2061b6746ea..955c195ae14b 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -328,10 +328,13 @@ static int rawsock_create(struct net *net, struct socket *sock, if ((sock->type != SOCK_SEQPACKET) && (sock->type != SOCK_RAW)) return -ESOCKTNOSUPPORT; - if (sock->type == SOCK_RAW) + if (sock->type == SOCK_RAW) { + if (!capable(CAP_NET_RAW)) + return -EPERM; sock->ops = &rawsock_raw_ops; - else + } else { sock->ops = &rawsock_ops; + } sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern); if (!sk) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 42f8cc70bb2c..6e47ef7ef036 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1756,6 +1756,7 @@ err: /* Called with ovs_mutex. */ static void __dp_destroy(struct datapath *dp) { + struct flow_table *table = &dp->table; int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { @@ -1774,7 +1775,14 @@ static void __dp_destroy(struct datapath *dp) */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); - /* RCU destroy the flow table */ + /* Flush sw_flow in the tables. RCU cb only releases resource + * such as dp, ports and tables. That may avoid some issues + * such as RCU usage warning. + */ + table_instance_flow_flush(table, ovsl_dereference(table->ti), + ovsl_dereference(table->ufid_ti)); + + /* RCU destroy the ports, meters and flow tables. */ call_rcu(&dp->rcu, destroy_dp_rcu); } diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 8c12675cbb67..e2235849a57e 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -473,19 +473,15 @@ static void table_instance_flow_free(struct flow_table *table, flow_mask_remove(table, flow->mask); } -static void table_instance_destroy(struct flow_table *table, - struct table_instance *ti, - struct table_instance *ufid_ti, - bool deferred) +/* Must be called with OVS mutex held. */ +void table_instance_flow_flush(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti) { int i; - if (!ti) - return; - - BUG_ON(!ufid_ti); if (ti->keep_flows) - goto skip_flows; + return; for (i = 0; i < ti->n_buckets; i++) { struct sw_flow *flow; @@ -497,18 +493,16 @@ static void table_instance_destroy(struct flow_table *table, table_instance_flow_free(table, ti, ufid_ti, flow, false); - ovs_flow_free(flow, deferred); + ovs_flow_free(flow, true); } } +} -skip_flows: - if (deferred) { - call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); - call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); - } else { - __table_instance_destroy(ti); - __table_instance_destroy(ufid_ti); - } +static void table_instance_destroy(struct table_instance *ti, + struct table_instance *ufid_ti) +{ + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); } /* No need for locking this function is called from RCU callback or @@ -523,7 +517,7 @@ void ovs_flow_tbl_destroy(struct flow_table *table) call_rcu(&mc->rcu, mask_cache_rcu_cb); call_rcu(&ma->rcu, mask_array_rcu_cb); - table_instance_destroy(table, ti, ufid_ti, false); + table_instance_destroy(ti, ufid_ti); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -641,7 +635,8 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table) flow_table->count = 0; flow_table->ufid_count = 0; - table_instance_destroy(flow_table, old_ti, old_ufid_ti, true); + table_instance_flow_flush(flow_table, old_ti, old_ufid_ti); + table_instance_destroy(old_ti, old_ufid_ti); return 0; err_free_ti: diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 74ce48fecba9..6e7d4ac59353 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -105,5 +105,8 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, bool full, const struct sw_flow_mask *mask); void ovs_flow_masks_rebalance(struct flow_table *table); +void table_instance_flow_flush(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti); #endif /* flow_table.h */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 0b8160d1a6e0..479c257ded73 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -941,6 +941,7 @@ static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) } static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) + __releases(&pkc->blk_fill_in_prog_lock) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); @@ -989,6 +990,7 @@ static void prb_fill_curr_block(char *curr, struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd, unsigned int len) + __acquires(&pkc->blk_fill_in_prog_lock) { struct tpacket3_hdr *ppd; @@ -2286,8 +2288,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (do_vnet && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), - vio_le(), true, 0)) + vio_le(), true, 0)) { + if (po->tp_version == TPACKET_V3) + prb_clear_blk_fill_status(&po->rx_ring); goto drop_n_account; + } if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); @@ -2393,7 +2398,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, __clear_bit(slot_id, po->rx_ring.rx_owner_map); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); - } else { + } else if (po->tp_version == TPACKET_V3) { prb_clear_blk_fill_status(&po->rx_ring); } diff --git a/net/socket.c b/net/socket.c index aff52e81653c..dbbe8ea7d395 100644 --- a/net/socket.c +++ b/net/socket.c @@ -500,7 +500,7 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) if (f.file) { sock = sock_from_file(f.file, err); if (likely(sock)) { - *fput_needed = f.flags; + *fput_needed = f.flags & FDPUT_FPUT; return sock; } fdput(f); @@ -1325,7 +1325,7 @@ int sock_wake_async(struct socket_wq *wq, int how, int band) case SOCK_WAKE_SPACE: if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags)) break; - /* fall through */ + fallthrough; case SOCK_WAKE_IO: call_kill: kill_fasync(&wq->fasync_list, SIGIO, band); @@ -1804,8 +1804,7 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, ret = __sys_accept4_file(f.file, 0, upeer_sockaddr, upeer_addrlen, flags, rlimit(RLIMIT_NOFILE)); - if (f.flags) - fput(f.file); + fdput(f); } return ret; @@ -1868,8 +1867,7 @@ int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) ret = move_addr_to_kernel(uservaddr, addrlen, &address); if (!ret) ret = __sys_connect_file(f.file, &address, addrlen, 0); - if (f.flags) - fput(f.file); + fdput(f); } return ret; @@ -2097,7 +2095,7 @@ static bool sock_use_custom_sol_socket(const struct socket *sock) int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, int optlen) { - sockptr_t optval; + sockptr_t optval = USER_SOCKPTR(user_optval); char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2105,10 +2103,6 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, if (optlen < 0) return -EINVAL; - err = init_user_sockptr(&optval, user_optval, optlen); - if (err) - return err; - sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; @@ -3065,7 +3059,7 @@ static int __init sock_init(void) err = register_filesystem(&sock_fs_type); if (err) - goto out_fs; + goto out; sock_mnt = kern_mount(&sock_fs_type); if (IS_ERR(sock_mnt)) { err = PTR_ERR(sock_mnt); @@ -3088,7 +3082,6 @@ out: out_mount: unregister_filesystem(&sock_fs_type); -out_fs: goto out; } @@ -3161,13 +3154,13 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32)) return -ENOMEM; buf_size += rule_cnt * sizeof(u32); - /* fall through */ + fallthrough; case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_SRXCLSRLINS: convert_out = true; - /* fall through */ + fallthrough; case ETHTOOL_SRXCLSRLDEL: buf_size += sizeof(struct ethtool_rxnfc); convert_in = true; diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index cf0fd170ac18..90b8329fef82 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -584,7 +584,7 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len, buf->head[0].iov_len); memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen); buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip; - buf->len = len - GSS_KRB5_TOK_HDR_LEN + headskip; + buf->len = len - (GSS_KRB5_TOK_HDR_LEN + headskip); /* Trim off the trailing "extra count" and checksum blob */ xdr_buf_trim(buf, ec + GSS_KRB5_TOK_HDR_LEN + tailskip); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 46027d0c903f..258b04372f85 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -332,7 +332,7 @@ static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct r struct gss_svc_seq_data { /* highest seq number seen so far: */ - int sd_max; + u32 sd_max; /* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of * sd_win is nonzero iff sequence number i has been seen already: */ unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG]; @@ -613,16 +613,29 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle) return found; } -/* Implements sequence number algorithm as specified in RFC 2203. */ -static int -gss_check_seq_num(struct rsc *rsci, int seq_num) +/** + * gss_check_seq_num - GSS sequence number window check + * @rqstp: RPC Call to use when reporting errors + * @rsci: cached GSS context state (updated on return) + * @seq_num: sequence number to check + * + * Implements sequence number algorithm as specified in + * RFC 2203, Section 5.3.3.1. "Context Management". + * + * Return values: + * %true: @rqstp's GSS sequence number is inside the window + * %false: @rqstp's GSS sequence number is outside the window + */ +static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci, + u32 seq_num) { struct gss_svc_seq_data *sd = &rsci->seqdata; + bool result = false; spin_lock(&sd->sd_lock); if (seq_num > sd->sd_max) { if (seq_num >= sd->sd_max + GSS_SEQ_WIN) { - memset(sd->sd_win,0,sizeof(sd->sd_win)); + memset(sd->sd_win, 0, sizeof(sd->sd_win)); sd->sd_max = seq_num; } else while (sd->sd_max < seq_num) { sd->sd_max++; @@ -631,17 +644,25 @@ gss_check_seq_num(struct rsc *rsci, int seq_num) __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); goto ok; } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) { - goto drop; + goto toolow; } - /* sd_max - GSS_SEQ_WIN < seq_num <= sd_max */ if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) - goto drop; + goto alreadyseen; + ok: + result = true; +out: spin_unlock(&sd->sd_lock); - return 1; -drop: - spin_unlock(&sd->sd_lock); - return 0; + return result; + +toolow: + trace_rpcgss_svc_seqno_low(rqstp, seq_num, + sd->sd_max - GSS_SEQ_WIN, + sd->sd_max); + goto out; +alreadyseen: + trace_rpcgss_svc_seqno_seen(rqstp, seq_num); + goto out; } static inline u32 round_up_to_quad(u32 i) @@ -721,14 +742,12 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, } if (gc->gc_seq > MAXSEQ) { - trace_rpcgss_svc_large_seqno(rqstp->rq_xid, gc->gc_seq); + trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq); *authp = rpcsec_gsserr_ctxproblem; return SVC_DENIED; } - if (!gss_check_seq_num(rsci, gc->gc_seq)) { - trace_rpcgss_svc_old_seqno(rqstp->rq_xid, gc->gc_seq); + if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq)) return SVC_DROP; - } return SVC_OK; } @@ -866,11 +885,13 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) static int unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) { + u32 integ_len, rseqno, maj_stat; int stat = -EINVAL; - u32 integ_len, maj_stat; struct xdr_netobj mic; struct xdr_buf integ_buf; + mic.data = NULL; + /* NFS READ normally uses splice to send data in-place. However * the data in cache can change after the reply's MIC is computed * but before the RPC reply is sent. To prevent the client from @@ -885,34 +906,44 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g integ_len = svc_getnl(&buf->head[0]); if (integ_len & 3) - return stat; + goto unwrap_failed; if (integ_len > buf->len) - return stat; - if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) { - WARN_ON_ONCE(1); - return stat; - } + goto unwrap_failed; + if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) + goto unwrap_failed; + /* copy out mic... */ if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) - return stat; + goto unwrap_failed; if (mic.len > RPC_MAX_AUTH_SIZE) - return stat; + goto unwrap_failed; mic.data = kmalloc(mic.len, GFP_KERNEL); if (!mic.data) - return stat; + goto unwrap_failed; if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) - goto out; + goto unwrap_failed; maj_stat = gss_verify_mic(ctx, &integ_buf, &mic); if (maj_stat != GSS_S_COMPLETE) - goto out; - if (svc_getnl(&buf->head[0]) != seq) - goto out; + goto bad_mic; + rseqno = svc_getnl(&buf->head[0]); + if (rseqno != seq) + goto bad_seqno; /* trim off the mic and padding at the end before returning */ xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4); stat = 0; out: kfree(mic.data); return stat; + +unwrap_failed: + trace_rpcgss_svc_unwrap_failed(rqstp); + goto out; +bad_seqno: + trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno); + goto out; +bad_mic: + trace_rpcgss_svc_mic(rqstp, maj_stat); + goto out; } static inline int @@ -937,6 +968,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs { u32 priv_len, maj_stat; int pad, remaining_len, offset; + u32 rseqno; clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); @@ -951,14 +983,13 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs * not yet read from the head, so these two values are different: */ remaining_len = total_buf_len(buf); if (priv_len > remaining_len) - return -EINVAL; + goto unwrap_failed; pad = remaining_len - priv_len; buf->len -= pad; fix_priv_head(buf, pad); maj_stat = gss_unwrap(ctx, 0, priv_len, buf); pad = priv_len - buf->len; - buf->len -= pad; /* The upper layers assume the buffer is aligned on 4-byte boundaries. * In the krb5p case, at least, the data ends up offset, so we need to * move it around. */ @@ -972,11 +1003,22 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs fix_priv_head(buf, pad); } if (maj_stat != GSS_S_COMPLETE) - return -EINVAL; + goto bad_unwrap; out_seq: - if (svc_getnl(&buf->head[0]) != seq) - return -EINVAL; + rseqno = svc_getnl(&buf->head[0]); + if (rseqno != seq) + goto bad_seqno; return 0; + +unwrap_failed: + trace_rpcgss_svc_unwrap_failed(rqstp); + return -EINVAL; +bad_seqno: + trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno); + return -EINVAL; +bad_unwrap: + trace_rpcgss_svc_unwrap(rqstp, maj_stat); + return -EINVAL; } struct gss_svc_data { @@ -1314,8 +1356,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, if (status) goto out; - trace_rpcgss_svc_accept_upcall(rqstp->rq_xid, ud.major_status, - ud.minor_status); + trace_rpcgss_svc_accept_upcall(rqstp, ud.major_status, ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: @@ -1490,8 +1531,6 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); - trace_rpcgss_svc_accept(rqstp->rq_xid, argv->iov_len); - *authp = rpc_autherr_badcred; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); @@ -1608,6 +1647,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) GSS_C_QOP_DEFAULT, gc->gc_svc); ret = SVC_OK; + trace_rpcgss_svc_authenticate(rqstp, gc); goto out; } garbage_args: diff --git a/net/sunrpc/auth_gss/trace.c b/net/sunrpc/auth_gss/trace.c index 49fa583d7f91..d26036a57443 100644 --- a/net/sunrpc/auth_gss/trace.c +++ b/net/sunrpc/auth_gss/trace.c @@ -5,6 +5,9 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> +#include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svc_xprt.h> +#include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/gss_err.h> #include <linux/sunrpc/auth_gss.h> diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index e9d0953522f0..eadc0ede928c 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1510,6 +1510,6 @@ err_notifier: void unregister_rpc_pipefs(void) { rpc_clients_notifier_unregister(); - kmem_cache_destroy(rpc_inode_cachep); unregister_filesystem(&rpc_pipe_fs_type); + kmem_cache_destroy(rpc_inode_cachep); } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d5cc5db9dbf3..6ba9d5842629 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -607,6 +607,11 @@ static void xprt_reset_majortimeo(struct rpc_rqst *req) req->rq_majortimeo += xprt_calc_majortimeo(req); } +static void xprt_reset_minortimeo(struct rpc_rqst *req) +{ + req->rq_minortimeo += req->rq_timeout; +} + static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req) { unsigned long time_init; @@ -618,6 +623,7 @@ static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req) time_init = xprt_abs_ktime_to_jiffies(task->tk_start); req->rq_timeout = task->tk_client->cl_timeout->to_initval; req->rq_majortimeo = time_init + xprt_calc_majortimeo(req); + req->rq_minortimeo = time_init + req->rq_timeout; } /** @@ -631,6 +637,8 @@ int xprt_adjust_timeout(struct rpc_rqst *req) const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; int status = 0; + if (time_before(jiffies, req->rq_minortimeo)) + return status; if (time_before(jiffies, req->rq_majortimeo)) { if (to->to_exponential) req->rq_timeout <<= 1; @@ -649,6 +657,7 @@ int xprt_adjust_timeout(struct rpc_rqst *req) spin_unlock(&xprt->transport_lock); status = -ETIMEDOUT; } + xprt_reset_minortimeo(req); if (req->rq_timeout == 0) { printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n"); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index b647562a26dd..7f94c9a19fd3 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -40,7 +40,6 @@ * New MRs are created on demand. */ -#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 453bacc99907..0f5120c7668f 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -275,14 +275,6 @@ out: return n; } -static void -xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) -{ - *iptr++ = cpu_to_be32(mr->mr_handle); - *iptr++ = cpu_to_be32(mr->mr_length); - xdr_encode_hyper(iptr, mr->mr_offset); -} - static int encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) { @@ -292,7 +284,7 @@ encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) if (unlikely(!p)) return -EMSGSIZE; - xdr_encode_rdma_segment(p, mr); + xdr_encode_rdma_segment(p, mr->mr_handle, mr->mr_length, mr->mr_offset); return 0; } @@ -307,8 +299,8 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return -EMSGSIZE; *p++ = xdr_one; /* Item present */ - *p++ = cpu_to_be32(position); - xdr_encode_rdma_segment(p, mr); + xdr_encode_read_segment(p, position, mr->mr_handle, mr->mr_length, + mr->mr_offset); return 0; } @@ -1133,11 +1125,11 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) p = xdr_inline_decode(xdr, 0); /* Chunk lists */ - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; /* RPC header */ @@ -1176,10 +1168,7 @@ static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) if (unlikely(!p)) return -EIO; - handle = be32_to_cpup(p++); - *length = be32_to_cpup(p++); - xdr_decode_hyper(p, &offset); - + xdr_decode_rdma_segment(p, &handle, length, &offset); trace_xprtrdma_decode_seg(handle, *length, offset); return 0; } @@ -1215,7 +1204,7 @@ static int decode_read_list(struct xdr_stream *xdr) p = xdr_inline_decode(xdr, sizeof(*p)); if (unlikely(!p)) return -EIO; - if (unlikely(*p != xdr_zero)) + if (unlikely(xdr_item_is_present(p))) return -EIO; return 0; } @@ -1234,7 +1223,7 @@ static int decode_write_list(struct xdr_stream *xdr, u32 *length) p = xdr_inline_decode(xdr, sizeof(*p)); if (unlikely(!p)) return -EIO; - if (*p == xdr_zero) + if (xdr_item_is_absent(p)) break; if (!first) return -EIO; @@ -1256,7 +1245,7 @@ static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) return -EIO; *length = 0; - if (*p != xdr_zero) + if (xdr_item_is_present(p)) if (decode_write_chunk(xdr, length)) return -EIO; return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 1ee73f7cf931..5e7c4ba9e147 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -87,7 +87,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, */ get_page(virt_to_page(rqst->rq_buffer)); ctxt->sc_send_wr.opcode = IB_WR_SEND; - return svc_rdma_send(rdma, &ctxt->sc_send_wr); + return svc_rdma_send(rdma, ctxt); } /* Server-side transport endpoint wants a whole page for its send diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index e426fedb9524..c6ea2903c21a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -117,6 +117,13 @@ svc_rdma_next_recv_ctxt(struct list_head *list) rc_list); } +static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_rq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { @@ -135,6 +142,8 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) goto fail2; + svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid); + ctxt->rc_recv_wr.next = NULL; ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge; @@ -248,16 +257,15 @@ static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, { int ret; - svc_xprt_get(&rdma->sc_xprt); + trace_svcrdma_post_recv(ctxt); ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); - trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret); if (ret) goto err_post; return 0; err_post: + trace_svcrdma_rq_post_err(rdma, ret); svc_rdma_recv_ctxt_put(rdma, ctxt); - svc_xprt_put(&rdma->sc_xprt); return ret; } @@ -265,6 +273,8 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + return 0; ctxt = svc_rdma_recv_ctxt_get(rdma); if (!ctxt) return -ENOMEM; @@ -309,11 +319,10 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_recv_ctxt *ctxt; - trace_svcrdma_wc_receive(wc); - /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); + trace_svcrdma_wc_receive(wc, &ctxt->rc_cid); if (wc->status != IB_WC_SUCCESS) goto flushed; @@ -333,15 +342,13 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) spin_unlock(&rdma->sc_rq_dto_lock); if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) svc_xprt_enqueue(&rdma->sc_xprt); - goto out; + return; flushed: post_err: svc_rdma_recv_ctxt_put(rdma, ctxt); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); svc_xprt_enqueue(&rdma->sc_xprt); -out: - svc_xprt_put(&rdma->sc_xprt); } /** @@ -419,7 +426,7 @@ static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) len = 0; first = true; - while (*p != xdr_zero) { + while (xdr_item_is_present(p)) { p = xdr_inline_decode(&rctxt->rc_stream, rpcrdma_readseg_maxsz * sizeof(*p)); if (!p) @@ -466,9 +473,7 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen) if (!p) return false; - handle = be32_to_cpup(p++); - length = be32_to_cpup(p++); - xdr_decode_hyper(p, &offset); + xdr_decode_rdma_segment(p, &handle, &length, &offset); trace_svcrdma_decode_wseg(handle, length, offset); total += length; @@ -500,7 +505,7 @@ static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt) if (!p) return false; rctxt->rc_write_list = p; - while (*p != xdr_zero) { + while (xdr_item_is_present(p)) { if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK)) return false; ++chcount; @@ -532,12 +537,11 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt) p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) return false; - rctxt->rc_reply_chunk = p; - if (*p != xdr_zero) { + rctxt->rc_reply_chunk = NULL; + if (xdr_item_is_present(p)) { if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK)) return false; - } else { - rctxt->rc_reply_chunk = NULL; + rctxt->rc_reply_chunk = p; } return true; } @@ -568,7 +572,7 @@ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, p += rpcrdma_fixed_maxsz; /* Read list */ - while (*p++ != xdr_zero) { + while (xdr_item_is_present(p++)) { p++; /* position */ if (inv_rkey == xdr_zero) inv_rkey = *p; @@ -578,7 +582,7 @@ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, } /* Write list */ - while (*p++ != xdr_zero) { + while (xdr_item_is_present(p++)) { segcount = be32_to_cpup(p++); for (i = 0; i < segcount; i++) { if (inv_rkey == xdr_zero) @@ -590,7 +594,7 @@ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, } /* Reply chunk */ - if (*p++ != xdr_zero) { + if (xdr_item_is_present(p++)) { segcount = be32_to_cpup(p++); for (i = 0; i < segcount; i++) { if (inv_rkey == xdr_zero) @@ -661,27 +665,27 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg, hdr_len = xdr_stream_pos(&rctxt->rc_stream); rq_arg->head[0].iov_len -= hdr_len; rq_arg->len -= hdr_len; - trace_svcrdma_decode_rqst(rdma_argp, hdr_len); + trace_svcrdma_decode_rqst(rctxt, rdma_argp, hdr_len); return hdr_len; out_short: - trace_svcrdma_decode_short_err(rq_arg->len); + trace_svcrdma_decode_short_err(rctxt, rq_arg->len); return -EINVAL; out_version: - trace_svcrdma_decode_badvers_err(rdma_argp); + trace_svcrdma_decode_badvers_err(rctxt, rdma_argp); return -EPROTONOSUPPORT; out_drop: - trace_svcrdma_decode_drop_err(rdma_argp); + trace_svcrdma_decode_drop_err(rctxt, rdma_argp); return 0; out_proc: - trace_svcrdma_decode_badproc_err(rdma_argp); + trace_svcrdma_decode_badproc_err(rctxt, rdma_argp); return -EINVAL; out_inval: - trace_svcrdma_decode_parse_err(rdma_argp); + trace_svcrdma_decode_parse_err(rctxt, rdma_argp); return -EINVAL; } @@ -714,57 +718,16 @@ static void rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_arg.buflen = head->rc_arg.buflen; } -static void svc_rdma_send_error(struct svcxprt_rdma *xprt, - __be32 *rdma_argp, int status) +static void svc_rdma_send_error(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *rctxt, + int status) { - struct svc_rdma_send_ctxt *ctxt; - __be32 *p; - int ret; + struct svc_rdma_send_ctxt *sctxt; - ctxt = svc_rdma_send_ctxt_get(xprt); - if (!ctxt) + sctxt = svc_rdma_send_ctxt_get(rdma); + if (!sctxt) return; - - p = xdr_reserve_space(&ctxt->sc_stream, - rpcrdma_fixed_maxsz * sizeof(*p)); - if (!p) - goto put_ctxt; - - *p++ = *rdma_argp; - *p++ = *(rdma_argp + 1); - *p++ = xprt->sc_fc_credits; - *p = rdma_error; - - switch (status) { - case -EPROTONOSUPPORT: - p = xdr_reserve_space(&ctxt->sc_stream, 3 * sizeof(*p)); - if (!p) - goto put_ctxt; - - *p++ = err_vers; - *p++ = rpcrdma_version; - *p = rpcrdma_version; - trace_svcrdma_err_vers(*rdma_argp); - break; - default: - p = xdr_reserve_space(&ctxt->sc_stream, sizeof(*p)); - if (!p) - goto put_ctxt; - - *p = err_chunk; - trace_svcrdma_err_chunk(*rdma_argp); - } - - ctxt->sc_send_wr.num_sge = 1; - ctxt->sc_send_wr.opcode = IB_WR_SEND; - ctxt->sc_sges[0].length = ctxt->sc_hdrbuf.len; - ret = svc_rdma_send(xprt, &ctxt->sc_send_wr); - if (ret) - goto put_ctxt; - return; - -put_ctxt: - svc_rdma_send_ctxt_put(xprt, ctxt); + svc_rdma_send_error_msg(rdma, sctxt, rctxt, status); } /* By convention, backchannel calls arrive via rdma_msg type @@ -900,13 +863,13 @@ out_readchunk: return 0; out_err: - svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_send_error(rdma_xprt, ctxt, ret); svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; out_postfail: if (ret == -EINVAL) - svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_send_error(rdma_xprt, ctxt, ret); svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 5eb35309ecef..fe54cbe97a46 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -7,6 +7,7 @@ #include <rdma/rw.h> +#include <linux/sunrpc/xdr.h> #include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> @@ -144,17 +145,25 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, * demand, and not cached. */ struct svc_rdma_chunk_ctxt { + struct rpc_rdma_cid cc_cid; struct ib_cqe cc_cqe; struct svcxprt_rdma *cc_rdma; struct list_head cc_rwctxts; int cc_sqecount; }; +static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_sq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc) { + svc_rdma_cc_cid_init(rdma, &cc->cc_cid); cc->cc_rdma = rdma; - svc_xprt_get(&rdma->sc_xprt); INIT_LIST_HEAD(&cc->cc_rwctxts); cc->cc_sqecount = 0; @@ -174,7 +183,6 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, ctxt->rw_nents, dir); svc_rdma_put_rw_ctxt(rdma, ctxt); } - svc_xprt_put(&rdma->sc_xprt); } /* State for sending a Write or Reply chunk. @@ -236,7 +244,7 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_write_info *info = container_of(cc, struct svc_rdma_write_info, wi_cc); - trace_svcrdma_wc_write(wc); + trace_svcrdma_wc_write(wc, &cc->cc_cid); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wake_up(&rdma->sc_send_wait); @@ -294,7 +302,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_read_info *info = container_of(cc, struct svc_rdma_read_info, ri_cc); - trace_svcrdma_wc_read(wc); + trace_svcrdma_wc_read(wc, &cc->cc_cid); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wake_up(&rdma->sc_send_wait); @@ -350,6 +358,7 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) do { if (atomic_sub_return(cc->cc_sqecount, &rdma->sc_sq_avail) > 0) { + trace_svcrdma_post_chunk(&cc->cc_cid, cc->cc_sqecount); ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); if (ret) break; @@ -441,34 +450,32 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; do { unsigned int write_len; - u32 seg_length, seg_handle; - u64 seg_offset; + u32 handle, length; + u64 offset; if (info->wi_seg_no >= info->wi_nsegs) goto out_overflow; - seg_handle = be32_to_cpup(seg); - seg_length = be32_to_cpup(seg + 1); - xdr_decode_hyper(seg + 2, &seg_offset); - seg_offset += info->wi_seg_off; + xdr_decode_rdma_segment(seg, &handle, &length, &offset); + offset += info->wi_seg_off; - write_len = min(remaining, seg_length - info->wi_seg_off); + write_len = min(remaining, length - info->wi_seg_off); ctxt = svc_rdma_get_rw_ctxt(rdma, (write_len >> PAGE_SHIFT) + 2); if (!ctxt) return -ENOMEM; constructor(info, write_len, ctxt); - ret = svc_rdma_rw_ctx_init(rdma, ctxt, seg_offset, seg_handle, + ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, handle, DMA_TO_DEVICE); if (ret < 0) return -EIO; - trace_svcrdma_send_wseg(seg_handle, write_len, seg_offset); + trace_svcrdma_send_wseg(handle, write_len, offset); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; - if (write_len == seg_length - info->wi_seg_off) { + if (write_len == length - info->wi_seg_off) { seg += 4; info->wi_seg_no++; info->wi_seg_off = 0; @@ -684,35 +691,24 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, struct svc_rdma_read_info *info, __be32 *p) { - unsigned int i; int ret; ret = -EINVAL; info->ri_chunklen = 0; while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { - u32 rs_handle, rs_length; - u64 rs_offset; - - rs_handle = be32_to_cpup(p++); - rs_length = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &rs_offset); + u32 handle, length; + u64 offset; - ret = svc_rdma_build_read_segment(info, rqstp, - rs_handle, rs_length, - rs_offset); + p = xdr_decode_rdma_segment(p, &handle, &length, &offset); + ret = svc_rdma_build_read_segment(info, rqstp, handle, length, + offset); if (ret < 0) break; - trace_svcrdma_send_rseg(rs_handle, rs_length, rs_offset); - info->ri_chunklen += rs_length; + trace_svcrdma_send_rseg(handle, length, offset); + info->ri_chunklen += length; } - /* Pages under I/O have been copied to head->rc_pages. - * Prevent their premature release by svc_xprt_release() . - */ - for (i = 0; i < info->ri_readctxt->rc_page_count; i++) - rqstp->rq_pages[i] = NULL; - return ret; } @@ -807,6 +803,26 @@ out: return ret; } +/* Pages under I/O have been copied to head->rc_pages. Ensure they + * are not released by svc_xprt_release() until the I/O is complete. + * + * This has to be done after all Read WRs are constructed to properly + * handle a page that is part of I/O on behalf of two different RDMA + * segments. + * + * Do this only if I/O has been posted. Otherwise, we do indeed want + * svc_xprt_release() to clean things up properly. + */ +static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, + const unsigned int start, + const unsigned int num_pages) +{ + unsigned int i; + + for (i = start; i < num_pages + start; i++) + rqstp->rq_pages[i] = NULL; +} + /** * svc_rdma_recv_read_chunk - Pull a Read chunk from the client * @rdma: controlling RDMA transport @@ -860,6 +876,7 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); if (ret < 0) goto out_err; + svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count); return 0; out_err: diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 38e7c3c8c4a9..7b94d971feb3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -106,7 +106,6 @@ #include <rdma/rdma_cm.h> #include <linux/sunrpc/debug.h> -#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" @@ -123,6 +122,13 @@ svc_rdma_next_send_ctxt(struct list_head *list) sc_list); } +static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_sq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { @@ -145,6 +151,8 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) goto fail2; + svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); + ctxt->sc_send_wr.next = NULL; ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; ctxt->sc_send_wr.sg_list = ctxt->sc_sges; @@ -269,34 +277,33 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { struct svcxprt_rdma *rdma = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_send_ctxt *ctxt; + struct svc_rdma_send_ctxt *ctxt = + container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); - trace_svcrdma_wc_send(wc); + trace_svcrdma_wc_send(wc, &ctxt->sc_cid); atomic_inc(&rdma->sc_sq_avail); wake_up(&rdma->sc_send_wait); - ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); svc_rdma_send_ctxt_put(rdma, ctxt); if (unlikely(wc->status != IB_WC_SUCCESS)) { set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); svc_xprt_enqueue(&rdma->sc_xprt); } - - svc_xprt_put(&rdma->sc_xprt); } /** * svc_rdma_send - Post a single Send WR * @rdma: transport on which to post the WR - * @wr: prepared Send WR to post + * @ctxt: send ctxt with a Send WR ready to post * * Returns zero the Send WR was posted successfully. Otherwise, a * negative errno is returned. */ -int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) +int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) { + struct ib_send_wr *wr = &ctxt->sc_send_wr; int ret; might_sleep(); @@ -321,8 +328,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) continue; } - svc_xprt_get(&rdma->sc_xprt); - trace_svcrdma_post_send(wr); + trace_svcrdma_post_send(ctxt); ret = ib_post_send(rdma->sc_qp, wr, NULL); if (ret) break; @@ -331,7 +337,6 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) trace_svcrdma_sq_post_err(rdma, ret); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_xprt_put(&rdma->sc_xprt); wake_up(&rdma->sc_send_wait); return ret; } @@ -375,11 +380,8 @@ static ssize_t svc_rdma_encode_write_segment(__be32 *src, if (!p) return -EMSGSIZE; - handle = be32_to_cpup(src++); - length = be32_to_cpup(src++); - xdr_decode_hyper(src, &offset); + xdr_decode_rdma_segment(src, &handle, &length, &offset); - *p++ = cpu_to_be32(handle); if (*remaining < length) { /* segment only partly filled */ length = *remaining; @@ -388,8 +390,7 @@ static ssize_t svc_rdma_encode_write_segment(__be32 *src, /* entire segment was consumed */ *remaining -= length; } - *p++ = cpu_to_be32(length); - xdr_encode_hyper(p, offset); + xdr_encode_rdma_segment(p, handle, length, offset); trace_svcrdma_encode_wseg(handle, length, offset); return len; @@ -801,45 +802,76 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, } else { sctxt->sc_send_wr.opcode = IB_WR_SEND; } - return svc_rdma_send(rdma, &sctxt->sc_send_wr); + return svc_rdma_send(rdma, sctxt); } -/* Given the client-provided Write and Reply chunks, the server was not - * able to form a complete reply. Return an RDMA_ERROR message so the - * client can retire this RPC transaction. As above, the Send completion - * routine releases payload pages that were part of a previous RDMA Write. - * - * Remote Invalidation is skipped for simplicity. +/** + * svc_rdma_send_error_msg - Send an RPC/RDMA v1 error response + * @rdma: controlling transport context + * @sctxt: Send context for the response + * @rctxt: Receive context for incoming bad message + * @status: negative errno indicating error that occurred + * + * Given the client-provided Read, Write, and Reply chunks, the + * server was not able to parse the Call or form a complete Reply. + * Return an RDMA_ERROR message so the client can retire the RPC + * transaction. + * + * The caller does not have to release @sctxt. It is released by + * Send completion, or by this function on error. */ -static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct svc_rqst *rqstp) +void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_recv_ctxt *rctxt, + int status) { - struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; __be32 *rdma_argp = rctxt->rc_recv_buf; __be32 *p; - rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0); - xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, - NULL); + rpcrdma_set_xdrlen(&sctxt->sc_hdrbuf, 0); + xdr_init_encode(&sctxt->sc_stream, &sctxt->sc_hdrbuf, + sctxt->sc_xprt_buf, NULL); - p = xdr_reserve_space(&ctxt->sc_stream, RPCRDMA_HDRLEN_ERR); + p = xdr_reserve_space(&sctxt->sc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); if (!p) - return -ENOMSG; + goto put_ctxt; *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); *p++ = rdma->sc_fc_credits; - *p++ = rdma_error; - *p = err_chunk; - trace_svcrdma_err_chunk(*rdma_argp); + *p = rdma_error; + + switch (status) { + case -EPROTONOSUPPORT: + p = xdr_reserve_space(&sctxt->sc_stream, 3 * sizeof(*p)); + if (!p) + goto put_ctxt; + + *p++ = err_vers; + *p++ = rpcrdma_version; + *p = rpcrdma_version; + trace_svcrdma_err_vers(*rdma_argp); + break; + default: + p = xdr_reserve_space(&sctxt->sc_stream, sizeof(*p)); + if (!p) + goto put_ctxt; + + *p = err_chunk; + trace_svcrdma_err_chunk(*rdma_argp); + } - svc_rdma_save_io_pages(rqstp, ctxt); + /* Remote Invalidation is skipped for simplicity. */ + sctxt->sc_send_wr.num_sge = 1; + sctxt->sc_send_wr.opcode = IB_WR_SEND; + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; + if (svc_rdma_send(rdma, sctxt)) + goto put_ctxt; + return; - ctxt->sc_send_wr.num_sge = 1; - ctxt->sc_send_wr.opcode = IB_WR_SEND; - ctxt->sc_sges[0].length = ctxt->sc_hdrbuf.len; - return svc_rdma_send(rdma, &ctxt->sc_send_wr); +put_ctxt: + svc_rdma_send_ctxt_put(rdma, sctxt); } /** @@ -930,15 +962,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (ret != -E2BIG && ret != -EINVAL) goto err1; - ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp); - if (ret < 0) - goto err1; + /* Send completion releases payload pages that were part + * of previously posted RDMA Writes. + */ + svc_rdma_save_io_pages(rqstp, sctxt); + svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret); return 0; err1: svc_rdma_send_ctxt_put(rdma, sctxt); err0: - trace_svcrdma_send_failed(rqstp, ret); + trace_svcrdma_send_err(rqstp, ret); set_bit(XPT_CLOSE, &xprt->xpt_flags); return -ENOTCONN; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index d38be57b00ed..fb044792b571 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -55,7 +55,6 @@ #include <linux/sunrpc/addr.h> #include <linux/sunrpc/debug.h> -#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/svc_rdma.h> @@ -238,65 +237,56 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, svc_xprt_enqueue(&listen_xprt->sc_xprt); } -/* - * Handles events generated on the listening endpoint. These events will be - * either be incoming connect requests or adapter removal events. +/** + * svc_rdma_listen_handler - Handle CM events generated on a listening endpoint + * @cma_id: the server's listener rdma_cm_id + * @event: details of the event + * + * Return values: + * %0: Do not destroy @cma_id + * %1: Destroy @cma_id (never returned here) + * + * NB: There is never a DEVICE_REMOVAL event for INADDR_ANY listeners. */ -static int rdma_listen_handler(struct rdma_cm_id *cma_id, - struct rdma_cm_event *event) +static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) { switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: - dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " - "event = %s (%d)\n", cma_id, cma_id->context, - rdma_event_msg(event->event), event->event); handle_connect_req(cma_id, &event->param.conn); break; default: - /* NB: No device removal upcall for INADDR_ANY listeners */ - dprintk("svcrdma: Unexpected event on listening endpoint %p, " - "event = %s (%d)\n", cma_id, - rdma_event_msg(event->event), event->event); break; } - return 0; } -static int rdma_cma_handler(struct rdma_cm_id *cma_id, - struct rdma_cm_event *event) +/** + * svc_rdma_cma_handler - Handle CM events on client connections + * @cma_id: the server's listener rdma_cm_id + * @event: details of the event + * + * Return values: + * %0: Do not destroy @cma_id + * %1: Destroy @cma_id (never returned here) + */ +static int svc_rdma_cma_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) { struct svcxprt_rdma *rdma = cma_id->context; struct svc_xprt *xprt = &rdma->sc_xprt; switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: - /* Accept complete */ - svc_xprt_get(xprt); - dprintk("svcrdma: Connection completed on DTO xprt=%p, " - "cm_id=%p\n", xprt, cma_id); clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); svc_xprt_enqueue(xprt); break; case RDMA_CM_EVENT_DISCONNECTED: - dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", - xprt, cma_id); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); - break; case RDMA_CM_EVENT_DEVICE_REMOVAL: - dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " - "event = %s (%d)\n", cma_id, xprt, - rdma_event_msg(event->event), event->event); set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); break; default: - dprintk("svcrdma: Unexpected event on DTO endpoint %p, " - "event = %s (%d)\n", cma_id, - rdma_event_msg(event->event), event->event); break; } return 0; @@ -322,7 +312,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener"); - listen_id = rdma_create_id(net, rdma_listen_handler, cma_xprt, + listen_id = rdma_create_id(net, svc_rdma_listen_handler, cma_xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(listen_id)) { ret = PTR_ERR(listen_id); @@ -486,7 +476,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; /* Swap out the handler */ - newxprt->sc_cm_id->event_handler = rdma_cma_handler; + newxprt->sc_cm_id->event_handler = svc_rdma_cma_handler; /* Construct RDMA-CM private message */ pmsg.cp_magic = rpcrdma_cmp_magic; @@ -540,24 +530,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) return NULL; } -/* - * When connected, an svc_xprt has at least two references: - * - * - A reference held by the cm_id between the ESTABLISHED and - * DISCONNECTED events. If the remote peer disconnected first, this - * reference could be gone. - * - * - A reference held by the svc_recv code that called this function - * as part of close processing. - * - * At a minimum one references should still be held. - */ static void svc_rdma_detach(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); - /* Disconnect and flush posted WQE */ rdma_disconnect(rdma->sc_cm_id); } @@ -567,6 +544,7 @@ static void __svc_rdma_free(struct work_struct *work) container_of(work, struct svcxprt_rdma, sc_work); struct svc_xprt *xprt = &rdma->sc_xprt; + /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 18fa6067bb7f..b74e2741f74f 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -561,7 +561,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct iov_iter msg_iter; - char *kaddr = kmap(page); + char *kaddr; struct kvec iov; int rc; @@ -576,6 +576,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, goto out; } + kaddr = kmap(page); iov.iov_base = kaddr + offset; iov.iov_len = size; iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 710bd44eaa49..9a3d9fedd7aa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -935,7 +935,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int ret = 0; int pending; - if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) + if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_CMSG_COMPAT)) return -EOPNOTSUPP; mutex_lock(&tls_ctx->tx_lock); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 27bbcfad9c17..9e93bc201cc0 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1032,7 +1032,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, } /* Connected sockets that can produce data can be written. */ - if (sk->sk_state == TCP_ESTABLISHED) { + if (transport && sk->sk_state == TCP_ESTABLISHED) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { bool space_avail_now = false; int ret = transport->notify_poll_out( diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 042ea9b40c7b..d5280fd6f9c1 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -122,7 +122,7 @@ struct xfrm_pol_inexact_bin { /* list containing '*:*' policies */ struct hlist_head hhead; - seqcount_t count; + seqcount_spinlock_t count; /* tree sorted by daddr/prefix */ struct rb_root root_d; @@ -155,7 +155,7 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; static struct kmem_cache *xfrm_dst_cache __ro_after_init; -static __read_mostly seqcount_t xfrm_policy_hash_generation; +static __read_mostly seqcount_mutex_t xfrm_policy_hash_generation; static struct rhashtable xfrm_policy_inexact_table; static const struct rhashtable_params xfrm_pol_inexact_params; @@ -719,7 +719,7 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) INIT_HLIST_HEAD(&bin->hhead); bin->root_d = RB_ROOT; bin->root_s = RB_ROOT; - seqcount_init(&bin->count); + seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock); prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, &bin->k, &bin->head, @@ -1899,7 +1899,7 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, static struct xfrm_pol_inexact_node * xfrm_policy_lookup_inexact_addr(const struct rb_root *r, - seqcount_t *count, + seqcount_spinlock_t *count, const xfrm_address_t *addr, u16 family) { const struct rb_node *parent; @@ -4157,7 +4157,7 @@ void __init xfrm_init(void) { register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); - seqcount_init(&xfrm_policy_hash_generation); + seqcount_mutex_init(&xfrm_policy_hash_generation, &hash_resize_mutex); xfrm_input_init(); #ifdef CONFIG_XFRM_ESPINTCP |