diff options
Diffstat (limited to 'net')
80 files changed, 932 insertions, 539 deletions
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 496b27588493..e2ed69850489 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -30,7 +30,9 @@ bool vlan_do_receive(struct sk_buff **skbp) skb->pkt_type = PACKET_HOST; } - if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR)) { + if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) && + !netif_is_macvlan_port(vlan_dev) && + !netif_is_bridge_port(vlan_dev)) { unsigned int offset = skb->data - skb_mac_header(skb); /* diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index a3bffd1ec2b4..70306cc9d814 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -271,11 +271,11 @@ static long bt_sock_data_wait(struct sock *sk, long timeo) if (signal_pending(current) || !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } __set_current_state(TASK_RUNNING); @@ -441,7 +441,7 @@ unsigned int bt_sock_poll(struct file *file, struct socket *sock, if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index c91353841e40..ffed8a1d4f27 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -3027,8 +3027,13 @@ static void smp_ready_cb(struct l2cap_chan *chan) BT_DBG("chan %p", chan); + /* No need to call l2cap_chan_hold() here since we already own + * the reference taken in smp_new_conn_cb(). This is just the + * first time that we tie it to a specific pointer. The code in + * l2cap_core.c ensures that there's no risk this function wont + * get called if smp_new_conn_cb was previously called. + */ conn->smp = chan; - l2cap_chan_hold(chan); if (hcon->type == ACL_LINK && test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) bredr_pairing(chan); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index f7e8dee64fc8..5f3f64553179 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -48,7 +48,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) p->state = state; err = switchdev_port_attr_set(p->dev, &attr); - if (err) + if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); } diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index fa53d7a89f48..5396ff08af32 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -50,7 +50,7 @@ void br_init_port(struct net_bridge_port *p) p->config_pending = 0; err = switchdev_port_attr_set(p->dev, &attr); - if (err) + if (err && err != -EOPNOTSUPP) netdev_err(p->dev, "failed to set HW ageing time\n"); } diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index cc858919108e..aa209b1066c9 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -323,7 +323,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); @@ -331,7 +331,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) if (sock_flag(sk, SOCK_DEAD)) break; - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index ba6eb17226da..10d87753ed87 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -8,6 +8,7 @@ #include <linux/ceph/decode.h> #include <linux/ceph/auth.h> +#include <linux/ceph/libceph.h> #include <linux/ceph/messenger.h> #include "crypto.h" @@ -279,6 +280,15 @@ bad: return -EINVAL; } +static void ceph_x_authorizer_cleanup(struct ceph_x_authorizer *au) +{ + ceph_crypto_key_destroy(&au->session_key); + if (au->buf) { + ceph_buffer_put(au->buf); + au->buf = NULL; + } +} + static int ceph_x_build_authorizer(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th, struct ceph_x_authorizer *au) @@ -297,7 +307,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, ceph_crypto_key_destroy(&au->session_key); ret = ceph_crypto_key_clone(&au->session_key, &th->session_key); if (ret) - return ret; + goto out_au; maxlen = sizeof(*msg_a) + sizeof(msg_b) + ceph_x_encrypt_buflen(ticket_blob_len); @@ -309,8 +319,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, if (!au->buf) { au->buf = ceph_buffer_new(maxlen, GFP_NOFS); if (!au->buf) { - ceph_crypto_key_destroy(&au->session_key); - return -ENOMEM; + ret = -ENOMEM; + goto out_au; } } au->service = th->service; @@ -340,7 +350,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b), p, end - p); if (ret < 0) - goto out_buf; + goto out_au; p += ret; au->buf->vec.iov_len = p - au->buf->vec.iov_base; dout(" built authorizer nonce %llx len %d\n", au->nonce, @@ -348,9 +358,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, BUG_ON(au->buf->vec.iov_len > maxlen); return 0; -out_buf: - ceph_buffer_put(au->buf); - au->buf = NULL; +out_au: + ceph_x_authorizer_cleanup(au); return ret; } @@ -624,8 +633,7 @@ static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, { struct ceph_x_authorizer *au = (void *)a; - ceph_crypto_key_destroy(&au->session_key); - ceph_buffer_put(au->buf); + ceph_x_authorizer_cleanup(au); kfree(au); } @@ -653,8 +661,7 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) remove_ticket_handler(ac, th); } - if (xi->auth_authorizer.buf) - ceph_buffer_put(xi->auth_authorizer.buf); + ceph_x_authorizer_cleanup(&xi->auth_authorizer); kfree(ac->private); ac->private = NULL; @@ -691,8 +698,10 @@ static int ceph_x_sign_message(struct ceph_auth_handshake *auth, struct ceph_msg *msg) { int ret; - if (!auth->authorizer) + + if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN)) return 0; + ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, msg, &msg->footer.sig); if (ret < 0) @@ -707,8 +716,9 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth, __le64 sig_check; int ret; - if (!auth->authorizer) + if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN)) return 0; + ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, msg, &sig_check); if (ret < 0) diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 78f098a20796..bcbec33c6a14 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -245,6 +245,8 @@ enum { Opt_nocrc, Opt_cephx_require_signatures, Opt_nocephx_require_signatures, + Opt_cephx_sign_messages, + Opt_nocephx_sign_messages, Opt_tcp_nodelay, Opt_notcp_nodelay, }; @@ -267,6 +269,8 @@ static match_table_t opt_tokens = { {Opt_nocrc, "nocrc"}, {Opt_cephx_require_signatures, "cephx_require_signatures"}, {Opt_nocephx_require_signatures, "nocephx_require_signatures"}, + {Opt_cephx_sign_messages, "cephx_sign_messages"}, + {Opt_nocephx_sign_messages, "nocephx_sign_messages"}, {Opt_tcp_nodelay, "tcp_nodelay"}, {Opt_notcp_nodelay, "notcp_nodelay"}, {-1, NULL} @@ -491,6 +495,12 @@ ceph_parse_options(char *options, const char *dev_name, case Opt_nocephx_require_signatures: opt->flags |= CEPH_OPT_NOMSGAUTH; break; + case Opt_cephx_sign_messages: + opt->flags &= ~CEPH_OPT_NOMSGSIGN; + break; + case Opt_nocephx_sign_messages: + opt->flags |= CEPH_OPT_NOMSGSIGN; + break; case Opt_tcp_nodelay: opt->flags |= CEPH_OPT_TCP_NODELAY; @@ -534,6 +544,8 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) seq_puts(m, "nocrc,"); if (opt->flags & CEPH_OPT_NOMSGAUTH) seq_puts(m, "nocephx_require_signatures,"); + if (opt->flags & CEPH_OPT_NOMSGSIGN) + seq_puts(m, "nocephx_sign_messages,"); if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) seq_puts(m, "notcp_nodelay,"); @@ -596,11 +608,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, if (ceph_test_opt(client, MYIP)) myaddr = &client->options->my_addr; - ceph_messenger_init(&client->msgr, myaddr, - client->supported_features, - client->required_features, - ceph_test_opt(client, NOCRC), - ceph_test_opt(client, TCP_NODELAY)); + ceph_messenger_init(&client->msgr, myaddr); /* subsystems */ err = ceph_monc_init(&client->monc, client); diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index d1498224c49d..2e9cab09f37b 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -16,8 +16,10 @@ struct ceph_crypto_key { static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) { - if (key) + if (key) { kfree(key->key); + key->key = NULL; + } } int ceph_crypto_key_clone(struct ceph_crypto_key *dst, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b9b0e3b5da49..9981039ef4ff 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -509,7 +509,7 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } - if (con->msgr->tcp_nodelay) { + if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) { int optval = 1; ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, @@ -637,9 +637,6 @@ static int con_close_socket(struct ceph_connection *con) static void ceph_msg_remove(struct ceph_msg *msg) { list_del_init(&msg->list_head); - BUG_ON(msg->con == NULL); - msg->con->ops->put(msg->con); - msg->con = NULL; ceph_msg_put(msg); } @@ -662,15 +659,14 @@ static void reset_connection(struct ceph_connection *con) if (con->in_msg) { BUG_ON(con->in_msg->con != con); - con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; - con->ops->put(con); } con->connect_seq = 0; con->out_seq = 0; if (con->out_msg) { + BUG_ON(con->out_msg->con != con); ceph_msg_put(con->out_msg); con->out_msg = NULL; } @@ -1205,7 +1201,7 @@ static void prepare_write_message_footer(struct ceph_connection *con) con->out_kvec[v].iov_base = &m->footer; if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { if (con->ops->sign_message) - con->ops->sign_message(con, m); + con->ops->sign_message(m); else m->footer.sig = 0; con->out_kvec[v].iov_len = sizeof(m->footer); @@ -1432,7 +1428,8 @@ static int prepare_write_connect(struct ceph_connection *con) dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = cpu_to_le64(con->msgr->supported_features); + con->out_connect.features = + cpu_to_le64(from_msgr(con->msgr)->supported_features); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1527,7 +1524,7 @@ static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; struct ceph_msg_data_cursor *cursor = &msg->cursor; - bool do_datacrc = !con->msgr->nocrc; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); u32 crc; dout("%s %p msg %p\n", __func__, con, msg); @@ -1552,8 +1549,8 @@ static int write_partial_message_data(struct ceph_connection *con) bool need_crc; int ret; - page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, - &last_piece); + page = ceph_msg_data_next(cursor, &page_offset, &length, + &last_piece); ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, !last_piece); if (ret <= 0) { @@ -1564,7 +1561,7 @@ static int write_partial_message_data(struct ceph_connection *con) } if (do_datacrc && cursor->need_crc) crc = ceph_crc32c_page(crc, page, page_offset, length); - need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret); + need_crc = ceph_msg_data_advance(cursor, (size_t)ret); } dout("%s %p msg %p done\n", __func__, con, msg); @@ -2005,8 +2002,8 @@ static int process_banner(struct ceph_connection *con) static int process_connect(struct ceph_connection *con) { - u64 sup_feat = con->msgr->supported_features; - u64 req_feat = con->msgr->required_features; + u64 sup_feat = from_msgr(con->msgr)->supported_features; + u64 req_feat = from_msgr(con->msgr)->required_features; u64 server_feat = ceph_sanitize_features( le64_to_cpu(con->in_reply.features)); int ret; @@ -2232,7 +2229,7 @@ static int read_partial_msg_data(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; struct ceph_msg_data_cursor *cursor = &msg->cursor; - const bool do_datacrc = !con->msgr->nocrc; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); struct page *page; size_t page_offset; size_t length; @@ -2246,8 +2243,7 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = con->in_data_crc; while (cursor->resid) { - page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, - NULL); + page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) { if (do_datacrc) @@ -2258,7 +2254,7 @@ static int read_partial_msg_data(struct ceph_connection *con) if (do_datacrc) crc = ceph_crc32c_page(crc, page, page_offset, ret); - (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret); + (void) ceph_msg_data_advance(cursor, (size_t)ret); } if (do_datacrc) con->in_data_crc = crc; @@ -2278,7 +2274,7 @@ static int read_partial_message(struct ceph_connection *con) int end; int ret; unsigned int front_len, middle_len, data_len; - bool do_datacrc = !con->msgr->nocrc; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); u64 seq; u32 crc; @@ -2423,7 +2419,7 @@ static int read_partial_message(struct ceph_connection *con) } if (need_sign && con->ops->check_message_signature && - con->ops->check_message_signature(con, m)) { + con->ops->check_message_signature(m)) { pr_err("read_partial_message %p signature check failed\n", m); return -EBADMSG; } @@ -2438,13 +2434,10 @@ static int read_partial_message(struct ceph_connection *con) */ static void process_message(struct ceph_connection *con) { - struct ceph_msg *msg; + struct ceph_msg *msg = con->in_msg; BUG_ON(con->in_msg->con != con); - con->in_msg->con = NULL; - msg = con->in_msg; con->in_msg = NULL; - con->ops->put(con); /* if first message, set peer_name */ if (con->peer_name.type == 0) @@ -2677,7 +2670,7 @@ more: if (ret <= 0) { switch (ret) { case -EBADMSG: - con->error_msg = "bad crc"; + con->error_msg = "bad crc/signature"; /* fall through */ case -EBADE: ret = -EIO; @@ -2918,10 +2911,8 @@ static void con_fault(struct ceph_connection *con) if (con->in_msg) { BUG_ON(con->in_msg->con != con); - con->in_msg->con = NULL; ceph_msg_put(con->in_msg); con->in_msg = NULL; - con->ops->put(con); } /* Requeue anything that hasn't been acked */ @@ -2952,15 +2943,8 @@ static void con_fault(struct ceph_connection *con) * initialize a new messenger instance */ void ceph_messenger_init(struct ceph_messenger *msgr, - struct ceph_entity_addr *myaddr, - u64 supported_features, - u64 required_features, - bool nocrc, - bool tcp_nodelay) + struct ceph_entity_addr *myaddr) { - msgr->supported_features = supported_features; - msgr->required_features = required_features; - spin_lock_init(&msgr->global_seq_lock); if (myaddr) @@ -2970,8 +2954,6 @@ void ceph_messenger_init(struct ceph_messenger *msgr, msgr->inst.addr.type = 0; get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); encode_my_addr(msgr); - msgr->nocrc = nocrc; - msgr->tcp_nodelay = tcp_nodelay; atomic_set(&msgr->stopping, 0); write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); @@ -2986,6 +2968,15 @@ void ceph_messenger_fini(struct ceph_messenger *msgr) } EXPORT_SYMBOL(ceph_messenger_fini); +static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) +{ + if (msg->con) + msg->con->ops->put(msg->con); + + msg->con = con ? con->ops->get(con) : NULL; + BUG_ON(msg->con != con); +} + static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ @@ -3017,9 +3008,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) return; } - BUG_ON(msg->con != NULL); - msg->con = con->ops->get(con); - BUG_ON(msg->con == NULL); + msg_con_set(msg, con); BUG_ON(!list_empty(&msg->list_head)); list_add_tail(&msg->list_head, &con->out_queue); @@ -3047,16 +3036,15 @@ void ceph_msg_revoke(struct ceph_msg *msg) { struct ceph_connection *con = msg->con; - if (!con) + if (!con) { + dout("%s msg %p null con\n", __func__, msg); return; /* Message not in our possession */ + } mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { dout("%s %p msg %p - was on queue\n", __func__, con, msg); list_del_init(&msg->list_head); - BUG_ON(msg->con == NULL); - msg->con->ops->put(msg->con); - msg->con = NULL; msg->hdr.seq = 0; ceph_msg_put(msg); @@ -3080,16 +3068,13 @@ void ceph_msg_revoke(struct ceph_msg *msg) */ void ceph_msg_revoke_incoming(struct ceph_msg *msg) { - struct ceph_connection *con; + struct ceph_connection *con = msg->con; - BUG_ON(msg == NULL); - if (!msg->con) { + if (!con) { dout("%s msg %p null con\n", __func__, msg); - return; /* Message not in our possession */ } - con = msg->con; mutex_lock(&con->mutex); if (con->in_msg == msg) { unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); @@ -3335,9 +3320,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) } if (msg) { BUG_ON(*skip); + msg_con_set(msg, con); con->in_msg = msg; - con->in_msg->con = con->ops->get(con); - BUG_ON(con->in_msg->con == NULL); } else { /* * Null message pointer means either we should skip @@ -3384,6 +3368,8 @@ static void ceph_msg_release(struct kref *kref) dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); + msg_con_set(m, NULL); + /* drop middle, data, if any */ if (m->middle) { ceph_buffer_put(m->middle); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f79ccac6699f..f8f235930d88 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -120,11 +120,13 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, } #endif /* CONFIG_BLOCK */ -#define osd_req_op_data(oreq, whch, typ, fld) \ - ({ \ - BUG_ON(whch >= (oreq)->r_num_ops); \ - &(oreq)->r_ops[whch].typ.fld; \ - }) +#define osd_req_op_data(oreq, whch, typ, fld) \ +({ \ + struct ceph_osd_request *__oreq = (oreq); \ + unsigned int __whch = (whch); \ + BUG_ON(__whch >= __oreq->r_num_ops); \ + &__oreq->r_ops[__whch].typ.fld; \ +}) static struct ceph_osd_data * osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) @@ -1750,8 +1752,7 @@ static void complete_request(struct ceph_osd_request *req) * handle osd op reply. either call the callback if it is specified, * or do the completion to wake up the waiting thread. */ -static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, - struct ceph_connection *con) +static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) { void *p, *end; struct ceph_osd_request *req; @@ -2807,7 +2808,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) ceph_osdc_handle_map(osdc, msg); break; case CEPH_MSG_OSD_OPREPLY: - handle_reply(osdc, msg, con); + handle_reply(osdc, msg); break; case CEPH_MSG_WATCH_NOTIFY: handle_watch_notify(osdc, msg); @@ -2849,9 +2850,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, goto out; } - if (req->r_reply->con) - dout("%s revoking msg %p from old con %p\n", __func__, - req->r_reply, req->r_reply->con); ceph_msg_revoke_incoming(req->r_reply); if (front_len > req->r_reply->front_alloc_len) { @@ -2978,17 +2976,19 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } -static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) +static int osd_sign_message(struct ceph_msg *msg) { - struct ceph_osd *o = con->private; + struct ceph_osd *o = msg->con->private; struct ceph_auth_handshake *auth = &o->o_auth; + return ceph_auth_sign_message(auth, msg); } -static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) +static int osd_check_message_signature(struct ceph_msg *msg) { - struct ceph_osd *o = con->private; + struct ceph_osd *o = msg->con->private; struct ceph_auth_handshake *auth = &o->o_auth; + return ceph_auth_check_message_signature(auth, msg); } @@ -3000,7 +3000,7 @@ static const struct ceph_connection_operations osd_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .alloc_msg = alloc_msg, - .sign_message = sign_message, - .check_message_signature = check_message_signature, + .sign_message = osd_sign_message, + .check_message_signature = osd_check_message_signature, .fault = osd_reset, }; diff --git a/net/core/datagram.c b/net/core/datagram.c index 617088aee21d..d62af69ad844 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -785,7 +785,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, if (sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } diff --git a/net/core/dev.c b/net/core/dev.c index ab9b8d0d115e..ae00b894e675 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2403,17 +2403,20 @@ static void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features = 0; struct net_device *dev = skb->dev; - const char *driver = ""; + const char *name = ""; if (!net_ratelimit()) return; - if (dev && dev->dev.parent) - driver = dev_driver_string(dev->dev.parent); - + if (dev) { + if (dev->dev.parent) + name = dev_driver_string(dev->dev.parent); + else + name = netdev_name(dev); + } WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " "gso_type=%d ip_summed=%d\n", - driver, dev ? &dev->features : &null_features, + name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features, skb->len, skb->data_len, skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, skb->ip_summed); @@ -6426,11 +6429,16 @@ int __netdev_update_features(struct net_device *dev) if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); + else + err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); + /* return non-0 since some features might have changed and + * it's better to fire a spurious notification than miss it + */ return -1; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 1aa8437ed6c4..f18ae91b652e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -857,7 +857,7 @@ static void neigh_probe(struct neighbour *neigh) struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) - skb = skb_copy(skb, GFP_ATOMIC); + skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); @@ -2215,7 +2215,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, ndm->ndm_pad2 = 0; ndm->ndm_flags = pn->flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; - ndm->ndm_ifindex = pn->dev->ifindex; + ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) @@ -2333,7 +2333,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (h > s_h) s_idx = 0; for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { - if (dev_net(n->dev) != net) + if (pneigh_net(n) != net) continue; if (idx < s_idx) goto next; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 6441f47b1a8f..2e4df84c34a1 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -56,7 +56,7 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) kfree(css_cls_state(css)); } -static int update_classid(const void *v, struct file *file, unsigned n) +static int update_classid_sock(const void *v, struct file *file, unsigned n) { int err; struct socket *sock = sock_from_file(file, &err); @@ -67,18 +67,25 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void update_classid(struct cgroup_subsys_state *css, void *v) { - struct cgroup_cls_state *cs = css_cls_state(css); - void *v = (void *)(unsigned long)cs->classid; + struct css_task_iter it; struct task_struct *p; - cgroup_taskset_for_each(p, tset) { + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { task_lock(p); - iterate_fd(p->files, 0, update_classid, v); + iterate_fd(p->files, 0, update_classid_sock, v); task_unlock(p); } + css_task_iter_end(&it); +} + +static void cgrp_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + update_classid(css, + (void *)(unsigned long)css_cls_state(css)->classid); } static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) @@ -89,8 +96,11 @@ static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, u64 value) { - css_cls_state(css)->classid = (u32) value; + struct cgroup_cls_state *cs = css_cls_state(css); + + cs->classid = (u32)value; + update_classid(css, (void *)(unsigned long)cs->classid); return 0; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 504bd17b7456..34ba7a08876d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1045,15 +1045,156 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, + struct net_device *dev) +{ + const struct rtnl_link_stats64 *stats; + struct rtnl_link_stats64 temp; + struct nlattr *attr; + + stats = dev_get_stats(dev, &temp); + + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); + if (!attr) + return -EMSGSIZE; + + copy_rtnl_link_stats(nla_data(attr), stats); + + attr = nla_reserve(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64)); + if (!attr) + return -EMSGSIZE; + + copy_rtnl_link_stats64(nla_data(attr), stats); + + return 0; +} + +static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, + struct net_device *dev, + int vfs_num, + struct nlattr *vfinfo) +{ + struct ifla_vf_rss_query_en vf_rss_query_en; + struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_spoofchk vf_spoofchk; + struct ifla_vf_tx_rate vf_tx_rate; + struct ifla_vf_stats vf_stats; + struct ifla_vf_trust vf_trust; + struct ifla_vf_vlan vf_vlan; + struct ifla_vf_rate vf_rate; + struct nlattr *vf, *vfstats; + struct ifla_vf_mac vf_mac; + struct ifla_vf_info ivi; + + /* Not all SR-IOV capable drivers support the + * spoofcheck and "RSS query enable" query. Preset to + * -1 so the user space tool can detect that the driver + * didn't report anything. + */ + ivi.spoofchk = -1; + ivi.rss_query_en = -1; + ivi.trusted = -1; + memset(ivi.mac, 0, sizeof(ivi.mac)); + /* The default value for VF link state is "auto" + * IFLA_VF_LINK_STATE_AUTO which equals zero + */ + ivi.linkstate = 0; + if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) + return 0; + + vf_mac.vf = + vf_vlan.vf = + vf_rate.vf = + vf_tx_rate.vf = + vf_spoofchk.vf = + vf_linkstate.vf = + vf_rss_query_en.vf = + vf_trust.vf = ivi.vf; + + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + vf_vlan.vlan = ivi.vlan; + vf_vlan.qos = ivi.qos; + vf_tx_rate.rate = ivi.max_tx_rate; + vf_rate.min_tx_rate = ivi.min_tx_rate; + vf_rate.max_tx_rate = ivi.max_tx_rate; + vf_spoofchk.setting = ivi.spoofchk; + vf_linkstate.link_state = ivi.linkstate; + vf_rss_query_en.setting = ivi.rss_query_en; + vf_trust.setting = ivi.trusted; + vf = nla_nest_start(skb, IFLA_VF_INFO); + if (!vf) { + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; + } + if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || + nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || + nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), + &vf_rate) || + nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), + &vf_tx_rate) || + nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), + &vf_spoofchk) || + nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), + &vf_linkstate) || + nla_put(skb, IFLA_VF_RSS_QUERY_EN, + sizeof(vf_rss_query_en), + &vf_rss_query_en) || + nla_put(skb, IFLA_VF_TRUST, + sizeof(vf_trust), &vf_trust)) + return -EMSGSIZE; + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, + &vf_stats); + vfstats = nla_nest_start(skb, IFLA_VF_STATS); + if (!vfstats) { + nla_nest_cancel(skb, vf); + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; + } + if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast) || + nla_put_u64(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast)) + return -EMSGSIZE; + nla_nest_end(skb, vfstats); + nla_nest_end(skb, vf); + return 0; +} + +static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) +{ + struct rtnl_link_ifmap map = { + .mem_start = dev->mem_start, + .mem_end = dev->mem_end, + .base_addr = dev->base_addr, + .irq = dev->irq, + .dma = dev->dma, + .port = dev->if_port, + }; + if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; - struct rtnl_link_stats64 temp; - const struct rtnl_link_stats64 *stats; - struct nlattr *attr, *af_spec; + struct nlattr *af_spec; struct rtnl_af_ops *af_ops; struct net_device *upper_dev = netdev_master_upper_dev_get(dev); @@ -1096,18 +1237,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; - if (1) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; - if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) - goto nla_put_failure; - } + if (rtnl_fill_link_ifmap(skb, dev)) + goto nla_put_failure; if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || @@ -1124,128 +1255,27 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; - attr = nla_reserve(skb, IFLA_STATS, - sizeof(struct rtnl_link_stats)); - if (attr == NULL) - goto nla_put_failure; - - stats = dev_get_stats(dev, &temp); - copy_rtnl_link_stats(nla_data(attr), stats); - - attr = nla_reserve(skb, IFLA_STATS64, - sizeof(struct rtnl_link_stats64)); - if (attr == NULL) + if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; - copy_rtnl_link_stats64(nla_data(attr), stats); if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) && nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent))) goto nla_put_failure; - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent - && (ext_filter_mask & RTEXT_FILTER_VF)) { + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent && + ext_filter_mask & RTEXT_FILTER_VF) { int i; - - struct nlattr *vfinfo, *vf, *vfstats; + struct nlattr *vfinfo; int num_vfs = dev_num_vf(dev->dev.parent); vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); if (!vfinfo) goto nla_put_failure; for (i = 0; i < num_vfs; i++) { - struct ifla_vf_info ivi; - struct ifla_vf_mac vf_mac; - struct ifla_vf_vlan vf_vlan; - struct ifla_vf_rate vf_rate; - struct ifla_vf_tx_rate vf_tx_rate; - struct ifla_vf_spoofchk vf_spoofchk; - struct ifla_vf_link_state vf_linkstate; - struct ifla_vf_rss_query_en vf_rss_query_en; - struct ifla_vf_stats vf_stats; - struct ifla_vf_trust vf_trust; - - /* - * Not all SR-IOV capable drivers support the - * spoofcheck and "RSS query enable" query. Preset to - * -1 so the user space tool can detect that the driver - * didn't report anything. - */ - ivi.spoofchk = -1; - ivi.rss_query_en = -1; - ivi.trusted = -1; - memset(ivi.mac, 0, sizeof(ivi.mac)); - /* The default value for VF link state is "auto" - * IFLA_VF_LINK_STATE_AUTO which equals zero - */ - ivi.linkstate = 0; - if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) - break; - vf_mac.vf = - vf_vlan.vf = - vf_rate.vf = - vf_tx_rate.vf = - vf_spoofchk.vf = - vf_linkstate.vf = - vf_rss_query_en.vf = - vf_trust.vf = ivi.vf; - - memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); - vf_vlan.vlan = ivi.vlan; - vf_vlan.qos = ivi.qos; - vf_tx_rate.rate = ivi.max_tx_rate; - vf_rate.min_tx_rate = ivi.min_tx_rate; - vf_rate.max_tx_rate = ivi.max_tx_rate; - vf_spoofchk.setting = ivi.spoofchk; - vf_linkstate.link_state = ivi.linkstate; - vf_rss_query_en.setting = ivi.rss_query_en; - vf_trust.setting = ivi.trusted; - vf = nla_nest_start(skb, IFLA_VF_INFO); - if (!vf) { - nla_nest_cancel(skb, vfinfo); - goto nla_put_failure; - } - if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || - nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || - nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), - &vf_rate) || - nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), - &vf_tx_rate) || - nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), - &vf_spoofchk) || - nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), - &vf_linkstate) || - nla_put(skb, IFLA_VF_RSS_QUERY_EN, - sizeof(vf_rss_query_en), - &vf_rss_query_en) || - nla_put(skb, IFLA_VF_TRUST, - sizeof(vf_trust), &vf_trust)) + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) goto nla_put_failure; - memset(&vf_stats, 0, sizeof(vf_stats)); - if (dev->netdev_ops->ndo_get_vf_stats) - dev->netdev_ops->ndo_get_vf_stats(dev, i, - &vf_stats); - vfstats = nla_nest_start(skb, IFLA_VF_STATS); - if (!vfstats) { - nla_nest_cancel(skb, vf); - nla_nest_cancel(skb, vfinfo); - goto nla_put_failure; - } - if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS, - vf_stats.rx_packets) || - nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS, - vf_stats.tx_packets) || - nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES, - vf_stats.rx_bytes) || - nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES, - vf_stats.tx_bytes) || - nla_put_u64(skb, IFLA_VF_STATS_BROADCAST, - vf_stats.broadcast) || - nla_put_u64(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast)) - goto nla_put_failure; - nla_nest_end(skb, vfstats); - nla_nest_end(skb, vf); } + nla_nest_end(skb, vfinfo); } diff --git a/net/core/scm.c b/net/core/scm.c index 3b6899b7d810..8a1741b14302 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -305,6 +305,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) err = put_user(cmlen, &cm->cmsg_len); if (!err) { cmlen = CMSG_SPACE(i*sizeof(int)); + if (msg->msg_controllen < cmlen) + cmlen = msg->msg_controllen; msg->msg_control += cmlen; msg->msg_controllen -= cmlen; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index aa41e6dd6429..152b9c70e252 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4268,7 +4268,8 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) return NULL; } - memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); + memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len, + 2 * ETH_ALEN); skb->mac_header += VLAN_HLEN; return skb; } diff --git a/net/core/sock.c b/net/core/sock.c index 1e4dd54bfb5a..e31dfcee1729 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1530,7 +1530,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); - spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); lockdep_set_class_and_name(&newsk->sk_callback_lock, af_callback_keys + newsk->sk_family, @@ -1607,7 +1606,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; - __sk_dst_set(sk, dst); + sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; @@ -1815,7 +1814,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; @@ -1861,7 +1860,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) break; - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) @@ -2048,9 +2047,9 @@ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); return rc; } @@ -2388,7 +2387,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) } else sk->sk_wq = NULL; - spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, diff --git a/net/core/stream.c b/net/core/stream.c index d70f77a0c889..b96f7a79e544 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -39,7 +39,7 @@ void sk_stream_write_space(struct sock *sk) wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } @@ -126,7 +126,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; while (1) { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); @@ -139,7 +139,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) } if (signal_pending(current)) goto do_interrupted; - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk_stream_memory_free(sk) && !vm_wait) break; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index db5fc2440a23..9c6d0508e63a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -202,7 +202,9 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req security_req_classify_flow(req, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -219,7 +221,10 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6.daddr = ireq->ir_v6_rmt_addr; - err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + rcu_read_lock(); + err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + np->tclass); + rcu_read_unlock(); err = net_xmit_eval(err); } @@ -387,6 +392,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct inet_sock *newinet; struct dccp6_sock *newdp6; struct sock *newsk; @@ -453,7 +459,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, * comment in that function for the gory details. -acme */ - __ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); newdp6 = (struct dccp6_sock *)newsk; @@ -488,13 +494,15 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, * Yes, keeping reference count would be much more clever, but we make * one more one thing there: reattach optmem to newsk. */ - if (np->opt != NULL) - newnp->opt = ipv6_dup_options(newsk, np->opt); - + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + } inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newnp->opt != NULL) - inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + - newnp->opt->opt_flen); + if (opt) + inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + + opt->opt_flen; dccp_sync_mss(newsk, dst_mtu(dst)); @@ -757,6 +765,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct dccp_sock *dp = dccp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; + struct ipv6_txoptions *opt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; @@ -856,7 +865,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_sport = inet->inet_sport; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + final_p = fl6_update_dst(&fl6, opt, &final); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -873,12 +883,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; - if (np->opt != NULL) - icsk->icsk_ext_hdr_len = (np->opt->opt_flen + - np->opt->opt_nflen); + if (opt) + icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; inet->inet_dport = usin->sin6_port; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index b5cf13a28009..41e65804ddf5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -339,8 +339,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock, if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - set_bit(SOCK_ASYNC_NOSPACE, - &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 675cf94e04f8..eebf5ac8ce18 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1747,9 +1747,9 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); } @@ -2004,10 +2004,10 @@ static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, &timeo, !dn_queue_too_long(scp, queue, flags)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); continue; } diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 4677b6fa6dda..ecc28cff08ab 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -67,7 +67,7 @@ * Returns the size of the result on success, -ve error code otherwise. */ int dns_query(const char *type, const char *name, size_t namelen, - const char *options, char **_result, time_t *_expiry) + const char *options, char **_result, time64_t *_expiry) { struct key *rkey; const struct user_key_payload *upayload; diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 35a9788bb3ae..c7d1adca30d8 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -312,7 +312,7 @@ static void send_hsr_supervision_frame(struct hsr_port *master, u8 type) return; out: - WARN_ON_ONCE("HSR: Could not send supervision frame\n"); + WARN_ONCE(1, "HSR: Could not send supervision frame\n"); kfree_skb(skb); } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6baf36e11808..05e4cba14162 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2126,7 +2126,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) ASSERT_RTNL(); in_dev = ip_mc_find_dev(net, imr); - if (!in_dev) { + if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) { ret = -ENODEV; goto out; } @@ -2147,7 +2147,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) *imlp = iml->next_rcu; - ip_mc_dec_group(in_dev, group); + if (in_dev) + ip_mc_dec_group(in_dev, group); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 92dd4b74d513..c3a38353f5dc 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -134,7 +134,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); -static void mroute_clean_tables(struct mr_table *mrt); +static void mroute_clean_tables(struct mr_table *mrt, bool all); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -350,7 +350,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) static void ipmr_free_table(struct mr_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, true); kfree(mrt); } @@ -441,10 +441,6 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) return dev; failure: - /* allow the register to be completed before unregistering. */ - rtnl_unlock(); - rtnl_lock(); - unregister_netdevice(dev); return NULL; } @@ -540,10 +536,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) return dev; failure: - /* allow the register to be completed before unregistering. */ - rtnl_unlock(); - rtnl_lock(); - unregister_netdevice(dev); return NULL; } @@ -1208,7 +1200,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr_table *mrt) +static void mroute_clean_tables(struct mr_table *mrt, bool all) { int i; LIST_HEAD(list); @@ -1217,8 +1209,9 @@ static void mroute_clean_tables(struct mr_table *mrt) /* Shut down all active vif entries */ for (i = 0; i < mrt->maxvif; i++) { - if (!(mrt->vif_table[i].flags & VIFF_STATIC)) - vif_delete(mrt, i, 0, &list); + if (!all && (mrt->vif_table[i].flags & VIFF_STATIC)) + continue; + vif_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); @@ -1226,7 +1219,7 @@ static void mroute_clean_tables(struct mr_table *mrt) for (i = 0; i < MFC_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { - if (c->mfc_flags & MFC_STATIC) + if (!all && (c->mfc_flags & MFC_STATIC)) continue; list_del_rcu(&c->list); mroute_netlink_event(mrt, c, RTM_DELROUTE); @@ -1261,7 +1254,7 @@ static void mrtsock_destruct(struct sock *sk) NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, false); } } rtnl_unlock(); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8c0d0bdc2a7c..63e5be0abd86 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -406,10 +406,12 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, ip_select_ident(net, skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + skb->transport_header += iphlen; + if (iph->protocol == IPPROTO_ICMP && + length >= iphlen + sizeof(struct icmphdr)) + icmp_out_count(net, ((struct icmphdr *) + skb_transport_header(skb))->type); } - if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(net, ((struct icmphdr *) - skb_transport_header(skb))->type); err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c1728771cf89..c82cca18c90f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -517,8 +517,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - set_bit(SOCK_ASYNC_NOSPACE, - &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after @@ -906,7 +905,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, goto out_err; } - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; @@ -1134,7 +1133,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } /* This should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fdd88c3803a6..2d656eef7f8e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4481,19 +4481,34 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_buff *skb; + int err = -ENOMEM; + int data_len = 0; bool fragstolen; if (size == 0) return 0; - skb = alloc_skb(size, sk->sk_allocation); + if (size > PAGE_SIZE) { + int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS); + + data_len = npages << PAGE_SHIFT; + size = data_len + (size & ~PAGE_MASK); + } + skb = alloc_skb_with_frags(size - data_len, data_len, + PAGE_ALLOC_COSTLY_ORDER, + &err, sk->sk_allocation); if (!skb) goto err; + skb_put(skb, size - data_len); + skb->data_len = data_len; + skb->len = size; + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto err_free; - if (memcpy_from_msg(skb_put(skb, size), msg, size)) + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); + if (err) goto err_free; TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; @@ -4509,7 +4524,8 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) err_free: kfree_skb(skb); err: - return -ENOMEM; + return err; + } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) @@ -5667,6 +5683,7 @@ discard: } tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->copied_seq = tp->rcv_nxt; tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ba09016d1bfd..db003438aaf5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -921,7 +921,8 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, } md5sig = rcu_dereference_protected(tp->md5sig_info, - sock_owned_by_user(sk)); + sock_owned_by_user(sk) || + lockdep_is_held(&sk->sk_lock.slock)); if (!md5sig) { md5sig = kmalloc(sizeof(*md5sig), gfp); if (!md5sig) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c9c716a483e4..193ba1fa8a9a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -168,7 +168,7 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); if (tp->syn_fastopen || tp->syn_data) tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (tp->syn_data) + if (tp->syn_data && icsk->icsk_retransmits == 1) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } @@ -176,6 +176,18 @@ static int tcp_write_timeout(struct sock *sk) syn_set = true; } else { if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { + /* Some middle-boxes may black-hole Fast Open _after_ + * the handshake. Therefore we conservatively disable + * Fast Open on this path on recurring timeouts with + * few or zero bytes acked after Fast Open. + */ + if (tp->syn_data_acked && + tp->bytes_acked <= tp->rx_opt.mss_clamp) { + tcp_fastopen_cache_set(sk, 0, NULL, true, 0); + if (icsk->icsk_retransmits == sysctl_tcp_retries1) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } /* Black hole detection */ tcp_mtu_probing(icsk, sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 24ec14f9825c..0c7b0e61b917 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -100,7 +100,6 @@ #include <linux/slab.h> #include <net/tcp_states.h> #include <linux/skbuff.h> -#include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/net_namespace.h> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d84742f003a9..61f26851655c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3642,7 +3642,7 @@ static void addrconf_dad_work(struct work_struct *w) /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any, NULL); + ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any); out: in6_ifa_put(ifp); rtnl_unlock(); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 44bb66bde0e2..8ec0df75f1c4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -428,9 +428,11 @@ void inet6_destroy_sock(struct sock *sk) /* Free tx options */ - opt = xchg(&np->opt, NULL); - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } } EXPORT_SYMBOL_GPL(inet6_destroy_sock); @@ -659,7 +661,10 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.fl6_sport = inet->inet_sport; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), + &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -668,7 +673,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); } return 0; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index d70b0238f468..517c55b01ba8 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -167,8 +167,10 @@ ipv4_connected: security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - opt = flowlabel ? flowlabel->opt : np->opt; + rcu_read_lock(); + opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); final_p = fl6_update_dst(&fl6, opt, &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); err = 0; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index ce203b0402be..ea7c4d64a00a 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -727,6 +727,7 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) *((char **)&opt2->dst1opt) += dif; if (opt2->srcrt) *((char **)&opt2->srcrt) += dif; + atomic_set(&opt2->refcnt, 1); } return opt2; } @@ -790,7 +791,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, return ERR_PTR(-ENOBUFS); memset(opt2, 0, tot_len); - + atomic_set(&opt2->refcnt, 1); opt2->tot_len = tot_len; p = (char *)(opt2 + 1); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 36c5a98b0472..0a37ddc7af51 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -834,11 +834,6 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); } -/* - * Special lock-class for __icmpv6_sk: - */ -static struct lock_class_key icmpv6_socket_sk_dst_lock_key; - static int __net_init icmpv6_sk_init(struct net *net) { struct sock *sk; @@ -860,15 +855,6 @@ static int __net_init icmpv6_sk_init(struct net *net) net->ipv6.icmp_sk[i] = sk; - /* - * Split off their lock-class, because sk->sk_dst_lock - * gets used from softirqs, which is safe for - * __icmpv6_sk (because those never get directly used - * via userspace syscalls), but unsafe for normal sockets. - */ - lockdep_set_class(&sk->sk_dst_lock, - &icmpv6_socket_sk_dst_lock_key); - /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 5d1c7cee2cb2..a7ca2cde2ecb 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -78,7 +78,9 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = proto; fl6->daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); fl6->saddr = ireq->ir_v6_loc_addr; fl6->flowi6_oif = ireq->ir_iif; fl6->flowi6_mark = ireq->ir_mark; @@ -109,14 +111,6 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); static inline -void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, - const struct in6_addr *saddr) -{ - __ip6_dst_store(sk, dst, daddr, saddr); -} - -static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { return __sk_dst_check(sk, cookie); @@ -142,14 +136,16 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->fl6_dport = inet->inet_dport; security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); dst = __inet6_csk_dst_check(sk, np->dst_cookie); if (!dst) { dst = ip6_dst_lookup_flow(sk, fl6, final_p); if (!IS_ERR(dst)) - __inet6_csk_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); } return dst; } @@ -175,7 +171,8 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused /* Restore final destination back after routing done */ fl6.daddr = sk->sk_v6_daddr; - res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + np->tclass); rcu_read_unlock(); return res; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index eabffbb89795..137fca42aaa6 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -177,7 +177,7 @@ void ip6_tnl_dst_reset(struct ip6_tnl *t) int i; for_each_possible_cpu(i) - ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), NULL); + ip6_tnl_per_cpu_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); } EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index ad19136086dd..a10e77103c88 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -118,7 +118,7 @@ static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, int cmd); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); -static void mroute_clean_tables(struct mr6_table *mrt); +static void mroute_clean_tables(struct mr6_table *mrt, bool all); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES @@ -334,7 +334,7 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) static void ip6mr_free_table(struct mr6_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, true); kfree(mrt); } @@ -765,10 +765,6 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) return dev; failure: - /* allow the register to be completed before unregistering. */ - rtnl_unlock(); - rtnl_lock(); - unregister_netdevice(dev); return NULL; } @@ -1542,7 +1538,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr6_table *mrt) +static void mroute_clean_tables(struct mr6_table *mrt, bool all) { int i; LIST_HEAD(list); @@ -1552,8 +1548,9 @@ static void mroute_clean_tables(struct mr6_table *mrt) * Shut down all active vif entries */ for (i = 0; i < mrt->maxvif; i++) { - if (!(mrt->vif6_table[i].flags & VIFF_STATIC)) - mif6_delete(mrt, i, &list); + if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC)) + continue; + mif6_delete(mrt, i, &list); } unregister_netdevice_many(&list); @@ -1562,7 +1559,7 @@ static void mroute_clean_tables(struct mr6_table *mrt) */ for (i = 0; i < MFC6_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) { - if (c->mfc_flags & MFC_STATIC) + if (!all && (c->mfc_flags & MFC_STATIC)) continue; write_lock_bh(&mrt_lock); list_del(&c->list); @@ -1625,7 +1622,7 @@ int ip6mr_sk_done(struct sock *sk) net->ipv6.devconf_all); write_unlock_bh(&mrt_lock); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, false); err = 0; break; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 63e6956917c9..4449ad1f8114 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -111,7 +111,8 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } - opt = xchg(&inet6_sk(sk)->opt, opt); + opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, + opt); sk_dst_reset(sk); return opt; @@ -231,9 +232,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } - opt = xchg(&np->opt, NULL); - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + opt = xchg((__force struct ipv6_txoptions **)&np->opt, + NULL); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } pktopt = xchg(&np->pktoptions, NULL); kfree_skb(pktopt); @@ -403,7 +407,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) break; - opt = ipv6_renew_options(sk, np->opt, optname, + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + opt = ipv6_renew_options(sk, opt, optname, (struct ipv6_opt_hdr __user *)optval, optlen); if (IS_ERR(opt)) { @@ -432,8 +437,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; opt = ipv6_update_options(sk, opt); sticky_done: - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } break; } @@ -486,6 +493,7 @@ sticky_done: break; memset(opt, 0, sizeof(*opt)); + atomic_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_user(opt+1, optval, optlen)) @@ -502,8 +510,10 @@ update: retv = 0; opt = ipv6_update_options(sk, opt); done: - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } break; } case IPV6_UNICAST_HOPS: @@ -1110,10 +1120,11 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, case IPV6_RTHDR: case IPV6_DSTOPTS: { + struct ipv6_txoptions *opt; lock_sock(sk); - len = ipv6_getsockopt_sticky(sk, np->opt, - optname, optval, len); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 124338a39e29..5ee56d0a8699 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1651,7 +1651,6 @@ out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, payload_len); } else { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); } @@ -2015,7 +2014,6 @@ out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, full_len); } else IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3e0f855e1bea..d6161e1c48c8 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -556,8 +556,7 @@ static void ndisc_send_unsol_na(struct net_device *dev) } void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr, - struct sk_buff *oskb) + const struct in6_addr *daddr, const struct in6_addr *saddr) { struct sk_buff *skb; struct in6_addr addr_buf; @@ -593,9 +592,6 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr); - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE) && oskb) - skb_dst_copy(skb, oskb); - ndisc_send_skb(skb, daddr, saddr); } @@ -682,12 +678,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } - ndisc_send_ns(dev, target, target, saddr, skb); + ndisc_send_ns(dev, target, target, saddr); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(dev, target, &mcaddr, saddr, skb); + ndisc_send_ns(dev, target, &mcaddr, saddr); } } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d5efeb87350e..bab4441ed4e4 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -190,7 +190,7 @@ static void nf_ct_frag6_expire(unsigned long data) /* Creation primitives. */ static inline struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, struct in6_addr *src, - struct in6_addr *dst, u8 ecn) + struct in6_addr *dst, int iif, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -200,6 +200,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id, arg.user = user; arg.src = src; arg.dst = dst; + arg.iif = iif; arg.ecn = ecn; local_bh_disable(); @@ -601,7 +602,7 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use fhdr = (struct frag_hdr *)skb_transport_header(clone); fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, - ip6_frag_ecn(hdr)); + skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); goto ret_orig; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index dc65ec198f7c..99140986e887 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -733,6 +733,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd, static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; @@ -839,8 +840,10 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; } - if (!opt) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -906,6 +909,7 @@ done: dst_release(dst); out: fl6_sock_release(flowlabel); + txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: dst_confirm(dst); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 44e21a03cfc3..45f5ae51de65 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -108,7 +108,10 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) return fq->id == arg->id && fq->user == arg->user && ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst); + ipv6_addr_equal(&fq->daddr, arg->dst) && + (arg->iif == fq->iif || + !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))); } EXPORT_SYMBOL(ip6_frag_match); @@ -180,7 +183,7 @@ static void ip6_frag_expire(unsigned long data) static struct frag_queue * fq_find(struct net *net, __be32 id, const struct in6_addr *src, - const struct in6_addr *dst, u8 ecn) + const struct in6_addr *dst, int iif, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -190,6 +193,7 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, arg.user = IP6_DEFRAG_LOCAL_DELIVER; arg.src = src; arg.dst = dst; + arg.iif = iif; arg.ecn = ecn; hash = inet6_hash_frag(id, src, dst); @@ -551,7 +555,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - ip6_frag_ecn(hdr)); + skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); if (fq) { int ret; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6f01fe122abd..826e6aa44f8d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -523,7 +523,7 @@ static void rt6_probe_deferred(struct work_struct *w) container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); - ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, NULL); + ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL); dev_put(work->dev); kfree(work); } diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index bb8f2fa1c7fb..eaf7ac496d50 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -222,7 +222,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(&fl6, np->opt, &final); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); fl6.saddr = ireq->ir_v6_loc_addr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = ireq->ir_mark; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c5429a636f1a..e7aab561b7b4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -120,6 +120,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; + struct ipv6_txoptions *opt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; @@ -235,7 +236,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; - final_p = fl6_update_dst(&fl6, np->opt, &final); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + final_p = fl6_update_dst(&fl6, opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); @@ -255,7 +257,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && @@ -263,9 +265,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tcp_fetch_timewait_stamp(sk, dst); icsk->icsk_ext_hdr_len = 0; - if (np->opt) - icsk->icsk_ext_hdr_len = (np->opt->opt_flen + - np->opt->opt_nflen); + if (opt) + icsk->icsk_ext_hdr_len = opt->opt_flen + + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -461,7 +463,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, if (np->repflow && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); - err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); + err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), + np->tclass); err = net_xmit_eval(err); } @@ -972,6 +975,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * struct inet_request_sock *ireq; struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; struct tcp_sock *newtp; @@ -1056,7 +1060,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * */ newsk->sk_gso_type = SKB_GSO_TCPV6; - __ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, NULL); inet6_sk_rx_dst_set(newsk, skb); newtcp6sk = (struct tcp6_sock *)newsk; @@ -1098,13 +1102,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * but we make one more one thing there: reattach optmem to newsk. */ - if (np->opt) - newnp->opt = ipv6_dup_options(newsk, np->opt); - + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + } inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newnp->opt) - inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + - newnp->opt->opt_flen); + if (opt) + inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + + opt->opt_flen; tcp_ca_openreq_child(newsk, dst); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 01bcb49619ee..9da3287a3923 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1110,6 +1110,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_txoptions *opt = NULL; + struct ipv6_txoptions *opt_to_free = NULL; struct ip6_flowlabel *flowlabel = NULL; struct flowi6 fl6; struct dst_entry *dst; @@ -1263,8 +1264,10 @@ do_udp_sendmsg: opt = NULL; connected = 0; } - if (!opt) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -1373,6 +1376,7 @@ release_dst: out: dst_release(dst); fl6_sock_release(flowlabel); + txopt_put(opt_to_free); if (!err) return len; /* diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index fcb2752419c6..435608c4306d 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1483,7 +1483,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock, if (sock_writeable(sk) && iucv_below_msglim(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index aca38d8aed8e..a2c8747d2936 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -486,6 +486,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; @@ -575,8 +576,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = NULL; } - if (opt == NULL) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -631,6 +634,7 @@ done: dst_release(dst); out: fl6_sock_release(flowlabel); + txopt_put(opt_to_free); return err < 0 ? err : len; diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index b7de0da46acd..ecf0a0196f18 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -572,7 +572,7 @@ static unsigned int llcp_sock_poll(struct file *file, struct socket *sock, if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); pr_debug("mask 0x%x\n", mask); diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index a7a80a6b77b0..653d073bae45 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -58,7 +58,7 @@ void ovs_dp_notify_wq(struct work_struct *work) struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { - if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) + if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) continue; if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index efb736bb6855..e41cd12d9b2d 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -117,7 +117,6 @@ static struct vport_ops ovs_geneve_vport_ops = { .destroy = ovs_netdev_tunnel_destroy, .get_options = geneve_get_options, .send = dev_queue_xmit, - .owner = THIS_MODULE, }; static int __init ovs_geneve_tnl_init(void) diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index c3257d78d3d2..7f8897f33a67 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -89,7 +89,6 @@ static struct vport_ops ovs_gre_vport_ops = { .create = gre_create, .send = dev_queue_xmit, .destroy = ovs_netdev_tunnel_destroy, - .owner = THIS_MODULE, }; static int __init ovs_gre_tnl_init(void) diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index b327368a3848..6b0190b987ec 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -180,9 +180,13 @@ void ovs_netdev_tunnel_destroy(struct vport *vport) if (vport->dev->priv_flags & IFF_OVS_DATAPATH) ovs_netdev_detach_dev(vport); - /* Early release so we can unregister the device */ + /* We can be invoked by both explicit vport deletion and + * underlying netdev deregistration; delete the link only + * if it's not already shutting down. + */ + if (vport->dev->reg_state == NETREG_REGISTERED) + rtnl_delete_link(vport->dev); dev_put(vport->dev); - rtnl_delete_link(vport->dev); vport->dev = NULL; rtnl_unlock(); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 0ac0fd004d7e..31cbc8c5c7db 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -71,7 +71,7 @@ static struct hlist_head *hash_bucket(const struct net *net, const char *name) return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; } -int ovs_vport_ops_register(struct vport_ops *ops) +int __ovs_vport_ops_register(struct vport_ops *ops) { int err = -EEXIST; struct vport_ops *o; @@ -87,7 +87,7 @@ errout: ovs_unlock(); return err; } -EXPORT_SYMBOL_GPL(ovs_vport_ops_register); +EXPORT_SYMBOL_GPL(__ovs_vport_ops_register); void ovs_vport_ops_unregister(struct vport_ops *ops) { @@ -256,8 +256,8 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) * * @vport: vport to delete. * - * Detaches @vport from its datapath and destroys it. It is possible to fail - * for reasons such as lack of memory. ovs_mutex must be held. + * Detaches @vport from its datapath and destroys it. ovs_mutex must + * be held. */ void ovs_vport_del(struct vport *vport) { diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index bdfd82a7c064..8ea3a96980ac 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -196,7 +196,13 @@ static inline const char *ovs_vport_name(struct vport *vport) return vport->dev->name; } -int ovs_vport_ops_register(struct vport_ops *ops); +int __ovs_vport_ops_register(struct vport_ops *ops); +#define ovs_vport_ops_register(ops) \ + ({ \ + (ops)->owner = THIS_MODULE; \ + __ovs_vport_ops_register(ops); \ + }) + void ovs_vport_ops_unregister(struct vport_ops *ops); static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 242bce1cf0f3..992396aa635c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2329,8 +2329,8 @@ static void tpacket_destruct_skb(struct sk_buff *skb) static bool ll_header_truncated(const struct net_device *dev, int len) { /* net device doesn't like empty head */ - if (unlikely(len <= dev->hard_header_len)) { - net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n", + if (unlikely(len < dev->hard_header_len)) { + net_warn_ratelimited("%s: packet size is too short (%d < %d)\n", current->comm, len, dev->hard_header_len); return true; } @@ -4109,7 +4109,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, err = -EINVAL; if (unlikely((int)req->tp_block_size <= 0)) goto out; - if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) + if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) goto out; if (po->tp_version >= TPACKET_V3 && (int)(req->tp_block_size - @@ -4121,8 +4121,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) goto out; - rb->frames_per_block = req->tp_block_size/req->tp_frame_size; - if (unlikely(rb->frames_per_block <= 0)) + rb->frames_per_block = req->tp_block_size / req->tp_frame_size; + if (unlikely(rb->frames_per_block == 0)) goto out; if (unlikely((rb->frames_per_block * req->tp_block_nr) != req->tp_frame_nr)) diff --git a/net/rds/connection.c b/net/rds/connection.c index d4564036a339..e3b118cae81d 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -186,12 +186,6 @@ static struct rds_connection *__rds_conn_create(struct net *net, } } - if (trans == NULL) { - kmem_cache_free(rds_conn_slab, conn); - conn = ERR_PTR(-ENODEV); - goto out; - } - conn->c_trans = trans; ret = trans->conn_alloc(conn, gfp); diff --git a/net/rds/send.c b/net/rds/send.c index 827155c2ead1..c9cdb358ea88 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1013,11 +1013,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) release_sock(sk); } - /* racing with another thread binding seems ok here */ + lock_sock(sk); if (daddr == 0 || rs->rs_bound_addr == 0) { + release_sock(sk); ret = -ENOTCONN; /* XXX not a great errno */ goto out; } + release_sock(sk); if (payload_len > rds_sk_sndbuf(rs)) { ret = -EMSGSIZE; diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index e0547f521f20..adc555e0323d 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -723,8 +723,10 @@ process_further: if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY || call->state == RXRPC_CALL_SERVER_AWAIT_ACK) && - hard > tx) + hard > tx) { + call->acks_hard = tx; goto all_acked; + } smp_rmb(); rxrpc_rotate_tx_window(call, hard - 1); diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index a40d3afe93b7..14c4e12c47b0 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -531,7 +531,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); /* this should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) return -EPIPE; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f43c8f33f09e..7ec667dd4ce1 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -253,7 +253,8 @@ int qdisc_set_default(const char *name) } /* We know handle. Find qdisc among all qdisc's attached to device - (root qdisc, all its children, children of children etc.) + * (root qdisc, all its children, children of children etc.) + * Note: caller either uses rtnl or rcu_read_lock() */ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) @@ -264,7 +265,7 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) root->handle == handle) return root; - list_for_each_entry(q, &root->list, list) { + list_for_each_entry_rcu(q, &root->list, list) { if (q->handle == handle) return q; } @@ -277,15 +278,18 @@ void qdisc_list_add(struct Qdisc *q) struct Qdisc *root = qdisc_dev(q)->qdisc; WARN_ON_ONCE(root == &noop_qdisc); - list_add_tail(&q->list, &root->list); + ASSERT_RTNL(); + list_add_tail_rcu(&q->list, &root->list); } } EXPORT_SYMBOL(qdisc_list_add); void qdisc_list_del(struct Qdisc *q) { - if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) - list_del(&q->list); + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { + ASSERT_RTNL(); + list_del_rcu(&q->list); + } } EXPORT_SYMBOL(qdisc_list_del); @@ -750,14 +754,18 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) if (n == 0) return; drops = max_t(int, n, 0); + rcu_read_lock(); while ((parentid = sch->parent)) { if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) - return; + break; + if (sch->flags & TCQ_F_NOPARENT) + break; + /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { - WARN_ON(parentid != TC_H_ROOT); - return; + WARN_ON_ONCE(parentid != TC_H_ROOT); + break; } cops = sch->ops->cl_ops; if (cops->qlen_notify) { @@ -768,6 +776,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) sch->q.qlen -= n; __qdisc_qstats_drop(sch, drops); } + rcu_read_unlock(); } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); @@ -941,7 +950,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, } lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); if (!netif_is_multiqueue(dev)) - sch->flags |= TCQ_F_ONETXQUEUE; + sch->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } sch->handle = handle; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cb5d4ad32946..e82a1ad80aa5 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -737,7 +737,7 @@ static void attach_one_default_qdisc(struct net_device *dev, return; } if (!netif_is_multiqueue(dev)) - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue->qdisc_sleeping = qdisc; } diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index f3cbaecd283a..3e82f047caaf 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -63,7 +63,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) if (qdisc == NULL) goto err; priv->qdiscs[ntx] = qdisc; - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } sch->flags |= TCQ_F_MQROOT; @@ -156,7 +156,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); if (new) - new->flags |= TCQ_F_ONETXQUEUE; + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 3811a745452c..ad70ecf57ce7 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -132,7 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) goto err; } priv->qdiscs[i] = qdisc; - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } /* If the mqprio options indicate that hardware should own @@ -209,7 +209,7 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); if (new) - new->flags |= TCQ_F_ONETXQUEUE; + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e917d27328ea..acb45b8c2a9d 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -209,6 +209,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 *fl6 = &transport->fl.u.ip6; + int res; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); @@ -220,7 +221,10 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); - return ip6_xmit(sk, skb, fl6, np->opt, np->tclass); + rcu_read_lock(); + res = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass); + rcu_read_unlock(); + return res; } /* Returns the dst cache entry for the given source and destination ip @@ -262,7 +266,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, pr_debug("src=%pI6 - ", &fl6->saddr); } - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); + dst = ip6_dst_lookup_flow(sk, fl6, final_p); if (!asoc || saddr) goto out; @@ -321,7 +328,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, if (baddr) { fl6->saddr = baddr->v6.sin6_addr; fl6->fl6_sport = baddr->v6.sin6_port; - final_p = fl6_update_dst(fl6, np->opt, &final); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); dst = ip6_dst_lookup_flow(sk, fl6, final_p); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 897c01c029ca..03c8256063ec 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -972,7 +972,7 @@ static int sctp_setsockopt_bindx(struct sock *sk, return -EFAULT; /* Alloc space for the address array in kernel memory. */ - kaddrs = kmalloc(addrs_size, GFP_KERNEL); + kaddrs = kmalloc(addrs_size, GFP_USER | __GFP_NOWARN); if (unlikely(!kaddrs)) return -ENOMEM; @@ -4928,7 +4928,7 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, to = optval + offsetof(struct sctp_getaddrs, addrs); space_left = len - offsetof(struct sctp_getaddrs, addrs); - addrs = kmalloc(space_left, GFP_KERNEL); + addrs = kmalloc(space_left, GFP_USER | __GFP_NOWARN); if (!addrs) return -ENOMEM; @@ -6458,7 +6458,7 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sctp_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* * Since the socket is not locked, the buffer * might be made available after the writeable check and @@ -6801,26 +6801,30 @@ no_packet: static void __sctp_write_space(struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; - struct socket *sock = sk->sk_socket; - if ((sctp_wspace(asoc) > 0) && sock) { - if (waitqueue_active(&asoc->wait)) - wake_up_interruptible(&asoc->wait); + if (sctp_wspace(asoc) <= 0) + return; + + if (waitqueue_active(&asoc->wait)) + wake_up_interruptible(&asoc->wait); - if (sctp_writeable(sk)) { - wait_queue_head_t *wq = sk_sleep(sk); + if (sctp_writeable(sk)) { + struct socket_wq *wq; - if (wq && waitqueue_active(wq)) - wake_up_interruptible(wq); + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq) { + if (waitqueue_active(&wq->wait)) + wake_up_interruptible(&wq->wait); /* Note that we try to include the Async I/O support * here by modeling from the current TCP/UDP code. * We have not tested with it yet. */ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, - SOCK_WAKE_SPACE, POLL_OUT); + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); } + rcu_read_unlock(); } } @@ -7375,6 +7379,13 @@ struct proto sctp_prot = { #if IS_ENABLED(CONFIG_IPV6) +#include <net/transp_v6.h> +static void sctp_v6_destroy_sock(struct sock *sk) +{ + sctp_destroy_sock(sk); + inet6_destroy_sock(sk); +} + struct proto sctpv6_prot = { .name = "SCTPv6", .owner = THIS_MODULE, @@ -7384,7 +7395,7 @@ struct proto sctpv6_prot = { .accept = sctp_accept, .ioctl = sctp_ioctl, .init = sctp_init_sock, - .destroy = sctp_destroy_sock, + .destroy = sctp_v6_destroy_sock, .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, diff --git a/net/socket.c b/net/socket.c index dd2c247c99e3..456fadb3d819 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1056,27 +1056,20 @@ static int sock_fasync(int fd, struct file *filp, int on) return 0; } -/* This function may be called only under socket lock or callback_lock or rcu_lock */ +/* This function may be called only under rcu_lock */ -int sock_wake_async(struct socket *sock, int how, int band) +int sock_wake_async(struct socket_wq *wq, int how, int band) { - struct socket_wq *wq; - - if (!sock) - return -1; - rcu_read_lock(); - wq = rcu_dereference(sock->wq); - if (!wq || !wq->fasync_list) { - rcu_read_unlock(); + if (!wq || !wq->fasync_list) return -1; - } + switch (how) { case SOCK_WAKE_WAITD: - if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) + if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags)) break; goto call_kill; case SOCK_WAKE_SPACE: - if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags)) + if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags)) break; /* fall through */ case SOCK_WAKE_IO: @@ -1086,7 +1079,7 @@ call_kill: case SOCK_WAKE_URG: kill_fasync(&wq->fasync_list, SIGURG, band); } - rcu_read_unlock(); + return 0; } EXPORT_SYMBOL(sock_wake_async); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index dace13d7638e..799e65b944b9 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1411,17 +1411,16 @@ gss_key_timeout(struct rpc_cred *rc) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; - unsigned long now = jiffies; - unsigned long expire; + unsigned long timeout = jiffies + (gss_key_expire_timeo * HZ); + int ret = 0; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); - if (ctx) - expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ); + if (!ctx || time_after(timeout, ctx->gc_expiry)) + ret = -EACCES; rcu_read_unlock(); - if (!ctx || time_after(now, expire)) - return -EACCES; - return 0; + + return ret; } static int diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 4a2340a54401..5e4f815c2b34 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -41,13 +41,16 @@ static bool cache_defer_req(struct cache_req *req, struct cache_head *item); static void cache_revisit_request(struct cache_head *item); -static void cache_init(struct cache_head *h) +static void cache_init(struct cache_head *h, struct cache_detail *detail) { time_t now = seconds_since_boot(); INIT_HLIST_NODE(&h->cache_list); h->flags = 0; kref_init(&h->ref); h->expiry_time = now + CACHE_NEW_EXPIRY; + if (now <= detail->flush_time) + /* ensure it isn't already expired */ + now = detail->flush_time + 1; h->last_refresh = now; } @@ -81,7 +84,7 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, * we might get lose if we need to * cache_put it soon. */ - cache_init(new); + cache_init(new, detail); detail->init(new, key); write_lock(&detail->hash_lock); @@ -116,10 +119,15 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); -static void cache_fresh_locked(struct cache_head *head, time_t expiry) +static void cache_fresh_locked(struct cache_head *head, time_t expiry, + struct cache_detail *detail) { + time_t now = seconds_since_boot(); + if (now <= detail->flush_time) + /* ensure it isn't immediately treated as expired */ + now = detail->flush_time + 1; head->expiry_time = expiry; - head->last_refresh = seconds_since_boot(); + head->last_refresh = now; smp_wmb(); /* paired with smp_rmb() in cache_is_valid() */ set_bit(CACHE_VALID, &head->flags); } @@ -149,7 +157,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, set_bit(CACHE_NEGATIVE, &old->flags); else detail->update(old, new); - cache_fresh_locked(old, new->expiry_time); + cache_fresh_locked(old, new->expiry_time, detail); write_unlock(&detail->hash_lock); cache_fresh_unlocked(old, detail); return old; @@ -162,7 +170,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, cache_put(old, detail); return NULL; } - cache_init(tmp); + cache_init(tmp, detail); detail->init(tmp, old); write_lock(&detail->hash_lock); @@ -173,8 +181,8 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); detail->entries++; cache_get(tmp); - cache_fresh_locked(tmp, new->expiry_time); - cache_fresh_locked(old, 0); + cache_fresh_locked(tmp, new->expiry_time, detail); + cache_fresh_locked(old, 0, detail); write_unlock(&detail->hash_lock); cache_fresh_unlocked(tmp, detail); cache_fresh_unlocked(old, detail); @@ -219,7 +227,8 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h rv = cache_is_valid(h); if (rv == -EAGAIN) { set_bit(CACHE_NEGATIVE, &h->flags); - cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY); + cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY, + detail); rv = -ENOENT; } write_unlock(&detail->hash_lock); @@ -487,10 +496,13 @@ EXPORT_SYMBOL_GPL(cache_flush); void cache_purge(struct cache_detail *detail) { - detail->flush_time = LONG_MAX; + time_t now = seconds_since_boot(); + if (detail->flush_time >= now) + now = detail->flush_time + 1; + /* 'now' is the maximum value any 'last_refresh' can have */ + detail->flush_time = now; detail->nextcheck = seconds_since_boot(); cache_flush(); - detail->flush_time = 1; } EXPORT_SYMBOL_GPL(cache_purge); @@ -1436,6 +1448,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf, { char tbuf[20]; char *bp, *ep; + time_t then, now; if (*ppos || count > sizeof(tbuf)-1) return -EINVAL; @@ -1447,8 +1460,22 @@ static ssize_t write_flush(struct file *file, const char __user *buf, return -EINVAL; bp = tbuf; - cd->flush_time = get_expiry(&bp); - cd->nextcheck = seconds_since_boot(); + then = get_expiry(&bp); + now = seconds_since_boot(); + cd->nextcheck = now; + /* Can only set flush_time to 1 second beyond "now", or + * possibly 1 second beyond flushtime. This is because + * flush_time never goes backwards so it mustn't get too far + * ahead of time. + */ + if (then >= now) { + /* Want to flush everything, so behave like cache_purge() */ + if (cd->flush_time >= now) + now = cd->flush_time + 1; + then = now; + } + + cd->flush_time = then; cache_flush(); *ppos += count; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 0c8120229a03..1413cdcc131c 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -181,7 +181,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr, struct page **ppage = xdr->pages; size_t base = xdr->page_base; unsigned int pglen = xdr->page_len; - unsigned int flags = MSG_MORE; + unsigned int flags = MSG_MORE | MSG_SENDPAGE_NOTLAST; int slen; int len = 0; @@ -399,6 +399,31 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp) return svc_port_is_privileged(svc_addr(rqstp)); } +static bool sunrpc_waitqueue_active(wait_queue_head_t *wq) +{ + if (!wq) + return false; + /* + * There should normally be a memory * barrier here--see + * wq_has_sleeper(). + * + * It appears that isn't currently necessary, though, basically + * because callers all appear to have sufficient memory barriers + * between the time the relevant change is made and the + * time they call these callbacks. + * + * The nfsd code itself doesn't actually explicitly wait on + * these waitqueues, but it may wait on them for example in + * sendpage() or sendmsg() calls. (And those may be the only + * places, since it it uses nonblocking reads.) + * + * Maybe we should add the memory barriers anyway, but these are + * hot paths so we'd need to be convinced there's no sigificant + * penalty. + */ + return waitqueue_active(wq); +} + /* * INET callback when data has been received on the socket. */ @@ -414,7 +439,7 @@ static void svc_udp_data_ready(struct sock *sk) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible(wq); } @@ -432,7 +457,7 @@ static void svc_write_space(struct sock *sk) svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) { + if (sunrpc_waitqueue_active(wq)) { dprintk("RPC svc_write_space: someone sleeping on %p\n", svsk); wake_up_interruptible(wq); @@ -787,7 +812,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk) } wq = sk_sleep(sk); - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible_all(wq); } @@ -808,7 +833,7 @@ static void svc_tcp_state_change(struct sock *sk) set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible_all(wq); } @@ -823,7 +848,7 @@ static void svc_tcp_data_ready(struct sock *sk) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); } - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible(wq); } @@ -1367,7 +1392,6 @@ EXPORT_SYMBOL_GPL(svc_sock_update_bufs); /* * Initialize socket for RPC use and create svc_sock struct - * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. */ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct socket *sock, @@ -1594,7 +1618,7 @@ static void svc_sock_detach(struct svc_xprt *xprt) sk->sk_write_space = svsk->sk_owspace; wq = sk_sleep(sk); - if (wq && waitqueue_active(wq)) + if (sunrpc_waitqueue_active(wq)) wake_up_interruptible(wq); } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 1d1a70498910..2ffaf6a79499 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -398,7 +398,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, if (unlikely(!sock)) return -ENOTSOCK; - clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags); if (base != 0) { addr = NULL; addrlen = 0; @@ -442,7 +442,7 @@ static void xs_nospace_callback(struct rpc_task *task) struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); transport->inet->sk_write_pending--; - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); } /** @@ -467,7 +467,7 @@ static int xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { - if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { + if (test_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags)) { /* * Notify TCP that we're limited by the application * window size @@ -478,7 +478,7 @@ static int xs_nospace(struct rpc_task *task) xprt_wait_for_buffer_space(task, xs_nospace_callback); } } else { - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); ret = -ENOTCONN; } @@ -626,7 +626,7 @@ process_status: case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); } return status; @@ -715,7 +715,7 @@ static int xs_tcp_send_request(struct rpc_task *task) case -EADDRINUSE: case -ENOBUFS: case -EPIPE: - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); } return status; @@ -1618,7 +1618,7 @@ static void xs_write_space(struct sock *sk) if (unlikely(!(xprt = xprt_from_sock(sk)))) return; - if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0) + if (test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags) == 0) return; xprt_write_space(xprt); diff --git a/net/tipc/link.c b/net/tipc/link.c index 9efbdbde2b08..91aea071ab27 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -191,6 +191,7 @@ void tipc_link_add_bc_peer(struct tipc_link *snd_l, snd_l->ackers++; rcv_l->acked = snd_l->snd_nxt - 1; + snd_l->state = LINK_ESTABLISHED; tipc_link_build_bc_init_msg(uc_l, xmitq); } @@ -206,6 +207,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, rcv_l->state = LINK_RESET; if (!snd_l->ackers) { tipc_link_reset(snd_l); + snd_l->state = LINK_RESET; __skb_queue_purge(xmitq); } } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 552dbaba9cf3..b53246fb0412 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -105,6 +105,7 @@ struct tipc_sock { static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); +static void tipc_sock_destruct(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); @@ -381,6 +382,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; + sk->sk_destruct = tipc_sock_destruct; tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; tsk->sent_unacked = 0; atomic_set(&tsk->dupl_rcvcnt, 0); @@ -470,9 +472,6 @@ static int tipc_release(struct socket *sock) tipc_node_remove_conn(net, dnode, tsk->portid); } - /* Discard any remaining (connection-based) messages in receive queue */ - __skb_queue_purge(&sk->sk_receive_queue); - /* Reject any messages that accumulated in backlog queue */ sock->state = SS_DISCONNECTING; release_sock(sk); @@ -1515,6 +1514,11 @@ static void tipc_data_ready(struct sock *sk) rcu_read_unlock(); } +static void tipc_sock_destruct(struct sock *sk) +{ + __skb_queue_purge(&sk->sk_receive_queue); +} + /** * filter_connect - Handle all incoming messages for a connection-based socket * @tsk: TIPC socket diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index ad2719ad4c1b..70c03271b798 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -158,8 +158,11 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; struct rtable *rt; - if (skb_headroom(skb) < UDP_MIN_HEADROOM) - pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); + if (skb_headroom(skb) < UDP_MIN_HEADROOM) { + err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); + if (err) + goto tx_error; + } skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); ub = rcu_dereference_rtnl(b->media_ptr); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 12b886f07982..45aebd966978 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -326,6 +326,118 @@ found: return s; } +/* Support code for asymmetrically connected dgram sockets + * + * If a datagram socket is connected to a socket not itself connected + * to the first socket (eg, /dev/log), clients may only enqueue more + * messages if the present receive queue of the server socket is not + * "too large". This means there's a second writeability condition + * poll and sendmsg need to test. The dgram recv code will do a wake + * up on the peer_wait wait queue of a socket upon reception of a + * datagram which needs to be propagated to sleeping would-be writers + * since these might not have sent anything so far. This can't be + * accomplished via poll_wait because the lifetime of the server + * socket might be less than that of its clients if these break their + * association with it or if the server socket is closed while clients + * are still connected to it and there's no way to inform "a polling + * implementation" that it should let go of a certain wait queue + * + * In order to propagate a wake up, a wait_queue_t of the client + * socket is enqueued on the peer_wait queue of the server socket + * whose wake function does a wake_up on the ordinary client socket + * wait queue. This connection is established whenever a write (or + * poll for write) hit the flow control condition and broken when the + * association to the server socket is dissolved or after a wake up + * was relayed. + */ + +static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags, + void *key) +{ + struct unix_sock *u; + wait_queue_head_t *u_sleep; + + u = container_of(q, struct unix_sock, peer_wake); + + __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, + q); + u->peer_wake.private = NULL; + + /* relaying can only happen while the wq still exists */ + u_sleep = sk_sleep(&u->sk); + if (u_sleep) + wake_up_interruptible_poll(u_sleep, key); + + return 0; +} + +static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) +{ + struct unix_sock *u, *u_other; + int rc; + + u = unix_sk(sk); + u_other = unix_sk(other); + rc = 0; + spin_lock(&u_other->peer_wait.lock); + + if (!u->peer_wake.private) { + u->peer_wake.private = other; + __add_wait_queue(&u_other->peer_wait, &u->peer_wake); + + rc = 1; + } + + spin_unlock(&u_other->peer_wait.lock); + return rc; +} + +static void unix_dgram_peer_wake_disconnect(struct sock *sk, + struct sock *other) +{ + struct unix_sock *u, *u_other; + + u = unix_sk(sk); + u_other = unix_sk(other); + spin_lock(&u_other->peer_wait.lock); + + if (u->peer_wake.private == other) { + __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); + u->peer_wake.private = NULL; + } + + spin_unlock(&u_other->peer_wait.lock); +} + +static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, + struct sock *other) +{ + unix_dgram_peer_wake_disconnect(sk, other); + wake_up_interruptible_poll(sk_sleep(sk), + POLLOUT | + POLLWRNORM | + POLLWRBAND); +} + +/* preconditions: + * - unix_peer(sk) == other + * - association is stable + */ +static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) +{ + int connected; + + connected = unix_dgram_peer_wake_connect(sk, other); + + if (unix_recvq_full(other)) + return 1; + + if (connected) + unix_dgram_peer_wake_disconnect(sk, other); + + return 0; +} + static int unix_writable(const struct sock *sk) { return sk->sk_state != TCP_LISTEN && @@ -431,6 +543,8 @@ static void unix_release_sock(struct sock *sk, int embrion) skpair->sk_state_change(skpair); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); } + + unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ unix_peer(sk) = NULL; } @@ -666,6 +780,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) INIT_LIST_HEAD(&u->link); mutex_init(&u->readlock); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); + init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); unix_insert_socket(unix_sockets_unbound(sk), sk); out: if (sk == NULL) @@ -1033,6 +1148,8 @@ restart: if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); unix_peer(sk) = other; + unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); + unix_state_double_unlock(sk, other); if (other != old_peer) @@ -1434,6 +1551,14 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } +static bool unix_passcred_enabled(const struct socket *sock, + const struct sock *other) +{ + return test_bit(SOCK_PASSCRED, &sock->flags) || + !other->sk_socket || + test_bit(SOCK_PASSCRED, &other->sk_socket->flags); +} + /* * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket @@ -1444,14 +1569,41 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, { if (UNIXCB(skb).pid) return; - if (test_bit(SOCK_PASSCRED, &sock->flags) || - !other->sk_socket || - test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) { + if (unix_passcred_enabled(sock, other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } } +static int maybe_init_creds(struct scm_cookie *scm, + struct socket *socket, + const struct sock *other) +{ + int err; + struct msghdr msg = { .msg_controllen = 0 }; + + err = scm_send(socket, &msg, scm, false); + if (err) + return err; + + if (unix_passcred_enabled(socket, other)) { + scm->pid = get_pid(task_tgid(current)); + current_uid_gid(&scm->creds.uid, &scm->creds.gid); + } + return err; +} + +static bool unix_skb_scm_eq(struct sk_buff *skb, + struct scm_cookie *scm) +{ + const struct unix_skb_parms *u = &UNIXCB(skb); + + return u->pid == scm->pid && + uid_eq(u->uid, scm->creds.uid) && + gid_eq(u->gid, scm->creds.gid) && + unix_secdata_eq(scm, skb); +} + /* * Send AF_UNIX data. */ @@ -1472,6 +1624,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, struct scm_cookie scm; int max_level; int data_len = 0; + int sk_locked; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); @@ -1550,12 +1703,14 @@ restart: goto out_free; } + sk_locked = 0; unix_state_lock(other); +restart_locked: err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; - if (sock_flag(other, SOCK_DEAD)) { + if (unlikely(sock_flag(other, SOCK_DEAD))) { /* * Check with 1003.1g - what should * datagram error @@ -1563,10 +1718,14 @@ restart: unix_state_unlock(other); sock_put(other); + if (!sk_locked) + unix_state_lock(sk); + err = 0; - unix_state_lock(sk); if (unix_peer(sk) == other) { unix_peer(sk) = NULL; + unix_dgram_peer_wake_disconnect_wakeup(sk, other); + unix_state_unlock(sk); unix_dgram_disconnected(sk, other); @@ -1592,21 +1751,38 @@ restart: goto out_unlock; } - if (unix_peer(other) != sk && unix_recvq_full(other)) { - if (!timeo) { - err = -EAGAIN; - goto out_unlock; + if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + if (timeo) { + timeo = unix_wait_for_peer(other, timeo); + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out_free; + + goto restart; } - timeo = unix_wait_for_peer(other, timeo); + if (!sk_locked) { + unix_state_unlock(other); + unix_state_double_lock(sk, other); + } - err = sock_intr_errno(timeo); - if (signal_pending(current)) - goto out_free; + if (unix_peer(sk) != other || + unix_dgram_peer_wake_me(sk, other)) { + err = -EAGAIN; + sk_locked = 1; + goto out_unlock; + } - goto restart; + if (!sk_locked) { + sk_locked = 1; + goto restart_locked; + } } + if (unlikely(sk_locked)) + unix_state_unlock(sk); + if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); @@ -1620,6 +1796,8 @@ restart: return len; out_unlock: + if (sk_locked) + unix_state_unlock(sk); unix_state_unlock(other); out_free: kfree_skb(skb); @@ -1741,8 +1919,10 @@ out_err: static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, int offset, size_t size, int flags) { - int err = 0; - bool send_sigpipe = true; + int err; + bool send_sigpipe = false; + bool init_scm = true; + struct scm_cookie scm; struct sock *other, *sk = socket->sk; struct sk_buff *skb, *newskb = NULL, *tail = NULL; @@ -1760,7 +1940,7 @@ alloc_skb: newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, &err, 0); if (!newskb) - return err; + goto err; } /* we must acquire readlock as we modify already present @@ -1769,12 +1949,12 @@ alloc_skb: err = mutex_lock_interruptible(&unix_sk(other)->readlock); if (err) { err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS; - send_sigpipe = false; goto err; } if (sk->sk_shutdown & SEND_SHUTDOWN) { err = -EPIPE; + send_sigpipe = true; goto err_unlock; } @@ -1783,23 +1963,34 @@ alloc_skb: if (sock_flag(other, SOCK_DEAD) || other->sk_shutdown & RCV_SHUTDOWN) { err = -EPIPE; + send_sigpipe = true; goto err_state_unlock; } + if (init_scm) { + err = maybe_init_creds(&scm, socket, other); + if (err) + goto err_state_unlock; + init_scm = false; + } + skb = skb_peek_tail(&other->sk_receive_queue); if (tail && tail == skb) { skb = newskb; - } else if (!skb) { - if (newskb) + } else if (!skb || !unix_skb_scm_eq(skb, &scm)) { + if (newskb) { skb = newskb; - else + } else { + tail = skb; goto alloc_skb; + } } else if (newskb) { /* this is fast path, we don't necessarily need to * call to kfree_skb even though with newskb == NULL * this - does no harm */ consume_skb(newskb); + newskb = NULL; } if (skb_append_pagefrags(skb, page, offset, size)) { @@ -1812,14 +2003,20 @@ alloc_skb: skb->truesize += size; atomic_add(size, &sk->sk_wmem_alloc); - if (newskb) + if (newskb) { + err = unix_scm_to_skb(&scm, skb, false); + if (err) + goto err_state_unlock; + spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, newskb); + spin_unlock(&other->sk_receive_queue.lock); + } unix_state_unlock(other); mutex_unlock(&unix_sk(other)->readlock); other->sk_data_ready(other); - + scm_destroy(&scm); return size; err_state_unlock: @@ -1830,6 +2027,8 @@ err: kfree_skb(newskb); if (send_sigpipe && !(flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); + if (!init_scm) + scm_destroy(&scm); return err; } @@ -1992,7 +2191,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); timeo = freezable_schedule_timeout(timeo); unix_state_lock(sk); @@ -2000,7 +2199,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, if (sock_flag(sk, SOCK_DEAD)) break; - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); @@ -2133,10 +2332,7 @@ unlock: if (check_creds) { /* Never glue messages from different writers */ - if ((UNIXCB(skb).pid != scm.pid) || - !uid_eq(UNIXCB(skb).uid, scm.creds.uid) || - !gid_eq(UNIXCB(skb).gid, scm.creds.gid) || - !unix_secdata_eq(&scm, skb)) + if (!unix_skb_scm_eq(skb, &scm)) break; } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { /* Copy credentials */ @@ -2472,20 +2668,22 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, return mask; writable = unix_writable(sk); - other = unix_peer_get(sk); - if (other) { - if (unix_peer(other) != sk) { - sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); - if (unix_recvq_full(other)) - writable = 0; - } - sock_put(other); + if (writable) { + unix_state_lock(sk); + + other = unix_peer(sk); + if (other && unix_peer(other) != sk && + unix_recvq_full(other) && + unix_dgram_peer_wake_me(sk, other)) + writable = 0; + + unix_state_unlock(sk); } if (writable) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } |