diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-03 17:49:17 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-03 17:49:17 -0700 |
commit | aa9d4648c2fbb455df7750ade1b73dd9ad9b3690 (patch) | |
tree | bc4590c27e6f30ec0612b28f3f38a539535b9930 /drivers/infiniband/core | |
parent | 906dde0f355bd97c080c215811ae7db1137c4af8 (diff) | |
parent | 8eb19e8e7c8658226d8b7e75728e6dfa2ef32717 (diff) |
Merge tag 'for-linus-ioctl' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
Pull rdma updates from Doug Ledford:
"This is a big pull request.
Of note is that I'm sending you the new ioctl API for the rdma
subsystem. We put it up on linux-api@, but didn't get much response.
The API is complex, but it solves two different problems in one go:
1) The bi-directional nature of the RDMA file write calls, which
created the security hole we had to handle (and for which the fix
is now causing problems for systems in production, we were a bit
over zealous in the fix and the ability to open a device, then
fork, then create new queue pairs on the device and use them is
broken).
2) The bloat caused by different vendors implementing extensions to
the base verbs API. Each vendor's hardware is slightly different,
and the hardware might be suitable for one extension but not
another.
By the time we add generic extensions for all the different ways
that the different hardware can offload things, the API becomes
bloated. Things like our completion structs have started to exceed
a cache line in size because of all the elements needed to support
this. That in turn shows up heavily in the performance graphs with
a noticable drop in performance on 100Gigabit links as our
completion structs go from occupying one cache line to 1+.
This API makes things like the completion structs modular in a
very similar way to netlink so that your structs can only include
the items needed for the offloads/features you are actually using
on a given queue pair. In that way we support everything, but only
use what we need, and our structs stay smaller.
The ioctl API is better explained by the posting on linux-api@ than I
can explain it here, so I'll just leave it at that.
The rest of the pull request is typical stuff.
Updates for 4.14 kernel merge window
- Lots of hfi1 driver updates (mixed with a few qib and core updates
as well)
- rxe updates
- various mlx updates
- Set default roce type to RoCEv2
- Several larger fixes for bnxt_re that were too big for -rc
- Several larger fixes for qedr that, likewise, were too big for -rc
- Misc core changes
- Make the hns_roce driver compilable on arches other than aarch64 so
we can more easily debug build issues related to it
- Add rdma-netlink infrastructure updates
- Add automatic IRQ affinity infrastructure
- Add 32bit lid support
- Lots of misc fixes across the subsystem from random people
- Autoloading of RDMA netlink modules
- PCI pool cleanups from Romain Perier
- mlx5 driver feature additions and fixes
- Hardware tag matchine feature
- Fix sleeping in atomic when resolving roce ah
- Add experimental ioctl interface as posted to linux-api@"
* tag 'for-linus-ioctl' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (328 commits)
IB/core: Expose ioctl interface through experimental Kconfig
IB/core: Assign root to all drivers
IB/core: Add completion queue (cq) object actions
IB/core: Add legacy driver's user-data
IB/core: Export ioctl enum types to user-space
IB/core: Explicitly destroy an object while keeping uobject
IB/core: Add macros for declaring methods and attributes
IB/core: Add uverbs merge trees functionality
IB/core: Add DEVICE object and root tree structure
IB/core: Declare an object instead of declaring only type attributes
IB/core: Add new ioctl interface
RDMA/vmw_pvrdma: Fix a signedness
RDMA/vmw_pvrdma: Report network header type in WC
IB/core: Add might_sleep() annotation to ib_init_ah_from_wc()
IB/cm: Fix sleeping in atomic when RoCE is used
IB/core: Add support to finalize objects in one transaction
IB/core: Add a generic way to execute an operation on a uobject
Documentation: Hardware tag matching
IB/mlx5: Support IB_SRQT_TM
net/mlx5: Add XRQ support
...
Diffstat (limited to 'drivers/infiniband/core')
29 files changed, 2718 insertions, 578 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index e3cdafff8ece..b4df164f71a6 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -11,7 +11,8 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - security.o + security.o nldev.o + ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o @@ -31,4 +32,5 @@ ib_umad-y := user_mad.o ib_ucm-y := ucm.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ - rdma_core.o uverbs_std_types.o + rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ + uverbs_ioctl_merge.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 437522ca97b4..12523f630b61 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -130,13 +130,11 @@ static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh) } int ib_nl_handle_ip_res_resp(struct sk_buff *skb, - struct netlink_callback *cb) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; - if ((nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; if (ib_nl_is_good_ip_resp(nlh)) @@ -186,7 +184,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL); + rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL); /* Make the request retry, so when we get the response from userspace * we will have something. @@ -326,7 +324,7 @@ static void queue_req(struct addr_req *req) static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { - if (ibnl_chk_listeners(RDMA_NL_GROUP_LS)) + if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; /* We fill in what we can, the response will fill the rest */ diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index efc94304dee3..77515638c55c 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1199,30 +1199,23 @@ int ib_cache_setup_one(struct ib_device *device) device->cache.ports = kzalloc(sizeof(*device->cache.ports) * (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); - if (!device->cache.ports) { - err = -ENOMEM; - goto out; - } + if (!device->cache.ports) + return -ENOMEM; err = gid_table_setup_one(device); - if (err) - goto out; + if (err) { + kfree(device->cache.ports); + device->cache.ports = NULL; + return err; + } for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) ib_cache_update(device, p + rdma_start_port(device), true); INIT_IB_EVENT_HANDLER(&device->cache.event_handler, device, ib_cache_event); - err = ib_register_event_handler(&device->cache.event_handler); - if (err) - goto err; - + ib_register_event_handler(&device->cache.event_handler); return 0; - -err: - gid_table_cleanup_one(device); -out: - return err; } void ib_cache_release_one(struct ib_device *device) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 2b4d613a3474..4c4b46586af2 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -373,11 +373,19 @@ out: return ret; } -static int cm_alloc_response_msg(struct cm_port *port, - struct ib_mad_recv_wc *mad_recv_wc, - struct ib_mad_send_buf **msg) +static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc) +{ + return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, + 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, + IB_MGMT_BASE_VERSION); +} + +static int cm_create_response_msg_ah(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + struct ib_mad_send_buf *msg) { - struct ib_mad_send_buf *m; struct ib_ah *ah; ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, @@ -385,27 +393,40 @@ static int cm_alloc_response_msg(struct cm_port *port, if (IS_ERR(ah)) return PTR_ERR(ah); - m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, - 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC, - IB_MGMT_BASE_VERSION); - if (IS_ERR(m)) { - rdma_destroy_ah(ah); - return PTR_ERR(m); - } - m->ah = ah; - *msg = m; + msg->ah = ah; return 0; } static void cm_free_msg(struct ib_mad_send_buf *msg) { - rdma_destroy_ah(msg->ah); + if (msg->ah) + rdma_destroy_ah(msg->ah); if (msg->context[0]) cm_deref_id(msg->context[0]); ib_free_send_mad(msg); } +static int cm_alloc_response_msg(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + struct ib_mad_send_buf **msg) +{ + struct ib_mad_send_buf *m; + int ret; + + m = cm_alloc_response_msg_no_ah(port, mad_recv_wc); + if (IS_ERR(m)) + return PTR_ERR(m); + + ret = cm_create_response_msg_ah(port, mad_recv_wc, m); + if (ret) { + cm_free_msg(m); + return ret; + } + + *msg = m; + return 0; +} + static void * cm_copy_private_data(const void *private_data, u8 private_data_len) { @@ -1175,6 +1196,11 @@ static void cm_format_req(struct cm_req_msg *req_msg, { struct sa_path_rec *pri_path = param->primary_path; struct sa_path_rec *alt_path = param->alternate_path; + bool pri_ext = false; + + if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA) + pri_ext = opa_is_extended_lid(pri_path->opa.dlid, + pri_path->opa.slid); cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); @@ -1202,18 +1228,24 @@ static void cm_format_req(struct cm_req_msg *req_msg, cm_req_set_srq(req_msg, param->srq); } + req_msg->primary_local_gid = pri_path->sgid; + req_msg->primary_remote_gid = pri_path->dgid; + if (pri_ext) { + req_msg->primary_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid)); + req_msg->primary_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid)); + } if (pri_path->hop_limit <= 1) { - req_msg->primary_local_lid = + req_msg->primary_local_lid = pri_ext ? 0 : htons(ntohl(sa_path_get_slid(pri_path))); - req_msg->primary_remote_lid = + req_msg->primary_remote_lid = pri_ext ? 0 : htons(ntohl(sa_path_get_dlid(pri_path))); } else { /* Work-around until there's a way to obtain remote LID info */ req_msg->primary_local_lid = IB_LID_PERMISSIVE; req_msg->primary_remote_lid = IB_LID_PERMISSIVE; } - req_msg->primary_local_gid = pri_path->sgid; - req_msg->primary_remote_gid = pri_path->dgid; cm_req_set_primary_flow_label(req_msg, pri_path->flow_label); cm_req_set_primary_packet_rate(req_msg, pri_path->rate); req_msg->primary_traffic_class = pri_path->traffic_class; @@ -1225,17 +1257,29 @@ static void cm_format_req(struct cm_req_msg *req_msg, pri_path->packet_life_time)); if (alt_path) { + bool alt_ext = false; + + if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA) + alt_ext = opa_is_extended_lid(alt_path->opa.dlid, + alt_path->opa.slid); + + req_msg->alt_local_gid = alt_path->sgid; + req_msg->alt_remote_gid = alt_path->dgid; + if (alt_ext) { + req_msg->alt_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid)); + req_msg->alt_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid)); + } if (alt_path->hop_limit <= 1) { - req_msg->alt_local_lid = + req_msg->alt_local_lid = alt_ext ? 0 : htons(ntohl(sa_path_get_slid(alt_path))); - req_msg->alt_remote_lid = + req_msg->alt_remote_lid = alt_ext ? 0 : htons(ntohl(sa_path_get_dlid(alt_path))); } else { req_msg->alt_local_lid = IB_LID_PERMISSIVE; req_msg->alt_remote_lid = IB_LID_PERMISSIVE; } - req_msg->alt_local_gid = alt_path->sgid; - req_msg->alt_remote_gid = alt_path->dgid; cm_req_set_alt_flow_label(req_msg, alt_path->flow_label); cm_req_set_alt_packet_rate(req_msg, alt_path->rate); @@ -1405,16 +1449,63 @@ static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid, (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn)))); } +static bool cm_req_has_alt_path(struct cm_req_msg *req_msg) +{ + return ((req_msg->alt_local_lid) || + (ib_is_opa_gid(&req_msg->alt_local_gid))); +} + +static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num, + struct sa_path_rec *path, union ib_gid *gid) +{ + if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num)) + path->rec_type = SA_PATH_REC_TYPE_OPA; + else + path->rec_type = SA_PATH_REC_TYPE_IB; +} + +static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg, + struct sa_path_rec *primary_path, + struct sa_path_rec *alt_path) +{ + u32 lid; + + if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(primary_path, + htonl(ntohs(req_msg->primary_local_lid))); + sa_path_set_slid(primary_path, + htonl(ntohs(req_msg->primary_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&req_msg->primary_local_gid); + sa_path_set_dlid(primary_path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&req_msg->primary_remote_gid); + sa_path_set_slid(primary_path, cpu_to_be32(lid)); + } + + if (!cm_req_has_alt_path(req_msg)) + return; + + if (alt_path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(alt_path, + htonl(ntohs(req_msg->alt_local_lid))); + sa_path_set_slid(alt_path, + htonl(ntohs(req_msg->alt_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&req_msg->alt_local_gid); + sa_path_set_dlid(alt_path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&req_msg->alt_remote_gid); + sa_path_set_slid(alt_path, cpu_to_be32(lid)); + } +} + static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct sa_path_rec *primary_path, struct sa_path_rec *alt_path) { primary_path->dgid = req_msg->primary_local_gid; primary_path->sgid = req_msg->primary_remote_gid; - sa_path_set_dlid(primary_path, - htonl(ntohs(req_msg->primary_local_lid))); - sa_path_set_slid(primary_path, - htonl(ntohs(req_msg->primary_remote_lid))); primary_path->flow_label = cm_req_get_primary_flow_label(req_msg); primary_path->hop_limit = req_msg->primary_hop_limit; primary_path->traffic_class = req_msg->primary_traffic_class; @@ -1431,13 +1522,9 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, primary_path->packet_life_time -= (primary_path->packet_life_time > 0); primary_path->service_id = req_msg->service_id; - if (req_msg->alt_local_lid) { + if (cm_req_has_alt_path(req_msg)) { alt_path->dgid = req_msg->alt_local_gid; alt_path->sgid = req_msg->alt_remote_gid; - sa_path_set_dlid(alt_path, - htonl(ntohs(req_msg->alt_local_lid))); - sa_path_set_slid(alt_path, - htonl(ntohs(req_msg->alt_remote_lid))); alt_path->flow_label = cm_req_get_alt_flow_label(req_msg); alt_path->hop_limit = req_msg->alt_hop_limit; alt_path->traffic_class = req_msg->alt_traffic_class; @@ -1454,6 +1541,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, alt_path->packet_life_time -= (alt_path->packet_life_time > 0); alt_path->service_id = req_msg->service_id; } + cm_format_path_lid_from_req(req_msg, primary_path, alt_path); } static u16 cm_get_bth_pkey(struct cm_work *work) @@ -1703,7 +1791,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) { if (!cm_req_get_primary_subnet_local(req_msg)) { if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { - req_msg->primary_local_lid = cpu_to_be16(wc->slid); + req_msg->primary_local_lid = ib_lid_be16(wc->slid); cm_req_set_primary_sl(req_msg, wc->sl); } @@ -1713,7 +1801,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) if (!cm_req_get_alt_subnet_local(req_msg)) { if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { - req_msg->alt_local_lid = cpu_to_be16(wc->slid); + req_msg->alt_local_lid = ib_lid_be16(wc->slid); cm_req_set_alt_sl(req_msg, wc->sl); } @@ -1784,9 +1872,12 @@ static int cm_req_handler(struct cm_work *work) dev_net(gid_attr.ndev)); dev_put(gid_attr.ndev); } else { - work->path[0].rec_type = SA_PATH_REC_TYPE_IB; + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &req_msg->primary_local_gid); } - if (req_msg->alt_local_lid) + if (cm_req_has_alt_path(req_msg)) work->path[1].rec_type = work->path[0].rec_type; cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); @@ -1811,16 +1902,19 @@ static int cm_req_handler(struct cm_work *work) dev_net(gid_attr.ndev)); dev_put(gid_attr.ndev); } else { - work->path[0].rec_type = SA_PATH_REC_TYPE_IB; + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &req_msg->primary_local_gid); } - if (req_msg->alt_local_lid) + if (cm_req_has_alt_path(req_msg)) work->path[1].rec_type = work->path[0].rec_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); goto rejected; } - if (req_msg->alt_local_lid) { + if (cm_req_has_alt_path(req_msg)) { ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av, cm_id_priv); if (ret) { @@ -2424,7 +2518,8 @@ static int cm_dreq_handler(struct cm_work *work) case IB_CM_TIMEWAIT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + if (IS_ERR(msg)) goto unlock; cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, @@ -2432,7 +2527,8 @@ static int cm_dreq_handler(struct cm_work *work) cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); - if (ib_post_send_mad(msg, NULL)) + if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || + ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_DREQ_RCVD: @@ -2843,6 +2939,11 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, const void *private_data, u8 private_data_len) { + bool alt_ext = false; + + if (alternate_path->rec_type == SA_PATH_REC_TYPE_OPA) + alt_ext = opa_is_extended_lid(alternate_path->opa.dlid, + alternate_path->opa.slid); cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP)); lap_msg->local_comm_id = cm_id_priv->id.local_id; @@ -2856,6 +2957,12 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, htons(ntohl(sa_path_get_dlid(alternate_path))); lap_msg->alt_local_gid = alternate_path->sgid; lap_msg->alt_remote_gid = alternate_path->dgid; + if (alt_ext) { + lap_msg->alt_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.slid)); + lap_msg->alt_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.dlid)); + } cm_lap_set_flow_label(lap_msg, alternate_path->flow_label); cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class); lap_msg->alt_hop_limit = alternate_path->hop_limit; @@ -2924,16 +3031,29 @@ out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); } EXPORT_SYMBOL(ib_send_cm_lap); +static void cm_format_path_lid_from_lap(struct cm_lap_msg *lap_msg, + struct sa_path_rec *path) +{ + u32 lid; + + if (path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(path, htonl(ntohs(lap_msg->alt_local_lid))); + sa_path_set_slid(path, htonl(ntohs(lap_msg->alt_remote_lid))); + } else { + lid = opa_get_lid_from_gid(&lap_msg->alt_local_gid); + sa_path_set_dlid(path, cpu_to_be32(lid)); + + lid = opa_get_lid_from_gid(&lap_msg->alt_remote_gid); + sa_path_set_slid(path, cpu_to_be32(lid)); + } +} + static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, struct sa_path_rec *path, struct cm_lap_msg *lap_msg) { - memset(path, 0, sizeof *path); - path->rec_type = SA_PATH_REC_TYPE_IB; path->dgid = lap_msg->alt_local_gid; path->sgid = lap_msg->alt_remote_gid; - sa_path_set_dlid(path, htonl(ntohs(lap_msg->alt_local_lid))); - sa_path_set_slid(path, htonl(ntohs(lap_msg->alt_remote_lid))); path->flow_label = cm_lap_get_flow_label(lap_msg); path->hop_limit = lap_msg->alt_hop_limit; path->traffic_class = cm_lap_get_traffic_class(lap_msg); @@ -2947,6 +3067,7 @@ static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, path->packet_life_time_selector = IB_SA_EQ; path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg); path->packet_life_time -= (path->packet_life_time > 0); + cm_format_path_lid_from_lap(lap_msg, path); } static int cm_lap_handler(struct cm_work *work) @@ -2965,6 +3086,11 @@ static int cm_lap_handler(struct cm_work *work) return -EINVAL; param = &work->cm_event.param.lap_rcvd; + memset(&work->path[0], 0, sizeof(work->path[1])); + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &lap_msg->alt_local_gid); param->alternate_path = &work->path[0]; cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); work->cm_event.private_data = &lap_msg->private_data; @@ -2980,7 +3106,8 @@ static int cm_lap_handler(struct cm_work *work) case IB_CM_MRA_LAP_SENT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_LAP_COUNTER]); - if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + if (IS_ERR(msg)) goto unlock; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, @@ -2990,7 +3117,8 @@ static int cm_lap_handler(struct cm_work *work) cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); - if (ib_post_send_mad(msg, NULL)) + if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || + ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_LAP_RCVD: @@ -4201,7 +4329,7 @@ static int __init ib_cm_init(void) goto error1; } - cm.wq = create_workqueue("ib_cm"); + cm.wq = alloc_workqueue("ib_cm", 0, 1); if (!cm.wq) { ret = -ENOMEM; goto error2; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 0eb393237ba2..852c8fec8088 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -72,6 +72,7 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 +#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", @@ -3998,7 +3999,8 @@ static void iboe_mcast_work_handler(struct work_struct *work) kfree(mw); } -static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; @@ -4008,8 +4010,8 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { - mgid->raw[0] = 0xff; - mgid->raw[1] = 0x0e; + mgid->raw[0] = (gid_type == IB_GID_TYPE_IB) ? 0xff : 0; + mgid->raw[1] = (gid_type == IB_GID_TYPE_IB) ? 0x0e : 0; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; @@ -4050,7 +4052,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, goto out1; } - cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); + gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type); mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); if (id_priv->id.ps == RDMA_PS_UDP) @@ -4066,8 +4070,6 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); - gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - - rdma_start_port(id_priv->cma_dev->device)]; if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; @@ -4280,8 +4282,12 @@ static void cma_add_one(struct ib_device *device) for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); - cma_dev->default_gid_type[i - rdma_start_port(device)] = - find_first_bit(&supported_gids, BITS_PER_LONG); + if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) + cma_dev->default_gid_type[i - rdma_start_port(device)] = + CMA_PREFERRED_ROCE_GID_TYPE; + else + cma_dev->default_gid_type[i - rdma_start_port(device)] = + find_first_bit(&supported_gids, BITS_PER_LONG); cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; } @@ -4452,9 +4458,8 @@ out: return skb->len; } -static const struct ibnl_client_cbs cma_cb_table[] = { - [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats, - .module = THIS_MODULE }, +static const struct rdma_nl_cbs cma_cb_table[] = { + [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats}, }; static int cma_init_net(struct net *net) @@ -4506,9 +4511,7 @@ static int __init cma_init(void) if (ret) goto err; - if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table), - cma_cb_table)) - pr_warn("RDMA CMA: failed to add netlink callback\n"); + rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table); cma_configfs_init(); return 0; @@ -4525,7 +4528,7 @@ err_wq: static void __exit cma_cleanup(void) { cma_configfs_exit(); - ibnl_remove_client(RDMA_NL_RDMA_CM); + rdma_nl_unregister(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); @@ -4534,5 +4537,7 @@ static void __exit cma_cleanup(void) destroy_workqueue(cma_wq); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1); + module_init(cma_init); module_exit(cma_cleanup); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 11ae67514e13..a1d687a664f8 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -38,6 +38,7 @@ #include <linux/cgroup_rdma.h> #include <rdma/ib_verbs.h> +#include <rdma/opa_addr.h> #include <rdma/ib_mad.h> #include "mad_priv.h" @@ -102,6 +103,14 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, roce_netdev_callback cb, void *cookie); +typedef int (*nldev_callback)(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx); + +int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, + struct netlink_callback *cb); + enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_SET, IB_CACHE_GID_DEFAULT_MODE_DELETE @@ -179,8 +188,8 @@ void ib_mad_cleanup(void); int ib_sa_init(void); void ib_sa_cleanup(void); -int ibnl_init(void); -void ibnl_cleanup(void); +int rdma_nl_init(void); +void rdma_nl_exit(void); /** * Check if there are any listeners to the netlink group @@ -190,11 +199,14 @@ void ibnl_cleanup(void); int ibnl_chk_listeners(unsigned int group); int ib_nl_handle_resolve_resp(struct sk_buff *skb, - struct netlink_callback *cb); + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); int ib_nl_handle_set_timeout(struct sk_buff *skb, - struct netlink_callback *cb); + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); int ib_nl_handle_ip_res_resp(struct sk_buff *skb, - struct netlink_callback *cb); + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); int ib_get_cached_subnet_prefix(struct ib_device *device, u8 port_num, @@ -301,4 +313,9 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, return 0; } #endif + +struct ib_device *__ib_device_get_by_index(u32 ifindex); +/* RDMA device netlink */ +void nldev_init(void); +void nldev_exit(void); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 221468f77128..84fc32a2c8b3 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -134,6 +134,17 @@ static int ib_device_check_mandatory(struct ib_device *device) return 0; } +struct ib_device *__ib_device_get_by_index(u32 index) +{ + struct ib_device *device; + + list_for_each_entry(device, &device_list, core_list) + if (device->index == index) + return device; + + return NULL; +} + static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; @@ -145,7 +156,6 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } - static int alloc_name(char *name) { unsigned long *inuse; @@ -326,10 +336,10 @@ static int read_port_immutable(struct ib_device *device) return 0; } -void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len) +void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->get_dev_fw_str) - dev->get_dev_fw_str(dev, str, str_len); + dev->get_dev_fw_str(dev, str); else str[0] = '\0'; } @@ -395,6 +405,30 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, } /** + * __dev_new_index - allocate an device index + * + * Returns a suitable unique value for a new device interface + * number. It assumes that there are less than 2^32-1 ib devices + * will be present in the system. + */ +static u32 __dev_new_index(void) +{ + /* + * The device index to allow stable naming. + * Similar to struct net -> ifindex. + */ + static u32 index; + + for (;;) { + if (!(++index)) + index = 1; + + if (!__ib_device_get_by_index(index)) + return index; + } +} + +/** * ib_register_device - Register an IB device with IB core * @device:Device to register * @@ -489,9 +523,10 @@ int ib_register_device(struct ib_device *device, device->reg_state = IB_DEV_REGISTERED; list_for_each_entry(client, &client_list, list) - if (client->add && !add_client_context(device, client)) + if (!add_client_context(device, client) && client->add) client->add(device); + device->index = __dev_new_index(); down_write(&lists_rwsem); list_add_tail(&device->core_list, &device_list); up_write(&lists_rwsem); @@ -578,7 +613,7 @@ int ib_register_client(struct ib_client *client) mutex_lock(&device_mutex); list_for_each_entry(device, &device_list, core_list) - if (client->add && !add_client_context(device, client)) + if (!add_client_context(device, client) && client->add) client->add(device); down_write(&lists_rwsem); @@ -712,7 +747,7 @@ EXPORT_SYMBOL(ib_set_client_data); * chapter 11 of the InfiniBand Architecture Specification). This * callback may occur in interrupt context. */ -int ib_register_event_handler (struct ib_event_handler *event_handler) +void ib_register_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; @@ -720,8 +755,6 @@ int ib_register_event_handler (struct ib_event_handler *event_handler) list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); - - return 0; } EXPORT_SYMBOL(ib_register_event_handler); @@ -732,15 +765,13 @@ EXPORT_SYMBOL(ib_register_event_handler); * Unregister an event handler registered with * ib_register_event_handler(). */ -int ib_unregister_event_handler(struct ib_event_handler *event_handler) +void ib_unregister_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_del(&event_handler->list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); - - return 0; } EXPORT_SYMBOL(ib_unregister_event_handler); @@ -894,6 +925,31 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, } /** + * ib_enum_all_devs - enumerate all ib_devices + * @cb: Callback to call for each found ib_device + * + * Enumerates all ib_devices and calls callback() on each device. + */ +int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct ib_device *dev; + unsigned int idx = 0; + int ret = 0; + + down_read(&lists_rwsem); + list_for_each_entry(dev, &device_list, core_list) { + ret = nldev_cb(dev, skb, cb, idx); + if (ret) + break; + idx++; + } + + up_read(&lists_rwsem); + return ret; +} + +/** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query @@ -945,14 +1001,17 @@ int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { - if (!device->modify_port) - return -ENOSYS; + int rc; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - return device->modify_port(device, port_num, port_modify_mask, - port_modify); + if (device->modify_port) + rc = device->modify_port(device, port_num, port_modify_mask, + port_modify); + else + rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS; + return rc; } EXPORT_SYMBOL(ib_modify_port); @@ -1087,29 +1146,21 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, } EXPORT_SYMBOL(ib_get_net_dev_by_params); -static struct ibnl_client_cbs ibnl_ls_cb_table[] = { +static const struct rdma_nl_cbs ibnl_ls_cb_table[] = { [RDMA_NL_LS_OP_RESOLVE] = { - .dump = ib_nl_handle_resolve_resp, - .module = THIS_MODULE }, + .doit = ib_nl_handle_resolve_resp, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NL_LS_OP_SET_TIMEOUT] = { - .dump = ib_nl_handle_set_timeout, - .module = THIS_MODULE }, + .doit = ib_nl_handle_set_timeout, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NL_LS_OP_IP_RESOLVE] = { - .dump = ib_nl_handle_ip_res_resp, - .module = THIS_MODULE }, + .doit = ib_nl_handle_ip_res_resp, + .flags = RDMA_NL_ADMIN_PERM, + }, }; -static int ib_add_ibnl_clients(void) -{ - return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table), - ibnl_ls_cb_table); -} - -static void ib_remove_ibnl_clients(void) -{ - ibnl_remove_client(RDMA_NL_LS); -} - static int __init ib_core_init(void) { int ret; @@ -1131,9 +1182,9 @@ static int __init ib_core_init(void) goto err_comp; } - ret = ibnl_init(); + ret = rdma_nl_init(); if (ret) { - pr_warn("Couldn't init IB netlink interface\n"); + pr_warn("Couldn't init IB netlink interface: err %d\n", ret); goto err_sysfs; } @@ -1155,24 +1206,18 @@ static int __init ib_core_init(void) goto err_mad; } - ret = ib_add_ibnl_clients(); - if (ret) { - pr_warn("Couldn't register ibnl clients\n"); - goto err_sa; - } - ret = register_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); - goto err_ibnl_clients; + goto err_sa; } + nldev_init(); + rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); ib_cache_setup(); return 0; -err_ibnl_clients: - ib_remove_ibnl_clients(); err_sa: ib_sa_cleanup(); err_mad: @@ -1180,7 +1225,7 @@ err_mad: err_addr: addr_cleanup(); err_ibnl: - ibnl_cleanup(); + rdma_nl_exit(); err_sysfs: class_unregister(&ib_class); err_comp: @@ -1192,18 +1237,21 @@ err: static void __exit ib_core_cleanup(void) { - unregister_lsm_notifier(&ibdev_lsm_nb); ib_cache_cleanup(); - ib_remove_ibnl_clients(); + nldev_exit(); + rdma_nl_unregister(RDMA_NL_LS); + unregister_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); - ibnl_cleanup(); + rdma_nl_exit(); class_unregister(&ib_class); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); + module_init(ib_core_init); module_exit(ib_core_cleanup); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 31661b5c1743..fcf42f6bb82a 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -80,7 +80,7 @@ const char *__attribute_const__ iwcm_reject_msg(int reason) } EXPORT_SYMBOL(iwcm_reject_msg); -static struct ibnl_client_cbs iwcm_nl_cb_table[] = { +static struct rdma_nl_cbs iwcm_nl_cb_table[] = { [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, @@ -1175,13 +1175,9 @@ static int __init iw_cm_init(void) ret = iwpm_init(RDMA_NL_IWCM); if (ret) pr_err("iw_cm: couldn't init iwpm\n"); - - ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table), - iwcm_nl_cb_table); - if (ret) - pr_err("iw_cm: couldn't register netlink callbacks\n"); - - iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); + else + rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0); if (!iwcm_wq) return -ENOMEM; @@ -1200,9 +1196,11 @@ static void __exit iw_cm_cleanup(void) { unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); - ibnl_remove_client(RDMA_NL_IWCM); + rdma_nl_unregister(RDMA_NL_IWCM); iwpm_exit(RDMA_NL_IWCM); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_IWCM, 2); + module_init(iw_cm_init); module_exit(iw_cm_cleanup); diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index a0e7c16d8bd8..30825bb9b8e9 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -42,7 +42,6 @@ int iwpm_valid_pid(void) { return iwpm_user_pid > 0; } -EXPORT_SYMBOL(iwpm_valid_pid); /* * iwpm_register_pid - Send a netlink query to user space @@ -104,7 +103,7 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); - ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNAVAILABLE; @@ -122,7 +121,6 @@ pid_query_error: iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -EXPORT_SYMBOL(iwpm_register_pid); /* * iwpm_add_mapping - Send a netlink add mapping message @@ -174,7 +172,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) goto add_mapping_error; nlmsg_request->req_buffer = pm_msg; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -191,7 +189,6 @@ add_mapping_error: iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -EXPORT_SYMBOL(iwpm_add_mapping); /* * iwpm_add_and_query_mapping - Send a netlink add and query @@ -251,7 +248,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) goto query_mapping_error; nlmsg_request->req_buffer = pm_msg; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ err_str = "Unable to send a nlmsg"; @@ -267,7 +264,6 @@ query_mapping_error: iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -EXPORT_SYMBOL(iwpm_add_and_query_mapping); /* * iwpm_remove_mapping - Send a netlink remove mapping message @@ -312,7 +308,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) if (ret) goto remove_mapping_error; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -328,7 +324,6 @@ remove_mapping_error: dev_kfree_skb_any(skb); return ret; } -EXPORT_SYMBOL(iwpm_remove_mapping); /* netlink attribute policy for the received response to register pid request */ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { @@ -397,7 +392,6 @@ register_pid_response_exit: up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_register_pid_cb); /* netlink attribute policy for the received response to add mapping request */ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { @@ -466,7 +460,6 @@ add_mapping_response_exit: up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_add_mapping_cb); /* netlink attribute policy for the response to add and query mapping request * and response with remote address info */ @@ -558,7 +551,6 @@ query_mapping_response_exit: up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb); /* * iwpm_remote_info_cb - Process a port mapper message, containing @@ -627,7 +619,6 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) "remote_info: Mapped remote sockaddr:"); return ret; } -EXPORT_SYMBOL(iwpm_remote_info_cb); /* netlink attribute policy for the received request for mapping info */ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { @@ -677,7 +668,6 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid); return ret; } -EXPORT_SYMBOL(iwpm_mapping_info_cb); /* netlink attribute policy for the received mapping info ack */ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { @@ -707,7 +697,6 @@ int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); return 0; } -EXPORT_SYMBOL(iwpm_ack_mapping_info_cb); /* netlink attribute policy for the received port mapper error message */ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { @@ -751,4 +740,3 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) up(&nlmsg_request->sem); return 0; } -EXPORT_SYMBOL(iwpm_mapping_error_cb); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index f13870e69ccd..c81c55942626 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -54,8 +54,6 @@ static struct iwpm_admin_data iwpm_admin; int iwpm_init(u8 nl_client) { int ret = 0; - if (iwpm_valid_client(nl_client)) - return -EINVAL; mutex_lock(&iwpm_admin_lock); if (atomic_read(&iwpm_admin.refcount) == 0) { iwpm_hash_bucket = kzalloc(IWPM_MAPINFO_HASH_SIZE * @@ -83,7 +81,6 @@ init_exit: } return ret; } -EXPORT_SYMBOL(iwpm_init); static void free_hash_bucket(void); static void free_reminfo_bucket(void); @@ -109,7 +106,6 @@ int iwpm_exit(u8 nl_client) iwpm_set_registration(nl_client, IWPM_REG_UNDEF); return 0; } -EXPORT_SYMBOL(iwpm_exit); static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); @@ -148,7 +144,6 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return ret; } -EXPORT_SYMBOL(iwpm_create_mapinfo); int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_local_addr) @@ -184,7 +179,6 @@ remove_mapinfo_exit: spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return ret; } -EXPORT_SYMBOL(iwpm_remove_mapinfo); static void free_hash_bucket(void) { @@ -297,7 +291,6 @@ get_remote_info_exit: spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); return ret; } -EXPORT_SYMBOL(iwpm_get_remote_info); struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, u8 nl_client, gfp_t gfp) @@ -383,15 +376,11 @@ int iwpm_get_nlmsg_seq(void) int iwpm_valid_client(u8 nl_client) { - if (nl_client >= RDMA_NL_NUM_CLIENTS) - return 0; return iwpm_admin.client_list[nl_client]; } void iwpm_set_valid(u8 nl_client, int valid) { - if (nl_client >= RDMA_NL_NUM_CLIENTS) - return; iwpm_admin.client_list[nl_client] = valid; } @@ -608,7 +597,7 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); if (ret) goto mapinfo_num_error; - ret = ibnl_unicast(skb, nlh, iwpm_pid); + ret = rdma_nl_unicast(skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; @@ -637,7 +626,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; - ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid); + ret = rdma_nl_unicast(skb, iwpm_pid); if (ret) pr_warn("%s Unable to send a nlmsg\n", __func__); return ret; diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index 0d3cca0a8890..e5cf09c66fe6 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -64,7 +64,7 @@ struct mad_rmpp_recv { __be64 tid; u32 src_qp; - u16 slid; + u32 slid; u8 mgmt_class; u8 class_version; u8 method; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 94931c474d41..e685148dd3e6 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2017 Mellanox Technologies Inc. All rights reserved. * Copyright (c) 2010 Voltaire Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -37,239 +38,267 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <rdma/rdma_netlink.h> +#include <linux/module.h> #include "core_priv.h" -struct ibnl_client { - struct list_head list; - int index; - int nops; - const struct ibnl_client_cbs *cb_table; -}; +#include "core_priv.h" -static DEFINE_MUTEX(ibnl_mutex); +static DEFINE_MUTEX(rdma_nl_mutex); static struct sock *nls; -static LIST_HEAD(client_list); +static struct { + const struct rdma_nl_cbs *cb_table; +} rdma_nl_types[RDMA_NL_NUM_CLIENTS]; -int ibnl_chk_listeners(unsigned int group) +int rdma_nl_chk_listeners(unsigned int group) { - if (netlink_has_listeners(nls, group) == 0) - return -1; - return 0; + return (netlink_has_listeners(nls, group)) ? 0 : -1; } +EXPORT_SYMBOL(rdma_nl_chk_listeners); -int ibnl_add_client(int index, int nops, - const struct ibnl_client_cbs cb_table[]) +static bool is_nl_msg_valid(unsigned int type, unsigned int op) { - struct ibnl_client *cur; - struct ibnl_client *nl_client; + static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS - 1] = { + RDMA_NL_RDMA_CM_NUM_OPS, + RDMA_NL_IWPM_NUM_OPS, + 0, + RDMA_NL_LS_NUM_OPS, + RDMA_NLDEV_NUM_OPS }; - nl_client = kmalloc(sizeof *nl_client, GFP_KERNEL); - if (!nl_client) - return -ENOMEM; + /* + * This BUILD_BUG_ON is intended to catch addition of new + * RDMA netlink protocol without updating the array above. + */ + BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6); - nl_client->index = index; - nl_client->nops = nops; - nl_client->cb_table = cb_table; + if (type > RDMA_NL_NUM_CLIENTS - 1) + return false; - mutex_lock(&ibnl_mutex); + return (op < max_num_ops[type - 1]) ? true : false; +} - list_for_each_entry(cur, &client_list, list) { - if (cur->index == index) { - pr_warn("Client for %d already exists\n", index); - mutex_unlock(&ibnl_mutex); - kfree(nl_client); - return -EINVAL; - } +static bool is_nl_valid(unsigned int type, unsigned int op) +{ + const struct rdma_nl_cbs *cb_table; + + if (!is_nl_msg_valid(type, op)) + return false; + + cb_table = rdma_nl_types[type].cb_table; +#ifdef CONFIG_MODULES + if (!cb_table) { + mutex_unlock(&rdma_nl_mutex); + request_module("rdma-netlink-subsys-%d", type); + mutex_lock(&rdma_nl_mutex); + cb_table = rdma_nl_types[type].cb_table; } +#endif - list_add_tail(&nl_client->list, &client_list); - - mutex_unlock(&ibnl_mutex); - - return 0; + if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) + return false; + return true; } -EXPORT_SYMBOL(ibnl_add_client); -int ibnl_remove_client(int index) +void rdma_nl_register(unsigned int index, + const struct rdma_nl_cbs cb_table[]) { - struct ibnl_client *cur, *next; - - mutex_lock(&ibnl_mutex); - list_for_each_entry_safe(cur, next, &client_list, list) { - if (cur->index == index) { - list_del(&(cur->list)); - mutex_unlock(&ibnl_mutex); - kfree(cur); - return 0; - } + mutex_lock(&rdma_nl_mutex); + if (!is_nl_msg_valid(index, 0)) { + /* + * All clients are not interesting in success/failure of + * this call. They want to see the print to error log and + * continue their initialization. Print warning for them, + * because it is programmer's error to be here. + */ + mutex_unlock(&rdma_nl_mutex); + WARN(true, + "The not-valid %u index was supplied to RDMA netlink\n", + index); + return; } - pr_warn("Can't remove callback for client idx %d. Not found\n", index); - mutex_unlock(&ibnl_mutex); - return -EINVAL; + if (rdma_nl_types[index].cb_table) { + mutex_unlock(&rdma_nl_mutex); + WARN(true, + "The %u index is already registered in RDMA netlink\n", + index); + return; + } + + rdma_nl_types[index].cb_table = cb_table; + mutex_unlock(&rdma_nl_mutex); +} +EXPORT_SYMBOL(rdma_nl_register); + +void rdma_nl_unregister(unsigned int index) +{ + mutex_lock(&rdma_nl_mutex); + rdma_nl_types[index].cb_table = NULL; + mutex_unlock(&rdma_nl_mutex); } -EXPORT_SYMBOL(ibnl_remove_client); +EXPORT_SYMBOL(rdma_nl_unregister); void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, int len, int client, int op, int flags) { - unsigned char *prev_tail; - - prev_tail = skb_tail_pointer(skb); - *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), - len, flags); + *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), len, flags); if (!*nlh) - goto out_nlmsg_trim; - (*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail; + return NULL; return nlmsg_data(*nlh); - -out_nlmsg_trim: - nlmsg_trim(skb, prev_tail); - return NULL; } EXPORT_SYMBOL(ibnl_put_msg); int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, int len, void *data, int type) { - unsigned char *prev_tail; - - prev_tail = skb_tail_pointer(skb); - if (nla_put(skb, type, len, data)) - goto nla_put_failure; - nlh->nlmsg_len += skb_tail_pointer(skb) - prev_tail; + if (nla_put(skb, type, len, data)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } return 0; - -nla_put_failure: - nlmsg_trim(skb, prev_tail - nlh->nlmsg_len); - return -EMSGSIZE; } EXPORT_SYMBOL(ibnl_put_attr); -static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - struct ibnl_client *client; int type = nlh->nlmsg_type; - int index = RDMA_NL_GET_CLIENT(type); + unsigned int index = RDMA_NL_GET_CLIENT(type); unsigned int op = RDMA_NL_GET_OP(type); + const struct rdma_nl_cbs *cb_table; + + if (!is_nl_valid(index, op)) + return -EINVAL; + + cb_table = rdma_nl_types[index].cb_table; - list_for_each_entry(client, &client_list, list) { - if (client->index == index) { - if (op >= client->nops || !client->cb_table[op].dump) - return -EINVAL; - - /* - * For response or local service set_timeout request, - * there is no need to use netlink_dump_start. - */ - if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || - (index == RDMA_NL_LS && - op == RDMA_NL_LS_OP_SET_TIMEOUT)) { - struct netlink_callback cb = { - .skb = skb, - .nlh = nlh, - .dump = client->cb_table[op].dump, - .module = client->cb_table[op].module, - }; - - return cb.dump(skb, &cb); - } - - { - struct netlink_dump_control c = { - .dump = client->cb_table[op].dump, - .module = client->cb_table[op].module, - }; - return netlink_dump_start(nls, skb, nlh, &c); - } - } + if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) && + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + /* FIXME: Convert IWCM to properly handle doit callbacks */ + if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM || + index == RDMA_NL_IWCM) { + struct netlink_dump_control c = { + .dump = cb_table[op].dump, + }; + return netlink_dump_start(nls, skb, nlh, &c); } - pr_info("Index %d wasn't found in client list\n", index); - return -EINVAL; + if (cb_table[op].doit) + return cb_table[op].doit(skb, nlh, extack); + + return 0; } -static void ibnl_rcv_reply_skb(struct sk_buff *skb) +/* + * This function is similar to netlink_rcv_skb with one exception: + * It calls to the callback for the netlink messages without NLM_F_REQUEST + * flag. These messages are intended for RDMA_NL_LS consumer, so it is allowed + * for that consumer only. + */ +static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, + struct nlmsghdr *, + struct netlink_ext_ack *)) { + struct netlink_ext_ack extack = {}; struct nlmsghdr *nlh; - int msglen; + int err; - /* - * Process responses until there is no more message or the first - * request. Generally speaking, it is not recommended to mix responses - * with requests. - */ while (skb->len >= nlmsg_total_size(0)) { + int msglen; + nlh = nlmsg_hdr(skb); + err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) - return; - - /* Handle response only */ - if (nlh->nlmsg_flags & NLM_F_REQUEST) - return; - - ibnl_rcv_msg(skb, nlh, NULL); + return 0; + /* + * Generally speaking, the only requests are handled + * by the kernel, but RDMA_NL_LS is different, because it + * runs backward netlink scheme. Kernel initiates messages + * and waits for reply with data to keep pathrecord cache + * in sync. + */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST) && + (RDMA_NL_GET_CLIENT(nlh->nlmsg_type) != RDMA_NL_LS)) + goto ack; + + /* Skip control messages */ + if (nlh->nlmsg_type < NLMSG_MIN_TYPE) + goto ack; + + err = cb(skb, nlh, &extack); + if (err == -EINTR) + goto skip; + +ack: + if (nlh->nlmsg_flags & NLM_F_ACK || err) + netlink_ack(skb, nlh, err, &extack); + +skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } + + return 0; } -static void ibnl_rcv(struct sk_buff *skb) +static void rdma_nl_rcv(struct sk_buff *skb) { - mutex_lock(&ibnl_mutex); - ibnl_rcv_reply_skb(skb); - netlink_rcv_skb(skb, &ibnl_rcv_msg); - mutex_unlock(&ibnl_mutex); + mutex_lock(&rdma_nl_mutex); + rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg); + mutex_unlock(&rdma_nl_mutex); } -int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, - __u32 pid) +int rdma_nl_unicast(struct sk_buff *skb, u32 pid) +{ + int err; + + err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT); + return (err < 0) ? err : 0; +} +EXPORT_SYMBOL(rdma_nl_unicast); + +int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid) { int err; err = netlink_unicast(nls, skb, pid, 0); return (err < 0) ? err : 0; } -EXPORT_SYMBOL(ibnl_unicast); +EXPORT_SYMBOL(rdma_nl_unicast_wait); -int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, - unsigned int group, gfp_t flags) +int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags) { return nlmsg_multicast(nls, skb, 0, group, flags); } -EXPORT_SYMBOL(ibnl_multicast); +EXPORT_SYMBOL(rdma_nl_multicast); -int __init ibnl_init(void) +int __init rdma_nl_init(void) { struct netlink_kernel_cfg cfg = { - .input = ibnl_rcv, + .input = rdma_nl_rcv, }; nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg); - if (!nls) { - pr_warn("Failed to create netlink socket\n"); + if (!nls) return -ENOMEM; - } nls->sk_sndtimeo = 10 * HZ; return 0; } -void ibnl_cleanup(void) +void rdma_nl_exit(void) { - struct ibnl_client *cur, *next; + int idx; - mutex_lock(&ibnl_mutex); - list_for_each_entry_safe(cur, next, &client_list, list) { - list_del(&(cur->list)); - kfree(cur); - } - mutex_unlock(&ibnl_mutex); + for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) + rdma_nl_unregister(idx); netlink_kernel_release(nls); } + +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c new file mode 100644 index 000000000000..3ba24c428c3b --- /dev/null +++ b/drivers/infiniband/core/nldev.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <net/netlink.h> +#include <rdma/rdma_netlink.h> + +#include "core_priv.h" + +static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { + [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = IB_DEVICE_NAME_MAX - 1}, + [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, + .len = IB_FW_VERSION_NAME_MAX - 1}, + [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, +}; + +static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) +{ + char fw[IB_FW_VERSION_NAME_MAX]; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) + return -EMSGSIZE; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device))) + return -EMSGSIZE; + + BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64)); + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, + device->attrs.device_cap_flags, 0)) + return -EMSGSIZE; + + ib_get_device_fw_str(device, fw); + /* Device without FW has strlen(fw) */ + if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw)) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID, + be64_to_cpu(device->node_guid), 0)) + return -EMSGSIZE; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, + be64_to_cpu(device->attrs.sys_image_guid), 0)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) + return -EMSGSIZE; + return 0; +} + +static int fill_port_info(struct sk_buff *msg, + struct ib_device *device, u32 port) +{ + struct ib_port_attr attr; + int ret; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) + return -EMSGSIZE; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) + return -EMSGSIZE; + + ret = ib_query_port(device, port, &attr); + if (ret) + return ret; + + BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64)); + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, + (u64)attr.port_cap_flags, 0)) + return -EMSGSIZE; + if (rdma_protocol_ib(device, port) && + nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, + attr.subnet_prefix, 0)) + return -EMSGSIZE; + if (rdma_protocol_ib(device, port)) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc)) + return -EMSGSIZE; + } + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) + return -EMSGSIZE; + return 0; +} + +static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + + device = __ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, 0); + + err = fill_dev_info(msg, device); + if (err) { + nlmsg_free(msg); + return err; + } + + nlmsg_end(msg, nlh); + + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); +} + +static int _nldev_get_dumpit(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx) +{ + int start = cb->args[0]; + struct nlmsghdr *nlh; + + if (idx < start) + return 0; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, NLM_F_MULTI); + + if (fill_dev_info(skb, device)) { + nlmsg_cancel(skb, nlh); + goto out; + } + + nlmsg_end(skb, nlh); + + idx++; + +out: cb->args[0] = idx; + return skb->len; +} + +static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + /* + * There is no need to take lock, because + * we are relying on ib_core's lists_rwsem + */ + return ib_enum_all_devs(_nldev_get_dumpit, skb, cb); +} + +static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + u32 port; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = __ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, 0); + + err = fill_port_info(msg, device, port); + if (err) { + nlmsg_free(msg); + return err; + } + + nlmsg_end(msg, nlh); + + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); +} + +static int nldev_port_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + int start = cb->args[0]; + struct nlmsghdr *nlh; + u32 idx = 0; + u32 ifindex; + int err; + u32 p; + + err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = __ib_device_get_by_index(ifindex); + if (!device) + return -EINVAL; + + for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { + /* + * The dumpit function returns all information from specific + * index. This specific index is taken from the netlink + * messages request sent by user and it is available + * in cb->args[0]. + * + * Usually, the user doesn't fill this field and it causes + * to return everything. + * + */ + if (idx < start) { + idx++; + continue; + } + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_PORT_GET), + 0, NLM_F_MULTI); + + if (fill_port_info(skb, device, p)) { + nlmsg_cancel(skb, nlh); + goto out; + } + idx++; + nlmsg_end(skb, nlh); + } + +out: cb->args[0] = idx; + return skb->len; +} + +static const struct rdma_nl_cbs nldev_cb_table[] = { + [RDMA_NLDEV_CMD_GET] = { + .doit = nldev_get_doit, + .dump = nldev_get_dumpit, + }, + [RDMA_NLDEV_CMD_PORT_GET] = { + .doit = nldev_port_get_doit, + .dump = nldev_port_get_dumpit, + }, +}; + +void __init nldev_init(void) +{ + rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); +} + +void __exit nldev_exit(void) +{ + rdma_nl_unregister(RDMA_NL_NLDEV); +} + +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5); diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 41c31a2bf093..85b5ee4defa4 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -35,10 +35,57 @@ #include <rdma/ib_verbs.h> #include <rdma/uverbs_types.h> #include <linux/rcupdate.h> +#include <rdma/uverbs_ioctl.h> +#include <rdma/rdma_user_ioctl.h> #include "uverbs.h" #include "core_priv.h" #include "rdma_core.h" +int uverbs_ns_idx(u16 *id, unsigned int ns_count) +{ + int ret = (*id & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT; + + if (ret >= ns_count) + return -EINVAL; + + *id &= ~UVERBS_ID_NS_MASK; + return ret; +} + +const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev, + uint16_t object) +{ + const struct uverbs_root_spec *object_hash = ibdev->specs_root; + const struct uverbs_object_spec_hash *objects; + int ret = uverbs_ns_idx(&object, object_hash->num_buckets); + + if (ret < 0) + return NULL; + + objects = object_hash->object_buckets[ret]; + + if (object >= objects->num_objects) + return NULL; + + return objects->objects[object]; +} + +const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object, + uint16_t method) +{ + const struct uverbs_method_spec_hash *methods; + int ret = uverbs_ns_idx(&method, object->num_buckets); + + if (ret < 0) + return NULL; + + methods = object->method_buckets[ret]; + if (method >= methods->num_methods) + return NULL; + + return methods->methods[method]; +} + void uverbs_uobject_get(struct ib_uobject *uobject) { kref_get(&uobject->ref); @@ -404,6 +451,41 @@ int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj) return ret; } +static int null_obj_type_class_remove_commit(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + return 0; +} + +static const struct uverbs_obj_type null_obj_type = { + .type_class = &((const struct uverbs_obj_type_class){ + .remove_commit = null_obj_type_class_remove_commit, + /* be cautious */ + .needs_kfree_rcu = true}), +}; + +int rdma_explicit_destroy(struct ib_uobject *uobject) +{ + int ret; + struct ib_ucontext *ucontext = uobject->context; + + /* Cleanup is running. Calling this should have been impossible */ + if (!down_read_trylock(&ucontext->cleanup_rwsem)) { + WARN(true, "ib_uverbs: Cleanup is running while removing an uobject\n"); + return 0; + } + lockdep_check(uobject, true); + ret = uobject->type->type_class->remove_commit(uobject, + RDMA_REMOVE_DESTROY); + if (ret) + return ret; + + uobject->type = &null_obj_type; + + up_read(&ucontext->cleanup_rwsem); + return 0; +} + static void alloc_commit_idr_uobject(struct ib_uobject *uobj) { uverbs_uobject_add(uobj); @@ -625,3 +707,100 @@ const struct uverbs_obj_type_class uverbs_fd_class = { .needs_kfree_rcu = false, }; +struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs, + struct ib_ucontext *ucontext, + enum uverbs_obj_access access, + int id) +{ + switch (access) { + case UVERBS_ACCESS_READ: + return rdma_lookup_get_uobject(type_attrs, ucontext, id, false); + case UVERBS_ACCESS_DESTROY: + case UVERBS_ACCESS_WRITE: + return rdma_lookup_get_uobject(type_attrs, ucontext, id, true); + case UVERBS_ACCESS_NEW: + return rdma_alloc_begin_uobject(type_attrs, ucontext); + default: + WARN_ON(true); + return ERR_PTR(-EOPNOTSUPP); + } +} + +int uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, + bool commit) +{ + int ret = 0; + + /* + * refcounts should be handled at the object level and not at the + * uobject level. Refcounts of the objects themselves are done in + * handlers. + */ + + switch (access) { + case UVERBS_ACCESS_READ: + rdma_lookup_put_uobject(uobj, false); + break; + case UVERBS_ACCESS_WRITE: + rdma_lookup_put_uobject(uobj, true); + break; + case UVERBS_ACCESS_DESTROY: + if (commit) + ret = rdma_remove_commit_uobject(uobj); + else + rdma_lookup_put_uobject(uobj, true); + break; + case UVERBS_ACCESS_NEW: + if (commit) + ret = rdma_alloc_commit_uobject(uobj); + else + rdma_alloc_abort_uobject(uobj); + break; + default: + WARN_ON(true); + ret = -EOPNOTSUPP; + } + + return ret; +} + +int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle, + struct uverbs_attr_spec_hash * const *spec_hash, + size_t num, + bool commit) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < num; i++) { + struct uverbs_attr_bundle_hash *curr_bundle = + &attrs_bundle->hash[i]; + const struct uverbs_attr_spec_hash *curr_spec_bucket = + spec_hash[i]; + unsigned int j; + + for (j = 0; j < curr_bundle->num_attrs; j++) { + struct uverbs_attr *attr; + const struct uverbs_attr_spec *spec; + + if (!uverbs_attr_is_valid_in_hash(curr_bundle, j)) + continue; + + attr = &curr_bundle->attrs[j]; + spec = &curr_spec_bucket->attrs[j]; + + if (spec->type == UVERBS_ATTR_TYPE_IDR || + spec->type == UVERBS_ATTR_TYPE_FD) { + int current_ret; + + current_ret = uverbs_finalize_object(attr->obj_attr.uobject, + spec->obj.access, + commit); + if (!ret) + ret = current_ret; + } + } + } + return ret; +} diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 1b82e7ff7fe8..1efcf93238dd 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -39,9 +39,15 @@ #include <linux/idr.h> #include <rdma/uverbs_types.h> +#include <rdma/uverbs_ioctl.h> #include <rdma/ib_verbs.h> #include <linux/mutex.h> +int uverbs_ns_idx(u16 *id, unsigned int ns_count); +const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev, + uint16_t object); +const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object, + uint16_t method); /* * These functions initialize the context and cleanups its uobjects. * The context has a list of objects which is protected by a mutex @@ -75,4 +81,40 @@ void uverbs_uobject_put(struct ib_uobject *uobject); */ void uverbs_close_fd(struct file *f); +/* + * Get an ib_uobject that corresponds to the given id from ucontext, assuming + * the object is from the given type. Lock it to the required access when + * applicable. + * This function could create (access == NEW), destroy (access == DESTROY) + * or unlock (access == READ || access == WRITE) objects if required. + * The action will be finalized only when uverbs_finalize_object or + * uverbs_finalize_objects are called. + */ +struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs, + struct ib_ucontext *ucontext, + enum uverbs_obj_access access, + int id); +int uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, + bool commit); +/* + * Note that certain finalize stages could return a status: + * (a) alloc_commit could return a failure if the object is committed at the + * same time when the context is destroyed. + * (b) remove_commit could fail if the object wasn't destroyed successfully. + * Since multiple objects could be finalized in one transaction, it is very NOT + * recommended to have several finalize actions which have side effects. + * For example, it's NOT recommended to have a certain action which has both + * a commit action and a destroy action or two destroy objects in the same + * action. The rule of thumb is to have one destroy or commit action with + * multiple lookups. + * The first non zero return value of finalize_object is returned from this + * function. For example, this could happen when we couldn't destroy an + * object. + */ +int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle, + struct uverbs_attr_spec_hash * const *spec_hash, + size_t num, + bool commit); + #endif /* RDMA_CORE_H */ diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 94a9eefb3cfc..90e3889b7fbe 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -44,6 +44,8 @@ static struct workqueue_struct *gid_cache_wq; +static struct workqueue_struct *gid_cache_wq; + enum gid_op_type { GID_DEL = 0, GID_ADD diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 70fa4cabe48e..ab5e1024fea9 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -50,6 +50,7 @@ #include <uapi/rdma/ib_user_sa.h> #include <rdma/ib_marshall.h> #include <rdma/ib_addr.h> +#include <rdma/opa_addr.h> #include "sa.h" #include "core_priv.h" @@ -861,7 +862,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, gfp_mask); + ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); if (!ret) ret = len; else @@ -1021,9 +1022,9 @@ static void ib_nl_request_timeout(struct work_struct *work) } int ib_nl_handle_set_timeout(struct sk_buff *skb, - struct netlink_callback *cb) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; int timeout, delta, abs_delta; const struct nlattr *attr; unsigned long flags; @@ -1033,8 +1034,7 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, int ret; if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), @@ -1098,9 +1098,9 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) } int ib_nl_handle_resolve_resp(struct sk_buff *skb, - struct netlink_callback *cb) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; unsigned long flags; struct ib_sa_query *query; struct ib_mad_send_buf *send_buf; @@ -1109,8 +1109,7 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb, int ret; if ((nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; spin_lock_irqsave(&ib_nl_request_lock, flags); @@ -1241,6 +1240,11 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->type = rdma_ah_find_type(device, port_num); rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec))); + + if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && + (rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE))) + rdma_ah_set_make_grd(ah_attr, true); + rdma_ah_set_sl(ah_attr, rec->sl); rdma_ah_set_path_bits(ah_attr, be32_to_cpu(sa_path_get_slid(rec)) & get_src_path_mask(device, port_num)); @@ -1420,7 +1424,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && (!(query->flags & IB_SA_QUERY_OPA))) { - if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) { + if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { if (!ib_nl_make_request(query, gfp_mask)) return id; } @@ -2290,12 +2294,15 @@ static void update_sm_ah(struct work_struct *work) rdma_ah_set_sl(&ah_attr, port_attr.sm_sl); rdma_ah_set_port_num(&ah_attr, port->port_num); if (port_attr.grh_required) { - rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); - - rdma_ah_set_subnet_prefix(&ah_attr, - cpu_to_be64(port_attr.subnet_prefix)); - rdma_ah_set_interface_id(&ah_attr, - cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); + if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA) { + rdma_ah_set_make_grd(&ah_attr, true); + } else { + rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); + rdma_ah_set_subnet_prefix(&ah_attr, + cpu_to_be64(port_attr.subnet_prefix)); + rdma_ah_set_interface_id(&ah_attr, + cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); + } } new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr); @@ -2410,8 +2417,7 @@ static void ib_sa_add_one(struct ib_device *device) */ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); - if (ib_register_event_handler(&sa_dev->event_handler)) - goto err; + ib_register_event_handler(&sa_dev->event_handler); for (i = 0; i <= e - s; ++i) { if (rdma_cap_ib_sa(device, i + 1)) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7ebe1ef23652..abc5ab581f82 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1210,8 +1210,8 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, { struct ib_device *dev = container_of(device, struct ib_device, dev); - ib_get_device_fw_str(dev, buf, PAGE_SIZE); - strlcat(buf, "\n", PAGE_SIZE); + ib_get_device_fw_str(dev, buf); + strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); return strlen(buf); } diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 112099c86a19..f2a7f62c2834 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -618,7 +618,7 @@ static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, if (result) goto out; - ib_copy_qp_attr_to_user(&resp, &qp_attr); + ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 276f0ef835bd..eb85b546e223 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -248,14 +248,15 @@ static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, dst->qp_num = src->qp_num; } -static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst, +static void ucma_copy_ud_event(struct ib_device *device, + struct rdma_ucm_ud_param *dst, struct rdma_ud_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; - ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); dst->qp_num = src->qp_num; dst->qkey = src->qkey; } @@ -335,7 +336,8 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, uevent->resp.event = event->event; uevent->resp.status = event->status; if (cm_id->qp_type == IB_QPT_UD) - ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud); + ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud, + &event->param.ud); else ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); @@ -1157,7 +1159,7 @@ static ssize_t ucma_init_qp_attr(struct ucma_file *file, if (ret) goto out; - ib_copy_qp_attr_to_user(&resp, &qp_attr); + ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 36a6f5c8914c..c1696e6084b2 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -229,7 +229,7 @@ static void recv_handler(struct ib_mad_agent *agent, packet->mad.hdr.status = 0; packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len; packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); - packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid); + packet->mad.hdr.lid = ib_lid_be16(mad_recv_wc->wc->slid); packet->mad.hdr.sl = mad_recv_wc->wc->sl; packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 64d494a64daf..37c8903e7fd0 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -100,6 +100,7 @@ struct ib_uverbs_device { struct mutex lists_mutex; /* protect lists */ struct list_head uverbs_file_list; struct list_head uverbs_events_file_list; + struct uverbs_root_spec *specs_root; }; struct ib_uverbs_event_queue { @@ -218,6 +219,8 @@ int uverbs_dealloc_mw(struct ib_mw *mw); void ib_uverbs_detach_umcast(struct ib_qp *qp, struct ib_uqp_object *uobj); +long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); + struct ib_uverbs_flow_spec { union { union { diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 739bd69ef1d4..e0cb99860934 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -91,9 +91,10 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, goto err; } - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); if (ret) @@ -275,8 +276,14 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; - resp.lid = attr.lid; - resp.sm_lid = attr.sm_lid; + + if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) { + resp.lid = OPA_TO_IB_UCAST_LID(attr.lid); + resp.sm_lid = OPA_TO_IB_UCAST_LID(attr.sm_lid); + } else { + resp.lid = ib_lid_cpu16(attr.lid); + resp.sm_lid = ib_lid_cpu16(attr.sm_lid); + } resp.lmc = attr.lmc; resp.max_vl_num = attr.max_vl_num; resp.sm_sl = attr.sm_sl; @@ -313,9 +320,10 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); uobj = uobj_alloc(uobj_get_type(pd), file->ucontext); if (IS_ERR(uobj)) @@ -482,9 +490,10 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); mutex_lock(&file->device->xrcd_tree_mutex); @@ -646,9 +655,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; @@ -740,7 +750,8 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, INIT_UDATA(&udata, buf + sizeof(cmd), (unsigned long) cmd.response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) return -EINVAL; @@ -1080,7 +1091,8 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, INIT_UDATA(&uhw, buf + sizeof(cmd), (unsigned long)cmd.response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); memset(&cmd_ex, 0, sizeof(cmd_ex)); cmd_ex.user_handle = cmd.user_handle; @@ -1161,9 +1173,10 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); if (!cq) @@ -1185,7 +1198,8 @@ out: return ret ? ret : in_len; } -static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) +static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest, + struct ib_wc *wc) { struct ib_uverbs_wc tmp; @@ -1199,7 +1213,10 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) tmp.src_qp = wc->src_qp; tmp.wc_flags = wc->wc_flags; tmp.pkey_index = wc->pkey_index; - tmp.slid = wc->slid; + if (rdma_cap_opa_ah(ib_dev, wc->port_num)) + tmp.slid = OPA_TO_IB_UCAST_LID(wc->slid); + else + tmp.slid = ib_lid_cpu16(wc->slid); tmp.sl = wc->sl; tmp.dlid_path_bits = wc->dlid_path_bits; tmp.port_num = wc->port_num; @@ -1243,7 +1260,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, if (!ret) break; - ret = copy_wc_to_user(data_ptr, &wc); + ret = copy_wc_to_user(ib_dev, data_ptr, &wc); if (ret) goto out_put; @@ -1383,8 +1400,9 @@ static int create_qp(struct ib_uverbs_file *file, attr.rwq_ind_tbl = ind_tbl; } - if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) + - sizeof(cmd->reserved1)) && cmd->reserved1) { + if (cmd_sz > sizeof(*cmd) && + !ib_is_udata_cleared(ucore, sizeof(*cmd), + cmd_sz - sizeof(*cmd))) { ret = -EOPNOTSUPP; goto err_put; } @@ -1420,7 +1438,7 @@ static int create_qp(struct ib_uverbs_file *file, if (cmd->is_srq) { srq = uobj_get_obj_read(srq, cmd->srq_handle, file->ucontext); - if (!srq || srq->srq_type != IB_SRQT_BASIC) { + if (!srq || srq->srq_type == IB_SRQT_XRC) { ret = -EINVAL; goto err_put; } @@ -1482,11 +1500,21 @@ static int create_qp(struct ib_uverbs_file *file, IB_QP_CREATE_MANAGED_SEND | IB_QP_CREATE_MANAGED_RECV | IB_QP_CREATE_SCATTER_FCS | - IB_QP_CREATE_CVLAN_STRIPPING)) { + IB_QP_CREATE_CVLAN_STRIPPING | + IB_QP_CREATE_SOURCE_QPN)) { ret = -EINVAL; goto err_put; } + if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) { + if (!capable(CAP_NET_RAW)) { + ret = -EPERM; + goto err_put; + } + + attr.source_qpn = cmd->source_qpn; + } + buf = (void *)cmd + sizeof(*cmd); if (cmd_sz > sizeof(*cmd)) if (!(buf[0] == 0 && !memcmp(buf, buf + 1, @@ -1722,9 +1750,10 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); obj = (struct ib_uqp_object *)uobj_alloc(uobj_get_type(qp), file->ucontext); @@ -1791,6 +1820,28 @@ err_put: return ret; } +static void copy_ah_attr_to_uverbs(struct ib_uverbs_qp_dest *uverb_attr, + struct rdma_ah_attr *rdma_attr) +{ + const struct ib_global_route *grh; + + uverb_attr->dlid = rdma_ah_get_dlid(rdma_attr); + uverb_attr->sl = rdma_ah_get_sl(rdma_attr); + uverb_attr->src_path_bits = rdma_ah_get_path_bits(rdma_attr); + uverb_attr->static_rate = rdma_ah_get_static_rate(rdma_attr); + uverb_attr->is_global = !!(rdma_ah_get_ah_flags(rdma_attr) & + IB_AH_GRH); + if (uverb_attr->is_global) { + grh = rdma_ah_read_grh(rdma_attr); + memcpy(uverb_attr->dgid, grh->dgid.raw, 16); + uverb_attr->flow_label = grh->flow_label; + uverb_attr->sgid_index = grh->sgid_index; + uverb_attr->hop_limit = grh->hop_limit; + uverb_attr->traffic_class = grh->traffic_class; + } + uverb_attr->port_num = rdma_ah_get_port_num(rdma_attr); +} + ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, struct ib_device *ib_dev, const char __user *buf, int in_len, @@ -1801,7 +1852,6 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, struct ib_qp *qp; struct ib_qp_attr *attr; struct ib_qp_init_attr *init_attr; - const struct ib_global_route *grh; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) @@ -1851,39 +1901,8 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, resp.alt_port_num = attr->alt_port_num; resp.alt_timeout = attr->alt_timeout; - resp.dest.dlid = rdma_ah_get_dlid(&attr->ah_attr); - resp.dest.sl = rdma_ah_get_sl(&attr->ah_attr); - resp.dest.src_path_bits = rdma_ah_get_path_bits(&attr->ah_attr); - resp.dest.static_rate = rdma_ah_get_static_rate(&attr->ah_attr); - resp.dest.is_global = !!(rdma_ah_get_ah_flags(&attr->ah_attr) & - IB_AH_GRH); - if (resp.dest.is_global) { - grh = rdma_ah_read_grh(&attr->ah_attr); - memcpy(resp.dest.dgid, grh->dgid.raw, 16); - resp.dest.flow_label = grh->flow_label; - resp.dest.sgid_index = grh->sgid_index; - resp.dest.hop_limit = grh->hop_limit; - resp.dest.traffic_class = grh->traffic_class; - } - resp.dest.port_num = rdma_ah_get_port_num(&attr->ah_attr); - - resp.alt_dest.dlid = rdma_ah_get_dlid(&attr->alt_ah_attr); - resp.alt_dest.sl = rdma_ah_get_sl(&attr->alt_ah_attr); - resp.alt_dest.src_path_bits = rdma_ah_get_path_bits(&attr->alt_ah_attr); - resp.alt_dest.static_rate - = rdma_ah_get_static_rate(&attr->alt_ah_attr); - resp.alt_dest.is_global - = !!(rdma_ah_get_ah_flags(&attr->alt_ah_attr) & - IB_AH_GRH); - if (resp.alt_dest.is_global) { - grh = rdma_ah_read_grh(&attr->alt_ah_attr); - memcpy(resp.alt_dest.dgid, grh->dgid.raw, 16); - resp.alt_dest.flow_label = grh->flow_label; - resp.alt_dest.sgid_index = grh->sgid_index; - resp.alt_dest.hop_limit = grh->hop_limit; - resp.alt_dest.traffic_class = grh->traffic_class; - } - resp.alt_dest.port_num = rdma_ah_get_port_num(&attr->alt_ah_attr); + copy_ah_attr_to_uverbs(&resp.dest, &attr->ah_attr); + copy_ah_attr_to_uverbs(&resp.alt_dest, &attr->alt_ah_attr); resp.max_send_wr = init_attr->cap.max_send_wr; resp.max_recv_wr = init_attr->cap.max_recv_wr; @@ -1917,6 +1936,29 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask) } } +static void copy_ah_attr_from_uverbs(struct ib_device *dev, + struct rdma_ah_attr *rdma_attr, + struct ib_uverbs_qp_dest *uverb_attr) +{ + rdma_attr->type = rdma_ah_find_type(dev, uverb_attr->port_num); + if (uverb_attr->is_global) { + rdma_ah_set_grh(rdma_attr, NULL, + uverb_attr->flow_label, + uverb_attr->sgid_index, + uverb_attr->hop_limit, + uverb_attr->traffic_class); + rdma_ah_set_dgid_raw(rdma_attr, uverb_attr->dgid); + } else { + rdma_ah_set_ah_flags(rdma_attr, 0); + } + rdma_ah_set_dlid(rdma_attr, uverb_attr->dlid); + rdma_ah_set_sl(rdma_attr, uverb_attr->sl); + rdma_ah_set_path_bits(rdma_attr, uverb_attr->src_path_bits); + rdma_ah_set_static_rate(rdma_attr, uverb_attr->static_rate); + rdma_ah_set_port_num(rdma_attr, uverb_attr->port_num); + rdma_ah_set_make_grd(rdma_attr, false); +} + static int modify_qp(struct ib_uverbs_file *file, struct ib_uverbs_ex_modify_qp *cmd, struct ib_udata *udata) { @@ -1964,48 +2006,12 @@ static int modify_qp(struct ib_uverbs_file *file, attr->rate_limit = cmd->rate_limit; if (cmd->base.attr_mask & IB_QP_AV) - attr->ah_attr.type = rdma_ah_find_type(qp->device, - cmd->base.dest.port_num); - if (cmd->base.dest.is_global) { - rdma_ah_set_grh(&attr->ah_attr, NULL, - cmd->base.dest.flow_label, - cmd->base.dest.sgid_index, - cmd->base.dest.hop_limit, - cmd->base.dest.traffic_class); - rdma_ah_set_dgid_raw(&attr->ah_attr, cmd->base.dest.dgid); - } else { - rdma_ah_set_ah_flags(&attr->ah_attr, 0); - } - rdma_ah_set_dlid(&attr->ah_attr, cmd->base.dest.dlid); - rdma_ah_set_sl(&attr->ah_attr, cmd->base.dest.sl); - rdma_ah_set_path_bits(&attr->ah_attr, cmd->base.dest.src_path_bits); - rdma_ah_set_static_rate(&attr->ah_attr, cmd->base.dest.static_rate); - rdma_ah_set_port_num(&attr->ah_attr, - cmd->base.dest.port_num); + copy_ah_attr_from_uverbs(qp->device, &attr->ah_attr, + &cmd->base.dest); if (cmd->base.attr_mask & IB_QP_ALT_PATH) - attr->alt_ah_attr.type = - rdma_ah_find_type(qp->device, cmd->base.dest.port_num); - if (cmd->base.alt_dest.is_global) { - rdma_ah_set_grh(&attr->alt_ah_attr, NULL, - cmd->base.alt_dest.flow_label, - cmd->base.alt_dest.sgid_index, - cmd->base.alt_dest.hop_limit, - cmd->base.alt_dest.traffic_class); - rdma_ah_set_dgid_raw(&attr->alt_ah_attr, - cmd->base.alt_dest.dgid); - } else { - rdma_ah_set_ah_flags(&attr->alt_ah_attr, 0); - } - - rdma_ah_set_dlid(&attr->alt_ah_attr, cmd->base.alt_dest.dlid); - rdma_ah_set_sl(&attr->alt_ah_attr, cmd->base.alt_dest.sl); - rdma_ah_set_path_bits(&attr->alt_ah_attr, - cmd->base.alt_dest.src_path_bits); - rdma_ah_set_static_rate(&attr->alt_ah_attr, - cmd->base.alt_dest.static_rate); - rdma_ah_set_port_num(&attr->alt_ah_attr, - cmd->base.alt_dest.port_num); + copy_ah_attr_from_uverbs(qp->device, &attr->alt_ah_attr, + &cmd->base.alt_dest); ret = ib_modify_qp_with_udata(qp, attr, modify_qp_mask(qp->qp_type, @@ -2037,7 +2043,8 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, return -EOPNOTSUPP; INIT_UDATA(&udata, buf + sizeof(cmd.base), NULL, - in_len - sizeof(cmd.base), out_len); + in_len - sizeof(cmd.base) - sizeof(struct ib_uverbs_cmd_hdr), + out_len); ret = modify_qp(file, &cmd, &udata); if (ret) @@ -2543,7 +2550,8 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, INIT_UDATA(&udata, buf + sizeof(cmd), (unsigned long)cmd.response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); uobj = uobj_alloc(uobj_get_type(ah), file->ucontext); if (IS_ERR(uobj)) @@ -2556,6 +2564,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, } attr.type = rdma_ah_find_type(ib_dev, cmd.attr.port_num); + rdma_ah_set_make_grd(&attr, false); rdma_ah_set_dlid(&attr, cmd.attr.dlid); rdma_ah_set_sl(&attr, cmd.attr.sl); rdma_ah_set_path_bits(&attr, cmd.attr.src_path_bits); @@ -3472,6 +3481,9 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, if (IS_ERR(obj)) return PTR_ERR(obj); + if (cmd->srq_type == IB_SRQT_TM) + attr.ext.tag_matching.max_num_tags = cmd->max_num_tags; + if (cmd->srq_type == IB_SRQT_XRC) { xrcd_uobj = uobj_get_read(uobj_get_type(xrcd), cmd->xrcd_handle, file->ucontext); @@ -3488,10 +3500,12 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); + } - attr.ext.xrc.cq = uobj_get_obj_read(cq, cmd->cq_handle, - file->ucontext); - if (!attr.ext.xrc.cq) { + if (ib_srq_has_cq(cmd->srq_type)) { + attr.ext.cq = uobj_get_obj_read(cq, cmd->cq_handle, + file->ucontext); + if (!attr.ext.cq) { ret = -EINVAL; goto err_put_xrcd; } @@ -3526,10 +3540,13 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; + if (ib_srq_has_cq(cmd->srq_type)) { + srq->ext.cq = attr.ext.cq; + atomic_inc(&attr.ext.cq->usecnt); + } + if (cmd->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.cq = attr.ext.xrc.cq; srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; - atomic_inc(&attr.ext.xrc.cq->usecnt); atomic_inc(&attr.ext.xrc.xrcd->usecnt); } @@ -3552,10 +3569,12 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, goto err_copy; } - if (cmd->srq_type == IB_SRQT_XRC) { + if (cmd->srq_type == IB_SRQT_XRC) uobj_put_read(xrcd_uobj); - uobj_put_obj_read(attr.ext.xrc.cq); - } + + if (ib_srq_has_cq(cmd->srq_type)) + uobj_put_obj_read(attr.ext.cq); + uobj_put_obj_read(pd); uobj_alloc_commit(&obj->uevent.uobject); @@ -3568,8 +3587,8 @@ err_put: uobj_put_obj_read(pd); err_put_cq: - if (cmd->srq_type == IB_SRQT_XRC) - uobj_put_obj_read(attr.ext.xrc.cq); + if (ib_srq_has_cq(cmd->srq_type)) + uobj_put_obj_read(attr.ext.cq); err_put_xrcd: if (cmd->srq_type == IB_SRQT_XRC) { @@ -3599,6 +3618,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; + memset(&xcmd, 0, sizeof(xcmd)); xcmd.response = cmd.response; xcmd.user_handle = cmd.user_handle; xcmd.srq_type = IB_SRQT_BASIC; @@ -3607,10 +3627,10 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, xcmd.max_sge = cmd.max_sge; xcmd.srq_limit = cmd.srq_limit; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), - out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata); if (ret) @@ -3634,10 +3654,10 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), - out_len - sizeof resp); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata); if (ret) @@ -3848,6 +3868,16 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.raw_packet_caps = attr.raw_packet_caps; resp.response_length += sizeof(resp.raw_packet_caps); + + if (ucore->outlen < resp.response_length + sizeof(resp.xrq_caps)) + goto end; + + resp.xrq_caps.max_rndv_hdr_size = attr.xrq_caps.max_rndv_hdr_size; + resp.xrq_caps.max_num_tags = attr.xrq_caps.max_num_tags; + resp.xrq_caps.max_ops = attr.xrq_caps.max_ops; + resp.xrq_caps.max_sge = attr.xrq_caps.max_sge; + resp.xrq_caps.flags = attr.xrq_caps.flags; + resp.response_length += sizeof(resp.xrq_caps); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c new file mode 100644 index 000000000000..5286ad57d903 --- /dev/null +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/rdma_user_ioctl.h> +#include <rdma/uverbs_ioctl.h> +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_process_attr(struct ib_device *ibdev, + struct ib_ucontext *ucontext, + const struct ib_uverbs_attr *uattr, + u16 attr_id, + const struct uverbs_attr_spec_hash *attr_spec_bucket, + struct uverbs_attr_bundle_hash *attr_bundle_h, + struct ib_uverbs_attr __user *uattr_ptr) +{ + const struct uverbs_attr_spec *spec; + struct uverbs_attr *e; + const struct uverbs_object_spec *object; + struct uverbs_obj_attr *o_attr; + struct uverbs_attr *elements = attr_bundle_h->attrs; + + if (uattr->reserved) + return -EINVAL; + + if (attr_id >= attr_spec_bucket->num_attrs) { + if (uattr->flags & UVERBS_ATTR_F_MANDATORY) + return -EINVAL; + else + return 0; + } + + spec = &attr_spec_bucket->attrs[attr_id]; + e = &elements[attr_id]; + e->uattr = uattr_ptr; + + switch (spec->type) { + case UVERBS_ATTR_TYPE_PTR_IN: + case UVERBS_ATTR_TYPE_PTR_OUT: + if (uattr->len < spec->len || + (!(spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ) && + uattr->len > spec->len)) + return -EINVAL; + + e->ptr_attr.data = uattr->data; + e->ptr_attr.len = uattr->len; + e->ptr_attr.flags = uattr->flags; + break; + + case UVERBS_ATTR_TYPE_IDR: + if (uattr->data >> 32) + return -EINVAL; + /* fall through */ + case UVERBS_ATTR_TYPE_FD: + if (uattr->len != 0 || !ucontext || uattr->data > INT_MAX) + return -EINVAL; + + o_attr = &e->obj_attr; + object = uverbs_get_object(ibdev, spec->obj.obj_type); + if (!object) + return -EINVAL; + o_attr->type = object->type_attrs; + + o_attr->id = (int)uattr->data; + o_attr->uobject = uverbs_get_uobject_from_context( + o_attr->type, + ucontext, + spec->obj.access, + o_attr->id); + + if (IS_ERR(o_attr->uobject)) + return PTR_ERR(o_attr->uobject); + + if (spec->obj.access == UVERBS_ACCESS_NEW) { + u64 id = o_attr->uobject->id; + + /* Copy the allocated id to the user-space */ + if (put_user(id, &e->uattr->data)) { + uverbs_finalize_object(o_attr->uobject, + UVERBS_ACCESS_NEW, + false); + return -EFAULT; + } + } + + break; + default: + return -EOPNOTSUPP; + } + + set_bit(attr_id, attr_bundle_h->valid_bitmap); + return 0; +} + +static int uverbs_uattrs_process(struct ib_device *ibdev, + struct ib_ucontext *ucontext, + const struct ib_uverbs_attr *uattrs, + size_t num_uattrs, + const struct uverbs_method_spec *method, + struct uverbs_attr_bundle *attr_bundle, + struct ib_uverbs_attr __user *uattr_ptr) +{ + size_t i; + int ret = 0; + int num_given_buckets = 0; + + for (i = 0; i < num_uattrs; i++) { + const struct ib_uverbs_attr *uattr = &uattrs[i]; + u16 attr_id = uattr->attr_id; + struct uverbs_attr_spec_hash *attr_spec_bucket; + + ret = uverbs_ns_idx(&attr_id, method->num_buckets); + if (ret < 0) { + if (uattr->flags & UVERBS_ATTR_F_MANDATORY) { + uverbs_finalize_objects(attr_bundle, + method->attr_buckets, + num_given_buckets, + false); + return ret; + } + continue; + } + + /* + * ret is the found ns, so increase num_given_buckets if + * necessary. + */ + if (ret >= num_given_buckets) + num_given_buckets = ret + 1; + + attr_spec_bucket = method->attr_buckets[ret]; + ret = uverbs_process_attr(ibdev, ucontext, uattr, attr_id, + attr_spec_bucket, &attr_bundle->hash[ret], + uattr_ptr++); + if (ret) { + uverbs_finalize_objects(attr_bundle, + method->attr_buckets, + num_given_buckets, + false); + return ret; + } + } + + return num_given_buckets; +} + +static int uverbs_validate_kernel_mandatory(const struct uverbs_method_spec *method_spec, + struct uverbs_attr_bundle *attr_bundle) +{ + unsigned int i; + + for (i = 0; i < attr_bundle->num_buckets; i++) { + struct uverbs_attr_spec_hash *attr_spec_bucket = + method_spec->attr_buckets[i]; + + if (!bitmap_subset(attr_spec_bucket->mandatory_attrs_bitmask, + attr_bundle->hash[i].valid_bitmap, + attr_spec_bucket->num_attrs)) + return -EINVAL; + } + + return 0; +} + +static int uverbs_handle_method(struct ib_uverbs_attr __user *uattr_ptr, + const struct ib_uverbs_attr *uattrs, + size_t num_uattrs, + struct ib_device *ibdev, + struct ib_uverbs_file *ufile, + const struct uverbs_method_spec *method_spec, + struct uverbs_attr_bundle *attr_bundle) +{ + int ret; + int finalize_ret; + int num_given_buckets; + + num_given_buckets = uverbs_uattrs_process(ibdev, ufile->ucontext, uattrs, + num_uattrs, method_spec, + attr_bundle, uattr_ptr); + if (num_given_buckets <= 0) + return -EINVAL; + + attr_bundle->num_buckets = num_given_buckets; + ret = uverbs_validate_kernel_mandatory(method_spec, attr_bundle); + if (ret) + goto cleanup; + + ret = method_spec->handler(ibdev, ufile, attr_bundle); +cleanup: + finalize_ret = uverbs_finalize_objects(attr_bundle, + method_spec->attr_buckets, + attr_bundle->num_buckets, + !ret); + + return ret ? ret : finalize_ret; +} + +#define UVERBS_OPTIMIZE_USING_STACK_SZ 256 +static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct ib_uverbs_ioctl_hdr *hdr, + void __user *buf) +{ + const struct uverbs_object_spec *object_spec; + const struct uverbs_method_spec *method_spec; + long err = 0; + unsigned int i; + struct { + struct ib_uverbs_attr *uattrs; + struct uverbs_attr_bundle *uverbs_attr_bundle; + } *ctx = NULL; + struct uverbs_attr *curr_attr; + unsigned long *curr_bitmap; + size_t ctx_size; +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)]; +#endif + + if (hdr->reserved) + return -EINVAL; + + object_spec = uverbs_get_object(ib_dev, hdr->object_id); + if (!object_spec) + return -EOPNOTSUPP; + + method_spec = uverbs_get_method(object_spec, hdr->method_id); + if (!method_spec) + return -EOPNOTSUPP; + + if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext) + return -EINVAL; + + ctx_size = sizeof(*ctx) + + sizeof(struct uverbs_attr_bundle) + + sizeof(struct uverbs_attr_bundle_hash) * method_spec->num_buckets + + sizeof(*ctx->uattrs) * hdr->num_attrs + + sizeof(*ctx->uverbs_attr_bundle->hash[0].attrs) * + method_spec->num_child_attrs + + sizeof(*ctx->uverbs_attr_bundle->hash[0].valid_bitmap) * + (method_spec->num_child_attrs / BITS_PER_LONG + + method_spec->num_buckets); + +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + if (ctx_size <= UVERBS_OPTIMIZE_USING_STACK_SZ) + ctx = (void *)data; + + if (!ctx) +#endif + ctx = kmalloc(ctx_size, GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->uverbs_attr_bundle = (void *)ctx + sizeof(*ctx); + ctx->uattrs = (void *)(ctx->uverbs_attr_bundle + 1) + + (sizeof(ctx->uverbs_attr_bundle->hash[0]) * + method_spec->num_buckets); + curr_attr = (void *)(ctx->uattrs + hdr->num_attrs); + curr_bitmap = (void *)(curr_attr + method_spec->num_child_attrs); + + /* + * We just fill the pointers and num_attrs here. The data itself will be + * filled at a later stage (uverbs_process_attr) + */ + for (i = 0; i < method_spec->num_buckets; i++) { + unsigned int curr_num_attrs = method_spec->attr_buckets[i]->num_attrs; + + ctx->uverbs_attr_bundle->hash[i].attrs = curr_attr; + curr_attr += curr_num_attrs; + ctx->uverbs_attr_bundle->hash[i].num_attrs = curr_num_attrs; + ctx->uverbs_attr_bundle->hash[i].valid_bitmap = curr_bitmap; + bitmap_zero(curr_bitmap, curr_num_attrs); + curr_bitmap += BITS_TO_LONGS(curr_num_attrs); + } + + err = copy_from_user(ctx->uattrs, buf, + sizeof(*ctx->uattrs) * hdr->num_attrs); + if (err) { + err = -EFAULT; + goto out; + } + + err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev, + file, method_spec, ctx->uverbs_attr_bundle); +out: +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + if (ctx_size > UVERBS_OPTIMIZE_USING_STACK_SZ) +#endif + kfree(ctx); + return err; +} + +#define IB_UVERBS_MAX_CMD_SZ 4096 + +long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_ioctl_hdr __user *user_hdr = + (struct ib_uverbs_ioctl_hdr __user *)arg; + struct ib_uverbs_ioctl_hdr hdr; + struct ib_device *ib_dev; + int srcu_key; + long err; + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (!ib_dev) { + err = -EIO; + goto out; + } + + if (cmd == RDMA_VERBS_IOCTL) { + err = copy_from_user(&hdr, user_hdr, sizeof(hdr)); + + if (err || hdr.length > IB_UVERBS_MAX_CMD_SZ || + hdr.length != sizeof(hdr) + hdr.num_attrs * sizeof(struct ib_uverbs_attr)) { + err = -EINVAL; + goto out; + } + + if (hdr.reserved) { + err = -EOPNOTSUPP; + goto out; + } + + err = ib_uverbs_cmd_verbs(ib_dev, file, &hdr, + (__user void *)arg + sizeof(hdr)); + } else { + err = -ENOIOCTLCMD; + } +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + + return err; +} diff --git a/drivers/infiniband/core/uverbs_ioctl_merge.c b/drivers/infiniband/core/uverbs_ioctl_merge.c new file mode 100644 index 000000000000..76ddb6564578 --- /dev/null +++ b/drivers/infiniband/core/uverbs_ioctl_merge.c @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/uverbs_ioctl.h> +#include <rdma/rdma_user_ioctl.h> +#include <linux/bitops.h> +#include "uverbs.h" + +#define UVERBS_NUM_NS (UVERBS_ID_NS_MASK >> UVERBS_ID_NS_SHIFT) +#define GET_NS_ID(idx) (((idx) & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT) +#define GET_ID(idx) ((idx) & ~UVERBS_ID_NS_MASK) + +#define _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset, \ + buckets_offset) \ + for (tmpj = 0, \ + elem = (*(const void ***)((hashes)[tmpi] + \ + (buckets_offset)))[0]; \ + tmpj < *(size_t *)((hashes)[tmpi] + (num_buckets_offset)); \ + tmpj++) \ + if ((elem = ((*(const void ***)(hashes[tmpi] + \ + (buckets_offset)))[tmpj]))) + +/* + * Iterate all elements of a few @hashes. The number of given hashes is + * indicated by @num_hashes. The offset of the number of buckets in the hash is + * represented by @num_buckets_offset, while the offset of the buckets array in + * the hash structure is represented by @buckets_offset. tmpi and tmpj are two + * short (or int) based indices that are given by the user. tmpi iterates over + * the different hashes. @elem points the current element in the hashes[tmpi] + * bucket we are looping on. To be honest, @hashes representation isn't exactly + * a hash, but more a collection of elements. These elements' ids are treated + * in a hash like manner, where the first upper bits are the bucket number. + * These elements are later mapped into a perfect-hash. + */ +#define for_each_element(elem, tmpi, tmpj, hashes, num_hashes, \ + num_buckets_offset, buckets_offset) \ + for (tmpi = 0; tmpi < (num_hashes); tmpi++) \ + _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset,\ + buckets_offset) + +#define get_elements_iterators_entry_above(iters, num_elements, elements, \ + num_objects_fld, objects_fld, bucket,\ + min_id) \ + get_elements_above_id((const void **)iters, num_elements, \ + (const void **)(elements), \ + offsetof(typeof(**elements), \ + num_objects_fld), \ + offsetof(typeof(**elements), objects_fld),\ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket, min_id) + +#define get_objects_above_id(iters, num_trees, trees, bucket, min_id) \ + get_elements_iterators_entry_above(iters, num_trees, trees, \ + num_objects, objects, bucket, min_id) + +#define get_methods_above_id(method_iters, num_iters, iters, bucket, min_id)\ + get_elements_iterators_entry_above(method_iters, num_iters, iters, \ + num_methods, methods, bucket, min_id) + +#define get_attrs_above_id(attrs_iters, num_iters, iters, bucket, min_id)\ + get_elements_iterators_entry_above(attrs_iters, num_iters, iters, \ + num_attrs, attrs, bucket, min_id) + +/* + * get_elements_above_id get a few hashes represented by @elements and + * @num_elements. The hashes fields are described by @num_offset, @data_offset + * and @id_offset in the same way as required by for_each_element. The function + * returns an array of @iters, represents an array of elements in the hashes + * buckets, which their ids are the smallest ids in all hashes but are all + * larger than the id given by min_id. Elements are only added to the iters + * array if their id belongs to the bucket @bucket. The number of elements in + * the returned array is returned by the function. @min_id is also updated to + * reflect the new min_id of all elements in iters. + */ +static size_t get_elements_above_id(const void **iters, + unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset, + u16 bucket, + short *min_id) +{ + size_t num_iters = 0; + short min = SHRT_MAX; + const void *elem; + int i, j, last_stored = -1; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) != bucket) + continue; + + if (GET_ID(id) < *min_id || + (min != SHRT_MAX && GET_ID(id) > min)) + continue; + + /* + * We first iterate all hashes represented by @elements. When + * we do, we try to find an element @elem in the bucket @bucket + * which its id is min. Since we can't ensure the user sorted + * the elements in increasing order, we override this hash's + * minimal id element we found, if a new element with a smaller + * id was just found. + */ + iters[last_stored == i ? num_iters - 1 : num_iters++] = elem; + last_stored = i; + min = GET_ID(id); + } + + /* + * We only insert to our iters array an element, if its id is smaller + * than all previous ids. Therefore, the final iters array is sorted so + * that smaller ids are in the end of the array. + * Therefore, we need to clean the beginning of the array to make sure + * all ids of final elements are equal to min. + */ + for (i = num_iters - 1; i >= 0 && + GET_ID(*(u16 *)(iters[i] + id_offset)) == min; i--) + ; + + num_iters -= i + 1; + memmove(iters, iters + i + 1, sizeof(*iters) * num_iters); + + *min_id = min; + return num_iters; +} + +#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \ + objects_fld, bucket) \ + find_max_element_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld), \ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket) + +static short find_max_element_ns_id(unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset) +{ + short max_ns = SHRT_MIN; + const void *elem; + int i, j; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) > max_ns) + max_ns = GET_NS_ID(id); + } + + return max_ns; +} + +static short find_max_element_id(unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset, + u16 bucket) +{ + short max_id = SHRT_MIN; + const void *elem; + int i, j; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) == bucket && + GET_ID(id) > max_id) + max_id = GET_ID(id); + } + return max_id; +} + +#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \ + objects_fld, bucket) \ + find_max_element_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld), \ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket) + +#define find_max_element_ns_entry_id(num_elements, elements, \ + num_objects_fld, objects_fld) \ + find_max_element_ns_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld),\ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id)) + +/* + * find_max_xxxx_ns_id gets a few elements. Each element is described by an id + * which its upper bits represents a namespace. It finds the max namespace. This + * could be used in order to know how many buckets do we need to allocate. If no + * elements exist, SHRT_MIN is returned. Namespace represents here different + * buckets. The common example is "common bucket" and "driver bucket". + * + * find_max_xxxx_id gets a few elements and a bucket. Each element is described + * by an id which its upper bits represent a namespace. It returns the max id + * which is contained in the same namespace defined in @bucket. This could be + * used in order to know how many elements do we need to allocate in the bucket. + * If no elements exist, SHRT_MIN is returned. + */ + +#define find_max_object_id(num_trees, trees, bucket) \ + find_max_element_entry_id(num_trees, trees, num_objects,\ + objects, bucket) +#define find_max_object_ns_id(num_trees, trees) \ + find_max_element_ns_entry_id(num_trees, trees, \ + num_objects, objects) + +#define find_max_method_id(num_iters, iters, bucket) \ + find_max_element_entry_id(num_iters, iters, num_methods,\ + methods, bucket) +#define find_max_method_ns_id(num_iters, iters) \ + find_max_element_ns_entry_id(num_iters, iters, \ + num_methods, methods) + +#define find_max_attr_id(num_iters, iters, bucket) \ + find_max_element_entry_id(num_iters, iters, num_attrs, \ + attrs, bucket) +#define find_max_attr_ns_id(num_iters, iters) \ + find_max_element_ns_entry_id(num_iters, iters, \ + num_attrs, attrs) + +static void free_method(struct uverbs_method_spec *method) +{ + unsigned int i; + + if (!method) + return; + + for (i = 0; i < method->num_buckets; i++) + kfree(method->attr_buckets[i]); + + kfree(method); +} + +#define IS_ATTR_OBJECT(attr) ((attr)->type == UVERBS_ATTR_TYPE_IDR || \ + (attr)->type == UVERBS_ATTR_TYPE_FD) + +/* + * This function gets array of size @num_method_defs which contains pointers to + * method definitions @method_defs. The function allocates an + * uverbs_method_spec structure and initializes its number of buckets and the + * elements in buckets to the correct attributes. While doing that, it + * validates that there aren't conflicts between attributes of different + * method_defs. + */ +static struct uverbs_method_spec *build_method_with_attrs(const struct uverbs_method_def **method_defs, + size_t num_method_defs) +{ + int bucket_idx; + int max_attr_buckets = 0; + size_t num_attr_buckets = 0; + int res = 0; + struct uverbs_method_spec *method = NULL; + const struct uverbs_attr_def **attr_defs; + unsigned int num_of_singularities = 0; + + max_attr_buckets = find_max_attr_ns_id(num_method_defs, method_defs); + if (max_attr_buckets >= 0) + num_attr_buckets = max_attr_buckets + 1; + + method = kzalloc(sizeof(*method) + + num_attr_buckets * sizeof(*method->attr_buckets), + GFP_KERNEL); + if (!method) + return ERR_PTR(-ENOMEM); + + method->num_buckets = num_attr_buckets; + attr_defs = kcalloc(num_method_defs, sizeof(*attr_defs), GFP_KERNEL); + if (!attr_defs) { + res = -ENOMEM; + goto free_method; + } + for (bucket_idx = 0; bucket_idx < method->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + int attr_max_bucket = 0; + struct uverbs_attr_spec_hash *hash = NULL; + + attr_max_bucket = find_max_attr_id(num_method_defs, method_defs, + bucket_idx); + if (attr_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + ALIGN(sizeof(*hash->attrs) * (attr_max_bucket + 1), + sizeof(long)) + + BITS_TO_LONGS(attr_max_bucket) * sizeof(long), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + hash->num_attrs = attr_max_bucket + 1; + method->num_child_attrs += hash->num_attrs; + hash->mandatory_attrs_bitmask = (void *)(hash + 1) + + ALIGN(sizeof(*hash->attrs) * + (attr_max_bucket + 1), + sizeof(long)); + + method->attr_buckets[bucket_idx] = hash; + + do { + size_t num_attr_defs; + struct uverbs_attr_spec *attr; + bool attr_obj_with_special_access; + + num_attr_defs = + get_attrs_above_id(attr_defs, + num_method_defs, + method_defs, + bucket_idx, + &min_id); + /* Last attr in bucket */ + if (!num_attr_defs) + break; + + if (num_attr_defs > 1) { + /* + * We don't allow two attribute definitions for + * the same attribute. This is usually a + * programmer error. If required, it's better to + * just add a new attribute to capture the new + * semantics. + */ + res = -EEXIST; + goto free; + } + + attr = &hash->attrs[min_id]; + memcpy(attr, &attr_defs[0]->attr, sizeof(*attr)); + + attr_obj_with_special_access = IS_ATTR_OBJECT(attr) && + (attr->obj.access == UVERBS_ACCESS_NEW || + attr->obj.access == UVERBS_ACCESS_DESTROY); + num_of_singularities += !!attr_obj_with_special_access; + if (WARN(num_of_singularities > 1, + "ib_uverbs: Method contains more than one object attr (%d) with new/destroy access\n", + min_id) || + WARN(attr_obj_with_special_access && + !(attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY), + "ib_uverbs: Tried to merge attr (%d) but it's an object with new/destroy aceess but isn't mandatory\n", + min_id) || + WARN(IS_ATTR_OBJECT(attr) && + attr->flags & UVERBS_ATTR_SPEC_F_MIN_SZ, + "ib_uverbs: Tried to merge attr (%d) but it's an object with min_sz flag\n", + min_id)) { + res = -EINVAL; + goto free; + } + + if (attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY) + set_bit(min_id, hash->mandatory_attrs_bitmask); + min_id++; + + } while (1); + } + kfree(attr_defs); + return method; + +free: + kfree(attr_defs); +free_method: + free_method(method); + return ERR_PTR(res); +} + +static void free_object(struct uverbs_object_spec *object) +{ + unsigned int i, j; + + if (!object) + return; + + for (i = 0; i < object->num_buckets; i++) { + struct uverbs_method_spec_hash *method_buckets = + object->method_buckets[i]; + + if (!method_buckets) + continue; + + for (j = 0; j < method_buckets->num_methods; j++) + free_method(method_buckets->methods[j]); + + kfree(method_buckets); + } + + kfree(object); +} + +/* + * This function gets array of size @num_object_defs which contains pointers to + * object definitions @object_defs. The function allocated an + * uverbs_object_spec structure and initialize its number of buckets and the + * elements in buckets to the correct methods. While doing that, it + * sorts out the correct relationship between conflicts in the same method. + */ +static struct uverbs_object_spec *build_object_with_methods(const struct uverbs_object_def **object_defs, + size_t num_object_defs) +{ + u16 bucket_idx; + int max_method_buckets = 0; + u16 num_method_buckets = 0; + int res = 0; + struct uverbs_object_spec *object = NULL; + const struct uverbs_method_def **method_defs; + + max_method_buckets = find_max_method_ns_id(num_object_defs, object_defs); + if (max_method_buckets >= 0) + num_method_buckets = max_method_buckets + 1; + + object = kzalloc(sizeof(*object) + + num_method_buckets * + sizeof(*object->method_buckets), GFP_KERNEL); + if (!object) + return ERR_PTR(-ENOMEM); + + object->num_buckets = num_method_buckets; + method_defs = kcalloc(num_object_defs, sizeof(*method_defs), GFP_KERNEL); + if (!method_defs) { + res = -ENOMEM; + goto free_object; + } + + for (bucket_idx = 0; bucket_idx < object->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + int methods_max_bucket = 0; + struct uverbs_method_spec_hash *hash = NULL; + + methods_max_bucket = find_max_method_id(num_object_defs, object_defs, + bucket_idx); + if (methods_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + sizeof(*hash->methods) * (methods_max_bucket + 1), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + + hash->num_methods = methods_max_bucket + 1; + object->method_buckets[bucket_idx] = hash; + + do { + size_t num_method_defs; + struct uverbs_method_spec *method; + int i; + + num_method_defs = + get_methods_above_id(method_defs, + num_object_defs, + object_defs, + bucket_idx, + &min_id); + /* Last method in bucket */ + if (!num_method_defs) + break; + + method = build_method_with_attrs(method_defs, + num_method_defs); + if (IS_ERR(method)) { + res = PTR_ERR(method); + goto free; + } + + /* + * The last tree which is given as an argument to the + * merge overrides previous method handler. + * Therefore, we iterate backwards and search for the + * first handler which != NULL. This also defines the + * set of flags used for this handler. + */ + for (i = num_object_defs - 1; + i >= 0 && !method_defs[i]->handler; i--) + ; + hash->methods[min_id++] = method; + /* NULL handler isn't allowed */ + if (WARN(i < 0, + "ib_uverbs: tried to merge function id %d, but all handlers are NULL\n", + min_id)) { + res = -EINVAL; + goto free; + } + method->handler = method_defs[i]->handler; + method->flags = method_defs[i]->flags; + + } while (1); + } + kfree(method_defs); + return object; + +free: + kfree(method_defs); +free_object: + free_object(object); + return ERR_PTR(res); +} + +void uverbs_free_spec_tree(struct uverbs_root_spec *root) +{ + unsigned int i, j; + + if (!root) + return; + + for (i = 0; i < root->num_buckets; i++) { + struct uverbs_object_spec_hash *object_hash = + root->object_buckets[i]; + + if (!object_hash) + continue; + + for (j = 0; j < object_hash->num_objects; j++) + free_object(object_hash->objects[j]); + + kfree(object_hash); + } + + kfree(root); +} +EXPORT_SYMBOL(uverbs_free_spec_tree); + +struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees, + const struct uverbs_object_tree_def **trees) +{ + u16 bucket_idx; + short max_object_buckets = 0; + size_t num_objects_buckets = 0; + struct uverbs_root_spec *root_spec = NULL; + const struct uverbs_object_def **object_defs; + int i; + int res = 0; + + max_object_buckets = find_max_object_ns_id(num_trees, trees); + /* + * Devices which don't want to support ib_uverbs, should just allocate + * an empty parsing tree. Every user-space command won't hit any valid + * entry in the parsing tree and thus will fail. + */ + if (max_object_buckets >= 0) + num_objects_buckets = max_object_buckets + 1; + + root_spec = kzalloc(sizeof(*root_spec) + + num_objects_buckets * sizeof(*root_spec->object_buckets), + GFP_KERNEL); + if (!root_spec) + return ERR_PTR(-ENOMEM); + root_spec->num_buckets = num_objects_buckets; + + object_defs = kcalloc(num_trees, sizeof(*object_defs), + GFP_KERNEL); + if (!object_defs) { + res = -ENOMEM; + goto free_root; + } + + for (bucket_idx = 0; bucket_idx < root_spec->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + short objects_max_bucket; + struct uverbs_object_spec_hash *hash = NULL; + + objects_max_bucket = find_max_object_id(num_trees, trees, + bucket_idx); + if (objects_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + sizeof(*hash->objects) * (objects_max_bucket + 1), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + hash->num_objects = objects_max_bucket + 1; + root_spec->object_buckets[bucket_idx] = hash; + + do { + size_t num_object_defs; + struct uverbs_object_spec *object; + + num_object_defs = get_objects_above_id(object_defs, + num_trees, + trees, + bucket_idx, + &min_id); + /* Last object in bucket */ + if (!num_object_defs) + break; + + object = build_object_with_methods(object_defs, + num_object_defs); + if (IS_ERR(object)) { + res = PTR_ERR(object); + goto free; + } + + /* + * The last tree which is given as an argument to the + * merge overrides previous object's type_attrs. + * Therefore, we iterate backwards and search for the + * first type_attrs which != NULL. + */ + for (i = num_object_defs - 1; + i >= 0 && !object_defs[i]->type_attrs; i--) + ; + /* + * NULL is a valid type_attrs. It means an object we + * can't instantiate (like DEVICE). + */ + object->type_attrs = i < 0 ? NULL : + object_defs[i]->type_attrs; + + hash->objects[min_id++] = object; + } while (1); + } + + kfree(object_defs); + return root_spec; + +free: + kfree(object_defs); +free_root: + uverbs_free_spec_tree(root_spec); + return ERR_PTR(res); +} +EXPORT_SYMBOL(uverbs_alloc_spec_tree); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 5e530d2bee44..dc2aed6fb21b 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -49,6 +49,7 @@ #include <linux/uaccess.h> #include <rdma/ib.h> +#include <rdma/uverbs_std_types.h> #include "uverbs.h" #include "core_priv.h" @@ -595,7 +596,6 @@ struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file { struct ib_uverbs_async_event_file *ev_file; struct file *filp; - int ret; ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL); if (!ev_file) @@ -621,21 +621,11 @@ struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler, ib_dev, ib_uverbs_event_handler); - ret = ib_register_event_handler(&uverbs_file->event_handler); - if (ret) - goto err_put_file; - + ib_register_event_handler(&uverbs_file->event_handler); /* At that point async file stuff was fully set */ return filp; -err_put_file: - fput(filp); - kref_put(&uverbs_file->async_file->ref, - ib_uverbs_release_async_event_file); - uverbs_file->async_file = NULL; - return ERR_PTR(ret); - err_put_refs: kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file); kref_put(&ev_file->ref, ib_uverbs_release_async_event_file); @@ -949,6 +939,9 @@ static const struct file_operations uverbs_fops = { .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, +#if IS_ENABLED(CONFIG_INFINIBAND_EXP_USER_ACCESS) + .unlocked_ioctl = ib_uverbs_ioctl, +#endif }; static const struct file_operations uverbs_mmap_fops = { @@ -958,6 +951,9 @@ static const struct file_operations uverbs_mmap_fops = { .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, +#if IS_ENABLED(CONFIG_INFINIBAND_EXP_USER_ACCESS) + .unlocked_ioctl = ib_uverbs_ioctl, +#endif }; static struct ib_client uverbs_client = { @@ -1108,6 +1104,18 @@ static void ib_uverbs_add_one(struct ib_device *device) if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; + if (!device->specs_root) { + const struct uverbs_object_tree_def *default_root[] = { + uverbs_default_get_objects()}; + + uverbs_dev->specs_root = uverbs_alloc_spec_tree(1, + default_root); + if (IS_ERR(uverbs_dev->specs_root)) + goto err_class; + + device->specs_root = uverbs_dev->specs_root; + } + ib_set_client_data(device, &uverbs_client, uverbs_dev); return; @@ -1239,6 +1247,11 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) ib_uverbs_comp_dev(uverbs_dev); if (wait_clients) wait_for_completion(&uverbs_dev->comp); + if (uverbs_dev->specs_root) { + uverbs_free_spec_tree(uverbs_dev->specs_root); + device->specs_root = NULL; + } + kobject_put(&uverbs_dev->kobj); } diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index 94fd989c9060..bd0acf376af0 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -33,10 +33,47 @@ #include <linux/export.h> #include <rdma/ib_marshall.h> -void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, - struct rdma_ah_attr *src) +#define OPA_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) +static int rdma_ah_conv_opa_to_ib(struct ib_device *dev, + struct rdma_ah_attr *ib, + struct rdma_ah_attr *opa) { + struct ib_port_attr port_attr; + int ret = 0; + + /* Do structure copy and the over-write fields */ + *ib = *opa; + + ib->type = RDMA_AH_ATTR_TYPE_IB; + rdma_ah_set_grh(ib, NULL, 0, 0, 1, 0); + + if (ib_query_port(dev, opa->port_num, &port_attr)) { + /* Set to default subnet to indicate error */ + rdma_ah_set_subnet_prefix(ib, OPA_DEFAULT_GID_PREFIX); + ret = -EINVAL; + } else { + rdma_ah_set_subnet_prefix(ib, + cpu_to_be64(port_attr.subnet_prefix)); + } + rdma_ah_set_interface_id(ib, OPA_MAKE_ID(rdma_ah_get_dlid(opa))); + return ret; +} + +void ib_copy_ah_attr_to_user(struct ib_device *device, + struct ib_uverbs_ah_attr *dst, + struct rdma_ah_attr *ah_attr) +{ + struct rdma_ah_attr *src = ah_attr; + struct rdma_ah_attr conv_ah; + memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); + + if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && + (rdma_ah_get_dlid(ah_attr) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (!rdma_ah_conv_opa_to_ib(device, &conv_ah, ah_attr))) + src = &conv_ah; + dst->dlid = rdma_ah_get_dlid(src); dst->sl = rdma_ah_get_sl(src); dst->src_path_bits = rdma_ah_get_path_bits(src); @@ -57,7 +94,8 @@ void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, } EXPORT_SYMBOL(ib_copy_ah_attr_to_user); -void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, +void ib_copy_qp_attr_to_user(struct ib_device *device, + struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src) { dst->qp_state = src->qp_state; @@ -76,8 +114,8 @@ void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, dst->max_recv_sge = src->cap.max_recv_sge; dst->max_inline_data = src->cap.max_inline_data; - ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); - ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr); + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(device, &dst->alt_ah_attr, &src->alt_ah_attr); dst->pkey_index = src->pkey_index; dst->alt_pkey_index = src->alt_pkey_index; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index ef293379f37a..0a98579700ec 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -209,67 +209,244 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_ return 0; }; -const struct uverbs_obj_fd_type uverbs_type_attrs_comp_channel = { - .type = UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file), 0), - .context_closed = uverbs_hot_unplug_completion_event_file, - .fops = &uverbs_event_fops, - .name = "[infinibandevent]", - .flags = O_RDONLY, -}; +/* + * This spec is used in order to pass information to the hardware driver in a + * legacy way. Every verb that could get driver specific data should get this + * spec. + */ +static const struct uverbs_attr_def uverbs_uhw_compat_in = + UVERBS_ATTR_PTR_IN_SZ(UVERBS_UHW_IN, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); +static const struct uverbs_attr_def uverbs_uhw_compat_out = + UVERBS_ATTR_PTR_OUT_SZ(UVERBS_UHW_OUT, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); -const struct uverbs_obj_idr_type uverbs_type_attrs_cq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0), - .destroy_object = uverbs_free_cq, -}; +static void create_udata(struct uverbs_attr_bundle *ctx, + struct ib_udata *udata) +{ + /* + * This is for ease of conversion. The purpose is to convert all drivers + * to use uverbs_attr_bundle instead of ib_udata. + * Assume attr == 0 is input and attr == 1 is output. + */ + void __user *inbuf; + size_t inbuf_len = 0; + void __user *outbuf; + size_t outbuf_len = 0; + const struct uverbs_attr *uhw_in = + uverbs_attr_get(ctx, UVERBS_UHW_IN); + const struct uverbs_attr *uhw_out = + uverbs_attr_get(ctx, UVERBS_UHW_OUT); + + if (!IS_ERR(uhw_in)) { + inbuf = uhw_in->ptr_attr.ptr; + inbuf_len = uhw_in->ptr_attr.len; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_qp = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0), - .destroy_object = uverbs_free_qp, -}; + if (!IS_ERR(uhw_out)) { + outbuf = uhw_out->ptr_attr.ptr; + outbuf_len = uhw_out->ptr_attr.len; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_mw = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_mw, -}; + INIT_UDATA_BUF_OR_NULL(udata, inbuf, outbuf, inbuf_len, outbuf_len); +} -const struct uverbs_obj_idr_type uverbs_type_attrs_mr = { - /* 1 is used in order to free the MR after all the MWs */ - .type = UVERBS_TYPE_ALLOC_IDR(1), - .destroy_object = uverbs_free_mr, -}; +static int uverbs_create_cq_handler(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_ucontext *ucontext = file->ucontext; + struct ib_ucq_object *obj; + struct ib_udata uhw; + int ret; + u64 user_handle; + struct ib_cq_init_attr attr = {}; + struct ib_cq *cq; + struct ib_uverbs_completion_event_file *ev_file = NULL; + const struct uverbs_attr *ev_file_attr; + struct ib_uobject *ev_file_uobj; + + if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ)) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&attr.comp_vector, attrs, CREATE_CQ_COMP_VECTOR); + if (!ret) + ret = uverbs_copy_from(&attr.cqe, attrs, CREATE_CQ_CQE); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, CREATE_CQ_USER_HANDLE); + if (ret) + return ret; -const struct uverbs_obj_idr_type uverbs_type_attrs_srq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0), - .destroy_object = uverbs_free_srq, -}; + /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ + if (uverbs_copy_from(&attr.flags, attrs, CREATE_CQ_FLAGS) == -EFAULT) + return -EFAULT; -const struct uverbs_obj_idr_type uverbs_type_attrs_ah = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_ah, -}; + ev_file_attr = uverbs_attr_get(attrs, CREATE_CQ_COMP_CHANNEL); + if (!IS_ERR(ev_file_attr)) { + ev_file_uobj = ev_file_attr->obj_attr.uobject; -const struct uverbs_obj_idr_type uverbs_type_attrs_flow = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_flow, -}; + ev_file = container_of(ev_file_uobj, + struct ib_uverbs_completion_event_file, + uobj_file.uobj); + uverbs_uobject_get(ev_file_uobj); + } -const struct uverbs_obj_idr_type uverbs_type_attrs_wq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0), - .destroy_object = uverbs_free_wq, -}; + if (attr.comp_vector >= ucontext->ufile->device->num_comp_vectors) { + ret = -EINVAL; + goto err_event_file; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_rwq_ind_table = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_rwq_ind_tbl, -}; + obj = container_of(uverbs_attr_get(attrs, CREATE_CQ_HANDLE)->obj_attr.uobject, + typeof(*obj), uobject); + obj->uverbs_file = ucontext->ufile; + obj->comp_events_reported = 0; + obj->async_events_reported = 0; + INIT_LIST_HEAD(&obj->comp_list); + INIT_LIST_HEAD(&obj->async_list); + + /* Temporary, only until drivers get the new uverbs_attr_bundle */ + create_udata(attrs, &uhw); + + cq = ib_dev->create_cq(ib_dev, &attr, ucontext, &uhw); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto err_event_file; + } -const struct uverbs_obj_idr_type uverbs_type_attrs_xrcd = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0), - .destroy_object = uverbs_free_xrcd, -}; + cq->device = ib_dev; + cq->uobject = &obj->uobject; + cq->comp_handler = ib_uverbs_comp_handler; + cq->event_handler = ib_uverbs_cq_event_handler; + cq->cq_context = &ev_file->ev_queue; + obj->uobject.object = cq; + obj->uobject.user_handle = user_handle; + atomic_set(&cq->usecnt, 0); + + ret = uverbs_copy_to(attrs, CREATE_CQ_RESP_CQE, &cq->cqe); + if (ret) + goto err_cq; -const struct uverbs_obj_idr_type uverbs_type_attrs_pd = { - /* 2 is used in order to free the PD after MRs */ - .type = UVERBS_TYPE_ALLOC_IDR(2), - .destroy_object = uverbs_free_pd, + return 0; +err_cq: + ib_destroy_cq(cq); + +err_event_file: + if (ev_file) + uverbs_uobject_put(ev_file_uobj); + return ret; }; + +static DECLARE_UVERBS_METHOD( + uverbs_method_cq_create, UVERBS_CQ_CREATE, uverbs_create_cq_handler, + &UVERBS_ATTR_IDR(CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ, UVERBS_ACCESS_NEW, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_CQE, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_USER_HANDLE, u64, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_FD(CREATE_CQ_COMP_CHANNEL, UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_ACCESS_READ), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_COMP_VECTOR, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_FLAGS, u32), + &UVERBS_ATTR_PTR_OUT(CREATE_CQ_RESP_CQE, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &uverbs_uhw_compat_in, &uverbs_uhw_compat_out); + +static int uverbs_destroy_cq_handler(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_destroy_cq_resp resp; + struct ib_uobject *uobj = + uverbs_attr_get(attrs, DESTROY_CQ_HANDLE)->obj_attr.uobject; + struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object, + uobject); + int ret; + + if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ)) + return -EOPNOTSUPP; + + ret = rdma_explicit_destroy(uobj); + if (ret) + return ret; + + resp.comp_events_reported = obj->comp_events_reported; + resp.async_events_reported = obj->async_events_reported; + + return uverbs_copy_to(attrs, DESTROY_CQ_RESP, &resp); +} + +static DECLARE_UVERBS_METHOD( + uverbs_method_cq_destroy, UVERBS_CQ_DESTROY, uverbs_destroy_cq_handler, + &UVERBS_ATTR_IDR(DESTROY_CQ_HANDLE, UVERBS_OBJECT_CQ, + UVERBS_ACCESS_DESTROY, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_OUT(DESTROY_CQ_RESP, struct ib_uverbs_destroy_cq_resp, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + +DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel, + UVERBS_OBJECT_COMP_CHANNEL, + &UVERBS_TYPE_ALLOC_FD(0, + sizeof(struct ib_uverbs_completion_event_file), + uverbs_hot_unplug_completion_event_file, + &uverbs_event_fops, + "[infinibandevent]", O_RDONLY)); + +DECLARE_UVERBS_OBJECT(uverbs_object_cq, UVERBS_OBJECT_CQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0, + uverbs_free_cq), + &uverbs_method_cq_create, + &uverbs_method_cq_destroy); + +DECLARE_UVERBS_OBJECT(uverbs_object_qp, UVERBS_OBJECT_QP, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0, + uverbs_free_qp)); + +DECLARE_UVERBS_OBJECT(uverbs_object_mw, UVERBS_OBJECT_MW, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_mw)); + +DECLARE_UVERBS_OBJECT(uverbs_object_mr, UVERBS_OBJECT_MR, + /* 1 is used in order to free the MR after all the MWs */ + &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr)); + +DECLARE_UVERBS_OBJECT(uverbs_object_srq, UVERBS_OBJECT_SRQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0, + uverbs_free_srq)); + +DECLARE_UVERBS_OBJECT(uverbs_object_ah, UVERBS_OBJECT_AH, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_ah)); + +DECLARE_UVERBS_OBJECT(uverbs_object_flow, UVERBS_OBJECT_FLOW, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow)); + +DECLARE_UVERBS_OBJECT(uverbs_object_wq, UVERBS_OBJECT_WQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0, + uverbs_free_wq)); + +DECLARE_UVERBS_OBJECT(uverbs_object_rwq_ind_table, + UVERBS_OBJECT_RWQ_IND_TBL, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_rwq_ind_tbl)); + +DECLARE_UVERBS_OBJECT(uverbs_object_xrcd, UVERBS_OBJECT_XRCD, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0, + uverbs_free_xrcd)); + +DECLARE_UVERBS_OBJECT(uverbs_object_pd, UVERBS_OBJECT_PD, + /* 2 is used in order to free the PD after MRs */ + &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd)); + +DECLARE_UVERBS_OBJECT(uverbs_object_device, UVERBS_OBJECT_DEVICE, NULL); + +DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, + &uverbs_object_device, + &uverbs_object_pd, + &uverbs_object_mr, + &uverbs_object_comp_channel, + &uverbs_object_cq, + &uverbs_object_qp, + &uverbs_object_ah, + &uverbs_object_mw, + &uverbs_object_srq, + &uverbs_object_flow, + &uverbs_object_wq, + &uverbs_object_rwq_ind_table, + &uverbs_object_xrcd); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b456e3ca1876..ee9e27dc799b 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -180,39 +180,29 @@ EXPORT_SYMBOL(ib_rate_to_mbps); __attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { - switch (node_type) { - case RDMA_NODE_IB_CA: - case RDMA_NODE_IB_SWITCH: - case RDMA_NODE_IB_ROUTER: - return RDMA_TRANSPORT_IB; - case RDMA_NODE_RNIC: - return RDMA_TRANSPORT_IWARP; - case RDMA_NODE_USNIC: + + if (node_type == RDMA_NODE_USNIC) return RDMA_TRANSPORT_USNIC; - case RDMA_NODE_USNIC_UDP: + if (node_type == RDMA_NODE_USNIC_UDP) return RDMA_TRANSPORT_USNIC_UDP; - default: - BUG(); - return 0; - } + if (node_type == RDMA_NODE_RNIC) + return RDMA_TRANSPORT_IWARP; + + return RDMA_TRANSPORT_IB; } EXPORT_SYMBOL(rdma_node_get_transport); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) { + enum rdma_transport_type lt; if (device->get_link_layer) return device->get_link_layer(device, port_num); - switch (rdma_node_get_transport(device->node_type)) { - case RDMA_TRANSPORT_IB: + lt = rdma_node_get_transport(device->node_type); + if (lt == RDMA_TRANSPORT_IB) return IB_LINK_LAYER_INFINIBAND; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_USNIC: - case RDMA_TRANSPORT_USNIC_UDP: - return IB_LINK_LAYER_ETHERNET; - default: - return IB_LINK_LAYER_UNSPECIFIED; - } + + return IB_LINK_LAYER_ETHERNET; } EXPORT_SYMBOL(rdma_port_get_link_layer); @@ -478,6 +468,8 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, union ib_gid dgid; union ib_gid sgid; + might_sleep(); + memset(ah_attr, 0, sizeof *ah_attr); ah_attr->type = rdma_ah_find_type(device, port_num); if (rdma_cap_eth_ah(device, port_num)) { @@ -632,11 +624,13 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; srq->srq_type = srq_init_attr->srq_type; + if (ib_srq_has_cq(srq->srq_type)) { + srq->ext.cq = srq_init_attr->ext.cq; + atomic_inc(&srq->ext.cq->usecnt); + } if (srq->srq_type == IB_SRQT_XRC) { srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; - srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; atomic_inc(&srq->ext.xrc.xrcd->usecnt); - atomic_inc(&srq->ext.xrc.cq->usecnt); } atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); @@ -677,18 +671,18 @@ int ib_destroy_srq(struct ib_srq *srq) pd = srq->pd; srq_type = srq->srq_type; - if (srq_type == IB_SRQT_XRC) { + if (ib_srq_has_cq(srq_type)) + cq = srq->ext.cq; + if (srq_type == IB_SRQT_XRC) xrcd = srq->ext.xrc.xrcd; - cq = srq->ext.xrc.cq; - } ret = srq->device->destroy_srq(srq); if (!ret) { atomic_dec(&pd->usecnt); - if (srq_type == IB_SRQT_XRC) { + if (srq_type == IB_SRQT_XRC) atomic_dec(&xrcd->usecnt); + if (ib_srq_has_cq(srq_type)) atomic_dec(&cq->usecnt); - } } return ret; @@ -1244,6 +1238,18 @@ int ib_resolve_eth_dmac(struct ib_device *device, if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw)) { rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, ah_attr->roce.dmac); + return 0; + } + if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { + if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { + __be32 addr = 0; + + memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4); + ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac); + } else { + ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw, + (char *)ah_attr->roce.dmac); + } } else { union ib_gid sgid; struct ib_gid_attr sgid_attr; @@ -1306,6 +1312,61 @@ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, } EXPORT_SYMBOL(ib_modify_qp_with_udata); +int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) +{ + int rc; + u32 netdev_speed; + struct net_device *netdev; + struct ethtool_link_ksettings lksettings; + + if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) + return -EINVAL; + + if (!dev->get_netdev) + return -EOPNOTSUPP; + + netdev = dev->get_netdev(dev, port_num); + if (!netdev) + return -ENODEV; + + rtnl_lock(); + rc = __ethtool_get_link_ksettings(netdev, &lksettings); + rtnl_unlock(); + + dev_put(netdev); + + if (!rc) { + netdev_speed = lksettings.base.speed; + } else { + netdev_speed = SPEED_1000; + pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name, + netdev_speed); + } + + if (netdev_speed <= SPEED_1000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_SDR; + } else if (netdev_speed <= SPEED_10000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_FDR10; + } else if (netdev_speed <= SPEED_20000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_DDR; + } else if (netdev_speed <= SPEED_25000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_EDR; + } else if (netdev_speed <= SPEED_40000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_FDR10; + } else { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_EDR; + } + + return 0; +} +EXPORT_SYMBOL(ib_get_eth_speed); + int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) @@ -1573,15 +1634,53 @@ EXPORT_SYMBOL(ib_dealloc_fmr); /* Multicast groups */ +static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) +{ + struct ib_qp_init_attr init_attr = {}; + struct ib_qp_attr attr = {}; + int num_eth_ports = 0; + int port; + + /* If QP state >= init, it is assigned to a port and we can check this + * port only. + */ + if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) { + if (attr.qp_state >= IB_QPS_INIT) { + if (qp->device->get_link_layer(qp->device, attr.port_num) != + IB_LINK_LAYER_INFINIBAND) + return true; + goto lid_check; + } + } + + /* Can't get a quick answer, iterate over all ports */ + for (port = 0; port < qp->device->phys_port_cnt; port++) + if (qp->device->get_link_layer(qp->device, port) != + IB_LINK_LAYER_INFINIBAND) + num_eth_ports++; + + /* If we have at lease one Ethernet port, RoCE annex declares that + * multicast LID should be ignored. We can't tell at this step if the + * QP belongs to an IB or Ethernet port. + */ + if (num_eth_ports) + return true; + + /* If all the ports are IB, we can check according to IB spec. */ +lid_check: + return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || + lid == be16_to_cpu(IB_LID_PERMISSIVE)); +} + int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { int ret; if (!qp->device->attach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || - lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || - lid == be16_to_cpu(IB_LID_PERMISSIVE)) + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->attach_mcast(qp, gid, lid); @@ -1597,9 +1696,9 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) if (!qp->device->detach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || - lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || - lid == be16_to_cpu(IB_LID_PERMISSIVE)) + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->detach_mcast(qp, gid, lid); |