diff options
Diffstat (limited to 'drivers/infiniband')
53 files changed, 574 insertions, 535 deletions
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index b136d3acc5bd..0f58f46dbad7 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -699,13 +699,16 @@ EXPORT_SYMBOL(rdma_addr_cancel); struct resolve_cb_context { struct rdma_dev_addr *addr; struct completion comp; + int status; }; static void resolve_cb(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context) { - memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct - rdma_dev_addr)); + if (!status) + memcpy(((struct resolve_cb_context *)context)->addr, + addr, sizeof(struct rdma_dev_addr)); + ((struct resolve_cb_context *)context)->status = status; complete(&((struct resolve_cb_context *)context)->comp); } @@ -743,6 +746,10 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, wait_for_completion(&ctx.comp); + ret = ctx.status; + if (ret) + return ret; + memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); if (!dev) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c99525512b34..71c7c4c328ef 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -80,6 +80,8 @@ static struct ib_cm { __be32 random_id_operand; struct list_head timewait_list; struct workqueue_struct *wq; + /* Sync on cm change port state */ + spinlock_t state_lock; } cm; /* Counter indexes ordered by attribute ID */ @@ -161,6 +163,8 @@ struct cm_port { struct ib_mad_agent *mad_agent; struct kobject port_obj; u8 port_num; + struct list_head cm_priv_prim_list; + struct list_head cm_priv_altr_list; struct cm_counter_group counter_group[CM_COUNTER_GROUPS]; }; @@ -241,6 +245,12 @@ struct cm_id_private { u8 service_timeout; u8 target_ack_delay; + struct list_head prim_list; + struct list_head altr_list; + /* Indicates that the send port mad is registered and av is set */ + int prim_send_port_not_ready; + int altr_send_port_not_ready; + struct list_head work_list; atomic_t work_count; }; @@ -259,20 +269,47 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv, struct ib_mad_agent *mad_agent; struct ib_mad_send_buf *m; struct ib_ah *ah; + struct cm_av *av; + unsigned long flags, flags2; + int ret = 0; + /* don't let the port to be released till the agent is down */ + spin_lock_irqsave(&cm.state_lock, flags2); + spin_lock_irqsave(&cm.lock, flags); + if (!cm_id_priv->prim_send_port_not_ready) + av = &cm_id_priv->av; + else if (!cm_id_priv->altr_send_port_not_ready && + (cm_id_priv->alt_av.port)) + av = &cm_id_priv->alt_av; + else { + pr_info("%s: not valid CM id\n", __func__); + ret = -ENODEV; + spin_unlock_irqrestore(&cm.lock, flags); + goto out; + } + spin_unlock_irqrestore(&cm.lock, flags); + /* Make sure the port haven't released the mad yet */ mad_agent = cm_id_priv->av.port->mad_agent; - ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr); - if (IS_ERR(ah)) - return PTR_ERR(ah); + if (!mad_agent) { + pr_info("%s: not a valid MAD agent\n", __func__); + ret = -ENODEV; + goto out; + } + ah = ib_create_ah(mad_agent->qp->pd, &av->ah_attr); + if (IS_ERR(ah)) { + ret = PTR_ERR(ah); + goto out; + } m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, - cm_id_priv->av.pkey_index, + av->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC, IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { ib_destroy_ah(ah); - return PTR_ERR(m); + ret = PTR_ERR(m); + goto out; } /* Timeout set by caller if response is expected. */ @@ -282,7 +319,10 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv, atomic_inc(&cm_id_priv->refcount); m->context[0] = cm_id_priv; *msg = m; - return 0; + +out: + spin_unlock_irqrestore(&cm.state_lock, flags2); + return ret; } static int cm_alloc_response_msg(struct cm_port *port, @@ -352,7 +392,8 @@ static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, grh, &av->ah_attr); } -static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) +static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av, + struct cm_id_private *cm_id_priv) { struct cm_device *cm_dev; struct cm_port *port = NULL; @@ -387,7 +428,17 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) &av->ah_attr); av->timeout = path->packet_life_time + 1; - return 0; + spin_lock_irqsave(&cm.lock, flags); + if (&cm_id_priv->av == av) + list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list); + else if (&cm_id_priv->alt_av == av) + list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list); + else + ret = -EINVAL; + + spin_unlock_irqrestore(&cm.lock, flags); + + return ret; } static int cm_alloc_id(struct cm_id_private *cm_id_priv) @@ -677,6 +728,8 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device, spin_lock_init(&cm_id_priv->lock); init_completion(&cm_id_priv->comp); INIT_LIST_HEAD(&cm_id_priv->work_list); + INIT_LIST_HEAD(&cm_id_priv->prim_list); + INIT_LIST_HEAD(&cm_id_priv->altr_list); atomic_set(&cm_id_priv->work_count, -1); atomic_set(&cm_id_priv->refcount, 1); return &cm_id_priv->id; @@ -892,6 +945,15 @@ retest: break; } + spin_lock_irq(&cm.lock); + if (!list_empty(&cm_id_priv->altr_list) && + (!cm_id_priv->altr_send_port_not_ready)) + list_del(&cm_id_priv->altr_list); + if (!list_empty(&cm_id_priv->prim_list) && + (!cm_id_priv->prim_send_port_not_ready)) + list_del(&cm_id_priv->prim_list); + spin_unlock_irq(&cm.lock); + cm_free_id(cm_id->local_id); cm_deref_id(cm_id_priv); wait_for_completion(&cm_id_priv->comp); @@ -1192,12 +1254,13 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, goto out; } - ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av); + ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av, + cm_id_priv); if (ret) goto error1; if (param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, - &cm_id_priv->alt_av); + &cm_id_priv->alt_av, cm_id_priv); if (ret) goto error1; } @@ -1653,7 +1716,8 @@ static int cm_req_handler(struct cm_work *work) dev_put(gid_attr.ndev); } work->path[0].gid_type = gid_attr.gid_type; - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, + cm_id_priv); } if (ret) { int err = ib_get_cached_gid(work->port->cm_dev->ib_device, @@ -1672,7 +1736,8 @@ static int cm_req_handler(struct cm_work *work) goto rejected; } if (req_msg->alt_local_lid) { - ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av); + ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av, + cm_id_priv); if (ret) { ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, @@ -2727,7 +2792,8 @@ int ib_send_cm_lap(struct ib_cm_id *cm_id, goto out; } - ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av); + ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av, + cm_id_priv); if (ret) goto out; cm_id_priv->alt_av.timeout = @@ -2839,7 +2905,8 @@ static int cm_lap_handler(struct cm_work *work) cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); - cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av); + cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, + cm_id_priv); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); @@ -3031,7 +3098,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - ret = cm_init_av_by_path(param->path, &cm_id_priv->av); + ret = cm_init_av_by_path(param->path, &cm_id_priv->av, cm_id_priv); if (ret) goto out; @@ -3468,7 +3535,9 @@ out: static int cm_migrate(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; + struct cm_av tmp_av; unsigned long flags; + int tmp_send_port_not_ready; int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); @@ -3477,7 +3546,14 @@ static int cm_migrate(struct ib_cm_id *cm_id) (cm_id->lap_state == IB_CM_LAP_UNINIT || cm_id->lap_state == IB_CM_LAP_IDLE)) { cm_id->lap_state = IB_CM_LAP_IDLE; + /* Swap address vector */ + tmp_av = cm_id_priv->av; cm_id_priv->av = cm_id_priv->alt_av; + cm_id_priv->alt_av = tmp_av; + /* Swap port send ready state */ + tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready; + cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready; + cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready; } else ret = -EINVAL; spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -3888,6 +3964,9 @@ static void cm_add_one(struct ib_device *ib_device) port->cm_dev = cm_dev; port->port_num = i; + INIT_LIST_HEAD(&port->cm_priv_prim_list); + INIT_LIST_HEAD(&port->cm_priv_altr_list); + ret = cm_create_port_fs(port); if (ret) goto error1; @@ -3945,6 +4024,8 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) { struct cm_device *cm_dev = client_data; struct cm_port *port; + struct cm_id_private *cm_id_priv; + struct ib_mad_agent *cur_mad_agent; struct ib_port_modify port_modify = { .clr_port_cap_mask = IB_PORT_CM_SUP }; @@ -3968,15 +4049,27 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); + /* Mark all the cm_id's as not valid */ + spin_lock_irq(&cm.lock); + list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list) + cm_id_priv->altr_send_port_not_ready = 1; + list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list) + cm_id_priv->prim_send_port_not_ready = 1; + spin_unlock_irq(&cm.lock); /* * We flush the queue here after the going_down set, this * verify that no new works will be queued in the recv handler, * after that we can call the unregister_mad_agent */ flush_workqueue(cm.wq); - ib_unregister_mad_agent(port->mad_agent); + spin_lock_irq(&cm.state_lock); + cur_mad_agent = port->mad_agent; + port->mad_agent = NULL; + spin_unlock_irq(&cm.state_lock); + ib_unregister_mad_agent(cur_mad_agent); cm_remove_port_fs(port); } + device_unregister(cm_dev->device); kfree(cm_dev); } @@ -3989,6 +4082,7 @@ static int __init ib_cm_init(void) INIT_LIST_HEAD(&cm.device_list); rwlock_init(&cm.device_lock); spin_lock_init(&cm.lock); + spin_lock_init(&cm.state_lock); cm.listen_service_table = RB_ROOT; cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID); cm.remote_id_table = RB_ROOT; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 36bf50ebb187..22fcf284dd8b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -116,7 +116,7 @@ static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; -static int cma_pernet_id; +static unsigned int cma_pernet_id; struct cma_pernet { struct idr tcp_ps; @@ -1094,47 +1094,47 @@ static void cma_save_ib_info(struct sockaddr *src_addr, } } -static void cma_save_ip4_info(struct sockaddr *src_addr, - struct sockaddr *dst_addr, +static void cma_save_ip4_info(struct sockaddr_in *src_addr, + struct sockaddr_in *dst_addr, struct cma_hdr *hdr, __be16 local_port) { - struct sockaddr_in *ip4; - if (src_addr) { - ip4 = (struct sockaddr_in *)src_addr; - ip4->sin_family = AF_INET; - ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr; - ip4->sin_port = local_port; + *src_addr = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr.s_addr = hdr->dst_addr.ip4.addr, + .sin_port = local_port, + }; } if (dst_addr) { - ip4 = (struct sockaddr_in *)dst_addr; - ip4->sin_family = AF_INET; - ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr; - ip4->sin_port = hdr->port; + *dst_addr = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr.s_addr = hdr->src_addr.ip4.addr, + .sin_port = hdr->port, + }; } } -static void cma_save_ip6_info(struct sockaddr *src_addr, - struct sockaddr *dst_addr, +static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, + struct sockaddr_in6 *dst_addr, struct cma_hdr *hdr, __be16 local_port) { - struct sockaddr_in6 *ip6; - if (src_addr) { - ip6 = (struct sockaddr_in6 *)src_addr; - ip6->sin6_family = AF_INET6; - ip6->sin6_addr = hdr->dst_addr.ip6; - ip6->sin6_port = local_port; + *src_addr = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = hdr->dst_addr.ip6, + .sin6_port = local_port, + }; } if (dst_addr) { - ip6 = (struct sockaddr_in6 *)dst_addr; - ip6->sin6_family = AF_INET6; - ip6->sin6_addr = hdr->src_addr.ip6; - ip6->sin6_port = hdr->port; + *dst_addr = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = hdr->src_addr.ip6, + .sin6_port = hdr->port, + }; } } @@ -1159,10 +1159,12 @@ static int cma_save_ip_info(struct sockaddr *src_addr, switch (cma_get_ip_ver(hdr)) { case 4: - cma_save_ip4_info(src_addr, dst_addr, hdr, port); + cma_save_ip4_info((struct sockaddr_in *)src_addr, + (struct sockaddr_in *)dst_addr, hdr, port); break; case 6: - cma_save_ip6_info(src_addr, dst_addr, hdr, port); + cma_save_ip6_info((struct sockaddr_in6 *)src_addr, + (struct sockaddr_in6 *)dst_addr, hdr, port); break; default: return -EAFNOSUPPORT; @@ -2436,6 +2438,18 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos) return 0; } +static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, + unsigned long supported_gids, + enum ib_gid_type default_gid) +{ + if ((network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) && + test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + return default_gid; +} + static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; @@ -2461,6 +2475,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->num_paths = 1; if (addr->dev_addr.bound_dev_if) { + unsigned long supported_gids; + ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); if (!ndev) { ret = -ENODEV; @@ -2484,7 +2500,12 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->net = &init_net; route->path_rec->ifindex = ndev->ifindex; - route->path_rec->gid_type = id_priv->gid_type; + supported_gids = roce_gid_type_mask_support(id_priv->id.device, + id_priv->id.port_num); + route->path_rec->gid_type = + cma_route_gid_type(addr->dev_addr.network, + supported_gids, + id_priv->gid_type); } if (!ndev) { ret = -ENODEV; diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 19d499dcab76..0c0bea091de8 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -127,14 +127,7 @@ void ib_cache_release_one(struct ib_device *device); static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, struct net_device *upper) { - struct net_device *_upper = NULL; - struct list_head *iter; - - netdev_for_each_all_upper_dev_rcu(dev, _upper, iter) - if (_upper == upper) - break; - - return _upper == upper; + return netdev_has_upper_dev_all_rcu(dev, upper); } int addr_init(void); diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 06556c34606d..3a64a0881882 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -437,6 +437,28 @@ static void callback_for_addr_gid_device_scan(struct ib_device *device, &parsed->gid_attr); } +struct upper_list { + struct list_head list; + struct net_device *upper; +}; + +static int netdev_upper_walk(struct net_device *upper, void *data) +{ + struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + struct list_head *upper_list = data; + + if (!entry) { + pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n"); + return 0; + } + + list_add_tail(&entry->list, upper_list); + dev_hold(upper); + entry->upper = upper; + + return 0; +} + static void handle_netdev_upper(struct ib_device *ib_dev, u8 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, @@ -444,30 +466,12 @@ static void handle_netdev_upper(struct ib_device *ib_dev, u8 port, struct net_device *ndev)) { struct net_device *ndev = (struct net_device *)cookie; - struct upper_list { - struct list_head list; - struct net_device *upper; - }; - struct net_device *upper; - struct list_head *iter; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); rcu_read_lock(); - netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) { - struct upper_list *entry = kmalloc(sizeof(*entry), - GFP_ATOMIC); - - if (!entry) { - pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n"); - continue; - } - - list_add_tail(&entry->list, &upper_list); - dev_hold(upper); - entry->upper = upper; - } + netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &upper_list); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c68746ce6624..84b4eff90395 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -94,6 +94,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, unsigned long dma_attrs = 0; struct scatterlist *sg, *sg_list_start; int need_release = 0; + unsigned int gup_flags = FOLL_WRITE; if (dmasync) dma_attrs |= DMA_ATTR_WRITE_BARRIER; @@ -174,7 +175,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, cur_base = addr & PAGE_MASK; - if (npages == 0) { + if (npages == 0 || npages > UINT_MAX) { ret = -EINVAL; goto out; } @@ -183,6 +184,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (ret) goto out; + if (!umem->writable) + gup_flags |= FOLL_FORCE; + need_release = 1; sg_list_start = umem->sg_head.sgl; @@ -190,7 +194,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - 1, !umem->writable, page_list, vma_list); + gup_flags, page_list, vma_list); if (ret < 0) goto out; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 75077a018675..6b079a31dced 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -527,6 +527,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, u64 off; int j, k, ret = 0, start_idx, npages = 0; u64 base_virt_addr; + unsigned int flags = 0; if (access_mask == 0) return -EINVAL; @@ -556,6 +557,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, goto out_put_task; } + if (access_mask & ODP_WRITE_ALLOWED_BIT) + flags |= FOLL_WRITE; + start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; k = start_idx; @@ -574,8 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, */ npages = get_user_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - access_mask & ODP_WRITE_ALLOWED_BIT, - 0, local_page_list, NULL); + flags, local_page_list, NULL, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 0012fa58c105..44b1104eb168 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -262,12 +262,9 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, container_of(uobj, struct ib_uqp_object, uevent.uobject); idr_remove_uobj(&ib_uverbs_qp_idr, uobj); - if (qp != qp->real_qp) { - ib_close_qp(qp); - } else { + if (qp == qp->real_qp) ib_uverbs_detach_umcast(qp, uqp); - ib_destroy_qp(qp); - } + ib_destroy_qp(qp); ib_uverbs_release_uevent(file, &uqp->uevent); kfree(uqp); } diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 867b8cf82be8..19c6477af19f 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -666,18 +666,6 @@ skip_cqe: return ret; } -static void invalidate_mr(struct c4iw_dev *rhp, u32 rkey) -{ - struct c4iw_mr *mhp; - unsigned long flags; - - spin_lock_irqsave(&rhp->lock, flags); - mhp = get_mhp(rhp, rkey >> 8); - if (mhp) - mhp->attr.state = 0; - spin_unlock_irqrestore(&rhp->lock, flags); -} - /* * Get one cq entry from c4iw and map it to openib. * @@ -733,7 +721,7 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) { wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe); wc->wc_flags |= IB_WC_WITH_INVALIDATE; - invalidate_mr(qhp->rhp, wc->ex.invalidate_rkey); + c4iw_invalidate_mr(qhp->rhp, wc->ex.invalidate_rkey); } } else { switch (CQE_OPCODE(&cqe)) { @@ -762,7 +750,8 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) /* Invalidate the MR if the fastreg failed */ if (CQE_STATUS(&cqe) != T4_ERR_SUCCESS) - invalidate_mr(qhp->rhp, CQE_WRID_FR_STAG(&cqe)); + c4iw_invalidate_mr(qhp->rhp, + CQE_WRID_FR_STAG(&cqe)); break; default: printk(KERN_ERR MOD "Unexpected opcode %d " diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 93e3d270a98a..4e5baf4fe15e 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -1481,6 +1481,7 @@ static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...) static struct cxgb4_uld_info c4iw_uld_info = { .name = DRV_NAME, .nrxq = MAX_ULD_QSETS, + .ntxq = MAX_ULD_QSETS, .rxq_size = 511, .ciq = true, .lro = false, diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 7e7f79e55006..4788e1a46fde 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -999,6 +999,6 @@ extern int db_coalescing_threshold; extern int use_dsgl; void c4iw_drain_rq(struct ib_qp *qp); void c4iw_drain_sq(struct ib_qp *qp); - +void c4iw_invalidate_mr(struct c4iw_dev *rhp, u32 rkey); #endif diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 80e27749420a..410408f886c1 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -770,3 +770,15 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr) kfree(mhp); return 0; } + +void c4iw_invalidate_mr(struct c4iw_dev *rhp, u32 rkey) +{ + struct c4iw_mr *mhp; + unsigned long flags; + + spin_lock_irqsave(&rhp->lock, flags); + mhp = get_mhp(rhp, rkey >> 8); + if (mhp) + mhp->attr.state = 0; + spin_unlock_irqrestore(&rhp->lock, flags); +} diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index f57deba6717c..b7ac97b27c88 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -706,12 +706,8 @@ static int build_memreg(struct t4_sq *sq, union t4_wr *wqe, return 0; } -static int build_inv_stag(struct c4iw_dev *dev, union t4_wr *wqe, - struct ib_send_wr *wr, u8 *len16) +static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) { - struct c4iw_mr *mhp = get_mhp(dev, wr->ex.invalidate_rkey >> 8); - - mhp->attr.state = 0; wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); wqe->inv.r2 = 0; *len16 = DIV_ROUND_UP(sizeof wqe->inv, 16); @@ -797,11 +793,13 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, spin_lock_irqsave(&qhp->lock, flag); if (t4_wq_in_error(&qhp->wq)) { spin_unlock_irqrestore(&qhp->lock, flag); + *bad_wr = wr; return -EINVAL; } num_wrs = t4_sq_avail(&qhp->wq); if (num_wrs == 0) { spin_unlock_irqrestore(&qhp->lock, flag); + *bad_wr = wr; return -ENOMEM; } while (wr) { @@ -840,10 +838,13 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, case IB_WR_RDMA_READ_WITH_INV: fw_opcode = FW_RI_RDMA_READ_WR; swsqe->opcode = FW_RI_READ_REQ; - if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) + if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) { + c4iw_invalidate_mr(qhp->rhp, + wr->sg_list[0].lkey); fw_flags = FW_RI_RDMA_READ_INVALIDATE; - else + } else { fw_flags = 0; + } err = build_rdma_read(wqe, wr, &len16); if (err) break; @@ -876,7 +877,8 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fw_flags |= FW_RI_LOCAL_FENCE_FLAG; fw_opcode = FW_RI_INV_LSTAG_WR; swsqe->opcode = FW_RI_LOCAL_INV; - err = build_inv_stag(qhp->rhp, wqe, wr, &len16); + err = build_inv_stag(wqe, wr, &len16); + c4iw_invalidate_mr(qhp->rhp, wr->ex.invalidate_rkey); break; default: PDBG("%s post of type=%d TBD!\n", __func__, @@ -934,11 +936,13 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, spin_lock_irqsave(&qhp->lock, flag); if (t4_wq_in_error(&qhp->wq)) { spin_unlock_irqrestore(&qhp->lock, flag); + *bad_wr = wr; return -EINVAL; } num_wrs = t4_rq_avail(&qhp->wq); if (num_wrs == 0) { spin_unlock_irqrestore(&qhp->lock, flag); + *bad_wr = wr; return -ENOMEM; } while (wr) { diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index a26a9a0bfc41..67ea85a56945 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -775,75 +775,3 @@ void hfi1_put_proc_affinity(int cpu) } mutex_unlock(&affinity->lock); } - -int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf, - size_t count) -{ - struct hfi1_affinity_node *entry; - cpumask_var_t mask; - int ret, i; - - mutex_lock(&node_affinity.lock); - entry = node_affinity_lookup(dd->node); - - if (!entry) { - ret = -EINVAL; - goto unlock; - } - - ret = zalloc_cpumask_var(&mask, GFP_KERNEL); - if (!ret) { - ret = -ENOMEM; - goto unlock; - } - - ret = cpulist_parse(buf, mask); - if (ret) - goto out; - - if (!cpumask_subset(mask, cpu_online_mask) || cpumask_empty(mask)) { - dd_dev_warn(dd, "Invalid CPU mask\n"); - ret = -EINVAL; - goto out; - } - - /* reset the SDMA interrupt affinity details */ - init_cpu_mask_set(&entry->def_intr); - cpumask_copy(&entry->def_intr.mask, mask); - - /* Reassign the affinity for each SDMA interrupt. */ - for (i = 0; i < dd->num_msix_entries; i++) { - struct hfi1_msix_entry *msix; - - msix = &dd->msix_entries[i]; - if (msix->type != IRQ_SDMA) - continue; - - ret = get_irq_affinity(dd, msix); - - if (ret) - break; - } -out: - free_cpumask_var(mask); -unlock: - mutex_unlock(&node_affinity.lock); - return ret ? ret : strnlen(buf, PAGE_SIZE); -} - -int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf) -{ - struct hfi1_affinity_node *entry; - - mutex_lock(&node_affinity.lock); - entry = node_affinity_lookup(dd->node); - - if (!entry) { - mutex_unlock(&node_affinity.lock); - return -EINVAL; - } - - cpumap_print_to_pagebuf(true, buf, &entry->def_intr.mask); - mutex_unlock(&node_affinity.lock); - return strnlen(buf, PAGE_SIZE); -} diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h index b89ea3c0ee1a..42e63316afd1 100644 --- a/drivers/infiniband/hw/hfi1/affinity.h +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -102,10 +102,6 @@ int hfi1_get_proc_affinity(int); /* Release a CPU used by a user process. */ void hfi1_put_proc_affinity(int); -int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf); -int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf, - size_t count); - struct hfi1_affinity_node { int node; struct cpu_mask_set def_intr; diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 9bf5f23544d4..24d0820873cf 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -6301,19 +6301,8 @@ void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf) /* leave shared count at zero for both global and VL15 */ write_global_credit(dd, vau, vl15buf, 0); - /* We may need some credits for another VL when sending packets - * with the snoop interface. Dividing it down the middle for VL15 - * and VL0 should suffice. - */ - if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) { - write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1) - << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT); - write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1) - << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT); - } else { - write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf - << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT); - } + write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf + << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT); } /* @@ -9915,9 +9904,6 @@ static void set_lidlmc(struct hfi1_pportdata *ppd) u32 mask = ~((1U << ppd->lmc) - 1); u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1); - if (dd->hfi1_snoop.mode_flag) - dd_dev_info(dd, "Set lid/lmc while snooping"); - c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK); c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK) @@ -12112,7 +12098,7 @@ static void update_synth_timer(unsigned long opaque) mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME); } -#define C_MAX_NAME 13 /* 12 chars + one for /0 */ +#define C_MAX_NAME 16 /* 15 chars + one for /0 */ static int init_cntrs(struct hfi1_devdata *dd) { int i, rcv_ctxts, j; @@ -14463,7 +14449,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, * Any error printing is already done by the init code. * On return, we have the chip mapped. */ - ret = hfi1_pcie_ddinit(dd, pdev, ent); + ret = hfi1_pcie_ddinit(dd, pdev); if (ret < 0) goto bail_free; @@ -14691,6 +14677,11 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (ret) goto bail_free_cntrs; + init_completion(&dd->user_comp); + + /* The user refcount starts with one to inidicate an active device */ + atomic_set(&dd->user_refcount, 1); + goto bail; bail_free_rcverr: diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 92345259a8f4..043fd21dc5f3 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -320,6 +320,9 @@ /* DC_DC8051_CFG_MODE.GENERAL bits */ #define DISABLE_SELF_GUID_CHECK 0x2 +/* Bad L2 frame error code */ +#define BAD_L2_ERR 0x6 + /* * Eager buffer minimum and maximum sizes supported by the hardware. * All power-of-two sizes in between are supported as well. diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 6563e4d38b80..c5efff29c147 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -599,7 +599,6 @@ static void __prescan_rxq(struct hfi1_packet *packet) dd->rhf_offset; struct rvt_qp *qp; struct ib_header *hdr; - struct ib_other_headers *ohdr; struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; u64 rhf = rhf_to_cpu(rhf_addr); u32 etype = rhf_rcv_type(rhf), qpn, bth1; @@ -615,18 +614,21 @@ static void __prescan_rxq(struct hfi1_packet *packet) if (etype != RHF_RCV_TYPE_IB) goto next; - hdr = hfi1_get_msgheader(dd, rhf_addr); + packet->hdr = hfi1_get_msgheader(dd, rhf_addr); + hdr = packet->hdr; lnh = be16_to_cpu(hdr->lrh[0]) & 3; - if (lnh == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else if (lnh == HFI1_LRH_GRH) - ohdr = &hdr->u.l.oth; - else + if (lnh == HFI1_LRH_BTH) { + packet->ohdr = &hdr->u.oth; + } else if (lnh == HFI1_LRH_GRH) { + packet->ohdr = &hdr->u.l.oth; + packet->rcv_flags |= HFI1_HAS_GRH; + } else { goto next; /* just in case */ + } - bth1 = be32_to_cpu(ohdr->bth[1]); + bth1 = be32_to_cpu(packet->ohdr->bth[1]); is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK)); if (!is_ecn) @@ -646,7 +648,7 @@ static void __prescan_rxq(struct hfi1_packet *packet) /* turn off BECN, FECN */ bth1 &= ~(HFI1_FECN_SMASK | HFI1_BECN_SMASK); - ohdr->bth[1] = cpu_to_be32(bth1); + packet->ohdr->bth[1] = cpu_to_be32(bth1); next: update_ps_mdata(&mdata, rcd); } @@ -1360,12 +1362,25 @@ int process_receive_ib(struct hfi1_packet *packet) int process_receive_bypass(struct hfi1_packet *packet) { + struct hfi1_devdata *dd = packet->rcd->dd; + if (unlikely(rhf_err_flags(packet->rhf))) handle_eflags(packet); - dd_dev_err(packet->rcd->dd, + dd_dev_err(dd, "Bypass packets are not supported in normal operation. Dropping\n"); - incr_cntr64(&packet->rcd->dd->sw_rcv_bypass_packet_errors); + incr_cntr64(&dd->sw_rcv_bypass_packet_errors); + if (!(dd->err_info_rcvport.status_and_code & OPA_EI_STATUS_SMASK)) { + u64 *flits = packet->ebuf; + + if (flits && !(packet->rhf & RHF_LEN_ERR)) { + dd->err_info_rcvport.packet_flit1 = flits[0]; + dd->err_info_rcvport.packet_flit2 = + packet->tlen > sizeof(flits[0]) ? flits[1] : 0; + } + dd->err_info_rcvport.status_and_code |= + (OPA_EI_STATUS_SMASK | BAD_L2_ERR); + } return RHF_RCV_CONTINUE; } diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 677efa0e8cd6..bd786b7bd30b 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -172,6 +172,9 @@ static int hfi1_file_open(struct inode *inode, struct file *fp) struct hfi1_devdata, user_cdev); + if (!atomic_inc_not_zero(&dd->user_refcount)) + return -ENXIO; + /* Just take a ref now. Not all opens result in a context assign */ kobject_get(&dd->kobj); @@ -183,11 +186,17 @@ static int hfi1_file_open(struct inode *inode, struct file *fp) fd->rec_cpu_num = -1; /* no cpu affinity by default */ fd->mm = current->mm; atomic_inc(&fd->mm->mm_count); - } + fp->private_data = fd; + } else { + fp->private_data = NULL; + + if (atomic_dec_and_test(&dd->user_refcount)) + complete(&dd->user_comp); - fp->private_data = fd; + return -ENOMEM; + } - return fd ? 0 : -ENOMEM; + return 0; } static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, @@ -798,6 +807,10 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) done: mmdrop(fdata->mm); kobject_put(&dd->kobj); + + if (atomic_dec_and_test(&dd->user_refcount)) + complete(&dd->user_comp); + kfree(fdata); return 0; } diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 7eef11b316ff..cc87fd4e534b 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -367,26 +367,6 @@ struct hfi1_packet { u8 etype; }; -/* - * Private data for snoop/capture support. - */ -struct hfi1_snoop_data { - int mode_flag; - struct cdev cdev; - struct device *class_dev; - /* protect snoop data */ - spinlock_t snoop_lock; - struct list_head queue; - wait_queue_head_t waitq; - void *filter_value; - int (*filter_callback)(void *hdr, void *data, void *value); - u64 dcc_cfg; /* saved value of DCC Cfg register */ -}; - -/* snoop mode_flag values */ -#define HFI1_PORT_SNOOP_MODE 1U -#define HFI1_PORT_CAPTURE_MODE 2U - struct rvt_sge_state; /* @@ -613,8 +593,6 @@ struct hfi1_pportdata { struct mutex hls_lock; u32 host_link_state; - spinlock_t sdma_alllock ____cacheline_aligned_in_smp; - u32 lstate; /* logical link state */ /* these are the "32 bit" regs */ @@ -1104,8 +1082,6 @@ struct hfi1_devdata { char *portcntrnames; size_t portcntrnameslen; - struct hfi1_snoop_data hfi1_snoop; - struct err_info_rcvport err_info_rcvport; struct err_info_constraint err_info_rcv_constraint; struct err_info_constraint err_info_xmit_constraint; @@ -1141,8 +1117,8 @@ struct hfi1_devdata { rhf_rcv_function_ptr normal_rhf_rcv_functions[8]; /* - * Handlers for outgoing data so that snoop/capture does not - * have to have its hooks in the send path + * Capability to have different send engines simply by changing a + * pointer value. */ send_routine process_pio_send; send_routine process_dma_send; @@ -1174,6 +1150,10 @@ struct hfi1_devdata { spinlock_t aspm_lock; /* Number of verbs contexts which have disabled ASPM */ atomic_t aspm_disabled_cnt; + /* Keeps track of user space clients */ + atomic_t user_refcount; + /* Used to wait for outstanding user space clients before dev removal */ + struct completion user_comp; struct hfi1_affinity *affinity; struct rhashtable sdma_rht; @@ -1221,8 +1201,6 @@ struct hfi1_devdata *hfi1_lookup(int unit); extern u32 hfi1_cpulist_count; extern unsigned long *hfi1_cpulist; -extern unsigned int snoop_drop_send; -extern unsigned int snoop_force_capture; int hfi1_init(struct hfi1_devdata *, int); int hfi1_count_units(int *npresentp, int *nupp); int hfi1_count_active_units(void); @@ -1557,13 +1535,6 @@ void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf); void reset_link_credits(struct hfi1_devdata *dd); void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu); -int snoop_recv_handler(struct hfi1_packet *packet); -int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, - u64 pbc); -int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, - u64 pbc); -void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf, - u64 pbc, const void *from, size_t count); int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *bc); static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd) @@ -1763,8 +1734,7 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len); int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *); void hfi1_pcie_cleanup(struct pci_dev *); -int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *, - const struct pci_device_id *); +int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *); void hfi1_pcie_ddcleanup(struct hfi1_devdata *); void hfi1_pcie_flr(struct hfi1_devdata *); int pcie_speeds(struct hfi1_devdata *); @@ -1799,8 +1769,6 @@ int kdeth_process_expected(struct hfi1_packet *packet); int kdeth_process_eager(struct hfi1_packet *packet); int process_receive_invalid(struct hfi1_packet *packet); -extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8]; - void update_sge(struct rvt_sge_state *ss, u32 length); /* global module parameter variables */ @@ -1827,9 +1795,6 @@ extern struct mutex hfi1_mutex; #define DRIVER_NAME "hfi1" #define HFI1_USER_MINOR_BASE 0 #define HFI1_TRACE_MINOR 127 -#define HFI1_DIAGPKT_MINOR 128 -#define HFI1_DIAG_MINOR_BASE 129 -#define HFI1_SNOOP_CAPTURE_BASE 200 #define HFI1_NMINORS 255 #define PCI_VENDOR_ID_INTEL 0x8086 @@ -1848,7 +1813,13 @@ extern struct mutex hfi1_mutex; static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, u16 ctxt_type) { - u64 base_sc_integrity = + u64 base_sc_integrity; + + /* No integrity checks if HFI1_CAP_NO_INTEGRITY is set */ + if (HFI1_CAP_IS_KSET(NO_INTEGRITY)) + return 0; + + base_sc_integrity = SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK @@ -1863,7 +1834,6 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK - | SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK; @@ -1872,18 +1842,23 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, else base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY; - if (is_ax(dd)) - /* turn off send-side job key checks - A0 */ - return base_sc_integrity & - ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + /* turn on send-side job key checks if !A0 */ + if (!is_ax(dd)) + base_sc_integrity |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + return base_sc_integrity; } static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) { - u64 base_sdma_integrity = + u64 base_sdma_integrity; + + /* No integrity checks if HFI1_CAP_NO_INTEGRITY is set */ + if (HFI1_CAP_IS_KSET(NO_INTEGRITY)) + return 0; + + base_sdma_integrity = SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK - | SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK @@ -1895,14 +1870,18 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK - | SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK; - if (is_ax(dd)) - /* turn off send-side job key checks - A0 */ - return base_sdma_integrity & - ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + if (!HFI1_CAP_IS_KSET(STATIC_RATE_CTRL)) + base_sdma_integrity |= + SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK; + + /* turn on send-side job key checks if !A0 */ + if (!is_ax(dd)) + base_sdma_integrity |= + SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + return base_sdma_integrity; } diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 60db61536fed..e3b5bc93bc70 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -144,6 +144,8 @@ int hfi1_create_ctxts(struct hfi1_devdata *dd) struct hfi1_ctxtdata *rcd; ppd = dd->pport + (i % dd->num_pports); + + /* dd->rcd[i] gets assigned inside the callee */ rcd = hfi1_create_ctxtdata(ppd, i, dd->node); if (!rcd) { dd_dev_err(dd, @@ -169,8 +171,6 @@ int hfi1_create_ctxts(struct hfi1_devdata *dd) if (!rcd->sc) { dd_dev_err(dd, "Unable to allocate kernel send context, failing\n"); - dd->rcd[rcd->ctxt] = NULL; - hfi1_free_ctxtdata(dd, rcd); goto nomem; } @@ -178,9 +178,6 @@ int hfi1_create_ctxts(struct hfi1_devdata *dd) if (ret < 0) { dd_dev_err(dd, "Failed to setup kernel receive context, failing\n"); - sc_free(rcd->sc); - dd->rcd[rcd->ctxt] = NULL; - hfi1_free_ctxtdata(dd, rcd); ret = -EFAULT; goto bail; } @@ -196,6 +193,10 @@ int hfi1_create_ctxts(struct hfi1_devdata *dd) nomem: ret = -ENOMEM; bail: + if (dd->rcd) { + for (i = 0; i < dd->num_rcv_contexts; ++i) + hfi1_free_ctxtdata(dd, dd->rcd[i]); + } kfree(dd->rcd); dd->rcd = NULL; return ret; @@ -216,7 +217,7 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, dd->num_rcv_contexts - dd->first_user_ctxt) kctxt_ngroups = (dd->rcv_entries.nctxt_extra - (dd->num_rcv_contexts - dd->first_user_ctxt)); - rcd = kzalloc(sizeof(*rcd), GFP_KERNEL); + rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa); if (rcd) { u32 rcvtids, max_entries; @@ -261,13 +262,6 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, } rcd->eager_base = base * dd->rcv_entries.group_size; - /* Validate and initialize Rcv Hdr Q variables */ - if (rcvhdrcnt % HDRQ_INCREMENT) { - dd_dev_err(dd, - "ctxt%u: header queue count %d must be divisible by %lu\n", - rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT); - goto bail; - } rcd->rcvhdrq_cnt = rcvhdrcnt; rcd->rcvhdrqentsize = hfi1_hdrq_entsize; /* @@ -506,7 +500,6 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); mutex_init(&ppd->hls_lock); - spin_lock_init(&ppd->sdma_alllock); spin_lock_init(&ppd->qsfp_info.qsfp_lock); ppd->qsfp_info.ppd = ppd; @@ -1399,28 +1392,43 @@ static void postinit_cleanup(struct hfi1_devdata *dd) hfi1_free_devdata(dd); } +static int init_validate_rcvhdrcnt(struct device *dev, uint thecnt) +{ + if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { + hfi1_early_err(dev, "Receive header queue count too small\n"); + return -EINVAL; + } + + if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { + hfi1_early_err(dev, + "Receive header queue count cannot be greater than %u\n", + HFI1_MAX_HDRQ_EGRBUF_CNT); + return -EINVAL; + } + + if (thecnt % HDRQ_INCREMENT) { + hfi1_early_err(dev, "Receive header queue count %d must be divisible by %lu\n", + thecnt, HDRQ_INCREMENT); + return -EINVAL; + } + + return 0; +} + static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { int ret = 0, j, pidx, initfail; - struct hfi1_devdata *dd = ERR_PTR(-EINVAL); + struct hfi1_devdata *dd; struct hfi1_pportdata *ppd; /* First, lock the non-writable module parameters */ HFI1_CAP_LOCK(); /* Validate some global module parameters */ - if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { - hfi1_early_err(&pdev->dev, "Header queue count too small\n"); - ret = -EINVAL; - goto bail; - } - if (rcvhdrcnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { - hfi1_early_err(&pdev->dev, - "Receive header queue count cannot be greater than %u\n", - HFI1_MAX_HDRQ_EGRBUF_CNT); - ret = -EINVAL; + ret = init_validate_rcvhdrcnt(&pdev->dev, rcvhdrcnt); + if (ret) goto bail; - } + /* use the encoding function as a sanitization check */ if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n", @@ -1461,26 +1469,25 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (ret) goto bail; - /* - * Do device-specific initialization, function table setup, dd - * allocation, etc. - */ - switch (ent->device) { - case PCI_DEVICE_ID_INTEL0: - case PCI_DEVICE_ID_INTEL1: - dd = hfi1_init_dd(pdev, ent); - break; - default: + if (!(ent->device == PCI_DEVICE_ID_INTEL0 || + ent->device == PCI_DEVICE_ID_INTEL1)) { hfi1_early_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n", ent->device); ret = -ENODEV; + goto clean_bail; } - if (IS_ERR(dd)) + /* + * Do device-specific initialization, function table setup, dd + * allocation, etc. + */ + dd = hfi1_init_dd(pdev, ent); + + if (IS_ERR(dd)) { ret = PTR_ERR(dd); - if (ret) goto clean_bail; /* error already printed */ + } ret = create_workqueues(dd); if (ret) @@ -1538,12 +1545,31 @@ bail: return ret; } +static void wait_for_clients(struct hfi1_devdata *dd) +{ + /* + * Remove the device init value and complete the device if there is + * no clients or wait for active clients to finish. + */ + if (atomic_dec_and_test(&dd->user_refcount)) + complete(&dd->user_comp); + + wait_for_completion(&dd->user_comp); +} + static void remove_one(struct pci_dev *pdev) { struct hfi1_devdata *dd = pci_get_drvdata(pdev); /* close debugfs files before ib unregister */ hfi1_dbg_ibdev_exit(&dd->verbs_dev); + + /* remove the /dev hfi1 interface */ + hfi1_device_remove(dd); + + /* wait for existing user space clients to finish */ + wait_for_clients(dd); + /* unregister from IB core */ hfi1_unregister_ib_device(dd); @@ -1558,8 +1584,6 @@ static void remove_one(struct pci_dev *pdev) /* wait until all of our (qsfp) queue_work() calls complete */ flush_workqueue(ib_wq); - hfi1_device_remove(dd); - postinit_cleanup(dd); } diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 89c68da1c273..4ac8f330c5cb 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -157,8 +157,7 @@ void hfi1_pcie_cleanup(struct pci_dev *pdev) * fields required to re-initialize after a chip reset, or for * various other purposes */ -int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev, - const struct pci_device_id *ent) +int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) { unsigned long len; resource_size_t addr; diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 50a3a36d9363..d89b8745d4c1 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -668,19 +668,12 @@ void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold) void set_pio_integrity(struct send_context *sc) { struct hfi1_devdata *dd = sc->dd; - u64 reg = 0; u32 hw_context = sc->hw_context; int type = sc->type; - /* - * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if - * we're snooping. - */ - if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) && - dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE) - reg = hfi1_pkt_default_send_ctxt_mask(dd, type); - - write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg); + write_kctxt_csr(dd, hw_context, + SC(CHECK_ENABLE), + hfi1_pkt_default_send_ctxt_mask(dd, type)); } static u32 get_buffers_allocated(struct send_context *sc) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 8bc5013f39a1..83198a8a8797 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -89,7 +89,7 @@ void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to) lockdep_assert_held(&qp->s_lock); qp->s_flags |= RVT_S_WAIT_RNR; - qp->s_timer.expires = jiffies + usecs_to_jiffies(to); + priv->s_rnr_timer.expires = jiffies + usecs_to_jiffies(to); add_timer(&priv->s_rnr_timer); } diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index fd39bcaa062d..9cbe52d21077 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -2009,11 +2009,6 @@ static void sdma_hw_start_up(struct sdma_engine *sde) write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg); } -#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ -(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) - -#define SET_STATIC_RATE_CONTROL_SMASK(r) \ -(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) /* * set_sdma_integrity * @@ -2022,19 +2017,9 @@ static void sdma_hw_start_up(struct sdma_engine *sde) static void set_sdma_integrity(struct sdma_engine *sde) { struct hfi1_devdata *dd = sde->dd; - u64 reg; - - if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY))) - return; - - reg = hfi1_pkt_base_sdma_integrity(dd); - - if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL)) - CLEAR_STATIC_RATE_CONTROL_SMASK(reg); - else - SET_STATIC_RATE_CONTROL_SMASK(reg); - write_sde_csr(sde, SD(CHECK_ENABLE), reg); + write_sde_csr(sde, SD(CHECK_ENABLE), + hfi1_pkt_base_sdma_integrity(dd)); } static void init_sdma_regs( diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index edba22461a9c..919a5474e651 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -49,7 +49,6 @@ #include "hfi.h" #include "mad.h" #include "trace.h" -#include "affinity.h" /* * Start of per-port congestion control structures and support code @@ -623,27 +622,6 @@ static ssize_t show_tempsense(struct device *device, return ret; } -static ssize_t show_sdma_affinity(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); - struct hfi1_devdata *dd = dd_from_dev(dev); - - return hfi1_get_sdma_affinity(dd, buf); -} - -static ssize_t store_sdma_affinity(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); - struct hfi1_devdata *dd = dd_from_dev(dev); - - return hfi1_set_sdma_affinity(dd, buf, count); -} - /* * end of per-unit (or driver, in some cases, but replicated * per unit) functions @@ -658,8 +636,6 @@ static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); -static DEVICE_ATTR(sdma_affinity, S_IWUSR | S_IRUGO, show_sdma_affinity, - store_sdma_affinity); static struct device_attribute *hfi1_attributes[] = { &dev_attr_hw_rev, @@ -670,7 +646,6 @@ static struct device_attribute *hfi1_attributes[] = { &dev_attr_boardversion, &dev_attr_tempsense, &dev_attr_chip_reset, - &dev_attr_sdma_affinity, }; int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h index 11e02b228922..f77e59fb43fe 100644 --- a/drivers/infiniband/hw/hfi1/trace_rx.h +++ b/drivers/infiniband/hw/hfi1/trace_rx.h @@ -253,66 +253,6 @@ TRACE_EVENT(hfi1_mmu_invalidate, ) ); -#define SNOOP_PRN \ - "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \ - "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]" - -TRACE_EVENT(snoop_capture, - TP_PROTO(struct hfi1_devdata *dd, - int hdr_len, - struct ib_header *hdr, - int data_len, - void *data), - TP_ARGS(dd, hdr_len, hdr, data_len, data), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u16, slid) - __field(u16, dlid) - __field(u32, qpn) - __field(u8, opcode) - __field(u8, sl) - __field(u16, pkey) - __field(u32, hdr_len) - __field(u32, data_len) - __field(u8, lnh) - __dynamic_array(u8, raw_hdr, hdr_len) - __dynamic_array(u8, raw_pkt, data_len) - ), - TP_fast_assign( - struct ib_other_headers *ohdr; - - __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3); - if (__entry->lnh == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else - ohdr = &hdr->u.l.oth; - DD_DEV_ASSIGN(dd); - __entry->slid = be16_to_cpu(hdr->lrh[3]); - __entry->dlid = be16_to_cpu(hdr->lrh[1]); - __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; - __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; - __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; - __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff; - __entry->hdr_len = hdr_len; - __entry->data_len = data_len; - memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len); - memcpy(__get_dynamic_array(raw_pkt), data, data_len); - ), - TP_printk( - "[%s] " SNOOP_PRN, - __get_str(dev), - __entry->slid, - __entry->dlid, - __entry->qpn, - __entry->opcode, - show_ib_opcode(__entry->opcode), - __entry->sl, - __entry->pkey, - __entry->hdr_len, - __entry->data_len - ) -); - #endif /* __HFI1_TRACE_RX_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index a761f804111e..77697d690f3e 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1144,7 +1144,7 @@ static int pin_vector_pages(struct user_sdma_request *req, rb_node = hfi1_mmu_rb_extract(pq->handler, (unsigned long)iovec->iov.iov_base, iovec->iov.iov_len); - if (rb_node && !IS_ERR(rb_node)) + if (rb_node) node = container_of(rb_node, struct sdma_mmu_node, rb); else rb_node = NULL; diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index 5fc623362731..b9bf0759f10a 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -102,7 +102,10 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr if (vlan_tag < 0x1000) vlan_tag |= (ah_attr->sl & 7) << 13; ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); - ah->av.eth.gid_index = mlx4_ib_gid_index_to_real_index(ibdev, ah_attr->port_num, ah_attr->grh.sgid_index); + ret = mlx4_ib_gid_index_to_real_index(ibdev, ah_attr->port_num, ah_attr->grh.sgid_index); + if (ret < 0) + return ERR_PTR(ret); + ah->av.eth.gid_index = ret; ah->av.eth.vlan = cpu_to_be16(vlan_tag); ah->av.eth.hop_limit = ah_attr->grh.hop_limit; if (ah_attr->static_rate) { diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 1ea686b9e0f9..6a0fec357dae 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -253,11 +253,14 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, if (context) if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) { err = -EFAULT; - goto err_dbmap; + goto err_cq_free; } return &cq->ibcq; +err_cq_free: + mlx4_cq_free(dev->dev, &cq->mcq); + err_dbmap: if (context) mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 79d017baf6f4..fcd04b881ec1 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -932,8 +932,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, if (err) goto err_create; } else { - /* for now choose 64 bytes till we have a proper interface */ - cqe_size = 64; + cqe_size = cache_line_size() == 128 ? 128 : 64; err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, &index, &inlen); if (err) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 22174774dbb8..2be65ddf56ba 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1019,7 +1019,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); - resp.cache_line_size = L1_CACHE_BYTES; + resp.cache_line_size = cache_line_size(); resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); @@ -1771,13 +1771,13 @@ static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) mutex_lock(&dev->flow_db.lock); list_for_each_entry_safe(iter, tmp, &handler->list, list) { - mlx5_del_flow_rule(iter->rule); + mlx5_del_flow_rules(iter->rule); put_flow_table(dev, iter->prio, true); list_del(&iter->list); kfree(iter); } - mlx5_del_flow_rule(handler->rule); + mlx5_del_flow_rules(handler->rule); put_flow_table(dev, handler->prio, true); mutex_unlock(&dev->flow_db.lock); @@ -1857,7 +1857,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, ft = mlx5_create_auto_grouped_flow_table(ns, priority, num_entries, num_groups, - 0); + 0, 0); if (!IS_ERR(ft)) { prio->refcount = 0; @@ -1877,10 +1877,10 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, { struct mlx5_flow_table *ft = ft_prio->flow_table; struct mlx5_ib_flow_handler *handler; + struct mlx5_flow_act flow_act = {0}; struct mlx5_flow_spec *spec; const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); unsigned int spec_index; - u32 action; int err = 0; if (!is_valid_attr(flow_attr)) @@ -1905,12 +1905,12 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, } spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); - action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + flow_act.action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; - handler->rule = mlx5_add_flow_rule(ft, spec, - action, - MLX5_FS_DEFAULT_FLOW_TAG, - dst); + flow_act.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG; + handler->rule = mlx5_add_flow_rules(ft, spec, + &flow_act, + dst, 1); if (IS_ERR(handler->rule)) { err = PTR_ERR(handler->rule); @@ -1941,7 +1941,7 @@ static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *de handler_dst = create_flow_rule(dev, ft_prio, flow_attr, dst); if (IS_ERR(handler_dst)) { - mlx5_del_flow_rule(handler->rule); + mlx5_del_flow_rules(handler->rule); ft_prio->refcount--; kfree(handler); handler = handler_dst; @@ -2004,7 +2004,7 @@ static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *de &leftovers_specs[LEFTOVERS_UC].flow_attr, dst); if (IS_ERR(handler_ucast)) { - mlx5_del_flow_rule(handler->rule); + mlx5_del_flow_rules(handler->rule); ft_prio->refcount--; kfree(handler); handler = handler_ucast; @@ -2046,7 +2046,7 @@ static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev, return handler_rx; err_tx: - mlx5_del_flow_rule(handler_rx->rule); + mlx5_del_flow_rules(handler_rx->rule); ft_rx->refcount--; kfree(handler_rx); err: @@ -2311,14 +2311,14 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, { struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; struct ib_event ibev; - + bool fatal = false; u8 port = 0; switch (event) { case MLX5_DEV_EVENT_SYS_ERROR: - ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; mlx5_ib_handle_internal_error(ibdev); + fatal = true; break; case MLX5_DEV_EVENT_PORT_UP: @@ -2358,6 +2358,8 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, ibev.event = IB_EVENT_CLIENT_REREGISTER; port = (u8)param; break; + default: + return; } ibev.device = &ibdev->ib_dev; @@ -2370,6 +2372,9 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, if (ibdev->ib_active) ib_dispatch_event(&ibev); + + if (fatal) + ibdev->ib_active = false; } static void get_ext_port_caps(struct mlx5_ib_dev *dev) @@ -3115,7 +3120,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) } err = init_node_data(dev); if (err) - goto err_dealloc; + goto err_free_port; mutex_init(&dev->flow_db.lock); mutex_init(&dev->cap_mask_mutex); @@ -3125,7 +3130,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) if (ll == IB_LINK_LAYER_ETHERNET) { err = mlx5_enable_roce(dev); if (err) - goto err_dealloc; + goto err_free_port; } err = create_dev_resources(&dev->devr); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index dcdcd195fe53..854748b61212 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -153,7 +153,7 @@ struct mlx5_ib_flow_handler { struct list_head list; struct ib_flow ibflow; struct mlx5_ib_flow_prio *prio; - struct mlx5_flow_rule *rule; + struct mlx5_flow_handle *rule; }; struct mlx5_ib_flow_db { @@ -626,6 +626,8 @@ struct mlx5_ib_dev { struct mlx5_ib_resources devr; struct mlx5_mr_cache cache; struct timer_list delay_timer; + /* Prevents soft lock on massive reg MRs */ + struct mutex slow_path_mutex; int fill_delay; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_odp_caps odp_caps; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index d4ad672b905b..4e9012463c37 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -610,6 +610,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) int err; int i; + mutex_init(&dev->slow_path_mutex); cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); if (!cache->wq) { mlx5_ib_warn(dev, "failed to create work queue\n"); @@ -1182,9 +1183,12 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, goto error; } - if (!mr) + if (!mr) { + mutex_lock(&dev->slow_path_mutex); mr = reg_create(NULL, pd, virt_addr, length, umem, ncont, page_shift, access_flags); + mutex_unlock(&dev->slow_path_mutex); + } if (IS_ERR(mr)) { err = PTR_ERR(mr); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 41f4c2afbcdd..d1e921816bfe 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -52,7 +52,6 @@ enum { enum { MLX5_IB_SQ_STRIDE = 6, - MLX5_IB_CACHE_LINE_SIZE = 64, }; static const u32 mlx5_ib_opcode[] = { @@ -2052,8 +2051,8 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn, - to_mcq(init_attr->recv_cq)->mcq.cqn, - to_mcq(init_attr->send_cq)->mcq.cqn); + init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1, + init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1); qp->trans_qp.xrcdn = xrcdn; @@ -4815,6 +4814,14 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, udata->inlen)) return ERR_PTR(-EOPNOTSUPP); + if (init_attr->log_ind_tbl_size > + MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)) { + mlx5_ib_dbg(dev, "log_ind_tbl_size = %d is bigger than supported = %d\n", + init_attr->log_ind_tbl_size, + MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)); + return ERR_PTR(-EINVAL); + } + min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); if (udata->outlen && udata->outlen < min_resp_len) return ERR_PTR(-EINVAL); diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 6c00d04b8b28..c6fe89d79248 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -472,7 +472,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, goto out; } - ret = get_user_pages(uaddr & PAGE_MASK, 1, 1, 0, pages, NULL); + ret = get_user_pages(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages, NULL); if (ret < 0) goto out; diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 35cbb17bec12..2baa45a8e401 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -65,7 +65,6 @@ MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); -int max_mtu = 9000; int interrupt_mod_interval = 0; /* Interoperability */ diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index e7430c9254d3..85acd0843b50 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -83,6 +83,8 @@ #define NES_FIRST_QPN 64 #define NES_SW_CONTEXT_ALIGN 1024 +#define NES_MAX_MTU 9000 + #define NES_NIC_MAX_NICS 16 #define NES_MAX_ARP_TABLE_SIZE 4096 @@ -169,8 +171,6 @@ do { \ #include "nes_cm.h" #include "nes_mgt.h" -extern int max_mtu; -#define max_frame_len (max_mtu+ETH_HLEN) extern int interrupt_mod_interval; extern int nes_if_count; extern int mpa_version; diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index 2b27d1351cf7..7f8597d6738b 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -981,20 +981,16 @@ static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu) { struct nes_vnic *nesvnic = netdev_priv(netdev); struct nes_device *nesdev = nesvnic->nesdev; - int ret = 0; u8 jumbomode = 0; u32 nic_active; u32 nic_active_bit; u32 uc_all_active; u32 mc_all_active; - if ((new_mtu < ETH_ZLEN) || (new_mtu > max_mtu)) - return -EINVAL; - netdev->mtu = new_mtu; nesvnic->max_frame_size = new_mtu + VLAN_ETH_HLEN; - if (netdev->mtu > 1500) { + if (netdev->mtu > ETH_DATA_LEN) { jumbomode=1; } nes_nic_init_timer_defaults(nesdev, jumbomode); @@ -1020,7 +1016,7 @@ static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu) nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); } - return ret; + return 0; } @@ -1658,7 +1654,7 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev, netdev->watchdog_timeo = NES_TX_TIMEOUT; netdev->irq = nesdev->pcidev->irq; - netdev->mtu = ETH_DATA_LEN; + netdev->max_mtu = NES_MAX_MTU; netdev->hard_header_len = ETH_HLEN; netdev->addr_len = ETH_ALEN; netdev->type = ARPHRD_ETHER; diff --git a/drivers/infiniband/hw/qedr/Kconfig b/drivers/infiniband/hw/qedr/Kconfig index 7c06d85568d4..6c9f3923e838 100644 --- a/drivers/infiniband/hw/qedr/Kconfig +++ b/drivers/infiniband/hw/qedr/Kconfig @@ -2,6 +2,7 @@ config INFINIBAND_QEDR tristate "QLogic RoCE driver" depends on 64BIT && QEDE select QED_LL2 + select QED_RDMA ---help--- This driver provides low-level InfiniBand over Ethernet support for QLogic QED host channel adapters (HCAs). diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 2d2b94fd3633..75f08624ac05 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -67,7 +67,8 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, for (got = 0; got < num_pages; got += ret) { ret = get_user_pages(start_page + got * PAGE_SIZE, - num_pages - got, 1, 1, + num_pages - got, + FOLL_WRITE | FOLL_FORCE, p + got, NULL); if (ret < 0) goto bail_release; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index a0b6ebee4d8a..1ccee6ea5bc3 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -111,6 +111,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, int i; int flags; dma_addr_t pa; + unsigned int gup_flags; if (!can_do_mlock()) return -EPERM; @@ -135,6 +136,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, flags = IOMMU_READ | IOMMU_CACHE; flags |= (writable) ? IOMMU_WRITE : 0; + gup_flags = FOLL_WRITE; + gup_flags |= (writable) ? 0 : FOLL_FORCE; cur_base = addr & PAGE_MASK; ret = 0; @@ -142,7 +145,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), - 1, !writable, page_list, NULL); + gup_flags, page_list, NULL); if (ret < 0) goto out; diff --git a/drivers/infiniband/sw/rdmavt/dma.c b/drivers/infiniband/sw/rdmavt/dma.c index 01f71caa3ac4..f2cefb0d9180 100644 --- a/drivers/infiniband/sw/rdmavt/dma.c +++ b/drivers/infiniband/sw/rdmavt/dma.c @@ -90,9 +90,6 @@ static u64 rvt_dma_map_page(struct ib_device *dev, struct page *page, if (WARN_ON(!valid_dma_direction(direction))) return BAD_DMA_ADDRESS; - if (offset + size > PAGE_SIZE) - return BAD_DMA_ADDRESS; - addr = (u64)page_address(page); if (addr) addr += offset; diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index b8258e4f0aea..ffff5a54cb34 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -243,10 +243,8 @@ static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, { int err; struct socket *sock; - struct udp_port_cfg udp_cfg; - struct udp_tunnel_sock_cfg tnl_cfg; - - memset(&udp_cfg, 0, sizeof(udp_cfg)); + struct udp_port_cfg udp_cfg = {0}; + struct udp_tunnel_sock_cfg tnl_cfg = {0}; if (ipv6) { udp_cfg.family = AF_INET6; @@ -264,10 +262,8 @@ static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, return ERR_PTR(err); } - tnl_cfg.sk_user_data = NULL; tnl_cfg.encap_type = 1; tnl_cfg.encap_rcv = rxe_udp_encap_recv; - tnl_cfg.encap_destroy = NULL; /* Setup UDP tunnel */ setup_udp_tunnel_sock(net, sock, &tnl_cfg); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index b8036cfbce04..c3e60e4bde6e 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -522,6 +522,7 @@ static void rxe_qp_reset(struct rxe_qp *qp) if (qp->sq.queue) { __rxe_do_task(&qp->comp.task); __rxe_do_task(&qp->req.task); + rxe_queue_reset(qp->sq.queue); } /* cleanup attributes */ @@ -573,6 +574,7 @@ void rxe_qp_error(struct rxe_qp *qp) { qp->req.state = QP_STATE_ERROR; qp->resp.state = QP_STATE_ERROR; + qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ rxe_run_task(&qp->resp.task, 1); diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c index 08274254eb88..d14bf496d62d 100644 --- a/drivers/infiniband/sw/rxe/rxe_queue.c +++ b/drivers/infiniband/sw/rxe/rxe_queue.c @@ -84,6 +84,15 @@ err1: return -EINVAL; } +inline void rxe_queue_reset(struct rxe_queue *q) +{ + /* queue is comprised from header and the memory + * of the actual queue. See "struct rxe_queue_buf" in rxe_queue.h + * reset only the queue itself and not the management header + */ + memset(q->buf->data, 0, q->buf_size - sizeof(struct rxe_queue_buf)); +} + struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, int *num_elem, unsigned int elem_size) diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h index 239fd609c31e..8c8641c87817 100644 --- a/drivers/infiniband/sw/rxe/rxe_queue.h +++ b/drivers/infiniband/sw/rxe/rxe_queue.h @@ -84,6 +84,8 @@ int do_mmap_info(struct rxe_dev *rxe, size_t buf_size, struct rxe_mmap_info **ip_p); +void rxe_queue_reset(struct rxe_queue *q); + struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, int *num_elem, unsigned int elem_size); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 832846b73ea0..22bd9630dcd9 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -696,7 +696,8 @@ next_wqe: qp->req.wqe_index); wqe->state = wqe_state_done; wqe->status = IB_WC_SUCCESS; - goto complete; + __rxe_do_task(&qp->comp.task); + return 0; } payload = mtu; } @@ -745,13 +746,17 @@ err: wqe->status = IB_WC_LOC_PROT_ERR; wqe->state = wqe_state_error; -complete: - if (qp_type(qp) != IB_QPT_RC) { - while (rxe_completer(qp) == 0) - ; - } - - return 0; + /* + * IBA Spec. Section 10.7.3.1 SIGNALED COMPLETIONS + * ---------8<---------8<------------- + * ...Note that if a completion error occurs, a Work Completion + * will always be generated, even if the signaling + * indicator requests an Unsignaled Completion. + * ---------8<---------8<------------- + */ + wqe->wr.send_flags |= IB_SEND_SIGNALED; + __rxe_do_task(&qp->comp.task); + return -EAGAIN; exit: return -EAGAIN; diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 7b8d2d9e2263..da12717a3eb7 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -63,6 +63,8 @@ enum ipoib_flush_level { enum { IPOIB_ENCAP_LEN = 4, + IPOIB_PSEUDO_LEN = 20, + IPOIB_HARD_LEN = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN, IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN, IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */ @@ -134,15 +136,21 @@ struct ipoib_header { u16 reserved; }; -struct ipoib_cb { - struct qdisc_skb_cb qdisc_cb; - u8 hwaddr[INFINIBAND_ALEN]; +struct ipoib_pseudo_header { + u8 hwaddr[INFINIBAND_ALEN]; }; -static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb) +static inline void skb_add_pseudo_hdr(struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb)); - return (struct ipoib_cb *)skb->cb; + char *data = skb_push(skb, IPOIB_PSEUDO_LEN); + + /* + * only the ipoib header is present now, make room for a dummy + * pseudo header and set skb field accordingly + */ + memset(data, 0, IPOIB_PSEUDO_LEN); + skb_reset_mac_header(skb); + skb_pull(skb, IPOIB_HARD_LEN); } /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 4ad297d3de89..339a1eecdfe3 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -63,6 +63,8 @@ MODULE_PARM_DESC(cm_data_debug_level, #define IPOIB_CM_RX_DELAY (3 * 256 * HZ) #define IPOIB_CM_RX_UPDATE_MASK (0x3) +#define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN) + static struct ib_qp_attr ipoib_cm_err_attr = { .qp_state = IB_QPS_ERR }; @@ -146,15 +148,15 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, struct sk_buff *skb; int i; - skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); + skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16)); if (unlikely(!skb)) return NULL; /* - * IPoIB adds a 4 byte header. So we need 12 more bytes to align the + * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the * IP header to a multiple of 16. */ - skb_reserve(skb, 12); + skb_reserve(skb, IPOIB_CM_RX_RESERVE); mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); @@ -624,9 +626,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) if (wc->byte_len < IPOIB_CM_COPYBREAK) { int dlen = wc->byte_len; - small_skb = dev_alloc_skb(dlen + 12); + small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE); if (small_skb) { - skb_reserve(small_skb, 12); + skb_reserve(small_skb, IPOIB_CM_RX_RESERVE); ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], dlen, DMA_FROM_DEVICE); skb_copy_from_linear_data(skb, small_skb->data, dlen); @@ -663,8 +665,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) copied: skb->protocol = ((struct ipoib_header *) skb->data)->proto; - skb_reset_mac_header(skb); - skb_pull(skb, IPOIB_ENCAP_LEN); + skb_add_pseudo_hdr(skb); ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index be11d5d5b8c1..830fecb6934c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -128,16 +128,15 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN); + skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN); if (unlikely(!skb)) return NULL; /* - * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte - * header. So we need 4 more bytes to get to 48 and align the - * IP header to a multiple of 16. + * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is + * 64 bytes aligned */ - skb_reserve(skb, 4); + skb_reserve(skb, sizeof(struct ipoib_pseudo_header)); mapping = priv->rx_ring[id].mapping; mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size, @@ -253,8 +252,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) skb_pull(skb, IB_GRH_BYTES); skb->protocol = ((struct ipoib_header *) skb->data)->proto; - skb_reset_mac_header(skb); - skb_pull(skb, IPOIB_ENCAP_LEN); + skb_add_pseudo_hdr(skb); ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 5636fc3da6b8..c50794fb92db 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -292,6 +292,25 @@ static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) return dev; } +struct ipoib_walk_data { + const struct sockaddr *addr; + struct net_device *result; +}; + +static int ipoib_upper_walk(struct net_device *upper, void *_data) +{ + struct ipoib_walk_data *data = _data; + int ret = 0; + + if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) { + dev_hold(upper); + data->result = upper; + ret = 1; + } + + return ret; +} + /** * Find a net_device matching the given address, which is an upper device of * the given net_device. @@ -304,27 +323,21 @@ static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) static struct net_device *ipoib_get_net_dev_match_addr( const struct sockaddr *addr, struct net_device *dev) { - struct net_device *upper, - *result = NULL; - struct list_head *iter; + struct ipoib_walk_data data = { + .addr = addr, + }; rcu_read_lock(); if (ipoib_is_dev_match_addr_rcu(addr, dev)) { dev_hold(dev); - result = dev; + data.result = dev; goto out; } - netdev_for_each_all_upper_dev_rcu(dev, upper, iter) { - if (ipoib_is_dev_match_addr_rcu(addr, upper)) { - dev_hold(upper); - result = upper; - break; - } - } + netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &data); out: rcu_read_unlock(); - return result; + return data.result; } /* returns the number of IPoIB netdevs on top a given ipoib device matching a @@ -925,9 +938,12 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, ipoib_neigh_free(neigh); goto err_drop; } - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) + if (skb_queue_len(&neigh->queue) < + IPOIB_MAX_PATH_REC_QUEUE) { + /* put pseudoheader back on for next time */ + skb_push(skb, IPOIB_PSEUDO_LEN); __skb_queue_tail(&neigh->queue, skb); - else { + } else { ipoib_warn(priv, "queue length limit %d. Packet drop.\n", skb_queue_len(&neigh->queue)); goto err_drop; @@ -964,7 +980,7 @@ err_drop: } static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, - struct ipoib_cb *cb) + struct ipoib_pseudo_header *phdr) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path; @@ -972,16 +988,18 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, spin_lock_irqsave(&priv->lock, flags); - path = __path_find(dev, cb->hwaddr + 4); + path = __path_find(dev, phdr->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; if (!path) { - path = path_rec_create(dev, cb->hwaddr + 4); + path = path_rec_create(dev, phdr->hwaddr + 4); new_path = 1; } if (path) { if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + /* put pseudoheader back on for next time */ + skb_push(skb, IPOIB_PSEUDO_LEN); __skb_queue_tail(&path->queue, skb); } else { ++dev->stats.tx_dropped; @@ -1009,10 +1027,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, be16_to_cpu(path->pathrec.dlid)); spin_unlock_irqrestore(&priv->lock, flags); - ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); + ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); return; } else if ((path->query || !path_rec_start(dev, path)) && skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + /* put pseudoheader back on for next time */ + skb_push(skb, IPOIB_PSEUDO_LEN); __skb_queue_tail(&path->queue, skb); } else { ++dev->stats.tx_dropped; @@ -1026,13 +1046,15 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_neigh *neigh; - struct ipoib_cb *cb = ipoib_skb_cb(skb); + struct ipoib_pseudo_header *phdr; struct ipoib_header *header; unsigned long flags; + phdr = (struct ipoib_pseudo_header *) skb->data; + skb_pull(skb, sizeof(*phdr)); header = (struct ipoib_header *) skb->data; - if (unlikely(cb->hwaddr[4] == 0xff)) { + if (unlikely(phdr->hwaddr[4] == 0xff)) { /* multicast, arrange "if" according to probability */ if ((header->proto != htons(ETH_P_IP)) && (header->proto != htons(ETH_P_IPV6)) && @@ -1045,13 +1067,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } /* Add in the P_Key for multicast*/ - cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; - cb->hwaddr[9] = priv->pkey & 0xff; + phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; + phdr->hwaddr[9] = priv->pkey & 0xff; - neigh = ipoib_neigh_get(dev, cb->hwaddr); + neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (likely(neigh)) goto send_using_neigh; - ipoib_mcast_send(dev, cb->hwaddr, skb); + ipoib_mcast_send(dev, phdr->hwaddr, skb); return NETDEV_TX_OK; } @@ -1060,16 +1082,16 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_TIPC): - neigh = ipoib_neigh_get(dev, cb->hwaddr); + neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (unlikely(!neigh)) { - neigh_add_path(skb, cb->hwaddr, dev); + neigh_add_path(skb, phdr->hwaddr, dev); return NETDEV_TX_OK; } break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): /* for unicast ARP and RARP should always perform path find */ - unicast_arp_send(skb, dev, cb); + unicast_arp_send(skb, dev, phdr); return NETDEV_TX_OK; default: /* ethertype not supported by IPoIB */ @@ -1086,11 +1108,13 @@ send_using_neigh: goto unref; } } else if (neigh->ah) { - ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr)); goto unref; } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + /* put pseudoheader back on for next time */ + skb_push(skb, sizeof(*phdr)); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); @@ -1122,8 +1146,8 @@ static int ipoib_hard_header(struct sk_buff *skb, unsigned short type, const void *daddr, const void *saddr, unsigned len) { + struct ipoib_pseudo_header *phdr; struct ipoib_header *header; - struct ipoib_cb *cb = ipoib_skb_cb(skb); header = (struct ipoib_header *) skb_push(skb, sizeof *header); @@ -1132,12 +1156,13 @@ static int ipoib_hard_header(struct sk_buff *skb, /* * we don't rely on dst_entry structure, always stuff the - * destination address into skb->cb so we can figure out where + * destination address into skb hard header so we can figure out where * to send the packet later. */ - memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); + phdr = (struct ipoib_pseudo_header *) skb_push(skb, sizeof(*phdr)); + memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); - return sizeof *header; + return IPOIB_HARD_LEN; } static void ipoib_set_mcast_list(struct net_device *dev) @@ -1759,7 +1784,7 @@ void ipoib_setup(struct net_device *dev) dev->flags |= IFF_BROADCAST | IFF_MULTICAST; - dev->hard_header_len = IPOIB_ENCAP_LEN; + dev->hard_header_len = IPOIB_HARD_LEN; dev->addr_len = INFINIBAND_ALEN; dev->type = ARPHRD_INFINIBAND; dev->tx_queue_len = ipoib_sendq_size * 2; @@ -2004,6 +2029,7 @@ static struct net_device *ipoib_add_port(const char *format, /* MTU will be reset when mcast join happens */ priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + priv->dev->max_mtu = IPOIB_CM_MTU; priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index d3394b6add24..1909dd252c94 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -796,9 +796,11 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) __ipoib_mcast_add(dev, mcast); list_add_tail(&mcast->list, &priv->multicast_list); } - if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) + if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) { + /* put pseudoheader back on for next time */ + skb_push(skb, sizeof(struct ipoib_pseudo_header)); skb_queue_tail(&mcast->pkt_queue, skb); - else { + } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } |