diff options
Diffstat (limited to 'net')
318 files changed, 28939 insertions, 6878 deletions
diff --git a/net/802/psnap.c b/net/802/psnap.c index 70980baeb682..6fea0750662b 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -29,7 +29,7 @@ static struct llc_sap *snap_sap; /* * Find a snap client by matching the 5 bytes. */ -static struct datalink_proto *find_snap_client(unsigned char *desc) +static struct datalink_proto *find_snap_client(const unsigned char *desc) { struct datalink_proto *proto = NULL, *p; @@ -51,7 +51,7 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, int rc = 1; struct datalink_proto *proto; static struct packet_type snap_packet_type = { - .type = __constant_htons(ETH_P_SNAP), + .type = cpu_to_be16(ETH_P_SNAP), }; if (unlikely(!pskb_may_pull(skb, 5))) @@ -95,15 +95,16 @@ static int snap_request(struct datalink_proto *dl, EXPORT_SYMBOL(register_snap_client); EXPORT_SYMBOL(unregister_snap_client); -static char snap_err_msg[] __initdata = +static const char snap_err_msg[] __initconst = KERN_CRIT "SNAP - unable to register with 802.2\n"; static int __init snap_init(void) { snap_sap = llc_sap_open(0xAA, snap_rcv); - - if (!snap_sap) + if (!snap_sap) { printk(snap_err_msg); + return -EBUSY; + } return 0; } @@ -121,7 +122,7 @@ module_exit(snap_exit); /* * Register SNAP clients. We don't yet use this for IP. */ -struct datalink_proto *register_snap_client(unsigned char *desc, +struct datalink_proto *register_snap_client(const unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *, @@ -136,7 +137,7 @@ struct datalink_proto *register_snap_client(unsigned char *desc, proto = kmalloc(sizeof(*proto), GFP_ATOMIC); if (proto) { - memcpy(proto->type, desc,5); + memcpy(proto->type, desc, 5); proto->rcvfunc = rcvfunc; proto->header_length = 5 + 3; /* snap + 802.2 */ proto->request = snap_request; diff --git a/net/802/tr.c b/net/802/tr.c index f47ae289d83b..e7eb13084d71 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -486,6 +486,7 @@ static struct rif_cache *rif_get_idx(loff_t pos) } static void *rif_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(&rif_lock) { spin_lock_irq(&rif_lock); @@ -517,6 +518,7 @@ static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void rif_seq_stop(struct seq_file *seq, void *v) + __releases(&rif_lock) { spin_unlock_irq(&rif_lock); } diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 41e8f65bd3f0..2b7390e377b3 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -51,8 +51,8 @@ const char vlan_version[] = DRV_VERSION; static const char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>"; static const char vlan_buggyright[] = "David S. Miller <davem@redhat.com>"; -static struct packet_type vlan_packet_type = { - .type = __constant_htons(ETH_P_8021Q), +static struct packet_type vlan_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_8021Q), .func = vlan_skb_recv, /* VLAN receive method */ }; diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 2886d2fb9ab5..654e45f5719d 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -89,38 +89,27 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, goto drop; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = p->dev == skb->dev; + NAPI_GRO_CB(p)->same_flow = + p->dev == skb->dev && !compare_ether_header( + skb_mac_header(p), skb_gro_mac_header(skb)); NAPI_GRO_CB(p)->flush = 0; } return dev_gro_receive(napi, skb); drop: - return 2; + return GRO_DROP; } int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, unsigned int vlan_tci, struct sk_buff *skb) { - int err = NET_RX_SUCCESS; + if (netpoll_rx_on(skb)) + return vlan_hwaccel_receive_skb(skb, grp, vlan_tci); - if (netpoll_receive_skb(skb)) - return NET_RX_DROP; - - switch (vlan_gro_common(napi, grp, vlan_tci, skb)) { - case -1: - return netif_receive_skb(skb); + skb_gro_reset_offset(skb); - case 2: - err = NET_RX_DROP; - /* fall through */ - - case 1: - kfree_skb(skb); - break; - } - - return err; + return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb); } EXPORT_SYMBOL(vlan_gro_receive); @@ -128,30 +117,14 @@ int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp, unsigned int vlan_tci, struct napi_gro_fraginfo *info) { struct sk_buff *skb = napi_fraginfo_skb(napi, info); - int err = NET_RX_DROP; if (!skb) - goto out; - - if (netpoll_receive_skb(skb)) - goto out; - - err = NET_RX_SUCCESS; - - switch (vlan_gro_common(napi, grp, vlan_tci, skb)) { - case -1: - return netif_receive_skb(skb); - - case 2: - err = NET_RX_DROP; - /* fall through */ + return NET_RX_DROP; - case 1: - napi_reuse_skb(napi, skb); - break; - } + if (netpoll_rx_on(skb)) + return vlan_hwaccel_receive_skb(skb, grp, vlan_tci); -out: - return err; + return napi_frags_finish(napi, skb, + vlan_gro_common(napi, grp, vlan_tci, skb)); } EXPORT_SYMBOL(vlan_gro_frags); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 1df0356f242b..c613ed08a5ee 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -417,7 +417,7 @@ static int p9_fd_write(struct p9_client *client, void *v, int len) oldfs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ - ret = vfs_write(ts->wr, (void __user *)v, len, &ts->wr->f_pos); + ret = vfs_write(ts->wr, (__force void __user *)v, len, &ts->wr->f_pos); set_fs(oldfs); if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN) diff --git a/net/Kconfig b/net/Kconfig index cdb8fdef6c4a..93998a9c39c2 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -24,9 +24,6 @@ if NET menu "Networking options" -config COMPAT_NET_DEV_OPS - def_bool y - source "net/packet/Kconfig" source "net/unix/Kconfig" source "net/xfrm/Kconfig" @@ -171,6 +168,7 @@ endif source "net/dccp/Kconfig" source "net/sctp/Kconfig" +source "net/rds/Kconfig" source "net/tipc/Kconfig" source "net/atm/Kconfig" source "net/802/Kconfig" @@ -185,6 +183,7 @@ source "net/x25/Kconfig" source "net/lapb/Kconfig" source "net/econet/Kconfig" source "net/wanrouter/Kconfig" +source "net/phonet/Kconfig" source "net/sched/Kconfig" source "net/dcb/Kconfig" @@ -220,6 +219,17 @@ config NET_TCPPROBE To compile this code as a module, choose M here: the module will be called tcp_probe. +config NET_DROP_MONITOR + boolean "Network packet drop alerting service" + depends on INET && EXPERIMENTAL && TRACEPOINTS + ---help--- + This feature provides an alerting service to userspace in the + event that packets are discarded in the network stack. Alerts + are broadcast via netlink socket to any listening user space + process. If you don't need network drop alerts, or if you are ok + just checking the various proc files and other utilities for + drop statistics, say N here. + endmenu endmenu @@ -229,7 +239,6 @@ source "net/can/Kconfig" source "net/irda/Kconfig" source "net/bluetooth/Kconfig" source "net/rxrpc/Kconfig" -source "net/phonet/Kconfig" config FIB_RULES bool diff --git a/net/Makefile b/net/Makefile index 0fcce89d7169..9e00a55a901b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -49,6 +49,7 @@ obj-y += 8021q/ endif obj-$(CONFIG_IP_DCCP) += dccp/ obj-$(CONFIG_IP_SCTP) += sctp/ +obj-$(CONFIG_RDS) += rds/ obj-y += wireless/ obj-$(CONFIG_MAC80211) += mac80211/ obj-$(CONFIG_TIPC) += tipc/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 5abce07fb50a..3e0671df3a3f 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1860,13 +1860,13 @@ static struct notifier_block ddp_notifier = { .notifier_call = ddp_device_event, }; -static struct packet_type ltalk_packet_type = { - .type = __constant_htons(ETH_P_LOCALTALK), +static struct packet_type ltalk_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_LOCALTALK), .func = ltalk_rcv, }; -static struct packet_type ppptalk_packet_type = { - .type = __constant_htons(ETH_P_PPPTALK), +static struct packet_type ppptalk_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_PPPTALK), .func = atalk_rcv, }; @@ -1877,7 +1877,7 @@ EXPORT_SYMBOL(aarp_send_ddp); EXPORT_SYMBOL(atrtr_get_dev); EXPORT_SYMBOL(atalk_find_dev_addr); -static char atalk_err_snap[] __initdata = +static const char atalk_err_snap[] __initconst = KERN_CRIT "Unable to register DDP with SNAP.\n"; /* Called by proto.c on kernel start up */ diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c index d856a62ab50f..72277d70c980 100644 --- a/net/appletalk/dev.c +++ b/net/appletalk/dev.c @@ -9,22 +9,20 @@ #include <linux/if_arp.h> #include <linux/if_ltalk.h> +#ifdef CONFIG_COMPAT_NET_DEV_OPS static int ltalk_change_mtu(struct net_device *dev, int mtu) { return -EINVAL; } - -static int ltalk_mac_addr(struct net_device *dev, void *addr) -{ - return -EINVAL; -} +#endif static void ltalk_setup(struct net_device *dev) { /* Fill in the fields of the device structure with localtalk-generic values. */ +#ifdef CONFIG_COMPAT_NET_DEV_OPS dev->change_mtu = ltalk_change_mtu; - dev->set_mac_address = ltalk_mac_addr; +#endif dev->type = ARPHRD_LOCALTLK; dev->hard_header_len = LTALK_HLEN; diff --git a/net/atm/br2684.c b/net/atm/br2684.c index ea9438fc6855..334fcd4a4ea4 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -83,7 +83,6 @@ struct br2684_dev { struct list_head br2684_devs; int number; struct list_head brvccs; /* one device <=> one vcc (before xmas) */ - struct net_device_stats stats; int mac_was_set; enum br2684_payload payload; }; @@ -148,9 +147,10 @@ static struct net_device *br2684_find_dev(const struct br2684_if_spec *s) * the way for multiple vcc's per itf. Returns true if we can send, * otherwise false */ -static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev, +static int br2684_xmit_vcc(struct sk_buff *skb, struct net_device *dev, struct br2684_vcc *brvcc) { + struct br2684_dev *brdev = BRPRIV(dev); struct atm_vcc *atmvcc; int minheadroom = (brvcc->encaps == e_llc) ? 10 : 2; @@ -211,8 +211,8 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev, } atomic_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc); ATM_SKB(skb)->atm_options = atmvcc->atm_options; - brdev->stats.tx_packets++; - brdev->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; atmvcc->send(atmvcc, skb); return 1; } @@ -233,14 +233,14 @@ static int br2684_start_xmit(struct sk_buff *skb, struct net_device *dev) brvcc = pick_outgoing_vcc(skb, brdev); if (brvcc == NULL) { pr_debug("no vcc attached to dev %s\n", dev->name); - brdev->stats.tx_errors++; - brdev->stats.tx_carrier_errors++; + dev->stats.tx_errors++; + dev->stats.tx_carrier_errors++; /* netif_stop_queue(dev); */ dev_kfree_skb(skb); read_unlock(&devs_lock); return 0; } - if (!br2684_xmit_vcc(skb, brdev, brvcc)) { + if (!br2684_xmit_vcc(skb, dev, brvcc)) { /* * We should probably use netif_*_queue() here, but that * involves added complication. We need to walk before @@ -248,27 +248,20 @@ static int br2684_start_xmit(struct sk_buff *skb, struct net_device *dev) * * Don't free here! this pointer might be no longer valid! */ - brdev->stats.tx_errors++; - brdev->stats.tx_fifo_errors++; + dev->stats.tx_errors++; + dev->stats.tx_fifo_errors++; } read_unlock(&devs_lock); return 0; } -static struct net_device_stats *br2684_get_stats(struct net_device *dev) -{ - pr_debug("br2684_get_stats\n"); - return &BRPRIV(dev)->stats; -} - /* * We remember when the MAC gets set, so we don't override it later with * the ESI of the ATM card of the first VC */ -static int (*my_eth_mac_addr) (struct net_device *, void *); static int br2684_mac_addr(struct net_device *dev, void *p) { - int err = my_eth_mac_addr(dev, p); + int err = eth_mac_addr(dev, p); if (!err) BRPRIV(dev)->mac_was_set = 1; return err; @@ -430,17 +423,17 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb) /* sigh, interface is down? */ if (unlikely(!(net_dev->flags & IFF_UP))) goto dropped; - brdev->stats.rx_packets++; - brdev->stats.rx_bytes += skb->len; + net_dev->stats.rx_packets++; + net_dev->stats.rx_bytes += skb->len; memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); netif_rx(skb); return; dropped: - brdev->stats.rx_dropped++; + net_dev->stats.rx_dropped++; goto free_skb; error: - brdev->stats.rx_errors++; + net_dev->stats.rx_errors++; free_skb: dev_kfree_skb(skb); return; @@ -531,8 +524,8 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg) skb->next = skb->prev = NULL; br2684_push(atmvcc, skb); - BRPRIV(skb->dev)->stats.rx_bytes -= skb->len; - BRPRIV(skb->dev)->stats.rx_packets--; + skb->dev->stats.rx_bytes -= skb->len; + skb->dev->stats.rx_packets--; skb = next; } @@ -544,17 +537,20 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg) return err; } +static const struct net_device_ops br2684_netdev_ops = { + .ndo_start_xmit = br2684_start_xmit, + .ndo_set_mac_address = br2684_mac_addr, + .ndo_change_mtu = eth_change_mtu, + .ndo_validate_addr = eth_validate_addr, +}; + static void br2684_setup(struct net_device *netdev) { struct br2684_dev *brdev = BRPRIV(netdev); ether_setup(netdev); - brdev->net_dev = netdev; - my_eth_mac_addr = netdev->set_mac_address; - netdev->set_mac_address = br2684_mac_addr; - netdev->hard_start_xmit = br2684_start_xmit; - netdev->get_stats = br2684_get_stats; + netdev->netdev_ops = &br2684_netdev_ops; INIT_LIST_HEAD(&brdev->brvccs); } @@ -565,10 +561,8 @@ static void br2684_setup_routed(struct net_device *netdev) brdev->net_dev = netdev; netdev->hard_header_len = 0; - my_eth_mac_addr = netdev->set_mac_address; - netdev->set_mac_address = br2684_mac_addr; - netdev->hard_start_xmit = br2684_start_xmit; - netdev->get_stats = br2684_get_stats; + + netdev->netdev_ops = &br2684_netdev_ops; netdev->addr_len = 0; netdev->mtu = 1500; netdev->type = ARPHRD_PPP; diff --git a/net/atm/clip.c b/net/atm/clip.c index 2d33a83be799..3dc0a3a42a57 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -214,15 +214,15 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) skb->protocol = ((__be16 *) skb->data)[3]; skb_pull(skb, RFC1483LLC_LEN); if (skb->protocol == htons(ETH_P_ARP)) { - PRIV(skb->dev)->stats.rx_packets++; - PRIV(skb->dev)->stats.rx_bytes += skb->len; + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; clip_arp_rcv(skb); return; } } clip_vcc->last_use = jiffies; - PRIV(skb->dev)->stats.rx_packets++; - PRIV(skb->dev)->stats.rx_bytes += skb->len; + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); netif_rx(skb); } @@ -372,7 +372,7 @@ static int clip_start_xmit(struct sk_buff *skb, struct net_device *dev) if (!skb->dst) { printk(KERN_ERR "clip_start_xmit: skb->dst == NULL\n"); dev_kfree_skb(skb); - clip_priv->stats.tx_dropped++; + dev->stats.tx_dropped++; return 0; } if (!skb->dst->neighbour) { @@ -380,13 +380,13 @@ static int clip_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->dst->neighbour = clip_find_neighbour(skb->dst, 1); if (!skb->dst->neighbour) { dev_kfree_skb(skb); /* lost that one */ - clip_priv->stats.tx_dropped++; + dev->stats.tx_dropped++; return 0; } #endif printk(KERN_ERR "clip_start_xmit: NO NEIGHBOUR !\n"); dev_kfree_skb(skb); - clip_priv->stats.tx_dropped++; + dev->stats.tx_dropped++; return 0; } entry = NEIGH2ENTRY(skb->dst->neighbour); @@ -400,7 +400,7 @@ static int clip_start_xmit(struct sk_buff *skb, struct net_device *dev) skb_queue_tail(&entry->neigh->arp_queue, skb); else { dev_kfree_skb(skb); - clip_priv->stats.tx_dropped++; + dev->stats.tx_dropped++; } return 0; } @@ -423,8 +423,8 @@ static int clip_start_xmit(struct sk_buff *skb, struct net_device *dev) printk(KERN_WARNING "clip_start_xmit: XOFF->XOFF transition\n"); return 0; } - clip_priv->stats.tx_packets++; - clip_priv->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; vcc->send(vcc, skb); if (atm_may_send(vcc, 0)) { entry->vccs->xoff = 0; @@ -443,11 +443,6 @@ static int clip_start_xmit(struct sk_buff *skb, struct net_device *dev) return 0; } -static struct net_device_stats *clip_get_stats(struct net_device *dev) -{ - return &PRIV(dev)->stats; -} - static int clip_mkip(struct atm_vcc *vcc, int timeout) { struct clip_vcc *clip_vcc; @@ -501,8 +496,8 @@ static int clip_mkip(struct atm_vcc *vcc, int timeout) skb_get(skb); clip_push(vcc, skb); - PRIV(skb->dev)->stats.rx_packets--; - PRIV(skb->dev)->stats.rx_bytes -= len; + skb->dev->stats.rx_packets--; + skb->dev->stats.rx_bytes -= len; kfree_skb(skb); } @@ -557,11 +552,13 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip) return error; } +static const struct net_device_ops clip_netdev_ops = { + .ndo_start_xmit = clip_start_xmit, +}; + static void clip_setup(struct net_device *dev) { - dev->hard_start_xmit = clip_start_xmit; - /* sg_xmit ... */ - dev->get_stats = clip_get_stats; + dev->netdev_ops = &clip_netdev_ops; dev->type = ARPHRD_ATM; dev->hard_header_len = RFC1483LLC_LEN; dev->mtu = RFC1626_MTU; @@ -621,7 +618,7 @@ static int clip_device_event(struct notifier_block *this, unsigned long event, } /* ignore non-CLIP devices */ - if (dev->type != ARPHRD_ATM || dev->hard_start_xmit != clip_start_xmit) + if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops) return NOTIFY_DONE; switch (event) { diff --git a/net/atm/lec.c b/net/atm/lec.c index e5e301550e8a..199b6bb79f42 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -62,7 +62,6 @@ static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 }; static int lec_open(struct net_device *dev); static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev); static int lec_close(struct net_device *dev); -static struct net_device_stats *lec_get_stats(struct net_device *dev); static void lec_init(struct net_device *dev); static struct lec_arp_table *lec_arp_find(struct lec_priv *priv, const unsigned char *mac_addr); @@ -218,28 +217,28 @@ static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc) static int lec_open(struct net_device *dev) { - struct lec_priv *priv = netdev_priv(dev); - netif_start_queue(dev); - memset(&priv->stats, 0, sizeof(struct net_device_stats)); + memset(&dev->stats, 0, sizeof(struct net_device_stats)); return 0; } -static __inline__ void -lec_send(struct atm_vcc *vcc, struct sk_buff *skb, struct lec_priv *priv) +static void +lec_send(struct atm_vcc *vcc, struct sk_buff *skb) { + struct net_device *dev = skb->dev; + ATM_SKB(skb)->vcc = vcc; ATM_SKB(skb)->atm_options = vcc->atm_options; atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); if (vcc->send(vcc, skb) < 0) { - priv->stats.tx_dropped++; + dev->stats.tx_dropped++; return; } - priv->stats.tx_packets++; - priv->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; } static void lec_tx_timeout(struct net_device *dev) @@ -270,7 +269,7 @@ static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev) pr_debug("lec_start_xmit called\n"); if (!priv->lecd) { printk("%s:No lecd attached\n", dev->name); - priv->stats.tx_errors++; + dev->stats.tx_errors++; netif_stop_queue(dev); return -EUNATCH; } @@ -345,7 +344,7 @@ static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev) GFP_ATOMIC); dev_kfree_skb(skb); if (skb2 == NULL) { - priv->stats.tx_dropped++; + dev->stats.tx_dropped++; return 0; } skb = skb2; @@ -380,7 +379,7 @@ static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev) ("%s:lec_start_xmit: tx queue full or no arp entry, dropping, ", dev->name); pr_debug("MAC address %pM\n", lec_h->h_dest); - priv->stats.tx_dropped++; + dev->stats.tx_dropped++; dev_kfree_skb(skb); } goto out; @@ -392,10 +391,10 @@ static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev) while (entry && (skb2 = skb_dequeue(&entry->tx_wait))) { pr_debug("lec.c: emptying tx queue, "); pr_debug("MAC address %pM\n", lec_h->h_dest); - lec_send(vcc, skb2, priv); + lec_send(vcc, skb2); } - lec_send(vcc, skb, priv); + lec_send(vcc, skb); if (!atm_may_send(vcc, 0)) { struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); @@ -427,15 +426,6 @@ static int lec_close(struct net_device *dev) return 0; } -/* - * Get the current statistics. - * This may be called with the card open or closed. - */ -static struct net_device_stats *lec_get_stats(struct net_device *dev) -{ - return &((struct lec_priv *)netdev_priv(dev))->stats; -} - static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) { unsigned long flags; @@ -512,7 +502,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) priv->lane2_ops = NULL; if (priv->lane_version > 1) priv->lane2_ops = &lane2_ops; - if (dev->change_mtu(dev, mesg->content.config.mtu)) + if (dev_set_mtu(dev, mesg->content.config.mtu)) printk("%s: change_mtu to %d failed\n", dev->name, mesg->content.config.mtu); priv->is_proxy = mesg->content.config.is_proxy; @@ -677,17 +667,19 @@ static void lec_set_multicast_list(struct net_device *dev) return; } +static const struct net_device_ops lec_netdev_ops = { + .ndo_open = lec_open, + .ndo_stop = lec_close, + .ndo_start_xmit = lec_start_xmit, + .ndo_change_mtu = lec_change_mtu, + .ndo_tx_timeout = lec_tx_timeout, + .ndo_set_multicast_list = lec_set_multicast_list, +}; + + static void lec_init(struct net_device *dev) { - dev->change_mtu = lec_change_mtu; - dev->open = lec_open; - dev->stop = lec_close; - dev->hard_start_xmit = lec_start_xmit; - dev->tx_timeout = lec_tx_timeout; - - dev->get_stats = lec_get_stats; - dev->set_multicast_list = lec_set_multicast_list; - dev->do_ioctl = NULL; + dev->netdev_ops = &lec_netdev_ops; printk("%s: Initialized!\n", dev->name); } @@ -810,8 +802,8 @@ static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb) else #endif skb->protocol = eth_type_trans(skb, dev); - priv->stats.rx_packets++; - priv->stats.rx_bytes += skb->len; + dev->stats.rx_packets++; + dev->stats.rx_bytes += skb->len; memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); netif_rx(skb); } @@ -1887,7 +1879,7 @@ restart: lec_arp_hold(entry); spin_unlock_irqrestore(&priv->lec_arp_lock, flags); while ((skb = skb_dequeue(&entry->tx_wait)) != NULL) - lec_send(vcc, skb, entry->priv); + lec_send(vcc, skb); entry->last_used = jiffies; entry->status = ESI_FORWARD_DIRECT; lec_arp_put(entry); @@ -2305,7 +2297,7 @@ restart: lec_arp_hold(entry); spin_unlock_irqrestore(&priv->lec_arp_lock, flags); while ((skb = skb_dequeue(&entry->tx_wait)) != NULL) - lec_send(vcc, skb, entry->priv); + lec_send(vcc, skb); entry->last_used = jiffies; entry->status = ESI_FORWARD_DIRECT; lec_arp_put(entry); diff --git a/net/atm/lec.h b/net/atm/lec.h index 0d376682c1a3..9d14d196cc1d 100644 --- a/net/atm/lec.h +++ b/net/atm/lec.h @@ -69,7 +69,6 @@ struct lane2_ops { #define LEC_ARP_TABLE_SIZE 16 struct lec_priv { - struct net_device_stats stats; unsigned short lecid; /* Lecid of this client */ struct hlist_head lec_arp_empty_ones; /* Used for storing VCC's that don't have a MAC address attached yet */ diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 039d5cc72c3d..e5bf11453a18 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -286,33 +286,32 @@ static void start_mpc(struct mpoa_client *mpc, struct net_device *dev) { dprintk("mpoa: (%s) start_mpc:\n", mpc->dev->name); - if (dev->hard_start_xmit == NULL) { - printk("mpoa: (%s) start_mpc: dev->hard_start_xmit == NULL, not starting\n", - dev->name); - return; + if (!dev->netdev_ops) + printk("mpoa: (%s) start_mpc not starting\n", dev->name); + else { + mpc->old_ops = dev->netdev_ops; + mpc->new_ops = *mpc->old_ops; + mpc->new_ops.ndo_start_xmit = mpc_send_packet; + dev->netdev_ops = &mpc->new_ops; } - mpc->old_hard_start_xmit = dev->hard_start_xmit; - dev->hard_start_xmit = mpc_send_packet; - - return; } static void stop_mpc(struct mpoa_client *mpc) { - + struct net_device *dev = mpc->dev; dprintk("mpoa: (%s) stop_mpc:", mpc->dev->name); /* Lets not nullify lec device's dev->hard_start_xmit */ - if (mpc->dev->hard_start_xmit != mpc_send_packet) { + if (dev->netdev_ops != &mpc->new_ops) { dprintk(" mpc already stopped, not fatal\n"); return; } dprintk("\n"); - mpc->dev->hard_start_xmit = mpc->old_hard_start_xmit; - mpc->old_hard_start_xmit = NULL; - /* close_shortcuts(mpc); ??? FIXME */ - return; + dev->netdev_ops = mpc->old_ops; + mpc->old_ops = NULL; + + /* close_shortcuts(mpc); ??? FIXME */ } static const char *mpoa_device_type_string(char type) __attribute__ ((unused)); @@ -531,7 +530,6 @@ static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc) */ static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev) { - int retval; struct mpoa_client *mpc; struct ethhdr *eth; int i = 0; @@ -561,9 +559,7 @@ static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev) } non_ip: - retval = mpc->old_hard_start_xmit(skb,dev); - - return retval; + return mpc->old_ops->ndo_start_xmit(skb,dev); } static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) diff --git a/net/atm/mpc.h b/net/atm/mpc.h index 24c386c35f57..0919a88bbc70 100644 --- a/net/atm/mpc.h +++ b/net/atm/mpc.h @@ -15,7 +15,7 @@ struct mpoa_client { struct mpoa_client *next; struct net_device *dev; /* lec in question */ int dev_num; /* e.g. 2 for lec2 */ - int (*old_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev); + struct atm_vcc *mpoad_vcc; /* control channel to mpoad */ uint8_t mps_ctrl_addr[ATM_ESA_LEN]; /* MPS control ATM address */ uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */ @@ -31,6 +31,9 @@ struct mpoa_client { uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */ int number_of_mps_macs; /* number of the above MAC addresses */ struct mpc_parameters parameters; /* parameters for this client */ + + const struct net_device_ops *old_ops; + struct net_device_ops new_ops; }; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 00d9e5e13158..7da5ebb84e97 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1435,6 +1435,11 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, size_t size; int lv, err, addr_len = msg->msg_namelen; + /* AX.25 empty data frame has no meaning : don't send */ + if (len == 0) { + return (0); + } + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) return -EINVAL; @@ -1529,10 +1534,8 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, dp = ax25->digipeat; } - SOCK_DEBUG(sk, "AX.25: sendto: Addresses built.\n"); - /* Build a packet */ - SOCK_DEBUG(sk, "AX.25: sendto: building packet.\n"); + SOCK_DEBUG(sk, "AX.25: sendto: Addresses built. Building packet.\n"); /* Assume the worst case */ size = len + ax25->ax25_dev->dev->hard_header_len; @@ -1636,6 +1639,13 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, skb_reset_transport_header(skb); copied = skb->len; + /* AX.25 empty data frame has no meaning : ignore it */ + if (copied == 0) { + err = copied; + skb_free_datagram(sk, skb); + goto out; + } + if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; @@ -1985,9 +1995,8 @@ static const struct proto_ops ax25_proto_ops = { /* * Called by socket.c on kernel start up */ -static struct packet_type ax25_packet_type = { - .type = __constant_htons(ETH_P_AX25), - .dev = NULL, /* All devices */ +static struct packet_type ax25_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_AX25), .func = ax25_kiss_rcv, }; diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c index 8443af57a374..71338f112108 100644 --- a/net/ax25/ax25_iface.c +++ b/net/ax25/ax25_iface.c @@ -61,27 +61,24 @@ void ax25_protocol_release(unsigned int pid) write_lock_bh(&protocol_list_lock); protocol = protocol_list; - if (protocol == NULL) { - write_unlock_bh(&protocol_list_lock); - return; - } + if (protocol == NULL) + goto out; if (protocol->pid == pid) { protocol_list = protocol->next; - write_unlock_bh(&protocol_list_lock); - return; + goto out; } while (protocol != NULL && protocol->next != NULL) { if (protocol->next->pid == pid) { s = protocol->next; protocol->next = protocol->next->next; - write_unlock_bh(&protocol_list_lock); - return; + goto out; } protocol = protocol->next; } +out: write_unlock_bh(&protocol_list_lock); } diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 744ed3f07ef3..02b9baa1930b 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -41,14 +41,13 @@ #include <net/bluetooth/bluetooth.h> -#define VERSION "2.14" +#define VERSION "2.15" /* Bluetooth sockets */ #define BT_MAX_PROTO 8 static struct net_proto_family *bt_proto[BT_MAX_PROTO]; static DEFINE_RWLOCK(bt_proto_lock); -#ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key bt_lock_key[BT_MAX_PROTO]; static const char *bt_key_strings[BT_MAX_PROTO] = { "sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP", @@ -86,11 +85,6 @@ static inline void bt_sock_reclassify_lock(struct socket *sock, int proto) bt_slock_key_strings[proto], &bt_slock_key[proto], bt_key_strings[proto], &bt_lock_key[proto]); } -#else -static inline void bt_sock_reclassify_lock(struct socket *sock, int proto) -{ -} -#endif int bt_sock_register(int proto, struct net_proto_family *ops) { @@ -217,7 +211,8 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) continue; } - if (sk->sk_state == BT_CONNECTED || !newsock) { + if (sk->sk_state == BT_CONNECTED || !newsock || + bt_sk(parent)->defer_setup) { bt_accept_unlink(sk); if (newsock) sock_graft(sk, newsock); @@ -232,7 +227,7 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) EXPORT_SYMBOL(bt_accept_dequeue); int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) + struct msghdr *msg, size_t len, int flags) { int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; @@ -277,7 +272,9 @@ static inline unsigned int bt_accept_poll(struct sock *parent) list_for_each_safe(p, n, &bt_sk(parent)->accept_q) { sk = (struct sock *) list_entry(p, struct bt_sock, accept_q); - if (sk->sk_state == BT_CONNECTED) + if (sk->sk_state == BT_CONNECTED || + (bt_sk(parent)->defer_setup && + sk->sk_state == BT_CONNECT2)) return POLLIN | POLLRDNORM; } diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index c9cac7719efe..0073ec8495da 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -126,8 +126,7 @@ static inline void cmtp_add_msgpart(struct cmtp_session *session, int id, const session->reassembly[id] = nskb; - if (skb) - kfree_skb(skb); + kfree_skb(skb); } static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *skb) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index a4a789f24c8d..1181db08d9de 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -123,6 +123,8 @@ void hci_add_sco(struct hci_conn *conn, __u16 handle) conn->state = BT_CONNECT; conn->out = 1; + conn->attempt++; + cp.handle = cpu_to_le16(handle); cp.pkt_type = cpu_to_le16(conn->pkt_type); @@ -139,6 +141,8 @@ void hci_setup_sync(struct hci_conn *conn, __u16 handle) conn->state = BT_CONNECT; conn->out = 1; + conn->attempt++; + cp.handle = cpu_to_le16(handle); cp.pkt_type = cpu_to_le16(conn->pkt_type); @@ -155,6 +159,7 @@ static void hci_conn_timeout(unsigned long arg) { struct hci_conn *conn = (void *) arg; struct hci_dev *hdev = conn->hdev; + __u8 reason; BT_DBG("conn %p state %d", conn, conn->state); @@ -173,7 +178,8 @@ static void hci_conn_timeout(unsigned long arg) break; case BT_CONFIG: case BT_CONNECTED: - hci_acl_disconn(conn, 0x13); + reason = hci_proto_disconn_ind(conn); + hci_acl_disconn(conn, reason); break; default: conn->state = BT_CLOSED; @@ -216,12 +222,13 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) break; case SCO_LINK: if (lmp_esco_capable(hdev)) - conn->pkt_type = hdev->esco_type & SCO_ESCO_MASK; + conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) | + (hdev->esco_type & EDR_ESCO_MASK); else conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK; break; case ESCO_LINK: - conn->pkt_type = hdev->esco_type; + conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK; break; } @@ -280,6 +287,8 @@ int hci_conn_del(struct hci_conn *conn) skb_queue_purge(&conn->data_q); + hci_conn_del_sysfs(conn); + return 0; } @@ -325,7 +334,7 @@ EXPORT_SYMBOL(hci_get_route); /* Create SCO or ACL connection. * Device _must_ be locked */ -struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 auth_type) +struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 sec_level, __u8 auth_type) { struct hci_conn *acl; struct hci_conn *sco; @@ -340,6 +349,7 @@ struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 hci_conn_hold(acl); if (acl->state == BT_OPEN || acl->state == BT_CLOSED) { + acl->sec_level = sec_level; acl->auth_type = auth_type; hci_acl_connect(acl); } @@ -385,51 +395,59 @@ int hci_conn_check_link_mode(struct hci_conn *conn) EXPORT_SYMBOL(hci_conn_check_link_mode); /* Authenticate remote device */ -int hci_conn_auth(struct hci_conn *conn) +static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) { BT_DBG("conn %p", conn); - if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) { - if (!(conn->auth_type & 0x01)) { - conn->auth_type |= 0x01; - conn->link_mode &= ~HCI_LM_AUTH; - } - } - - if (conn->link_mode & HCI_LM_AUTH) + if (sec_level > conn->sec_level) + conn->sec_level = sec_level; + else if (conn->link_mode & HCI_LM_AUTH) return 1; + conn->auth_type = auth_type; + if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { struct hci_cp_auth_requested cp; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); } + return 0; } -EXPORT_SYMBOL(hci_conn_auth); -/* Enable encryption */ -int hci_conn_encrypt(struct hci_conn *conn) +/* Enable security */ +int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) { BT_DBG("conn %p", conn); + if (sec_level == BT_SECURITY_SDP) + return 1; + + if (sec_level == BT_SECURITY_LOW) { + if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) + return hci_conn_auth(conn, sec_level, auth_type); + else + return 1; + } + if (conn->link_mode & HCI_LM_ENCRYPT) - return hci_conn_auth(conn); + return hci_conn_auth(conn, sec_level, auth_type); if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) return 0; - if (hci_conn_auth(conn)) { + if (hci_conn_auth(conn, sec_level, auth_type)) { struct hci_cp_set_conn_encrypt cp; cp.handle = cpu_to_le16(conn->handle); cp.encrypt = 1; hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp), &cp); } + return 0; } -EXPORT_SYMBOL(hci_conn_encrypt); +EXPORT_SYMBOL(hci_conn_security); /* Change link key */ int hci_conn_change_link_key(struct hci_conn *conn) @@ -442,12 +460,13 @@ int hci_conn_change_link_key(struct hci_conn *conn) hci_send_cmd(conn->hdev, HCI_OP_CHANGE_CONN_LINK_KEY, sizeof(cp), &cp); } + return 0; } EXPORT_SYMBOL(hci_conn_change_link_key); /* Switch role */ -int hci_conn_switch_role(struct hci_conn *conn, uint8_t role) +int hci_conn_switch_role(struct hci_conn *conn, __u8 role) { BT_DBG("conn %p", conn); @@ -460,6 +479,7 @@ int hci_conn_switch_role(struct hci_conn *conn, uint8_t role) cp.role = role; hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp); } + return 0; } EXPORT_SYMBOL(hci_conn_switch_role); @@ -542,9 +562,7 @@ void hci_conn_hash_flush(struct hci_dev *hdev) c->state = BT_CLOSED; - hci_conn_del_sysfs(c); - - hci_proto_disconn_ind(c, 0x16); + hci_proto_disconn_cfm(c, 0x16); hci_conn_del(c); } } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ba78cc1eb8d9..cd061510b6bd 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1565,8 +1565,7 @@ static void hci_cmd_task(unsigned long arg) /* Send queued commands */ if (atomic_read(&hdev->cmd_cnt) && (skb = skb_dequeue(&hdev->cmd_q))) { - if (hdev->sent_cmd) - kfree_skb(hdev->sent_cmd); + kfree_skb(hdev->sent_cmd); if ((hdev->sent_cmd = skb_clone(skb, GFP_ATOMIC))) { atomic_dec(&hdev->cmd_cnt); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index f91ba690f5d2..55534244c3a0 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -484,6 +484,15 @@ static void hci_cc_read_local_features(struct hci_dev *hdev, struct sk_buff *skb if (hdev->features[4] & LMP_EV5) hdev->esco_type |= (ESCO_EV5); + if (hdev->features[5] & LMP_EDR_ESCO_2M) + hdev->esco_type |= (ESCO_2EV3); + + if (hdev->features[5] & LMP_EDR_ESCO_3M) + hdev->esco_type |= (ESCO_3EV3); + + if (hdev->features[5] & LMP_EDR_3S_ESCO) + hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5); + BT_DBG("%s features 0x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", hdev->name, hdev->features[0], hdev->features[1], hdev->features[2], hdev->features[3], @@ -914,7 +923,8 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s if (ev->status) { hci_proto_connect_cfm(conn, ev->status); hci_conn_del(conn); - } + } else if (ev->link_type != ACL_LINK) + hci_proto_connect_cfm(conn, ev->status); unlock: hci_dev_unlock(hdev); @@ -1009,9 +1019,7 @@ static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff if (conn) { conn->state = BT_CLOSED; - hci_conn_del_sysfs(conn); - - hci_proto_disconn_ind(conn, ev->reason); + hci_proto_disconn_cfm(conn, ev->reason); hci_conn_del(conn); } @@ -1600,7 +1608,8 @@ static inline void hci_remote_ext_features_evt(struct hci_dev *hdev, struct sk_b if (conn->state == BT_CONFIG) { if (!ev->status && hdev->ssp_mode > 0 && - conn->ssp_mode > 0 && conn->out) { + conn->ssp_mode > 0 && conn->out && + conn->sec_level != BT_SECURITY_SDP) { struct hci_cp_auth_requested cp; cp.handle = ev->handle; hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, @@ -1637,6 +1646,13 @@ static inline void hci_sync_conn_complete_evt(struct hci_dev *hdev, struct sk_bu conn->type = SCO_LINK; } + if (conn->out && ev->status == 0x1c && conn->attempt < 2) { + conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) | + (hdev->esco_type & EDR_ESCO_MASK); + hci_setup_sync(conn, conn->link->handle); + goto unlock; + } + if (!ev->status) { conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index b93748e224ff..ca4d3b40d5ce 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -50,9 +50,10 @@ #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> -#define VERSION "2.11" +#define VERSION "2.13" -static u32 l2cap_feat_mask = 0x0000; +static u32 l2cap_feat_mask = 0x0080; +static u8 l2cap_fixed_chan[8] = { 0x02, }; static const struct proto_ops l2cap_sock_ops; @@ -77,9 +78,10 @@ static void l2cap_sock_timeout(unsigned long arg) bh_lock_sock(sk); - if (sk->sk_state == BT_CONNECT && - (l2cap_pi(sk)->link_mode & (L2CAP_LM_AUTH | - L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE))) + if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONFIG) + reason = ECONNREFUSED; + else if (sk->sk_state == BT_CONNECT && + l2cap_pi(sk)->sec_level != BT_SECURITY_SDP) reason = ECONNREFUSED; else reason = ETIMEDOUT; @@ -204,6 +206,8 @@ static void __l2cap_chan_add(struct l2cap_conn *conn, struct sock *sk, struct so BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, l2cap_pi(sk)->psm, l2cap_pi(sk)->dcid); + conn->disc_reason = 0x13; + l2cap_pi(sk)->conn = conn; if (sk->sk_type == SOCK_SEQPACKET) { @@ -259,18 +263,35 @@ static void l2cap_chan_del(struct sock *sk, int err) } /* Service level security */ -static inline int l2cap_check_link_mode(struct sock *sk) +static inline int l2cap_check_security(struct sock *sk) { struct l2cap_conn *conn = l2cap_pi(sk)->conn; + __u8 auth_type; - if ((l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT) || - (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE)) - return hci_conn_encrypt(conn->hcon); + if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) { + if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH) + auth_type = HCI_AT_NO_BONDING_MITM; + else + auth_type = HCI_AT_NO_BONDING; - if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH) - return hci_conn_auth(conn->hcon); + if (l2cap_pi(sk)->sec_level == BT_SECURITY_LOW) + l2cap_pi(sk)->sec_level = BT_SECURITY_SDP; + } else { + switch (l2cap_pi(sk)->sec_level) { + case BT_SECURITY_HIGH: + auth_type = HCI_AT_GENERAL_BONDING_MITM; + break; + case BT_SECURITY_MEDIUM: + auth_type = HCI_AT_GENERAL_BONDING; + break; + default: + auth_type = HCI_AT_NO_BONDING; + break; + } + } - return 1; + return hci_conn_security(conn->hcon, l2cap_pi(sk)->sec_level, + auth_type); } static inline u8 l2cap_get_ident(struct l2cap_conn *conn) @@ -312,7 +333,10 @@ static void l2cap_do_start(struct sock *sk) struct l2cap_conn *conn = l2cap_pi(sk)->conn; if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) { - if (l2cap_check_link_mode(sk)) { + if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)) + return; + + if (l2cap_check_security(sk)) { struct l2cap_conn_req req; req.scid = cpu_to_le16(l2cap_pi(sk)->scid); req.psm = l2cap_pi(sk)->psm; @@ -356,7 +380,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn) } if (sk->sk_state == BT_CONNECT) { - if (l2cap_check_link_mode(sk)) { + if (l2cap_check_security(sk)) { struct l2cap_conn_req req; req.scid = cpu_to_le16(l2cap_pi(sk)->scid); req.psm = l2cap_pi(sk)->psm; @@ -371,10 +395,18 @@ static void l2cap_conn_start(struct l2cap_conn *conn) rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); - if (l2cap_check_link_mode(sk)) { - sk->sk_state = BT_CONFIG; - rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); - rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + if (l2cap_check_security(sk)) { + if (bt_sk(sk)->defer_setup) { + struct sock *parent = bt_sk(sk)->parent; + rsp.result = cpu_to_le16(L2CAP_CR_PEND); + rsp.status = cpu_to_le16(L2CAP_CS_AUTHOR_PEND); + parent->sk_data_ready(parent, 0); + + } else { + sk->sk_state = BT_CONFIG; + rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + } } else { rsp.result = cpu_to_le16(L2CAP_CR_PEND); rsp.status = cpu_to_le16(L2CAP_CS_AUTHEN_PEND); @@ -426,7 +458,7 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err) read_lock(&l->lock); for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { - if (l2cap_pi(sk)->link_mode & L2CAP_LM_RELIABLE) + if (l2cap_pi(sk)->force_reliable) sk->sk_err = err; } @@ -437,6 +469,7 @@ static void l2cap_info_timeout(unsigned long arg) { struct l2cap_conn *conn = (void *) arg; + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; conn->info_ident = 0; l2cap_conn_start(conn); @@ -470,6 +503,8 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status) spin_lock_init(&conn->lock); rwlock_init(&conn->chan_list.lock); + conn->disc_reason = 0x13; + return conn; } @@ -483,8 +518,7 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); - if (conn->rx_skb) - kfree_skb(conn->rx_skb); + kfree_skb(conn->rx_skb); /* Kill channels */ while ((sk = conn->chan_list.head)) { @@ -608,7 +642,6 @@ static void __l2cap_sock_close(struct sock *sk, int reason) case BT_CONNECTED: case BT_CONFIG: - case BT_CONNECT2: if (sk->sk_type == SOCK_SEQPACKET) { struct l2cap_conn *conn = l2cap_pi(sk)->conn; struct l2cap_disconn_req req; @@ -624,6 +657,27 @@ static void __l2cap_sock_close(struct sock *sk, int reason) l2cap_chan_del(sk, reason); break; + case BT_CONNECT2: + if (sk->sk_type == SOCK_SEQPACKET) { + struct l2cap_conn *conn = l2cap_pi(sk)->conn; + struct l2cap_conn_rsp rsp; + __u16 result; + + if (bt_sk(sk)->defer_setup) + result = L2CAP_CR_SEC_BLOCK; + else + result = L2CAP_CR_BAD_PSM; + + rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); + rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); + rsp.result = cpu_to_le16(result); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + l2cap_send_cmd(conn, l2cap_pi(sk)->ident, + L2CAP_CONN_RSP, sizeof(rsp), &rsp); + } else + l2cap_chan_del(sk, reason); + break; + case BT_CONNECT: case BT_DISCONN: l2cap_chan_del(sk, reason); @@ -653,13 +707,19 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent) if (parent) { sk->sk_type = parent->sk_type; + bt_sk(sk)->defer_setup = bt_sk(parent)->defer_setup; + pi->imtu = l2cap_pi(parent)->imtu; pi->omtu = l2cap_pi(parent)->omtu; - pi->link_mode = l2cap_pi(parent)->link_mode; + pi->sec_level = l2cap_pi(parent)->sec_level; + pi->role_switch = l2cap_pi(parent)->role_switch; + pi->force_reliable = l2cap_pi(parent)->force_reliable; } else { pi->imtu = L2CAP_DEFAULT_MTU; pi->omtu = 0; - pi->link_mode = 0; + pi->sec_level = BT_SECURITY_LOW; + pi->role_switch = 0; + pi->force_reliable = 0; } /* Default config options */ @@ -723,17 +783,24 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol) return 0; } -static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { - struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr; struct sock *sk = sock->sk; - int err = 0; + struct sockaddr_l2 la; + int len, err = 0; - BT_DBG("sk %p, %s %d", sk, batostr(&la->l2_bdaddr), la->l2_psm); + BT_DBG("sk %p", sk); if (!addr || addr->sa_family != AF_BLUETOOTH) return -EINVAL; + memset(&la, 0, sizeof(la)); + len = min_t(unsigned int, sizeof(la), alen); + memcpy(&la, addr, len); + + if (la.l2_cid) + return -EINVAL; + lock_sock(sk); if (sk->sk_state != BT_OPEN) { @@ -741,7 +808,7 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_ goto done; } - if (la->l2_psm && btohs(la->l2_psm) < 0x1001 && + if (la.l2_psm && btohs(la.l2_psm) < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) { err = -EACCES; goto done; @@ -749,14 +816,17 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_ write_lock_bh(&l2cap_sk_list.lock); - if (la->l2_psm && __l2cap_get_sock_by_addr(la->l2_psm, &la->l2_bdaddr)) { + if (la.l2_psm && __l2cap_get_sock_by_addr(la.l2_psm, &la.l2_bdaddr)) { err = -EADDRINUSE; } else { /* Save source address */ - bacpy(&bt_sk(sk)->src, &la->l2_bdaddr); - l2cap_pi(sk)->psm = la->l2_psm; - l2cap_pi(sk)->sport = la->l2_psm; + bacpy(&bt_sk(sk)->src, &la.l2_bdaddr); + l2cap_pi(sk)->psm = la.l2_psm; + l2cap_pi(sk)->sport = la.l2_psm; sk->sk_state = BT_BOUND; + + if (btohs(la.l2_psm) == 0x0001 || btohs(la.l2_psm) == 0x0003) + l2cap_pi(sk)->sec_level = BT_SECURITY_SDP; } write_unlock_bh(&l2cap_sk_list.lock); @@ -776,7 +846,8 @@ static int l2cap_do_connect(struct sock *sk) __u8 auth_type; int err = 0; - BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst), l2cap_pi(sk)->psm); + BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst), + l2cap_pi(sk)->psm); if (!(hdev = hci_get_route(dst, src))) return -EHOSTUNREACH; @@ -785,21 +856,42 @@ static int l2cap_do_connect(struct sock *sk) err = -ENOMEM; - if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH || - l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT || - l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) { - if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) + if (sk->sk_type == SOCK_RAW) { + switch (l2cap_pi(sk)->sec_level) { + case BT_SECURITY_HIGH: + auth_type = HCI_AT_DEDICATED_BONDING_MITM; + break; + case BT_SECURITY_MEDIUM: + auth_type = HCI_AT_DEDICATED_BONDING; + break; + default: + auth_type = HCI_AT_NO_BONDING; + break; + } + } else if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) { + if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH) auth_type = HCI_AT_NO_BONDING_MITM; else - auth_type = HCI_AT_GENERAL_BONDING_MITM; - } else { - if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) auth_type = HCI_AT_NO_BONDING; - else + + if (l2cap_pi(sk)->sec_level == BT_SECURITY_LOW) + l2cap_pi(sk)->sec_level = BT_SECURITY_SDP; + } else { + switch (l2cap_pi(sk)->sec_level) { + case BT_SECURITY_HIGH: + auth_type = HCI_AT_GENERAL_BONDING_MITM; + break; + case BT_SECURITY_MEDIUM: auth_type = HCI_AT_GENERAL_BONDING; + break; + default: + auth_type = HCI_AT_NO_BONDING; + break; + } } - hcon = hci_connect(hdev, ACL_LINK, dst, auth_type); + hcon = hci_connect(hdev, ACL_LINK, dst, + l2cap_pi(sk)->sec_level, auth_type); if (!hcon) goto done; @@ -835,20 +927,25 @@ done: static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { - struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr; struct sock *sk = sock->sk; - int err = 0; - - lock_sock(sk); + struct sockaddr_l2 la; + int len, err = 0; BT_DBG("sk %p", sk); - if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_l2)) { - err = -EINVAL; - goto done; - } + if (!addr || addr->sa_family != AF_BLUETOOTH) + return -EINVAL; + + memset(&la, 0, sizeof(la)); + len = min_t(unsigned int, sizeof(la), alen); + memcpy(&la, addr, len); + + if (la.l2_cid) + return -EINVAL; + + lock_sock(sk); - if (sk->sk_type == SOCK_SEQPACKET && !la->l2_psm) { + if (sk->sk_type == SOCK_SEQPACKET && !la.l2_psm) { err = -EINVAL; goto done; } @@ -875,8 +972,8 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int al } /* Set destination address and psm */ - bacpy(&bt_sk(sk)->dst, &la->l2_bdaddr); - l2cap_pi(sk)->psm = la->l2_psm; + bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr); + l2cap_pi(sk)->psm = la.l2_psm; if ((err = l2cap_do_connect(sk))) goto done; @@ -1000,12 +1097,16 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *l addr->sa_family = AF_BLUETOOTH; *len = sizeof(struct sockaddr_l2); - if (peer) + if (peer) { + la->l2_psm = l2cap_pi(sk)->psm; bacpy(&la->l2_bdaddr, &bt_sk(sk)->dst); - else + la->l2_cid = htobs(l2cap_pi(sk)->dcid); + } else { + la->l2_psm = l2cap_pi(sk)->sport; bacpy(&la->l2_bdaddr, &bt_sk(sk)->src); + la->l2_cid = htobs(l2cap_pi(sk)->scid); + } - la->l2_psm = l2cap_pi(sk)->psm; return 0; } @@ -1106,11 +1207,38 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms return err; } -static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + + if (sk->sk_state == BT_CONNECT2 && bt_sk(sk)->defer_setup) { + struct l2cap_conn_rsp rsp; + + sk->sk_state = BT_CONFIG; + + rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); + rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); + rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + l2cap_send_cmd(l2cap_pi(sk)->conn, l2cap_pi(sk)->ident, + L2CAP_CONN_RSP, sizeof(rsp), &rsp); + + release_sock(sk); + return 0; + } + + release_sock(sk); + + return bt_sock_recvmsg(iocb, sock, msg, len, flags); +} + +static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, int optlen) { struct sock *sk = sock->sk; struct l2cap_options opts; - int err = 0, len; + int len, err = 0; u32 opt; BT_DBG("sk %p", sk); @@ -1140,7 +1268,15 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch break; } - l2cap_pi(sk)->link_mode = opt; + if (opt & L2CAP_LM_AUTH) + l2cap_pi(sk)->sec_level = BT_SECURITY_LOW; + if (opt & L2CAP_LM_ENCRYPT) + l2cap_pi(sk)->sec_level = BT_SECURITY_MEDIUM; + if (opt & L2CAP_LM_SECURE) + l2cap_pi(sk)->sec_level = BT_SECURITY_HIGH; + + l2cap_pi(sk)->role_switch = (opt & L2CAP_LM_MASTER); + l2cap_pi(sk)->force_reliable = (opt & L2CAP_LM_RELIABLE); break; default: @@ -1152,12 +1288,77 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch return err; } -static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct bt_security sec; + int len, err = 0; + u32 opt; + + BT_DBG("sk %p", sk); + + if (level == SOL_L2CAP) + return l2cap_sock_setsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_RAW) { + err = -EINVAL; + break; + } + + sec.level = BT_SECURITY_LOW; + + len = min_t(unsigned int, sizeof(sec), optlen); + if (copy_from_user((char *) &sec, optval, len)) { + err = -EFAULT; + break; + } + + if (sec.level < BT_SECURITY_LOW || + sec.level > BT_SECURITY_HIGH) { + err = -EINVAL; + break; + } + + l2cap_pi(sk)->sec_level = sec.level; + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + bt_sk(sk)->defer_setup = opt; + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct l2cap_options opts; struct l2cap_conninfo cinfo; int len, err = 0; + u32 opt; BT_DBG("sk %p", sk); @@ -1180,12 +1381,36 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, ch break; case L2CAP_LM: - if (put_user(l2cap_pi(sk)->link_mode, (u32 __user *) optval)) + switch (l2cap_pi(sk)->sec_level) { + case BT_SECURITY_LOW: + opt = L2CAP_LM_AUTH; + break; + case BT_SECURITY_MEDIUM: + opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT; + break; + case BT_SECURITY_HIGH: + opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT | + L2CAP_LM_SECURE; + break; + default: + opt = 0; + break; + } + + if (l2cap_pi(sk)->role_switch) + opt |= L2CAP_LM_MASTER; + + if (l2cap_pi(sk)->force_reliable) + opt |= L2CAP_LM_RELIABLE; + + if (put_user(opt, (u32 __user *) optval)) err = -EFAULT; break; case L2CAP_CONNINFO: - if (sk->sk_state != BT_CONNECTED) { + if (sk->sk_state != BT_CONNECTED && + !(sk->sk_state == BT_CONNECT2 && + bt_sk(sk)->defer_setup)) { err = -ENOTCONN; break; } @@ -1208,6 +1433,60 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, ch return err; } +static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct bt_security sec; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (level == SOL_L2CAP) + return l2cap_sock_getsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_RAW) { + err = -EINVAL; + break; + } + + sec.level = l2cap_pi(sk)->sec_level; + + len = min_t(unsigned int, len, sizeof(sec)); + if (copy_to_user(optval, (char *) &sec, len)) + err = -EFAULT; + + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + static int l2cap_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; @@ -1270,11 +1549,6 @@ static void l2cap_chan_ready(struct sock *sk) */ parent->sk_data_ready(parent, 0); } - - if (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) { - struct l2cap_conn *conn = l2cap_pi(sk)->conn; - hci_conn_change_link_key(conn->hcon); - } } /* Copy frame to all raw sockets on that connection */ @@ -1549,8 +1823,11 @@ static inline int l2cap_command_rej(struct l2cap_conn *conn, struct l2cap_cmd_hd if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) && cmd->ident == conn->info_ident) { - conn->info_ident = 0; del_timer(&conn->info_timer); + + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; + conn->info_ident = 0; + l2cap_conn_start(conn); } @@ -1580,6 +1857,7 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd /* Check if the ACL is secure enough (if not SDP) */ if (psm != cpu_to_le16(0x0001) && !hci_conn_check_link_mode(conn->hcon)) { + conn->disc_reason = 0x05; result = L2CAP_CR_SEC_BLOCK; goto response; } @@ -1621,11 +1899,18 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd l2cap_pi(sk)->ident = cmd->ident; - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) { - if (l2cap_check_link_mode(sk)) { - sk->sk_state = BT_CONFIG; - result = L2CAP_CR_SUCCESS; - status = L2CAP_CS_NO_INFO; + if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) { + if (l2cap_check_security(sk)) { + if (bt_sk(sk)->defer_setup) { + sk->sk_state = BT_CONNECT2; + result = L2CAP_CR_PEND; + status = L2CAP_CS_AUTHOR_PEND; + parent->sk_data_ready(parent, 0); + } else { + sk->sk_state = BT_CONFIG; + result = L2CAP_CR_SUCCESS; + status = L2CAP_CS_NO_INFO; + } } else { sk->sk_state = BT_CONNECT2; result = L2CAP_CR_PEND; @@ -1695,11 +1980,14 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd l2cap_pi(sk)->dcid = dcid; l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT; + l2cap_pi(sk)->conf_state &= ~L2CAP_CONF_CONNECT_PEND; + l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, l2cap_build_conf_req(sk, req), req); break; case L2CAP_CR_PEND: + l2cap_pi(sk)->conf_state |= L2CAP_CONF_CONNECT_PEND; break; default: @@ -1908,6 +2196,14 @@ static inline int l2cap_information_req(struct l2cap_conn *conn, struct l2cap_cm put_unaligned(cpu_to_le32(l2cap_feat_mask), (__le32 *) rsp->data); l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(buf), buf); + } else if (type == L2CAP_IT_FIXED_CHAN) { + u8 buf[12]; + struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf; + rsp->type = cpu_to_le16(L2CAP_IT_FIXED_CHAN); + rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS); + memcpy(buf + 4, l2cap_fixed_chan, 8); + l2cap_send_cmd(conn, cmd->ident, + L2CAP_INFO_RSP, sizeof(buf), buf); } else { struct l2cap_info_rsp rsp; rsp.type = cpu_to_le16(type); @@ -1929,14 +2225,31 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cm BT_DBG("type 0x%4.4x result 0x%2.2x", type, result); - conn->info_ident = 0; - del_timer(&conn->info_timer); - if (type == L2CAP_IT_FEAT_MASK) + if (type == L2CAP_IT_FEAT_MASK) { conn->feat_mask = get_unaligned_le32(rsp->data); - l2cap_conn_start(conn); + if (conn->feat_mask & 0x0080) { + struct l2cap_info_req req; + req.type = cpu_to_le16(L2CAP_IT_FIXED_CHAN); + + conn->info_ident = l2cap_get_ident(conn); + + l2cap_send_cmd(conn, conn->info_ident, + L2CAP_INFO_REQ, sizeof(req), &req); + } else { + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; + conn->info_ident = 0; + + l2cap_conn_start(conn); + } + } else if (type == L2CAP_IT_FIXED_CHAN) { + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; + conn->info_ident = 0; + + l2cap_conn_start(conn); + } return 0; } @@ -2143,10 +2456,15 @@ static int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) continue; if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr)) { - lm1 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode); + lm1 |= HCI_LM_ACCEPT; + if (l2cap_pi(sk)->role_switch) + lm1 |= HCI_LM_MASTER; exact++; - } else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) - lm2 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode); + } else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) { + lm2 |= HCI_LM_ACCEPT; + if (l2cap_pi(sk)->role_switch) + lm2 |= HCI_LM_MASTER; + } } read_unlock(&l2cap_sk_list.lock); @@ -2172,89 +2490,48 @@ static int l2cap_connect_cfm(struct hci_conn *hcon, u8 status) return 0; } -static int l2cap_disconn_ind(struct hci_conn *hcon, u8 reason) +static int l2cap_disconn_ind(struct hci_conn *hcon) { - BT_DBG("hcon %p reason %d", hcon, reason); + struct l2cap_conn *conn = hcon->l2cap_data; - if (hcon->type != ACL_LINK) - return 0; + BT_DBG("hcon %p", hcon); - l2cap_conn_del(hcon, bt_err(reason)); + if (hcon->type != ACL_LINK || !conn) + return 0x13; - return 0; + return conn->disc_reason; } -static int l2cap_auth_cfm(struct hci_conn *hcon, u8 status) +static int l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason) { - struct l2cap_chan_list *l; - struct l2cap_conn *conn = hcon->l2cap_data; - struct sock *sk; + BT_DBG("hcon %p reason %d", hcon, reason); - if (!conn) + if (hcon->type != ACL_LINK) return 0; - l = &conn->chan_list; - - BT_DBG("conn %p", conn); - - read_lock(&l->lock); - - for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { - struct l2cap_pinfo *pi = l2cap_pi(sk); - - bh_lock_sock(sk); - - if ((pi->link_mode & (L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)) && - !(hcon->link_mode & HCI_LM_ENCRYPT) && - !status) { - bh_unlock_sock(sk); - continue; - } - - if (sk->sk_state == BT_CONNECT) { - if (!status) { - struct l2cap_conn_req req; - req.scid = cpu_to_le16(l2cap_pi(sk)->scid); - req.psm = l2cap_pi(sk)->psm; - - l2cap_pi(sk)->ident = l2cap_get_ident(conn); - - l2cap_send_cmd(conn, l2cap_pi(sk)->ident, - L2CAP_CONN_REQ, sizeof(req), &req); - } else { - l2cap_sock_clear_timer(sk); - l2cap_sock_set_timer(sk, HZ / 10); - } - } else if (sk->sk_state == BT_CONNECT2) { - struct l2cap_conn_rsp rsp; - __u16 result; + l2cap_conn_del(hcon, bt_err(reason)); - if (!status) { - sk->sk_state = BT_CONFIG; - result = L2CAP_CR_SUCCESS; - } else { - sk->sk_state = BT_DISCONN; - l2cap_sock_set_timer(sk, HZ / 10); - result = L2CAP_CR_SEC_BLOCK; - } + return 0; +} - rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); - rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); - rsp.result = cpu_to_le16(result); - rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); - l2cap_send_cmd(conn, l2cap_pi(sk)->ident, - L2CAP_CONN_RSP, sizeof(rsp), &rsp); - } +static inline void l2cap_check_encryption(struct sock *sk, u8 encrypt) +{ + if (sk->sk_type != SOCK_SEQPACKET) + return; - bh_unlock_sock(sk); + if (encrypt == 0x00) { + if (l2cap_pi(sk)->sec_level == BT_SECURITY_MEDIUM) { + l2cap_sock_clear_timer(sk); + l2cap_sock_set_timer(sk, HZ * 5); + } else if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH) + __l2cap_sock_close(sk, ECONNREFUSED); + } else { + if (l2cap_pi(sk)->sec_level == BT_SECURITY_MEDIUM) + l2cap_sock_clear_timer(sk); } - - read_unlock(&l->lock); - - return 0; } -static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) +static int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) { struct l2cap_chan_list *l; struct l2cap_conn *conn = hcon->l2cap_data; @@ -2270,15 +2547,16 @@ static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) read_lock(&l->lock); for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { - struct l2cap_pinfo *pi = l2cap_pi(sk); - bh_lock_sock(sk); - if ((pi->link_mode & (L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)) && - (sk->sk_state == BT_CONNECTED || - sk->sk_state == BT_CONFIG) && - !status && encrypt == 0x00) { - __l2cap_sock_close(sk, ECONNREFUSED); + if (l2cap_pi(sk)->conf_state & L2CAP_CONF_CONNECT_PEND) { + bh_unlock_sock(sk); + continue; + } + + if (!status && (sk->sk_state == BT_CONNECTED || + sk->sk_state == BT_CONFIG)) { + l2cap_check_encryption(sk, encrypt); bh_unlock_sock(sk); continue; } @@ -2376,7 +2654,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl goto drop; skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), - skb->len); + skb->len); conn->rx_len = len - skb->len; } else { BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); @@ -2398,7 +2676,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl } skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), - skb->len); + skb->len); conn->rx_len -= skb->len; if (!conn->rx_len) { @@ -2424,10 +2702,10 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf) sk_for_each(sk, node, &l2cap_sk_list.head) { struct l2cap_pinfo *pi = l2cap_pi(sk); - str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n", + str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d %d\n", batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), sk->sk_state, btohs(pi->psm), pi->scid, pi->dcid, - pi->imtu, pi->omtu, pi->link_mode); + pi->imtu, pi->omtu, pi->sec_level); } read_unlock_bh(&l2cap_sk_list.lock); @@ -2447,7 +2725,7 @@ static const struct proto_ops l2cap_sock_ops = { .accept = l2cap_sock_accept, .getname = l2cap_sock_getname, .sendmsg = l2cap_sock_sendmsg, - .recvmsg = bt_sock_recvmsg, + .recvmsg = l2cap_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, @@ -2469,8 +2747,8 @@ static struct hci_proto l2cap_hci_proto = { .connect_ind = l2cap_connect_ind, .connect_cfm = l2cap_connect_cfm, .disconn_ind = l2cap_disconn_ind, - .auth_cfm = l2cap_auth_cfm, - .encrypt_cfm = l2cap_encrypt_cfm, + .disconn_cfm = l2cap_disconn_cfm, + .security_cfm = l2cap_security_cfm, .recv_acldata = l2cap_recv_acldata }; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index acd84fd524b8..1d0fb0f23c63 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -46,7 +46,7 @@ #include <net/bluetooth/l2cap.h> #include <net/bluetooth/rfcomm.h> -#define VERSION "1.10" +#define VERSION "1.11" static int disable_cfc = 0; static int channel_mtu = -1; @@ -223,19 +223,25 @@ static int rfcomm_l2sock_create(struct socket **sock) return err; } -static inline int rfcomm_check_link_mode(struct rfcomm_dlc *d) +static inline int rfcomm_check_security(struct rfcomm_dlc *d) { struct sock *sk = d->session->sock->sk; + __u8 auth_type; - if (d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) { - if (!hci_conn_encrypt(l2cap_pi(sk)->conn->hcon)) - return 1; - } else if (d->link_mode & RFCOMM_LM_AUTH) { - if (!hci_conn_auth(l2cap_pi(sk)->conn->hcon)) - return 1; + switch (d->sec_level) { + case BT_SECURITY_HIGH: + auth_type = HCI_AT_GENERAL_BONDING_MITM; + break; + case BT_SECURITY_MEDIUM: + auth_type = HCI_AT_GENERAL_BONDING; + break; + default: + auth_type = HCI_AT_NO_BONDING; + break; } - return 0; + return hci_conn_security(l2cap_pi(sk)->conn->hcon, d->sec_level, + auth_type); } /* ---- RFCOMM DLCs ---- */ @@ -388,10 +394,10 @@ static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc; if (s->state == BT_CONNECTED) { - if (rfcomm_check_link_mode(d)) - set_bit(RFCOMM_AUTH_PENDING, &d->flags); - else + if (rfcomm_check_security(d)) rfcomm_send_pn(s, 1, d); + else + set_bit(RFCOMM_AUTH_PENDING, &d->flags); } rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT); @@ -421,9 +427,16 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) d, d->state, d->dlci, err, s); switch (d->state) { - case BT_CONNECTED: - case BT_CONFIG: case BT_CONNECT: + case BT_CONFIG: + if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { + set_bit(RFCOMM_AUTH_REJECT, &d->flags); + rfcomm_schedule(RFCOMM_SCHED_AUTH); + break; + } + /* Fall through */ + + case BT_CONNECTED: d->state = BT_DISCONN; if (skb_queue_empty(&d->tx_queue)) { rfcomm_send_disc(s, d->dlci); @@ -434,6 +447,15 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) } break; + case BT_OPEN: + case BT_CONNECT2: + if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { + set_bit(RFCOMM_AUTH_REJECT, &d->flags); + rfcomm_schedule(RFCOMM_SCHED_AUTH); + break; + } + /* Fall through */ + default: rfcomm_dlc_clear_timer(d); @@ -636,6 +658,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst bacpy(&addr.l2_bdaddr, src); addr.l2_family = AF_BLUETOOTH; addr.l2_psm = 0; + addr.l2_cid = 0; *err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); if (*err < 0) goto failed; @@ -657,6 +680,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst bacpy(&addr.l2_bdaddr, dst); addr.l2_family = AF_BLUETOOTH; addr.l2_psm = htobs(RFCOMM_PSM); + addr.l2_cid = 0; *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK); if (*err == 0 || *err == -EINPROGRESS) return s; @@ -1162,7 +1186,7 @@ static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci) return 0; } -static void rfcomm_dlc_accept(struct rfcomm_dlc *d) +void rfcomm_dlc_accept(struct rfcomm_dlc *d) { struct sock *sk = d->session->sock->sk; @@ -1175,12 +1199,31 @@ static void rfcomm_dlc_accept(struct rfcomm_dlc *d) d->state_change(d, 0); rfcomm_dlc_unlock(d); - if (d->link_mode & RFCOMM_LM_MASTER) + if (d->role_switch) hci_conn_switch_role(l2cap_pi(sk)->conn->hcon, 0x00); rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig); } +static void rfcomm_check_accept(struct rfcomm_dlc *d) +{ + if (rfcomm_check_security(d)) { + if (d->defer_setup) { + set_bit(RFCOMM_DEFER_SETUP, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + + rfcomm_dlc_lock(d); + d->state = BT_CONNECT2; + d->state_change(d, 0); + rfcomm_dlc_unlock(d); + } else + rfcomm_dlc_accept(d); + } else { + set_bit(RFCOMM_AUTH_PENDING, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + } +} + static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci) { struct rfcomm_dlc *d; @@ -1203,11 +1246,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci) if (d) { if (d->state == BT_OPEN) { /* DLC was previously opened by PN request */ - if (rfcomm_check_link_mode(d)) { - set_bit(RFCOMM_AUTH_PENDING, &d->flags); - rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); - } else - rfcomm_dlc_accept(d); + rfcomm_check_accept(d); } return 0; } @@ -1219,11 +1258,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci) d->addr = __addr(s->initiator, dlci); rfcomm_dlc_link(s, d); - if (rfcomm_check_link_mode(d)) { - set_bit(RFCOMM_AUTH_PENDING, &d->flags); - rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); - } else - rfcomm_dlc_accept(d); + rfcomm_check_accept(d); } else { rfcomm_send_dm(s, dlci); } @@ -1637,11 +1672,12 @@ static void rfcomm_process_connect(struct rfcomm_session *s) d = list_entry(p, struct rfcomm_dlc, list); if (d->state == BT_CONFIG) { d->mtu = s->mtu; - if (rfcomm_check_link_mode(d)) { + if (rfcomm_check_security(d)) { + rfcomm_send_pn(s, 1, d); + } else { set_bit(RFCOMM_AUTH_PENDING, &d->flags); rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); - } else - rfcomm_send_pn(s, 1, d); + } } } } @@ -1717,11 +1753,17 @@ static inline void rfcomm_process_dlcs(struct rfcomm_session *s) if (d->out) { rfcomm_send_pn(s, 1, d); rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT); - } else - rfcomm_dlc_accept(d); - if (d->link_mode & RFCOMM_LM_SECURE) { - struct sock *sk = s->sock->sk; - hci_conn_change_link_key(l2cap_pi(sk)->conn->hcon); + } else { + if (d->defer_setup) { + set_bit(RFCOMM_DEFER_SETUP, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + + rfcomm_dlc_lock(d); + d->state = BT_CONNECT2; + d->state_change(d, 0); + rfcomm_dlc_unlock(d); + } else + rfcomm_dlc_accept(d); } continue; } else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) { @@ -1734,6 +1776,9 @@ static inline void rfcomm_process_dlcs(struct rfcomm_session *s) continue; } + if (test_bit(RFCOMM_SEC_PENDING, &d->flags)) + continue; + if (test_bit(RFCOMM_TX_THROTTLED, &s->flags)) continue; @@ -1876,6 +1921,7 @@ static int rfcomm_add_listener(bdaddr_t *ba) bacpy(&addr.l2_bdaddr, ba); addr.l2_family = AF_BLUETOOTH; addr.l2_psm = htobs(RFCOMM_PSM); + addr.l2_cid = 0; err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); if (err < 0) { BT_ERR("Bind failed %d", err); @@ -1947,42 +1993,7 @@ static int rfcomm_run(void *unused) return 0; } -static void rfcomm_auth_cfm(struct hci_conn *conn, u8 status) -{ - struct rfcomm_session *s; - struct rfcomm_dlc *d; - struct list_head *p, *n; - - BT_DBG("conn %p status 0x%02x", conn, status); - - s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst); - if (!s) - return; - - rfcomm_session_hold(s); - - list_for_each_safe(p, n, &s->dlcs) { - d = list_entry(p, struct rfcomm_dlc, list); - - if ((d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) && - !(conn->link_mode & HCI_LM_ENCRYPT) && !status) - continue; - - if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags)) - continue; - - if (!status) - set_bit(RFCOMM_AUTH_ACCEPT, &d->flags); - else - set_bit(RFCOMM_AUTH_REJECT, &d->flags); - } - - rfcomm_session_put(s); - - rfcomm_schedule(RFCOMM_SCHED_AUTH); -} - -static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt) +static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) { struct rfcomm_session *s; struct rfcomm_dlc *d; @@ -1999,18 +2010,29 @@ static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt) list_for_each_safe(p, n, &s->dlcs) { d = list_entry(p, struct rfcomm_dlc, list); - if ((d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) && - (d->state == BT_CONNECTED || - d->state == BT_CONFIG) && - !status && encrypt == 0x00) { - __rfcomm_dlc_close(d, ECONNREFUSED); - continue; + if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) { + rfcomm_dlc_clear_timer(d); + if (status || encrypt == 0x00) { + __rfcomm_dlc_close(d, ECONNREFUSED); + continue; + } + } + + if (d->state == BT_CONNECTED && !status && encrypt == 0x00) { + if (d->sec_level == BT_SECURITY_MEDIUM) { + set_bit(RFCOMM_SEC_PENDING, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + continue; + } else if (d->sec_level == BT_SECURITY_HIGH) { + __rfcomm_dlc_close(d, ECONNREFUSED); + continue; + } } if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags)) continue; - if (!status && encrypt) + if (!status) set_bit(RFCOMM_AUTH_ACCEPT, &d->flags); else set_bit(RFCOMM_AUTH_REJECT, &d->flags); @@ -2023,8 +2045,7 @@ static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt) static struct hci_cb rfcomm_cb = { .name = "RFCOMM", - .auth_cfm = rfcomm_auth_cfm, - .encrypt_cfm = rfcomm_encrypt_cfm + .security_cfm = rfcomm_security_cfm }; static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, char *buf) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index d3fc6fca38d0..7f482784e9f7 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -261,12 +261,19 @@ static void rfcomm_sock_init(struct sock *sk, struct sock *parent) if (parent) { sk->sk_type = parent->sk_type; - pi->link_mode = rfcomm_pi(parent)->link_mode; + pi->dlc->defer_setup = bt_sk(parent)->defer_setup; + + pi->sec_level = rfcomm_pi(parent)->sec_level; + pi->role_switch = rfcomm_pi(parent)->role_switch; } else { - pi->link_mode = 0; + pi->dlc->defer_setup = 0; + + pi->sec_level = BT_SECURITY_LOW; + pi->role_switch = 0; } - pi->dlc->link_mode = pi->link_mode; + pi->dlc->sec_level = pi->sec_level; + pi->dlc->role_switch = pi->role_switch; } static struct proto rfcomm_proto = { @@ -406,7 +413,8 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a bacpy(&bt_sk(sk)->dst, &sa->rc_bdaddr); rfcomm_pi(sk)->channel = sa->rc_channel; - d->link_mode = rfcomm_pi(sk)->link_mode; + d->sec_level = rfcomm_pi(sk)->sec_level; + d->role_switch = rfcomm_pi(sk)->role_switch; err = rfcomm_dlc_open(d, &bt_sk(sk)->src, &sa->rc_bdaddr, sa->rc_channel); if (!err) @@ -554,6 +562,9 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct sk_buff *skb; int sent = 0; + if (test_bit(RFCOMM_DEFER_SETUP, &d->flags)) + return -ENOTCONN; + if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; @@ -570,8 +581,11 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock, skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE, msg->msg_flags & MSG_DONTWAIT, &err); - if (!skb) + if (!skb) { + if (sent == 0) + sent = err; break; + } skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); @@ -630,10 +644,16 @@ static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; + struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; int err = 0; size_t target, copied = 0; long timeo; + if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { + rfcomm_dlc_accept(d); + return 0; + } + if (flags & MSG_OOB) return -EOPNOTSUPP; @@ -710,7 +730,7 @@ out: return copied ? : err; } -static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, int optlen) { struct sock *sk = sock->sk; int err = 0; @@ -727,7 +747,14 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c break; } - rfcomm_pi(sk)->link_mode = opt; + if (opt & RFCOMM_LM_AUTH) + rfcomm_pi(sk)->sec_level = BT_SECURITY_LOW; + if (opt & RFCOMM_LM_ENCRYPT) + rfcomm_pi(sk)->sec_level = BT_SECURITY_MEDIUM; + if (opt & RFCOMM_LM_SECURE) + rfcomm_pi(sk)->sec_level = BT_SECURITY_HIGH; + + rfcomm_pi(sk)->role_switch = (opt & RFCOMM_LM_MASTER); break; default: @@ -739,12 +766,76 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c return err; } -static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct bt_security sec; + int len, err = 0; + u32 opt; + + BT_DBG("sk %p", sk); + + if (level == SOL_RFCOMM) + return rfcomm_sock_setsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (sk->sk_type != SOCK_STREAM) { + err = -EINVAL; + break; + } + + sec.level = BT_SECURITY_LOW; + + len = min_t(unsigned int, sizeof(sec), optlen); + if (copy_from_user((char *) &sec, optval, len)) { + err = -EFAULT; + break; + } + + if (sec.level > BT_SECURITY_HIGH) { + err = -EINVAL; + break; + } + + rfcomm_pi(sk)->sec_level = sec.level; + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + bt_sk(sk)->defer_setup = opt; + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct sock *l2cap_sk; struct rfcomm_conninfo cinfo; int len, err = 0; + u32 opt; BT_DBG("sk %p", sk); @@ -755,12 +846,32 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c switch (optname) { case RFCOMM_LM: - if (put_user(rfcomm_pi(sk)->link_mode, (u32 __user *) optval)) + switch (rfcomm_pi(sk)->sec_level) { + case BT_SECURITY_LOW: + opt = RFCOMM_LM_AUTH; + break; + case BT_SECURITY_MEDIUM: + opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT; + break; + case BT_SECURITY_HIGH: + opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT | + RFCOMM_LM_SECURE; + break; + default: + opt = 0; + break; + } + + if (rfcomm_pi(sk)->role_switch) + opt |= RFCOMM_LM_MASTER; + + if (put_user(opt, (u32 __user *) optval)) err = -EFAULT; break; case RFCOMM_CONNINFO: - if (sk->sk_state != BT_CONNECTED) { + if (sk->sk_state != BT_CONNECTED && + !rfcomm_pi(sk)->dlc->defer_setup) { err = -ENOTCONN; break; } @@ -785,6 +896,60 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c return err; } +static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct bt_security sec; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (level == SOL_RFCOMM) + return rfcomm_sock_getsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (sk->sk_type != SOCK_STREAM) { + err = -EINVAL; + break; + } + + sec.level = rfcomm_pi(sk)->sec_level; + + len = min_t(unsigned int, len, sizeof(sec)); + if (copy_to_user(optval, (char *) &sec, len)) + err = -EFAULT; + + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk __maybe_unused = sock->sk; @@ -888,6 +1053,10 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * done: bh_unlock_sock(parent); + + if (bt_sk(parent)->defer_setup) + parent->sk_state_change(parent); + return result; } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 46fd8bf9a690..51ae0c3e470a 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -195,7 +195,7 @@ static int sco_connect(struct sock *sk) else type = SCO_LINK; - hcon = hci_connect(hdev, type, dst, HCI_AT_NO_BONDING); + hcon = hci_connect(hdev, type, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING); if (!hcon) goto done; @@ -668,7 +668,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char return err; } -static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct sco_options opts; @@ -723,6 +723,31 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char return err; } +static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (level == SOL_SCO) + return sco_sock_getsockopt_old(sock, optname, optval, optlen); + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + static int sco_sock_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -832,10 +857,30 @@ done: /* ----- SCO interface with lower layer (HCI) ----- */ static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type) { + register struct sock *sk; + struct hlist_node *node; + int lm = 0; + + if (type != SCO_LINK && type != ESCO_LINK) + return 0; + BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr)); - /* Always accept connection */ - return HCI_LM_ACCEPT; + /* Find listening sockets */ + read_lock(&sco_sk_list.lock); + sk_for_each(sk, node, &sco_sk_list.head) { + if (sk->sk_state != BT_LISTEN) + continue; + + if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr) || + !bacmp(&bt_sk(sk)->src, BDADDR_ANY)) { + lm |= HCI_LM_ACCEPT; + break; + } + } + read_unlock(&sco_sk_list.lock); + + return lm; } static int sco_connect_cfm(struct hci_conn *hcon, __u8 status) @@ -857,7 +902,7 @@ static int sco_connect_cfm(struct hci_conn *hcon, __u8 status) return 0; } -static int sco_disconn_ind(struct hci_conn *hcon, __u8 reason) +static int sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) { BT_DBG("hcon %p reason %d", hcon, reason); @@ -940,7 +985,7 @@ static struct hci_proto sco_hci_proto = { .id = HCI_PROTO_SCO, .connect_ind = sco_connect_ind, .connect_cfm = sco_connect_cfm, - .disconn_ind = sco_disconn_ind, + .disconn_cfm = sco_disconn_cfm, .recv_scodata = sco_recv_scodata }; diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index cf754ace0b75..3953ac4214c8 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -107,7 +107,7 @@ static void fake_update_pmtu(struct dst_entry *dst, u32 mtu) static struct dst_ops fake_dst_ops = { .family = AF_INET, - .protocol = __constant_htons(ETH_P_IP), + .protocol = cpu_to_be16(ETH_P_IP), .update_pmtu = fake_update_pmtu, .entries = ATOMIC_INIT(0), }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index ba7be195803c..fcffb3fb1177 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -98,7 +98,8 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index d44cbf8c374a..a94f3cc377c0 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -214,7 +214,7 @@ static struct xt_target ebt_log_tg_reg __read_mostly = { .me = THIS_MODULE, }; -static const struct nf_logger ebt_log_logger = { +static struct nf_logger ebt_log_logger __read_mostly = { .name = "ebt_log", .logfn = &ebt_log_packet, .me = THIS_MODULE, diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 2c6d6823e703..133eeae45a4f 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -279,21 +279,21 @@ static struct xt_target ebt_ulog_tg_reg __read_mostly = { .me = THIS_MODULE, }; -static const struct nf_logger ebt_ulog_logger = { - .name = "ulog", +static struct nf_logger ebt_ulog_logger __read_mostly = { + .name = "ebt_ulog", .logfn = &ebt_log_packet, .me = THIS_MODULE, }; static int __init ebt_ulog_init(void) { - bool ret = true; + int ret; int i; if (nlbufsiz >= 128*1024) { printk(KERN_NOTICE "ebt_ulog: Netlink buffer has to be <= 128kB," " please try a smaller nlbufsiz parameter.\n"); - return false; + return -EINVAL; } /* initialize ulog_buffers */ @@ -308,12 +308,12 @@ static int __init ebt_ulog_init(void) if (!ebtulognl) { printk(KERN_WARNING KBUILD_MODNAME ": out of memory trying to " "call netlink_kernel_create\n"); - ret = false; - } else if (xt_register_target(&ebt_ulog_tg_reg) != 0) { + ret = -ENOMEM; + } else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0) { netlink_kernel_release(ebtulognl); } - if (ret) + if (ret == 0) nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger); return ret; diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 8604dfc1fc3b..c751111440f8 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -46,7 +46,6 @@ static struct ebt_table broute_table = .name = "broute", .table = &initial_table, .valid_hooks = 1 << NF_BR_BROUTING, - .lock = __RW_LOCK_UNLOCKED(broute_table.lock), .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 2b2e8040a9c6..a5eea72938a6 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -55,7 +55,6 @@ static struct ebt_table frame_filter = .name = "filter", .table = &initial_table, .valid_hooks = FILTER_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(frame_filter.lock), .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 3fe1ae87e35f..6024c551f9a9 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -55,7 +55,6 @@ static struct ebt_table frame_nat = .name = "nat", .table = &initial_table, .valid_hooks = NAT_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(frame_nat.lock), .check = check, .me = THIS_MODULE, }; diff --git a/net/can/af_can.c b/net/can/af_can.c index fa417ca6cbe6..547bafc79e28 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -273,8 +273,7 @@ int can_send(struct sk_buff *skb, int loop) err = net_xmit_errno(err); if (err) { - if (newskb) - kfree_skb(newskb); + kfree_skb(newskb); return err; } @@ -828,7 +827,7 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg, */ static struct packet_type can_packet __read_mostly = { - .type = __constant_htons(ETH_P_CAN), + .type = cpu_to_be16(ETH_P_CAN), .dev = NULL, .func = can_rcv, }; diff --git a/net/can/raw.c b/net/can/raw.c index 0703cba4bf9f..6aa154e806ae 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -648,6 +648,9 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); if (err < 0) goto free_skb; + err = sock_tx_timestamp(msg, sk, skb_tx(skb)); + if (err < 0) + goto free_skb; skb->dev = dev; skb->sk = sk; diff --git a/net/compat.c b/net/compat.c index a3a2ba0fac08..8d739053afe4 100644 --- a/net/compat.c +++ b/net/compat.c @@ -216,7 +216,7 @@ Efault: int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data) { struct compat_timeval ctv; - struct compat_timespec cts; + struct compat_timespec cts[3]; struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control; struct compat_cmsghdr cmhdr; int cmlen; @@ -233,12 +233,17 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat data = &ctv; len = sizeof(ctv); } - if (level == SOL_SOCKET && type == SCM_TIMESTAMPNS) { + if (level == SOL_SOCKET && + (type == SCM_TIMESTAMPNS || type == SCM_TIMESTAMPING)) { + int count = type == SCM_TIMESTAMPNS ? 1 : 3; + int i; struct timespec *ts = (struct timespec *)data; - cts.tv_sec = ts->tv_sec; - cts.tv_nsec = ts->tv_nsec; + for (i = 0; i < count; i++) { + cts[i].tv_sec = ts[i].tv_sec; + cts[i].tv_nsec = ts[i].tv_nsec; + } data = &cts; - len = sizeof(cts); + len = sizeof(cts[0]) * count; } cmlen = CMSG_COMPAT_LEN(len); @@ -455,7 +460,7 @@ int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) struct timeval tv; if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); tv = ktime_to_timeval(sk->sk_stamp); if (tv.tv_sec == -1) return err; @@ -479,7 +484,7 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta struct timespec ts; if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); ts = ktime_to_timespec(sk->sk_stamp); if (ts.tv_sec == -1) return err; diff --git a/net/core/Makefile b/net/core/Makefile index 26a37cb31923..796f46eece5f 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -17,3 +17,6 @@ obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_NET_DMA) += user_dma.o obj-$(CONFIG_FIB_RULES) += fib_rules.o +obj-$(CONFIG_TRACEPOINTS) += net-traces.o +obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o + diff --git a/net/core/datagram.c b/net/core/datagram.c index 5e2ac0c4b07c..d0de644b378d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -208,7 +208,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { - kfree_skb(skb); + consume_skb(skb); sk_mem_reclaim_partial(sk); } diff --git a/net/core/dev.c b/net/core/dev.c index e3fe5c705606..052dd478d3e1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1668,8 +1668,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; + int rc; - prefetch(&dev->netdev_ops->ndo_start_xmit); if (likely(!skb->next)) { if (!list_empty(&ptype_all)) dev_queue_xmit_nit(skb, dev); @@ -1681,13 +1681,27 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, goto gso; } - return ops->ndo_start_xmit(skb, dev); + rc = ops->ndo_start_xmit(skb, dev); + /* + * TODO: if skb_orphan() was called by + * dev->hard_start_xmit() (for example, the unmodified + * igb driver does that; bnx2 doesn't), then + * skb_tx_software_timestamp() will be unable to send + * back the time stamp. + * + * How can this be prevented? Always create another + * reference to the socket before calling + * dev->hard_start_xmit()? Prevent that skb_orphan() + * does anything in dev->hard_start_xmit() by clearing + * the skb destructor before the call and restoring it + * afterwards, then doing the skb_orphan() ourselves? + */ + return rc; } gso: do { struct sk_buff *nskb = skb->next; - int rc; skb->next = nskb->next; nskb->next = NULL; @@ -1708,59 +1722,24 @@ out_kfree_skb: return 0; } -static u32 simple_tx_hashrnd; -static int simple_tx_hashrnd_initialized = 0; +static u32 skb_tx_hashrnd; -static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) +u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) { - u32 addr1, addr2, ports; - u32 hash, ihl; - u8 ip_proto = 0; - - if (unlikely(!simple_tx_hashrnd_initialized)) { - get_random_bytes(&simple_tx_hashrnd, 4); - simple_tx_hashrnd_initialized = 1; - } - - switch (skb->protocol) { - case htons(ETH_P_IP): - if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))) - ip_proto = ip_hdr(skb)->protocol; - addr1 = ip_hdr(skb)->saddr; - addr2 = ip_hdr(skb)->daddr; - ihl = ip_hdr(skb)->ihl; - break; - case htons(ETH_P_IPV6): - ip_proto = ipv6_hdr(skb)->nexthdr; - addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3]; - addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3]; - ihl = (40 >> 2); - break; - default: - return 0; - } - + u32 hash; - switch (ip_proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - case IPPROTO_AH: - case IPPROTO_SCTP: - case IPPROTO_UDPLITE: - ports = *((u32 *) (skb_network_header(skb) + (ihl * 4))); - break; - - default: - ports = 0; - break; - } + if (skb_rx_queue_recorded(skb)) { + hash = skb_get_rx_queue(skb); + } else if (skb->sk && skb->sk->sk_hash) { + hash = skb->sk->sk_hash; + } else + hash = skb->protocol; - hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd); + hash = jhash_1word(hash, skb_tx_hashrnd); return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); } +EXPORT_SYMBOL(skb_tx_hash); static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) @@ -1771,7 +1750,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, if (ops->ndo_select_queue) queue_index = ops->ndo_select_queue(dev, skb); else if (dev->real_num_tx_queues > 1) - queue_index = simple_tx_hash(dev, skb); + queue_index = skb_tx_hash(dev, skb); skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); @@ -2297,6 +2276,8 @@ ncls: if (!skb) goto out; + skb_orphan(skb); + type = skb->protocol; list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { @@ -2366,7 +2347,6 @@ static int napi_gro_complete(struct sk_buff *skb) out: skb_shinfo(skb)->gso_size = 0; - __skb_push(skb, -skb_network_offset(skb)); return netif_receive_skb(skb); } @@ -2380,20 +2360,40 @@ void napi_gro_flush(struct napi_struct *napi) napi_gro_complete(skb); } + napi->gro_count = 0; napi->gro_list = NULL; } EXPORT_SYMBOL(napi_gro_flush); +void *skb_gro_header(struct sk_buff *skb, unsigned int hlen) +{ + unsigned int offset = skb_gro_offset(skb); + + hlen += offset; + if (hlen <= skb_headlen(skb)) + return skb->data + offset; + + if (unlikely(!skb_shinfo(skb)->nr_frags || + skb_shinfo(skb)->frags[0].size <= + hlen - skb_headlen(skb) || + PageHighMem(skb_shinfo(skb)->frags[0].page))) + return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; + + return page_address(skb_shinfo(skb)->frags[0].page) + + skb_shinfo(skb)->frags[0].page_offset + + offset - skb_headlen(skb); +} +EXPORT_SYMBOL(skb_gro_header); + int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; struct packet_type *ptype; __be16 type = skb->protocol; struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; - int count = 0; int same_flow; int mac_len; - int free; + int ret; if (!(skb->dev->features & NETIF_F_GRO)) goto normal; @@ -2403,30 +2403,16 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { - struct sk_buff *p; - if (ptype->type != type || ptype->dev || !ptype->gro_receive) continue; - skb_reset_network_header(skb); + skb_set_network_header(skb, skb_gro_offset(skb)); mac_len = skb->network_header - skb->mac_header; skb->mac_len = mac_len; NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; - for (p = napi->gro_list; p; p = p->next) { - count++; - - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - if (p->mac_len != mac_len || - memcmp(skb_mac_header(p), skb_mac_header(skb), - mac_len)) - NAPI_GRO_CB(p)->same_flow = 0; - } - pp = ptype->gro_receive(&napi->gro_list, skb); break; } @@ -2436,7 +2422,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) goto normal; same_flow = NAPI_GRO_CB(skb)->same_flow; - free = NAPI_GRO_CB(skb)->free; + ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { struct sk_buff *nskb = *pp; @@ -2444,27 +2430,35 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) *pp = nskb->next; nskb->next = NULL; napi_gro_complete(nskb); - count--; + napi->gro_count--; } if (same_flow) goto ok; - if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) { - __skb_push(skb, -skb_network_offset(skb)); + if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) goto normal; - } + napi->gro_count++; NAPI_GRO_CB(skb)->count = 1; - skb_shinfo(skb)->gso_size = skb->len; + skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; + ret = GRO_HELD; + +pull: + if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) { + if (napi->gro_list == skb) + napi->gro_list = skb->next; + ret = GRO_DROP; + } ok: - return free; + return ret; normal: - return -1; + ret = GRO_NORMAL; + goto pull; } EXPORT_SYMBOL(dev_gro_receive); @@ -2472,29 +2466,44 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; + if (netpoll_rx_on(skb)) + return GRO_NORMAL; + for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = 1; + NAPI_GRO_CB(p)->same_flow = !compare_ether_header( + skb_mac_header(p), skb_gro_mac_header(skb)); NAPI_GRO_CB(p)->flush = 0; } return dev_gro_receive(napi, skb); } -int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +int napi_skb_finish(int ret, struct sk_buff *skb) { - if (netpoll_receive_skb(skb)) - return NET_RX_DROP; + int err = NET_RX_SUCCESS; - switch (__napi_gro_receive(napi, skb)) { - case -1: + switch (ret) { + case GRO_NORMAL: return netif_receive_skb(skb); - case 1: + case GRO_DROP: + err = NET_RX_DROP; + /* fall through */ + + case GRO_MERGED_FREE: kfree_skb(skb); break; } - return NET_RX_SUCCESS; + return err; +} +EXPORT_SYMBOL(napi_skb_finish); + +int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + skb_gro_reset_offset(skb); + + return napi_skb_finish(__napi_gro_receive(napi, skb), skb); } EXPORT_SYMBOL(napi_gro_receive); @@ -2512,6 +2521,9 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, { struct net_device *dev = napi->dev; struct sk_buff *skb = napi->skb; + struct ethhdr *eth; + skb_frag_t *frag; + int i; napi->skb = NULL; @@ -2524,20 +2536,36 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, } BUG_ON(info->nr_frags > MAX_SKB_FRAGS); + frag = &info->frags[info->nr_frags - 1]; + + for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) { + skb_fill_page_desc(skb, i, frag->page, frag->page_offset, + frag->size); + frag++; + } skb_shinfo(skb)->nr_frags = info->nr_frags; - memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags)); skb->data_len = info->len; skb->len += info->len; skb->truesize += info->len; - if (!pskb_may_pull(skb, ETH_HLEN)) { + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb); + + eth = skb_gro_header(skb, sizeof(*eth)); + if (!eth) { napi_reuse_skb(napi, skb); skb = NULL; goto out; } - skb->protocol = eth_type_trans(skb, dev); + skb_gro_pull(skb, sizeof(*eth)); + + /* + * This works because the only protocols we care about don't require + * special handling. We'll fix it up properly at the end. + */ + skb->protocol = eth->h_proto; skb->ip_summed = info->ip_summed; skb->csum = info->csum; @@ -2547,32 +2575,43 @@ out: } EXPORT_SYMBOL(napi_fraginfo_skb); -int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info) +int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) { - struct sk_buff *skb = napi_fraginfo_skb(napi, info); - int err = NET_RX_DROP; + int err = NET_RX_SUCCESS; - if (!skb) - goto out; + switch (ret) { + case GRO_NORMAL: + case GRO_HELD: + skb->protocol = eth_type_trans(skb, napi->dev); - if (netpoll_receive_skb(skb)) - goto out; + if (ret == GRO_NORMAL) + return netif_receive_skb(skb); - err = NET_RX_SUCCESS; + skb_gro_pull(skb, -ETH_HLEN); + break; - switch (__napi_gro_receive(napi, skb)) { - case -1: - return netif_receive_skb(skb); + case GRO_DROP: + err = NET_RX_DROP; + /* fall through */ - case 0: - goto out; + case GRO_MERGED_FREE: + napi_reuse_skb(napi, skb); + break; } - napi_reuse_skb(napi, skb); - -out: return err; } +EXPORT_SYMBOL(napi_frags_finish); + +int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info) +{ + struct sk_buff *skb = napi_fraginfo_skb(napi, info); + + if (!skb) + return NET_RX_DROP; + + return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); +} EXPORT_SYMBOL(napi_gro_frags); static int process_backlog(struct napi_struct *napi, int quota) @@ -2653,6 +2692,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { INIT_LIST_HEAD(&napi->poll_list); + napi->gro_count = 0; napi->gro_list = NULL; napi->skb = NULL; napi->poll = poll; @@ -2681,6 +2721,7 @@ void netif_napi_del(struct napi_struct *napi) } napi->gro_list = NULL; + napi->gro_count = 0; } EXPORT_SYMBOL(netif_napi_del); @@ -3949,6 +3990,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) cmd == SIOCSMIIREG || cmd == SIOCBRADDIF || cmd == SIOCBRDELIF || + cmd == SIOCSHWTSTAMP || cmd == SIOCWANDEV) { err = -EOPNOTSUPP; if (ops->ndo_do_ioctl) { @@ -4103,6 +4145,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCBONDCHANGEACTIVE: case SIOCBRADDIF: case SIOCBRDELIF: + case SIOCSHWTSTAMP: if (!capable(CAP_NET_ADMIN)) return -EPERM; /* fall through */ @@ -5199,6 +5242,7 @@ static int __init net_dev_init(void) queue->backlog.poll = process_backlog; queue->backlog.weight = weight_p; queue->backlog.gro_list = NULL; + queue->backlog.gro_count = 0; } dev_boot_phase = 0; @@ -5231,6 +5275,14 @@ out: subsys_initcall(net_dev_init); +static int __init initialize_hashrnd(void) +{ + get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); + return 0; +} + +late_initcall_sync(initialize_hashrnd); + EXPORT_SYMBOL(__dev_get_by_index); EXPORT_SYMBOL(__dev_get_by_name); EXPORT_SYMBOL(__dev_remove_pack); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c new file mode 100644 index 000000000000..9fd0dc3cca99 --- /dev/null +++ b/net/core/drop_monitor.c @@ -0,0 +1,263 @@ +/* + * Monitoring code for network dropped packet alerts + * + * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> + */ + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/if_arp.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/netlink.h> +#include <linux/net_dropmon.h> +#include <linux/percpu.h> +#include <linux/timer.h> +#include <linux/bitops.h> +#include <net/genetlink.h> + +#include <trace/skb.h> + +#include <asm/unaligned.h> + +#define TRACE_ON 1 +#define TRACE_OFF 0 + +static void send_dm_alert(struct work_struct *unused); + + +/* + * Globals, our netlink socket pointer + * and the work handle that will send up + * netlink alerts + */ +struct sock *dm_sock; + +struct per_cpu_dm_data { + struct work_struct dm_alert_work; + struct sk_buff *skb; + atomic_t dm_hit_count; + struct timer_list send_timer; +}; + +static struct genl_family net_drop_monitor_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "NET_DM", + .version = 1, + .maxattr = NET_DM_CMD_MAX, +}; + +static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); + +static int dm_hit_limit = 64; +static int dm_delay = 1; + + +static void reset_per_cpu_data(struct per_cpu_dm_data *data) +{ + size_t al; + struct net_dm_alert_msg *msg; + + al = sizeof(struct net_dm_alert_msg); + al += dm_hit_limit * sizeof(struct net_dm_drop_point); + data->skb = genlmsg_new(al, GFP_KERNEL); + genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family, + 0, NET_DM_CMD_ALERT); + msg = __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_alert_msg)); + memset(msg, 0, al); + atomic_set(&data->dm_hit_count, dm_hit_limit); +} + +static void send_dm_alert(struct work_struct *unused) +{ + struct sk_buff *skb; + struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + + /* + * Grab the skb we're about to send + */ + skb = data->skb; + + /* + * Replace it with a new one + */ + reset_per_cpu_data(data); + + /* + * Ship it! + */ + genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); + +} + +/* + * This is the timer function to delay the sending of an alert + * in the event that more drops will arrive during the + * hysteresis period. Note that it operates under the timer interrupt + * so we don't need to disable preemption here + */ +static void sched_send_work(unsigned long unused) +{ + struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + + schedule_work(&data->dm_alert_work); +} + +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +{ + struct net_dm_alert_msg *msg; + struct nlmsghdr *nlh; + int i; + struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + + + if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) { + /* + * we're already at zero, discard this hit + */ + goto out; + } + + nlh = (struct nlmsghdr *)data->skb->data; + msg = genlmsg_data(nlmsg_data(nlh)); + for (i = 0; i < msg->entries; i++) { + if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { + msg->points[i].count++; + goto out; + } + } + + /* + * We need to create a new entry + */ + __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point)); + memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); + msg->points[msg->entries].count = 1; + msg->entries++; + + if (!timer_pending(&data->send_timer)) { + data->send_timer.expires = jiffies + dm_delay * HZ; + add_timer_on(&data->send_timer, smp_processor_id()); + } + +out: + return; +} + +static int set_all_monitor_traces(int state) +{ + int rc = 0; + + switch (state) { + case TRACE_ON: + rc |= register_trace_kfree_skb(trace_kfree_skb_hit); + break; + case TRACE_OFF: + rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); + + tracepoint_synchronize_unregister(); + break; + default: + rc = 1; + break; + } + + if (rc) + return -EINPROGRESS; + return rc; +} + + +static int net_dm_cmd_config(struct sk_buff *skb, + struct genl_info *info) +{ + return -ENOTSUPP; +} + +static int net_dm_cmd_trace(struct sk_buff *skb, + struct genl_info *info) +{ + switch (info->genlhdr->cmd) { + case NET_DM_CMD_START: + return set_all_monitor_traces(TRACE_ON); + break; + case NET_DM_CMD_STOP: + return set_all_monitor_traces(TRACE_OFF); + break; + } + + return -ENOTSUPP; +} + + +static struct genl_ops dropmon_ops[] = { + { + .cmd = NET_DM_CMD_CONFIG, + .doit = net_dm_cmd_config, + }, + { + .cmd = NET_DM_CMD_START, + .doit = net_dm_cmd_trace, + }, + { + .cmd = NET_DM_CMD_STOP, + .doit = net_dm_cmd_trace, + }, +}; + +static int __init init_net_drop_monitor(void) +{ + int cpu; + int rc, i, ret; + struct per_cpu_dm_data *data; + printk(KERN_INFO "Initalizing network drop monitor service\n"); + + if (sizeof(void *) > 8) { + printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n"); + return -ENOSPC; + } + + if (genl_register_family(&net_drop_monitor_family) < 0) { + printk(KERN_ERR "Could not create drop monitor netlink family\n"); + return -EFAULT; + } + + rc = -EFAULT; + + for (i = 0; i < ARRAY_SIZE(dropmon_ops); i++) { + ret = genl_register_ops(&net_drop_monitor_family, + &dropmon_ops[i]); + if (ret) { + printk(KERN_CRIT "failed to register operation %d\n", + dropmon_ops[i].cmd); + goto out_unreg; + } + } + + rc = 0; + + for_each_present_cpu(cpu) { + data = &per_cpu(dm_cpu_data, cpu); + reset_per_cpu_data(data); + INIT_WORK(&data->dm_alert_work, send_dm_alert); + init_timer(&data->send_timer); + data->send_timer.data = cpu; + data->send_timer.function = sched_send_work; + } + goto out; + +out_unreg: + genl_unregister_family(&net_drop_monitor_family); +out: + return rc; +} + +late_initcall(init_net_drop_monitor); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 947710a36ced..244ca56dffac 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -209,34 +209,62 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) return 0; } -static int ethtool_set_rxhash(struct net_device *dev, void __user *useraddr) +static int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr) { struct ethtool_rxnfc cmd; - if (!dev->ethtool_ops->set_rxhash) + if (!dev->ethtool_ops->set_rxnfc) return -EOPNOTSUPP; if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; - return dev->ethtool_ops->set_rxhash(dev, &cmd); + return dev->ethtool_ops->set_rxnfc(dev, &cmd); } -static int ethtool_get_rxhash(struct net_device *dev, void __user *useraddr) +static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) { struct ethtool_rxnfc info; + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + void *rule_buf = NULL; - if (!dev->ethtool_ops->get_rxhash) + if (!ops->get_rxnfc) return -EOPNOTSUPP; if (copy_from_user(&info, useraddr, sizeof(info))) return -EFAULT; - dev->ethtool_ops->get_rxhash(dev, &info); + if (info.cmd == ETHTOOL_GRXCLSRLALL) { + if (info.rule_cnt > 0) { + rule_buf = kmalloc(info.rule_cnt * sizeof(u32), + GFP_USER); + if (!rule_buf) + return -ENOMEM; + } + } + ret = ops->get_rxnfc(dev, &info, rule_buf); + if (ret < 0) + goto err_out; + + ret = -EFAULT; if (copy_to_user(useraddr, &info, sizeof(info))) - return -EFAULT; - return 0; + goto err_out; + + if (rule_buf) { + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + if (copy_to_user(useraddr, rule_buf, + info.rule_cnt * sizeof(u32))) + goto err_out; + } + ret = 0; + +err_out: + if (rule_buf) + kfree(rule_buf); + + return ret; } static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) @@ -901,6 +929,10 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GFLAGS: case ETHTOOL_GPFLAGS: case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: break; default: if (!capable(CAP_NET_ADMIN)) @@ -1052,10 +1084,16 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) dev->ethtool_ops->set_priv_flags); break; case ETHTOOL_GRXFH: - rc = ethtool_get_rxhash(dev, useraddr); + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + rc = ethtool_get_rxnfc(dev, useraddr); break; case ETHTOOL_SRXFH: - rc = ethtool_set_rxhash(dev, useraddr); + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + rc = ethtool_set_rxnfc(dev, useraddr); break; case ETHTOOL_GGRO: rc = ethtool_get_gro(dev, useraddr); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 32b3a0152d7a..98691e1466b8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -588,7 +588,8 @@ static void notify_rule_change(int event, struct fib_rule *rule, goto errout; } - err = rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); + rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(net, ops->nlgroup, err); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 278a142d1047..a1cbce7fdae5 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -871,8 +871,7 @@ static void neigh_timer_handler(unsigned long arg) write_unlock(&neigh->lock); neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); - if (skb) - kfree_skb(skb); + kfree_skb(skb); } else { out: write_unlock(&neigh->lock); @@ -908,8 +907,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) neigh->updated = jiffies; write_unlock_bh(&neigh->lock); - if (skb) - kfree_skb(skb); + kfree_skb(skb); return 1; } } else if (neigh->nud_state & NUD_STALE) { @@ -1656,7 +1654,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) flags &= ~NEIGH_UPDATE_F_OVERRIDE; } - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); + if (ndm->ndm_flags & NTF_USE) { + neigh_event_send(neigh, NULL); + err = 0; + } else + err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); neigh_release(neigh); goto out_dev_put; } @@ -2534,7 +2536,8 @@ static void __neigh_notify(struct neighbour *n, int type, int flags) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 484f58750eba..2da59a0ac4ac 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -498,7 +498,7 @@ int netdev_register_kobject(struct net_device *net) dev->groups = groups; BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ); - dev_set_name(dev, net->name); + dev_set_name(dev, "%s", net->name); #ifdef CONFIG_SYSFS *groups++ = &netstat_group; diff --git a/net/core/net-traces.c b/net/core/net-traces.c new file mode 100644 index 000000000000..c8fb45665e4f --- /dev/null +++ b/net/core/net-traces.c @@ -0,0 +1,29 @@ +/* + * consolidates trace point definitions + * + * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> + */ + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/if_arp.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/rcupdate.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/netlink.h> +#include <linux/net_dropmon.h> +#include <trace/skb.h> + +#include <asm/unaligned.h> +#include <asm/bitops.h> + + +DEFINE_TRACE(kfree_skb); +EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 65498483325a..32d419f5ac98 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3275,8 +3275,7 @@ static void pktgen_stop(struct pktgen_thread *t) list_for_each_entry(pkt_dev, &t->if_list, list) { pktgen_stop_device(pkt_dev); - if (pkt_dev->skb) - kfree_skb(pkt_dev->skb); + kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; } @@ -3303,8 +3302,7 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) if (!cur->removal_mark) continue; - if (cur->skb) - kfree_skb(cur->skb); + kfree_skb(cur->skb); cur->skb = NULL; pktgen_remove_device(t, cur); @@ -3328,8 +3326,7 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); - if (cur->skb) - kfree_skb(cur->skb); + kfree_skb(cur->skb); cur->skb = NULL; pktgen_remove_device(t, cur); @@ -3393,8 +3390,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) if (!netif_running(odev)) { pktgen_stop_device(pkt_dev); - if (pkt_dev->skb) - kfree_skb(pkt_dev->skb); + kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; goto out; } @@ -3415,8 +3411,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) if ((++pkt_dev->clone_count >= pkt_dev->clone_skb) || (!pkt_dev->skb)) { /* build a new pkt */ - if (pkt_dev->skb) - kfree_skb(pkt_dev->skb); + kfree_skb(pkt_dev->skb); pkt_dev->skb = fill_packet(odev, pkt_dev); if (pkt_dev->skb == NULL) { @@ -3498,8 +3493,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) /* Done with this */ pktgen_stop_device(pkt_dev); - if (pkt_dev->skb) - kfree_skb(pkt_dev->skb); + kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; } out:; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 790dd205bb5d..d78030f88bd0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -455,8 +455,8 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) return nlmsg_unicast(rtnl, skb, pid); } -int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, - struct nlmsghdr *nlh, gfp_t flags) +void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, + struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; int report = 0; @@ -464,7 +464,7 @@ int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, if (nlh) report = nlmsg_report(nlh); - return nlmsg_notify(rtnl, skb, pid, group, report, flags); + nlmsg_notify(rtnl, skb, pid, group, report, flags); } void rtnl_set_sk_err(struct net *net, u32 group, int error) @@ -1246,7 +1246,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c6a6b166f8d6..6acbf9e79eb1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -55,6 +55,7 @@ #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/scatterlist.h> +#include <linux/errqueue.h> #include <net/protocol.h> #include <net/dst.h> @@ -64,6 +65,7 @@ #include <asm/uaccess.h> #include <asm/system.h> +#include <trace/skb.h> #include "kmap_skb.h" @@ -123,6 +125,7 @@ void skb_over_panic(struct sk_buff *skb, int sz, void *here) skb->dev ? skb->dev->name : "<NULL>"); BUG(); } +EXPORT_SYMBOL(skb_over_panic); /** * skb_under_panic - private function @@ -142,6 +145,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) skb->dev ? skb->dev->name : "<NULL>"); BUG(); } +EXPORT_SYMBOL(skb_under_panic); /* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the @@ -205,7 +209,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo->gso_segs = 0; shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; + shinfo->tx_flags.flags = 0; shinfo->frag_list = NULL; + memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); if (fclone) { struct sk_buff *child = skb + 1; @@ -223,6 +229,7 @@ nodata: skb = NULL; goto out; } +EXPORT_SYMBOL(__alloc_skb); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device @@ -250,6 +257,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, } return skb; } +EXPORT_SYMBOL(__netdev_alloc_skb); struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask) { @@ -418,6 +426,7 @@ void __kfree_skb(struct sk_buff *skb) skb_release_all(skb); kfree_skbmem(skb); } +EXPORT_SYMBOL(__kfree_skb); /** * kfree_skb - free an sk_buff @@ -434,8 +443,30 @@ void kfree_skb(struct sk_buff *skb) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; + trace_kfree_skb(skb, __builtin_return_address(0)); + __kfree_skb(skb); +} +EXPORT_SYMBOL(kfree_skb); + +/** + * consume_skb - free an skbuff + * @skb: buffer to free + * + * Drop a ref to the buffer and free it if the usage count has hit zero + * Functions identically to kfree_skb, but kfree_skb assumes that the frame + * is being dropped after a failure and notes that + */ +void consume_skb(struct sk_buff *skb) +{ + if (unlikely(!skb)) + return; + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; __kfree_skb(skb); } +EXPORT_SYMBOL(consume_skb); /** * skb_recycle_check - check if skb can be reused for receive @@ -605,6 +636,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return __skb_clone(n, skb); } +EXPORT_SYMBOL(skb_clone); static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { @@ -671,7 +703,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) copy_skb_header(n, skb); return n; } - +EXPORT_SYMBOL(skb_copy); /** * pskb_copy - create copy of an sk_buff with private head. @@ -730,6 +762,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) out: return n; } +EXPORT_SYMBOL(pskb_copy); /** * pskb_expand_head - reallocate header of &sk_buff @@ -813,6 +846,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, nodata: return -ENOMEM; } +EXPORT_SYMBOL(pskb_expand_head); /* Make private copy of skb with writable head and some headroom */ @@ -833,7 +867,7 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) } return skb2; } - +EXPORT_SYMBOL(skb_realloc_headroom); /** * skb_copy_expand - copy and expand sk_buff @@ -898,6 +932,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, return n; } +EXPORT_SYMBOL(skb_copy_expand); /** * skb_pad - zero pad the tail of an skb @@ -943,6 +978,7 @@ free_skb: kfree_skb(skb); return err; } +EXPORT_SYMBOL(skb_pad); /** * skb_put - add data to a buffer @@ -1100,6 +1136,7 @@ done: return 0; } +EXPORT_SYMBOL(___pskb_trim); /** * __pskb_pull_tail - advance tail of skb header @@ -1193,8 +1230,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) insp = list; } if (!pskb_pull(list, eat)) { - if (clone) - kfree_skb(clone); + kfree_skb(clone); return NULL; } break; @@ -1238,6 +1274,7 @@ pull_pages: return skb_tail_pointer(skb); } +EXPORT_SYMBOL(__pskb_pull_tail); /* Copy some data bits from skb to kernel buffer. */ @@ -1315,6 +1352,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) fault: return -EFAULT; } +EXPORT_SYMBOL(skb_copy_bits); /* * Callback from splice_to_pipe(), if we need to release some pages @@ -1325,14 +1363,39 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) put_page(spd->pages[i]); } -static inline struct page *linear_to_page(struct page *page, unsigned int len, - unsigned int offset) +static inline struct page *linear_to_page(struct page *page, unsigned int *len, + unsigned int *offset, + struct sk_buff *skb) { - struct page *p = alloc_pages(GFP_KERNEL, 0); + struct sock *sk = skb->sk; + struct page *p = sk->sk_sndmsg_page; + unsigned int off; - if (!p) - return NULL; - memcpy(page_address(p) + offset, page_address(page) + offset, len); + if (!p) { +new_page: + p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0); + if (!p) + return NULL; + + off = sk->sk_sndmsg_off = 0; + /* hold one ref to this page until it's full */ + } else { + unsigned int mlen; + + off = sk->sk_sndmsg_off; + mlen = PAGE_SIZE - off; + if (mlen < 64 && mlen < *len) { + put_page(p); + goto new_page; + } + + *len = min_t(unsigned int, *len, mlen); + } + + memcpy(page_address(p) + off, page_address(page) + *offset, *len); + sk->sk_sndmsg_off += *len; + *offset = off; + get_page(p); return p; } @@ -1341,21 +1404,21 @@ static inline struct page *linear_to_page(struct page *page, unsigned int len, * Fill page/offset/length into spd, if it can hold more pages. */ static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, - unsigned int len, unsigned int offset, + unsigned int *len, unsigned int offset, struct sk_buff *skb, int linear) { if (unlikely(spd->nr_pages == PIPE_BUFFERS)) return 1; if (linear) { - page = linear_to_page(page, len, offset); + page = linear_to_page(page, len, &offset, skb); if (!page) return 1; } else get_page(page); spd->pages[spd->nr_pages] = page; - spd->partial[spd->nr_pages].len = len; + spd->partial[spd->nr_pages].len = *len; spd->partial[spd->nr_pages].offset = offset; spd->nr_pages++; @@ -1365,8 +1428,13 @@ static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, static inline void __segment_seek(struct page **page, unsigned int *poff, unsigned int *plen, unsigned int off) { + unsigned long n; + *poff += off; - *page += *poff / PAGE_SIZE; + n = *poff / PAGE_SIZE; + if (n) + *page = nth_page(*page, n); + *poff = *poff % PAGE_SIZE; *plen -= off; } @@ -1397,7 +1465,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff, /* the linear region may spread across several pages */ flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - if (spd_fill_page(spd, page, flen, poff, skb, linear)) + if (spd_fill_page(spd, page, &flen, poff, skb, linear)) return 1; __segment_seek(&page, &poff, &plen, flen); @@ -1590,7 +1658,6 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) fault: return -EFAULT; } - EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ @@ -1667,6 +1734,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, return csum; } +EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ @@ -1748,6 +1816,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, BUG_ON(len); return csum; } +EXPORT_SYMBOL(skb_copy_and_csum_bits); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) { @@ -1774,6 +1843,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) *((__sum16 *)(to + csstuff)) = csum_fold(csum); } } +EXPORT_SYMBOL(skb_copy_and_csum_dev); /** * skb_dequeue - remove from the head of the queue @@ -1794,6 +1864,7 @@ struct sk_buff *skb_dequeue(struct sk_buff_head *list) spin_unlock_irqrestore(&list->lock, flags); return result; } +EXPORT_SYMBOL(skb_dequeue); /** * skb_dequeue_tail - remove from the tail of the queue @@ -1813,6 +1884,7 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) spin_unlock_irqrestore(&list->lock, flags); return result; } +EXPORT_SYMBOL(skb_dequeue_tail); /** * skb_queue_purge - empty a list @@ -1828,6 +1900,7 @@ void skb_queue_purge(struct sk_buff_head *list) while ((skb = skb_dequeue(list)) != NULL) kfree_skb(skb); } +EXPORT_SYMBOL(skb_queue_purge); /** * skb_queue_head - queue a buffer at the list head @@ -1848,6 +1921,7 @@ void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) __skb_queue_head(list, newsk); spin_unlock_irqrestore(&list->lock, flags); } +EXPORT_SYMBOL(skb_queue_head); /** * skb_queue_tail - queue a buffer at the list tail @@ -1868,6 +1942,7 @@ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) __skb_queue_tail(list, newsk); spin_unlock_irqrestore(&list->lock, flags); } +EXPORT_SYMBOL(skb_queue_tail); /** * skb_unlink - remove a buffer from a list @@ -1887,6 +1962,7 @@ void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) __skb_unlink(skb, list); spin_unlock_irqrestore(&list->lock, flags); } +EXPORT_SYMBOL(skb_unlink); /** * skb_append - append a buffer @@ -1906,7 +1982,7 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head __skb_queue_after(list, old, newsk); spin_unlock_irqrestore(&list->lock, flags); } - +EXPORT_SYMBOL(skb_append); /** * skb_insert - insert a buffer @@ -1928,6 +2004,7 @@ void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head __skb_insert(newsk, old->prev, old, list); spin_unlock_irqrestore(&list->lock, flags); } +EXPORT_SYMBOL(skb_insert); static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, @@ -2006,6 +2083,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) else /* Second chunk has no header, nothing to copy. */ skb_split_no_header(skb, skb1, len, pos); } +EXPORT_SYMBOL(skb_split); /* Shifting from/to a cloned skb is a no-go. * @@ -2168,6 +2246,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, st->frag_idx = st->stepped_offset = 0; st->frag_data = NULL; } +EXPORT_SYMBOL(skb_prepare_seq_read); /** * skb_seq_read - Sequentially read skb data @@ -2255,6 +2334,7 @@ next_skb: return 0; } +EXPORT_SYMBOL(skb_seq_read); /** * skb_abort_seq_read - Abort a sequential read of skb data @@ -2268,6 +2348,7 @@ void skb_abort_seq_read(struct skb_seq_state *st) if (st->frag_data) kunmap_skb_frag(st->frag_data); } +EXPORT_SYMBOL(skb_abort_seq_read); #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) @@ -2310,6 +2391,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, ret = textsearch_find(config, state); return (ret <= to - from ? ret : UINT_MAX); } +EXPORT_SYMBOL(skb_find_text); /** * skb_append_datato_frags: - append the user data to a skb @@ -2382,6 +2464,7 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, return 0; } +EXPORT_SYMBOL(skb_append_datato_frags); /** * skb_pull_rcsum - pull skb and update receive checksum @@ -2569,7 +2652,6 @@ err: } return ERR_PTR(err); } - EXPORT_SYMBOL_GPL(skb_segment); int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) @@ -2577,17 +2659,23 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct sk_buff *p = *head; struct sk_buff *nskb; unsigned int headroom; - unsigned int hlen = p->data - skb_mac_header(p); - unsigned int len = skb->len; + unsigned int len = skb_gro_len(skb); - if (hlen + p->len + len >= 65536) + if (p->len + len >= 65536) return -E2BIG; if (skb_shinfo(p)->frag_list) goto merge; - else if (!skb_headlen(p) && !skb_headlen(skb) && - skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags < - MAX_SKB_FRAGS) { + else if (skb_headlen(skb) <= skb_gro_offset(skb)) { + if (skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags > + MAX_SKB_FRAGS) + return -E2BIG; + + skb_shinfo(skb)->frags[0].page_offset += + skb_gro_offset(skb) - skb_headlen(skb); + skb_shinfo(skb)->frags[0].size -= + skb_gro_offset(skb) - skb_headlen(skb); + memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags, skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); @@ -2604,7 +2692,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) } headroom = skb_headroom(p); - nskb = netdev_alloc_skb(p->dev, headroom); + nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p)); if (unlikely(!nskb)) return -ENOMEM; @@ -2612,12 +2700,15 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) nskb->mac_len = p->mac_len; skb_reserve(nskb, headroom); + __skb_put(nskb, skb_gro_offset(p)); - skb_set_mac_header(nskb, -hlen); + skb_set_mac_header(nskb, skb_mac_header(p) - p->data); skb_set_network_header(nskb, skb_network_offset(p)); skb_set_transport_header(nskb, skb_transport_offset(p)); - memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen); + __skb_pull(p, skb_gro_offset(p)); + memcpy(skb_mac_header(nskb), skb_mac_header(p), + p->data - skb_mac_header(p)); *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); skb_shinfo(nskb)->frag_list = p; @@ -2636,6 +2727,17 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) p = nskb; merge: + if (skb_gro_offset(skb) > skb_headlen(skb)) { + skb_shinfo(skb)->frags[0].page_offset += + skb_gro_offset(skb) - skb_headlen(skb); + skb_shinfo(skb)->frags[0].size -= + skb_gro_offset(skb) - skb_headlen(skb); + skb_gro_reset_offset(skb); + skb_gro_pull(skb, skb_headlen(skb)); + } + + __skb_pull(skb, skb_gro_offset(skb)); + p->prev->next = skb; p->prev = skb; skb_header_release(skb); @@ -2747,6 +2849,7 @@ int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int le return nsg; } +EXPORT_SYMBOL_GPL(skb_to_sgvec); /** * skb_cow_data - Check that a socket buffer's data buffers are writable @@ -2856,6 +2959,45 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return elt; } +EXPORT_SYMBOL_GPL(skb_cow_data); + +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = orig_skb->sk; + struct sock_exterr_skb *serr; + struct sk_buff *skb; + int err; + + if (!sk) + return; + + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + + if (hwtstamps) { + *skb_hwtstamps(skb) = + *hwtstamps; + } else { + /* + * no hardware time stamps available, + * so keep the skb_shared_tx and only + * store software time stamp + */ + skb->tstamp = ktime_get_real(); + } + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + err = sock_queue_err_skb(sk, skb); + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_tstamp_tx); + /** * skb_partial_csum_set - set up and verify partial csum values for packet @@ -2884,6 +3026,7 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) skb->csum_offset = off; return true; } +EXPORT_SYMBOL_GPL(skb_partial_csum_set); void __skb_warn_lro_forwarding(const struct sk_buff *skb) { @@ -2891,42 +3034,4 @@ void __skb_warn_lro_forwarding(const struct sk_buff *skb) pr_warning("%s: received packets cannot be forwarded" " while LRO is enabled\n", skb->dev->name); } - -EXPORT_SYMBOL(___pskb_trim); -EXPORT_SYMBOL(__kfree_skb); -EXPORT_SYMBOL(kfree_skb); -EXPORT_SYMBOL(__pskb_pull_tail); -EXPORT_SYMBOL(__alloc_skb); -EXPORT_SYMBOL(__netdev_alloc_skb); -EXPORT_SYMBOL(pskb_copy); -EXPORT_SYMBOL(pskb_expand_head); -EXPORT_SYMBOL(skb_checksum); -EXPORT_SYMBOL(skb_clone); -EXPORT_SYMBOL(skb_copy); -EXPORT_SYMBOL(skb_copy_and_csum_bits); -EXPORT_SYMBOL(skb_copy_and_csum_dev); -EXPORT_SYMBOL(skb_copy_bits); -EXPORT_SYMBOL(skb_copy_expand); -EXPORT_SYMBOL(skb_over_panic); -EXPORT_SYMBOL(skb_pad); -EXPORT_SYMBOL(skb_realloc_headroom); -EXPORT_SYMBOL(skb_under_panic); -EXPORT_SYMBOL(skb_dequeue); -EXPORT_SYMBOL(skb_dequeue_tail); -EXPORT_SYMBOL(skb_insert); -EXPORT_SYMBOL(skb_queue_purge); -EXPORT_SYMBOL(skb_queue_head); -EXPORT_SYMBOL(skb_queue_tail); -EXPORT_SYMBOL(skb_unlink); -EXPORT_SYMBOL(skb_append); -EXPORT_SYMBOL(skb_split); -EXPORT_SYMBOL(skb_prepare_seq_read); -EXPORT_SYMBOL(skb_seq_read); -EXPORT_SYMBOL(skb_abort_seq_read); -EXPORT_SYMBOL(skb_find_text); -EXPORT_SYMBOL(skb_append_datato_frags); EXPORT_SYMBOL(__skb_warn_lro_forwarding); - -EXPORT_SYMBOL_GPL(skb_to_sgvec); -EXPORT_SYMBOL_GPL(skb_cow_data); -EXPORT_SYMBOL_GPL(skb_partial_csum_set); diff --git a/net/core/sock.c b/net/core/sock.c index 5f97caa158e8..0620046e4eba 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -120,6 +120,7 @@ #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> +#include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> @@ -149,7 +150,7 @@ static const char *af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , - "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , + "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , @@ -164,7 +165,7 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , - "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" , + "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , "slock-27" , "slock-28" , "slock-AF_CAN" , "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , @@ -179,7 +180,7 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , - "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" , + "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , "clock-27" , "clock-28" , "clock-AF_CAN" , "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , @@ -255,11 +256,14 @@ static void sock_warn_obsolete_bsdism(const char *name) } } -static void sock_disable_timestamp(struct sock *sk) +static void sock_disable_timestamp(struct sock *sk, int flag) { - if (sock_flag(sk, SOCK_TIMESTAMP)) { - sock_reset_flag(sk, SOCK_TIMESTAMP); - net_disable_timestamp(); + if (sock_flag(sk, flag)) { + sock_reset_flag(sk, flag); + if (!sock_flag(sk, SOCK_TIMESTAMP) && + !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) { + net_disable_timestamp(); + } } } @@ -614,13 +618,38 @@ set_rcvbuf: else sock_set_flag(sk, SOCK_RCVTSTAMPNS); sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); } break; + case SO_TIMESTAMPING: + if (val & ~SOF_TIMESTAMPING_MASK) { + ret = EINVAL; + break; + } + sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, + val & SOF_TIMESTAMPING_TX_HARDWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, + val & SOF_TIMESTAMPING_TX_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, + val & SOF_TIMESTAMPING_RX_HARDWARE); + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) + sock_enable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + else + sock_disable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, + val & SOF_TIMESTAMPING_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, + val & SOF_TIMESTAMPING_SYS_HARDWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, + val & SOF_TIMESTAMPING_RAW_HARDWARE); + break; + case SO_RCVLOWAT: if (val < 0) val = INT_MAX; @@ -768,6 +797,24 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); break; + case SO_TIMESTAMPING: + v.val = 0; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) + v.val |= SOF_TIMESTAMPING_TX_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) + v.val |= SOF_TIMESTAMPING_RX_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) + v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) + v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; + break; + case SO_RCVTIMEO: lv=sizeof(struct timeval); if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { @@ -969,7 +1016,8 @@ void sk_free(struct sock *sk) rcu_assign_pointer(sk->sk_filter, NULL); } - sock_disable_timestamp(sk); + sock_disable_timestamp(sk, SOCK_TIMESTAMP); + sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); if (atomic_read(&sk->sk_omem_alloc)) printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", @@ -1255,10 +1303,9 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo) * Generic send/receive buffer handlers */ -static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, - unsigned long header_len, - unsigned long data_len, - int noblock, int *errcode) +struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, + unsigned long data_len, int noblock, + int *errcode) { struct sk_buff *skb; gfp_t gfp_mask; @@ -1338,6 +1385,7 @@ failure: *errcode = err; return NULL; } +EXPORT_SYMBOL(sock_alloc_send_pskb); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) @@ -1786,7 +1834,7 @@ int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) { struct timeval tv; if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); tv = ktime_to_timeval(sk->sk_stamp); if (tv.tv_sec == -1) return -ENOENT; @@ -1802,7 +1850,7 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) { struct timespec ts; if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); ts = ktime_to_timespec(sk->sk_stamp); if (ts.tv_sec == -1) return -ENOENT; @@ -1814,11 +1862,20 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) } EXPORT_SYMBOL(sock_get_timestampns); -void sock_enable_timestamp(struct sock *sk) +void sock_enable_timestamp(struct sock *sk, int flag) { - if (!sock_flag(sk, SOCK_TIMESTAMP)) { - sock_set_flag(sk, SOCK_TIMESTAMP); - net_enable_timestamp(); + if (!sock_flag(sk, flag)) { + sock_set_flag(sk, flag); + /* + * we just set one of the two flags which require net + * time stamping, but time stamping might have been on + * already because of the other one + */ + if (!sock_flag(sk, + flag == SOCK_TIMESTAMP ? + SOCK_TIMESTAMPING_RX_SOFTWARE : + SOCK_TIMESTAMP)) + net_enable_timestamp(); } } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 83d3398559ea..7db1de0497c6 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -11,6 +11,7 @@ #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/init.h> +#include <net/ip.h> #include <net/sock.h> static struct ctl_table net_core_table[] = { diff --git a/net/core/utils.c b/net/core/utils.c index 72e0ebe964a0..83221aee7084 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -22,7 +22,6 @@ #include <linux/net.h> #include <linux/string.h> #include <linux/types.h> -#include <linux/random.h> #include <linux/percpu.h> #include <linux/init.h> #include <net/sock.h> diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 45f95e55f873..7ea557b7c6b1 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -20,6 +20,9 @@ /* We can spread an ack vector across multiple options */ #define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2) +/* Estimated minimum average Ack Vector length - used for updating MPS */ +#define DCCPAV_MIN_OPTLEN 16 + #define DCCP_ACKVEC_STATE_RECEIVED 0 #define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) #define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index f2230fc168e1..d6bc47363b1c 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -42,9 +42,11 @@ extern int dccp_debug; #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) +#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) #else #define dccp_pr_debug(format, a...) #define dccp_pr_debug_cat(format, a...) +#define dccp_debug(format, a...) #endif extern struct inet_hashinfo dccp_hashinfo; @@ -61,11 +63,14 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields * Hence a safe upper bound for the maximum option length is 1020-28 = 992 */ -#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int)) +#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) #define DCCP_MAX_PACKET_HDR 28 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) +/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ +#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) + #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ @@ -95,9 +100,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); extern int sysctl_dccp_request_retries; extern int sysctl_dccp_retries1; extern int sysctl_dccp_retries2; -extern int sysctl_dccp_feat_sequence_window; -extern int sysctl_dccp_feat_rx_ccid; -extern int sysctl_dccp_feat_tx_ccid; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; @@ -409,23 +411,21 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, static inline void dccp_update_gsr(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_minisock *dmsk = dccp_msk(sk); dp->dccps_gsr = seq; - dccp_set_seqno(&dp->dccps_swl, - dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4)); - dccp_set_seqno(&dp->dccps_swh, - dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4); + /* Sequence validity window depends on remote Sequence Window (7.5.1) */ + dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); + dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); } static inline void dccp_update_gss(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); - dp->dccps_awh = dp->dccps_gss = seq; - dccp_set_seqno(&dp->dccps_awl, - (dp->dccps_gss - - dccp_msk(sk)->dccpms_sequence_window + 1)); + dp->dccps_gss = seq; + /* Ack validity window depends on local Sequence Window value (7.5.1) */ + dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); + dp->dccps_awh = dp->dccps_gss; } static inline int dccp_ack_pending(const struct sock *sk) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 4152308958ab..b04160a2eea5 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -25,6 +25,11 @@ #include "ccid.h" #include "feat.h" +/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ +unsigned long sysctl_dccp_sequence_window __read_mostly = 100; +int sysctl_dccp_rx_ccid __read_mostly = 2, + sysctl_dccp_tx_ccid __read_mostly = 2; + /* * Feature activation handlers. * @@ -51,8 +56,17 @@ static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) { - if (!rx) - dccp_msk(sk)->dccpms_sequence_window = seq_win; + struct dccp_sock *dp = dccp_sk(sk); + + if (rx) { + dp->dccps_r_seq_win = seq_win; + /* propagate changes to update SWL/SWH */ + dccp_update_gsr(sk, dp->dccps_gsr); + } else { + dp->dccps_l_seq_win = seq_win; + /* propagate changes to update AWL */ + dccp_update_gss(sk, dp->dccps_gss); + } return 0; } @@ -194,6 +208,100 @@ static int dccp_feat_default_value(u8 feat_num) return idx < 0 ? 0 : dccp_feat_table[idx].default_value; } +/* + * Debugging and verbose-printing section + */ +static const char *dccp_feat_fname(const u8 feat) +{ + static const char *feature_names[] = { + [DCCPF_RESERVED] = "Reserved", + [DCCPF_CCID] = "CCID", + [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", + [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", + [DCCPF_ECN_INCAPABLE] = "ECN Incapable", + [DCCPF_ACK_RATIO] = "Ack Ratio", + [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", + [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", + [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", + [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", + }; + if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) + return feature_names[DCCPF_RESERVED]; + + if (feat == DCCPF_SEND_LEV_RATE) + return "Send Loss Event Rate"; + if (feat >= DCCPF_MIN_CCID_SPECIFIC) + return "CCID-specific"; + + return feature_names[feat]; +} + +static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING", + "UNSTABLE", "STABLE" }; + +#ifdef CONFIG_IP_DCCP_DEBUG +static const char *dccp_feat_oname(const u8 opt) +{ + switch (opt) { + case DCCPO_CHANGE_L: return "Change_L"; + case DCCPO_CONFIRM_L: return "Confirm_L"; + case DCCPO_CHANGE_R: return "Change_R"; + case DCCPO_CONFIRM_R: return "Confirm_R"; + } + return NULL; +} + +static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) +{ + u8 i, type = dccp_feat_type(feat_num); + + if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) + dccp_pr_debug_cat("(NULL)"); + else if (type == FEAT_SP) + for (i = 0; i < val->sp.len; i++) + dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); + else if (type == FEAT_NN) + dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); + else + dccp_pr_debug_cat("unknown type %u", type); +} + +static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) +{ + u8 type = dccp_feat_type(feat_num); + dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; + + if (type == FEAT_NN) + fval.nn = dccp_decode_value_var(list, len); + dccp_feat_printval(feat_num, &fval); +} + +static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) +{ + dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", + dccp_feat_fname(entry->feat_num)); + dccp_feat_printval(entry->feat_num, &entry->val); + dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], + entry->needs_confirm ? "(Confirm pending)" : ""); +} + +#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ + dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ + dccp_feat_printvals(feat, val, len); \ + dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) + +#define dccp_feat_print_fnlist(fn_list) { \ + const struct dccp_feat_entry *___entry; \ + \ + dccp_pr_debug("List Dump:\n"); \ + list_for_each_entry(___entry, fn_list, node) \ + dccp_feat_print_entry(___entry); \ +} +#else /* ! CONFIG_IP_DCCP_DEBUG */ +#define dccp_feat_print_opt(opt, feat, val, len, mandatory) +#define dccp_feat_print_fnlist(fn_list) +#endif + static int __dccp_feat_activate(struct sock *sk, const int idx, const bool is_local, dccp_feat_val const *fval) { @@ -226,6 +334,10 @@ static int __dccp_feat_activate(struct sock *sk, const int idx, /* Location is RX if this is a local-RX or remote-TX feature */ rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); + dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", + dccp_feat_fname(dccp_feat_table[idx].feat_num), + fval ? "" : "default ", (unsigned long long)val); + return dccp_feat_table[idx].activation_hdlr(sk, val, rx); } @@ -530,6 +642,7 @@ int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, return -1; } } + dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) return -1; @@ -783,6 +896,7 @@ int dccp_feat_finalise_settings(struct dccp_sock *dp) while (i--) if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) return -1; + dccp_feat_print_fnlist(fn); return 0; } @@ -901,6 +1015,8 @@ static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ goto unknown_feature_or_value; + dccp_feat_print_opt(opt, feat, val, len, is_mandatory); + /* * Negotiation of NN features: Change R is invalid, so there is no * simultaneous negotiation; hence we do not look up in the list. @@ -1006,6 +1122,8 @@ static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, const bool local = (opt == DCCPO_CONFIRM_R); struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); + dccp_feat_print_opt(opt, feat, val, len, is_mandatory); + if (entry == NULL) { /* nothing queued: ignore or handle error */ if (is_mandatory && type == FEAT_UNKNOWN) return DCCP_RESET_CODE_MANDATORY_ERROR; @@ -1115,23 +1233,70 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, return 0; /* ignore FN options in all other states */ } +/** + * dccp_feat_init - Seed feature negotiation with host-specific defaults + * This initialises global defaults, depending on the value of the sysctls. + * These can later be overridden by registering changes via setsockopt calls. + * The last link in the chain is finalise_settings, to make sure that between + * here and the start of actual feature negotiation no inconsistencies enter. + * + * All features not appearing below use either defaults or are otherwise + * later adjusted through dccp_feat_finalise_settings(). + */ int dccp_feat_init(struct sock *sk) { - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); + struct list_head *fn = &dccp_sk(sk)->dccps_featneg; + u8 on = 1, off = 0; int rc; + struct { + u8 *val; + u8 len; + } tx, rx; + + /* Non-negotiable (NN) features */ + rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, + sysctl_dccp_sequence_window); + if (rc) + return rc; + + /* Server-priority (SP) features */ + + /* Advertise that short seqnos are not supported (7.6.1) */ + rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); + if (rc) + return rc; - INIT_LIST_HEAD(&dmsk->dccpms_pending); /* XXX no longer used */ - INIT_LIST_HEAD(&dmsk->dccpms_conf); /* XXX no longer used */ + /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ + rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); + if (rc) + return rc; + + /* + * We advertise the available list of CCIDs and reorder according to + * preferences, to avoid failure resulting from negotiating different + * singleton values (which always leads to failure). + * These settings can still (later) be overridden via sockopts. + */ + if (ccid_get_builtin_ccids(&tx.val, &tx.len) || + ccid_get_builtin_ccids(&rx.val, &rx.len)) + return -ENOBUFS; + + if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || + !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) + goto free_ccid_lists; + + rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); + if (rc) + goto free_ccid_lists; + + rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); - /* Ack ratio */ - rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0, - dp->dccps_l_ack_ratio); +free_ccid_lists: + kfree(tx.val); + kfree(rx.val); return rc; } -EXPORT_SYMBOL_GPL(dccp_feat_init); - int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) { struct dccp_sock *dp = dccp_sk(sk); @@ -1156,9 +1321,10 @@ int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) goto activation_failed; } if (cur->state != FEAT_STABLE) { - DCCP_CRIT("Negotiation of %s %u failed in state %u", + DCCP_CRIT("Negotiation of %s %s failed in state %s", cur->is_local ? "local" : "remote", - cur->feat_num, cur->state); + dccp_feat_fname(cur->feat_num), + dccp_feat_sname[cur->state]); goto activation_failed; } fvals[idx][cur->is_local] = &cur->val; @@ -1199,43 +1365,3 @@ activation_failed: dp->dccps_hc_rx_ackvec = NULL; return -1; } - -#ifdef CONFIG_IP_DCCP_DEBUG -const char *dccp_feat_typename(const u8 type) -{ - switch(type) { - case DCCPO_CHANGE_L: return("ChangeL"); - case DCCPO_CONFIRM_L: return("ConfirmL"); - case DCCPO_CHANGE_R: return("ChangeR"); - case DCCPO_CONFIRM_R: return("ConfirmR"); - /* the following case must not appear in feature negotation */ - default: dccp_pr_debug("unknown type %d [BUG!]\n", type); - } - return NULL; -} - -const char *dccp_feat_name(const u8 feat) -{ - static const char *feature_names[] = { - [DCCPF_RESERVED] = "Reserved", - [DCCPF_CCID] = "CCID", - [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", - [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", - [DCCPF_ECN_INCAPABLE] = "ECN Incapable", - [DCCPF_ACK_RATIO] = "Ack Ratio", - [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", - [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", - [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", - [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", - }; - if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) - return feature_names[DCCPF_RESERVED]; - - if (feat == DCCPF_SEND_LEV_RATE) - return "Send Loss Event Rate"; - if (feat >= DCCPF_MIN_CCID_SPECIFIC) - return "CCID-specific"; - - return feature_names[feat]; -} -#endif /* CONFIG_IP_DCCP_DEBUG */ diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 9b46e2a7866e..f96721619def 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -100,26 +100,21 @@ struct ccid_dependency { u8 val; }; -#ifdef CONFIG_IP_DCCP_DEBUG -extern const char *dccp_feat_typename(const u8 type); -extern const char *dccp_feat_name(const u8 feat); - -static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) -{ - dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type), - dccp_feat_name(feat), feat, val); -} -#else -#define dccp_feat_debug(type, feat, val) -#endif /* CONFIG_IP_DCCP_DEBUG */ +/* + * Sysctls to seed defaults for feature negotiation + */ +extern unsigned long sysctl_dccp_sequence_window; +extern int sysctl_dccp_rx_ccid; +extern int sysctl_dccp_tx_ccid; +extern int dccp_feat_init(struct sock *sk); +extern void dccp_feat_initialise_sysctls(void); extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len); extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, u8 mand, u8 opt, u8 feat, u8 *val, u8 len); extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); -extern int dccp_feat_init(struct sock *sk); /* * Encoding variable-length options and their maximum length. diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 6821ae33dd37..5ca49cec95f5 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -42,11 +42,6 @@ struct inet_timewait_death_row dccp_death_row = { EXPORT_SYMBOL_GPL(dccp_death_row); -void dccp_minisock_init(struct dccp_minisock *dmsk) -{ - dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; -} - void dccp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; @@ -110,7 +105,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk, struct dccp_request_sock *dreq = dccp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct dccp_sock *newdp = dccp_sk(newsk); - struct dccp_minisock *newdmsk = dccp_msk(newsk); newdp->dccps_role = DCCP_ROLE_SERVER; newdp->dccps_hc_rx_ackvec = NULL; @@ -128,10 +122,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk, * Initialize S.GAR := S.ISS * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies */ - - /* See dccp_v4_conn_request */ - newdmsk->dccpms_sequence_window = req->rcv_wnd; - newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; dccp_update_gss(newsk, dreq->dreq_iss); @@ -290,7 +280,6 @@ int dccp_reqsk_init(struct request_sock *req, inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; inet_rsk(req)->loc_port = dccp_hdr(skb)->dccph_dport; inet_rsk(req)->acked = 0; - req->rcv_wnd = sysctl_dccp_feat_sequence_window; dreq->dreq_timestamp_echo = 0; /* inherit feature negotiation options from listening socket */ diff --git a/net/dccp/options.c b/net/dccp/options.c index 7b1165c21f51..1b08cae9c65b 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -23,10 +23,6 @@ #include "dccp.h" #include "feat.h" -int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; -int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; -int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; - u64 dccp_decode_value_var(const u8 *bf, const u8 len) { u64 value = 0; @@ -502,10 +498,6 @@ int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, *to++ = *val; if (len) memcpy(to, val, len); - - dccp_pr_debug("%s(%s (%d), ...), length %d\n", - dccp_feat_typename(type), - dccp_feat_name(feat), feat, len); return 0; } diff --git a/net/dccp/output.c b/net/dccp/output.c index 22a618af4893..36bcc00654d3 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -161,21 +161,27 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); u32 ccmps = dccp_determine_ccmps(dp); - int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; + u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; /* Account for header lengths and IPv4/v6 option overhead */ cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); /* - * FIXME: this should come from the CCID infrastructure, where, say, - * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets - * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED - * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to - * make it a multiple of 4 + * Leave enough headroom for common DCCP header options. + * This only considers options which may appear on DCCP-Data packets, as + * per table 3 in RFC 4340, 5.8. When running out of space for other + * options (eg. Ack Vector which can take up to 255 bytes), it is better + * to schedule a separate Ack. Thus we leave headroom for the following: + * - 1 byte for Slow Receiver (11.6) + * - 6 bytes for Timestamp (13.1) + * - 10 bytes for Timestamp Echo (13.3) + * - 8 bytes for NDP count (7.7, when activated) + * - 6 bytes for Data Checksum (9.3) + * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) */ - - cur_mps -= roundup(5 + 6 + 10 + 6 + 6 + 6, 4); + cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + + (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; @@ -270,7 +276,20 @@ void dccp_write_xmit(struct sock *sk, int block) const int len = skb->len; if (sk->sk_state == DCCP_PARTOPEN) { - /* See 8.1.5. Handshake Completion */ + const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; + /* + * See 8.1.5 - Handshake Completion. + * + * For robustness we resend Confirm options until the client has + * entered OPEN. During the initial feature negotiation, the MPS + * is smaller than usual, reduced by the Change/Confirm options. + */ + if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { + DCCP_WARN("Payload too large (%d) for featneg.\n", len); + dccp_send_ack(sk); + dccp_feat_list_purge(&dp->dccps_featneg); + } + inet_csk_schedule_ack(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, inet_csk(sk)->icsk_rto, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 945b4d5d23b3..314a1b5c033c 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -174,8 +174,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) struct dccp_sock *dp = dccp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - dccp_minisock_init(&dp->dccps_minisock); - icsk->icsk_rto = DCCP_TIMEOUT_INIT; icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 018e210875e1..a5a1856234e7 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -18,55 +18,72 @@ #error This file should not be compiled without CONFIG_SYSCTL defined #endif +/* Boundary values */ +static int zero = 0, + u8_max = 0xFF; +static unsigned long seqw_min = 32; + static struct ctl_table dccp_default_table[] = { { .procname = "seq_window", - .data = &sysctl_dccp_feat_sequence_window, - .maxlen = sizeof(sysctl_dccp_feat_sequence_window), + .data = &sysctl_dccp_sequence_window, + .maxlen = sizeof(sysctl_dccp_sequence_window), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ }, { .procname = "rx_ccid", - .data = &sysctl_dccp_feat_rx_ccid, - .maxlen = sizeof(sysctl_dccp_feat_rx_ccid), + .data = &sysctl_dccp_rx_ccid, + .maxlen = sizeof(sysctl_dccp_rx_ccid), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, /* RFC 4340, 10. */ }, { .procname = "tx_ccid", - .data = &sysctl_dccp_feat_tx_ccid, - .maxlen = sizeof(sysctl_dccp_feat_tx_ccid), + .data = &sysctl_dccp_tx_ccid, + .maxlen = sizeof(sysctl_dccp_tx_ccid), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, /* RFC 4340, 10. */ }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, .maxlen = sizeof(sysctl_dccp_request_retries), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, }, { .procname = "retries1", .data = &sysctl_dccp_retries1, .maxlen = sizeof(sysctl_dccp_retries1), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, }, { .procname = "retries2", .data = &sysctl_dccp_retries2, .maxlen = sizeof(sysctl_dccp_retries2), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, }, { .procname = "tx_qlen", .data = &sysctl_dccp_tx_qlen, .maxlen = sizeof(sysctl_dccp_tx_qlen), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "sync_ratelimit", diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index cf0e18499297..9647d911f916 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1246,11 +1246,12 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case TIOCINQ: lock_sock(sk); - if ((skb = skb_peek(&scp->other_receive_queue)) != NULL) { + skb = skb_peek(&scp->other_receive_queue); + if (skb) { amount = skb->len; } else { - struct sk_buff *skb = sk->sk_receive_queue.next; - for(;;) { + skb = sk->sk_receive_queue.next; + for (;;) { if (skb == (struct sk_buff *)&sk->sk_receive_queue) break; @@ -1579,16 +1580,16 @@ static int __dn_getsockopt(struct socket *sock, int level,int optname, char __us default: #ifdef CONFIG_NETFILTER { - int val, len; + int ret, len; if(get_user(len, optlen)) return -EFAULT; - val = nf_getsockopt(sk, PF_DECnet, optname, + ret = nf_getsockopt(sk, PF_DECnet, optname, optval, &len); - if (val >= 0) - val = put_user(len, optlen); - return val; + if (ret >= 0) + ret = put_user(len, optlen); + return ret; } #endif case DSO_STREAM: @@ -2071,8 +2072,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, } out: - if (skb) - kfree_skb(skb); + kfree_skb(skb); release_sock(sk); @@ -2112,9 +2112,8 @@ static struct notifier_block dn_dev_notifier = { extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); -static struct packet_type dn_dix_packet_type = { - .type = __constant_htons(ETH_P_DNA_RT), - .dev = NULL, /* All devices */ +static struct packet_type dn_dix_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_DNA_RT), .func = dn_route_rcv, }; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index daf2b98b15fe..1c6a5bb6f0c8 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -684,7 +684,6 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return -ENODEV; if ((dn_db = dev->dn_ptr) == NULL) { - int err; dn_db = dn_dev_create(dev, &err); if (!dn_db) return err; @@ -769,7 +768,8 @@ static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL); + rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err); @@ -1322,6 +1322,7 @@ static inline int is_dn_dev(struct net_device *dev) } static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(&dev_base_lock) { int i; struct net_device *dev; @@ -1364,6 +1365,7 @@ static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void dn_dev_seq_stop(struct seq_file *seq, void *v) + __releases(&dev_base_lock) { read_unlock(&dev_base_lock); } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index c754670b7fca..0cc4394117df 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -124,7 +124,7 @@ int decnet_dst_gc_interval = 2; static struct dst_ops dn_dst_ops = { .family = PF_DECnet, - .protocol = __constant_htons(ETH_P_DNA_RT), + .protocol = cpu_to_be16(ETH_P_DNA_RT), .gc_thresh = 128, .gc = dn_dst_gc, .check = dn_dst_check, @@ -380,7 +380,6 @@ static int dn_return_short(struct sk_buff *skb) unsigned char *ptr; __le16 *src; __le16 *dst; - __le16 tmp; /* Add back headers */ skb_push(skb, skb->data - skb_network_header(skb)); @@ -399,10 +398,7 @@ static int dn_return_short(struct sk_buff *skb) ptr += 2; *ptr = 0; /* Zero hop count */ - /* Swap source and destination */ - tmp = *src; - *src = *dst; - *dst = tmp; + swap(*src, *dst); skb->pkt_type = PACKET_OUTGOING; dn_rt_finish_output(skb, NULL, NULL); diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index 69ad9280c693..67054b0d550f 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -375,7 +375,8 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL); + rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err); diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 965397af9a80..5bcd592ae6dd 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -179,7 +179,7 @@ static int dn_node_address_handler(ctl_table *table, int write, } if (write) { - int len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); + len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); if (copy_from_user(addr, buffer, len)) return -EFAULT; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 49211b35725b..c51b55400dc5 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -41,13 +41,13 @@ config NET_DSA_MV88E6XXX_NEED_PPU default n config NET_DSA_MV88E6131 - bool "Marvell 88E6131 ethernet switch chip support" + bool "Marvell 88E6095/6095F/6131 ethernet switch chip support" select NET_DSA_MV88E6XXX select NET_DSA_MV88E6XXX_NEED_PPU select NET_DSA_TAG_DSA ---help--- - This enables support for the Marvell 88E6131 ethernet switch - chip. + This enables support for the Marvell 88E6095/6095F/6131 + ethernet switch chips. config NET_DSA_MV88E6123_61_65 bool "Marvell 88E6123/6161/6165 ethernet switch chip support" diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 33e99462023a..71489f69a42c 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -1,6 +1,6 @@ /* * net/dsa/dsa.c - Hardware switch handling - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -67,12 +67,13 @@ dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) /* basic switch operations **************************************************/ static struct dsa_switch * -dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, - struct mii_bus *bus, struct net_device *dev) +dsa_switch_setup(struct dsa_switch_tree *dst, int index, + struct device *parent, struct mii_bus *bus) { + struct dsa_chip_data *pd = dst->pd->chip + index; + struct dsa_switch_driver *drv; struct dsa_switch *ds; int ret; - struct dsa_switch_driver *drv; char *name; int i; @@ -81,11 +82,12 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, */ drv = dsa_switch_probe(bus, pd->sw_addr, &name); if (drv == NULL) { - printk(KERN_ERR "%s: could not detect attached switch\n", - dev->name); + printk(KERN_ERR "%s[%d]: could not detect attached switch\n", + dst->master_netdev->name, index); return ERR_PTR(-EINVAL); } - printk(KERN_INFO "%s: detected a %s switch\n", dev->name, name); + printk(KERN_INFO "%s[%d]: detected a %s switch\n", + dst->master_netdev->name, index, name); /* @@ -95,18 +97,16 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, if (ds == NULL) return ERR_PTR(-ENOMEM); - ds->pd = pd; - ds->master_netdev = dev; - ds->master_mii_bus = bus; - + ds->dst = dst; + ds->index = index; + ds->pd = dst->pd->chip + index; ds->drv = drv; - ds->tag_protocol = drv->tag_protocol; + ds->master_mii_bus = bus; /* * Validate supplied switch configuration. */ - ds->cpu_port = -1; for (i = 0; i < DSA_MAX_PORTS; i++) { char *name; @@ -115,32 +115,28 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, continue; if (!strcmp(name, "cpu")) { - if (ds->cpu_port != -1) { + if (dst->cpu_switch != -1) { printk(KERN_ERR "multiple cpu ports?!\n"); ret = -EINVAL; goto out; } - ds->cpu_port = i; + dst->cpu_switch = index; + dst->cpu_port = i; + } else if (!strcmp(name, "dsa")) { + ds->dsa_port_mask |= 1 << i; } else { - ds->valid_port_mask |= 1 << i; + ds->phys_port_mask |= 1 << i; } } - if (ds->cpu_port == -1) { - printk(KERN_ERR "no cpu port?!\n"); - ret = -EINVAL; - goto out; - } - /* - * If we use a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point on get - * sent to the tag format's receive function. (Which will - * discard received packets until we set ds->ports[] below.) + * If the CPU connects to this switch, set the switch tree + * tagging protocol to the preferred tagging format of this + * switch. */ - wmb(); - dev->dsa_ptr = (void *)ds; + if (ds->dst->cpu_switch == index) + ds->dst->tag_protocol = drv->tag_protocol; /* @@ -150,7 +146,7 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, if (ret < 0) goto out; - ret = drv->set_addr(ds, dev->dev_addr); + ret = drv->set_addr(ds, dst->master_netdev->dev_addr); if (ret < 0) goto out; @@ -169,18 +165,18 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, /* * Create network devices for physical switch ports. */ - wmb(); for (i = 0; i < DSA_MAX_PORTS; i++) { struct net_device *slave_dev; - if (!(ds->valid_port_mask & (1 << i))) + if (!(ds->phys_port_mask & (1 << i))) continue; slave_dev = dsa_slave_create(ds, parent, i, pd->port_names[i]); if (slave_dev == NULL) { - printk(KERN_ERR "%s: can't create dsa slave " - "device for port %d(%s)\n", - dev->name, i, pd->port_names[i]); + printk(KERN_ERR "%s[%d]: can't create dsa " + "slave device for port %d(%s)\n", + dst->master_netdev->name, + index, i, pd->port_names[i]); continue; } @@ -192,7 +188,6 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, out_free: mdiobus_free(ds->slave_mii_bus); out: - dev->dsa_ptr = NULL; kfree(ds); return ERR_PTR(ret); } @@ -212,35 +207,42 @@ static void dsa_switch_destroy(struct dsa_switch *ds) */ bool dsa_uses_dsa_tags(void *dsa_ptr) { - struct dsa_switch *ds = dsa_ptr; + struct dsa_switch_tree *dst = dsa_ptr; - return !!(ds->tag_protocol == htons(ETH_P_DSA)); + return !!(dst->tag_protocol == htons(ETH_P_DSA)); } bool dsa_uses_trailer_tags(void *dsa_ptr) { - struct dsa_switch *ds = dsa_ptr; + struct dsa_switch_tree *dst = dsa_ptr; - return !!(ds->tag_protocol == htons(ETH_P_TRAILER)); + return !!(dst->tag_protocol == htons(ETH_P_TRAILER)); } /* link polling *************************************************************/ static void dsa_link_poll_work(struct work_struct *ugly) { - struct dsa_switch *ds; + struct dsa_switch_tree *dst; + int i; + + dst = container_of(ugly, struct dsa_switch_tree, link_poll_work); - ds = container_of(ugly, struct dsa_switch, link_poll_work); + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; - ds->drv->poll_link(ds); - mod_timer(&ds->link_poll_timer, round_jiffies(jiffies + HZ)); + if (ds != NULL && ds->drv->poll_link != NULL) + ds->drv->poll_link(ds); + } + + mod_timer(&dst->link_poll_timer, round_jiffies(jiffies + HZ)); } -static void dsa_link_poll_timer(unsigned long _ds) +static void dsa_link_poll_timer(unsigned long _dst) { - struct dsa_switch *ds = (void *)_ds; + struct dsa_switch_tree *dst = (void *)_dst; - schedule_work(&ds->link_poll_work); + schedule_work(&dst->link_poll_work); } @@ -303,18 +305,14 @@ static int dsa_probe(struct platform_device *pdev) static int dsa_version_printed; struct dsa_platform_data *pd = pdev->dev.platform_data; struct net_device *dev; - struct mii_bus *bus; - struct dsa_switch *ds; + struct dsa_switch_tree *dst; + int i; if (!dsa_version_printed++) printk(KERN_NOTICE "Distributed Switch Architecture " "driver version %s\n", dsa_driver_version); - if (pd == NULL || pd->mii_bus == NULL || pd->netdev == NULL) - return -EINVAL; - - bus = dev_to_mii_bus(pd->mii_bus); - if (bus == NULL) + if (pd == NULL || pd->netdev == NULL) return -EINVAL; dev = dev_to_net_device(pd->netdev); @@ -326,36 +324,79 @@ static int dsa_probe(struct platform_device *pdev) return -EEXIST; } - ds = dsa_switch_setup(&pdev->dev, pd, bus, dev); - if (IS_ERR(ds)) { + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (dst == NULL) { dev_put(dev); - return PTR_ERR(ds); + return -ENOMEM; } - if (ds->drv->poll_link != NULL) { - INIT_WORK(&ds->link_poll_work, dsa_link_poll_work); - init_timer(&ds->link_poll_timer); - ds->link_poll_timer.data = (unsigned long)ds; - ds->link_poll_timer.function = dsa_link_poll_timer; - ds->link_poll_timer.expires = round_jiffies(jiffies + HZ); - add_timer(&ds->link_poll_timer); + platform_set_drvdata(pdev, dst); + + dst->pd = pd; + dst->master_netdev = dev; + dst->cpu_switch = -1; + dst->cpu_port = -1; + + for (i = 0; i < pd->nr_chips; i++) { + struct mii_bus *bus; + struct dsa_switch *ds; + + bus = dev_to_mii_bus(pd->chip[i].mii_bus); + if (bus == NULL) { + printk(KERN_ERR "%s[%d]: no mii bus found for " + "dsa switch\n", dev->name, i); + continue; + } + + ds = dsa_switch_setup(dst, i, &pdev->dev, bus); + if (IS_ERR(ds)) { + printk(KERN_ERR "%s[%d]: couldn't create dsa switch " + "instance (error %ld)\n", dev->name, i, + PTR_ERR(ds)); + continue; + } + + dst->ds[i] = ds; + if (ds->drv->poll_link != NULL) + dst->link_poll_needed = 1; } - platform_set_drvdata(pdev, ds); + /* + * If we use a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point on get + * sent to the tag format's receive function. + */ + wmb(); + dev->dsa_ptr = (void *)dst; + + if (dst->link_poll_needed) { + INIT_WORK(&dst->link_poll_work, dsa_link_poll_work); + init_timer(&dst->link_poll_timer); + dst->link_poll_timer.data = (unsigned long)dst; + dst->link_poll_timer.function = dsa_link_poll_timer; + dst->link_poll_timer.expires = round_jiffies(jiffies + HZ); + add_timer(&dst->link_poll_timer); + } return 0; } static int dsa_remove(struct platform_device *pdev) { - struct dsa_switch *ds = platform_get_drvdata(pdev); + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + int i; - if (ds->drv->poll_link != NULL) - del_timer_sync(&ds->link_poll_timer); + if (dst->link_poll_needed) + del_timer_sync(&dst->link_poll_timer); flush_scheduled_work(); - dsa_switch_destroy(ds); + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL) + dsa_switch_destroy(ds); + } return 0; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 7063378a1ebf..41055f33d28a 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -1,6 +1,6 @@ /* * net/dsa/dsa_priv.h - Hardware switch handling - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,42 +19,107 @@ struct dsa_switch { /* - * Configuration data for the platform device that owns - * this dsa switch instance. + * Parent switch tree, and switch index. */ - struct dsa_platform_data *pd; + struct dsa_switch_tree *dst; + int index; /* - * References to network device and mii bus to use. + * Configuration data for this switch. */ - struct net_device *master_netdev; - struct mii_bus *master_mii_bus; + struct dsa_chip_data *pd; /* - * The used switch driver and frame tagging type. + * The used switch driver. */ struct dsa_switch_driver *drv; - __be16 tag_protocol; + + /* + * Reference to mii bus to use. + */ + struct mii_bus *master_mii_bus; /* * Slave mii_bus and devices for the individual ports. */ - int cpu_port; - u32 valid_port_mask; - struct mii_bus *slave_mii_bus; - struct net_device *ports[DSA_MAX_PORTS]; + u32 dsa_port_mask; + u32 phys_port_mask; + struct mii_bus *slave_mii_bus; + struct net_device *ports[DSA_MAX_PORTS]; +}; + +struct dsa_switch_tree { + /* + * Configuration data for the platform device that owns + * this dsa switch tree instance. + */ + struct dsa_platform_data *pd; + + /* + * Reference to network device to use, and which tagging + * protocol to use. + */ + struct net_device *master_netdev; + __be16 tag_protocol; + + /* + * The switch and port to which the CPU is attached. + */ + s8 cpu_switch; + s8 cpu_port; /* * Link state polling. */ - struct work_struct link_poll_work; - struct timer_list link_poll_timer; + int link_poll_needed; + struct work_struct link_poll_work; + struct timer_list link_poll_timer; + + /* + * Data for the individual switch chips. + */ + struct dsa_switch *ds[DSA_MAX_SWITCHES]; }; +static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) +{ + return !!(ds->index == ds->dst->cpu_switch && p == ds->dst->cpu_port); +} + +static inline u8 dsa_upstream_port(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst = ds->dst; + + /* + * If this is the root switch (i.e. the switch that connects + * to the CPU), return the cpu port number on this switch. + * Else return the (DSA) port number that connects to the + * switch that is one hop closer to the cpu. + */ + if (dst->cpu_switch == ds->index) + return dst->cpu_port; + else + return ds->pd->rtable[dst->cpu_switch]; +} + struct dsa_slave_priv { + /* + * The linux network interface corresponding to this + * switch port. + */ struct net_device *dev; + + /* + * Which switch this port is a part of, and the port index + * for this port. + */ struct dsa_switch *parent; - int port; + u8 port; + + /* + * The phylib phy_device pointer for the PHY connected + * to this port. + */ struct phy_device *phy; }; diff --git a/net/dsa/mv88e6060.c b/net/dsa/mv88e6060.c index 85081ae9fe89..83277f463af7 100644 --- a/net/dsa/mv88e6060.c +++ b/net/dsa/mv88e6060.c @@ -1,6 +1,6 @@ /* * net/dsa/mv88e6060.c - Driver for Marvell 88e6060 switch chips - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -81,7 +81,7 @@ static int mv88e6060_switch_reset(struct dsa_switch *ds) /* * Reset the switch. */ - REG_WRITE(REG_GLOBAL, 0x0A, 0xa130); + REG_WRITE(REG_GLOBAL, 0x0a, 0xa130); /* * Wait up to one second for reset to complete. @@ -128,7 +128,7 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p) * state to Forwarding. Additionally, if this is the CPU * port, enable Ingress and Egress Trailer tagging mode. */ - REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x4103 : 0x0003); + REG_WRITE(addr, 0x04, dsa_is_cpu_port(ds, p) ? 0x4103 : 0x0003); /* * Port based VLAN map: give each port its own address @@ -138,9 +138,9 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p) */ REG_WRITE(addr, 0x06, ((p & 0xf) << 12) | - ((p == ds->cpu_port) ? - ds->valid_port_mask : - (1 << ds->cpu_port))); + (dsa_is_cpu_port(ds, p) ? + ds->phys_port_mask : + (1 << ds->dst->cpu_port))); /* * Port Association Vector: when learning source addresses diff --git a/net/dsa/mv88e6123_61_65.c b/net/dsa/mv88e6123_61_65.c index ec8c6a0482d3..52faaa21a4d9 100644 --- a/net/dsa/mv88e6123_61_65.c +++ b/net/dsa/mv88e6123_61_65.c @@ -1,6 +1,6 @@ /* * net/dsa/mv88e6123_61_65.c - Marvell 88e6123/6161/6165 switch chip support - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -98,17 +98,17 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds) return ret; /* - * Configure the cpu port, and configure the cpu port as the - * port to which ingress and egress monitor frames are to be - * sent. + * Configure the upstream port, and configure the upstream + * port as the port to which ingress and egress monitor frames + * are to be sent. */ - REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1110)); + REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1110)); /* * Disable remote management for now, and set the switch's - * DSA device number to zero. + * DSA device number. */ - REG_WRITE(REG_GLOBAL, 0x1c, 0x0000); + REG_WRITE(REG_GLOBAL, 0x1c, ds->index & 0x1f); /* * Send all frames with destination addresses matching @@ -133,10 +133,17 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds) REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff); /* - * Map all DSA device IDs to the CPU port. + * Program the DSA routing table. */ - for (i = 0; i < 32; i++) - REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port); + for (i = 0; i < 32; i++) { + int nexthop; + + nexthop = 0x1f; + if (i != ds->index && i < ds->dst->pd->nr_chips) + nexthop = ds->pd->rtable[i] & 0x1f; + + REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop); + } /* * Clear all trunk masks. @@ -176,12 +183,18 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds) static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p) { int addr = REG_PORT(p); + u16 val; /* * MAC Forcing register: don't force link, speed, duplex - * or flow control state to any particular values. + * or flow control state to any particular values on physical + * ports, but force the CPU port and all DSA ports to 1000 Mb/s + * full duplex. */ - REG_WRITE(addr, 0x01, 0x0003); + if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p)) + REG_WRITE(addr, 0x01, 0x003e); + else + REG_WRITE(addr, 0x01, 0x0003); /* * Do not limit the period of time that this port can be @@ -192,37 +205,50 @@ static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p) /* * Port Control: disable Drop-on-Unlock, disable Drop-on-Lock, - * configure the requested (DSA/EDSA) tagging mode if this is - * the CPU port, disable Header mode, enable IGMP/MLD snooping, - * disable VLAN tunneling, determine priority by looking at - * 802.1p and IP priority fields (IP prio has precedence), and - * set STP state to Forwarding. Finally, if this is the CPU - * port, additionally enable forwarding of unknown unicast and - * multicast addresses. - */ - REG_WRITE(addr, 0x04, - (p == ds->cpu_port) ? - (ds->tag_protocol == htons(ETH_P_DSA)) ? - 0x053f : 0x373f : - 0x0433); + * disable Header mode, enable IGMP/MLD snooping, disable VLAN + * tunneling, determine priority by looking at 802.1p and IP + * priority fields (IP prio has precedence), and set STP state + * to Forwarding. + * + * If this is the CPU link, use DSA or EDSA tagging depending + * on which tagging mode was configured. + * + * If this is a link to another switch, use DSA tagging mode. + * + * If this is the upstream port for this switch, enable + * forwarding of unknown unicasts and multicasts. + */ + val = 0x0433; + if (dsa_is_cpu_port(ds, p)) { + if (ds->dst->tag_protocol == htons(ETH_P_EDSA)) + val |= 0x3300; + else + val |= 0x0100; + } + if (ds->dsa_port_mask & (1 << p)) + val |= 0x0100; + if (p == dsa_upstream_port(ds)) + val |= 0x000c; + REG_WRITE(addr, 0x04, val); /* * Port Control 1: disable trunking. Also, if this is the * CPU port, enable learn messages to be sent to this port. */ - REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000); + REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000); /* * Port based VLAN map: give each port its own address * database, allow the CPU port to talk to each of the 'real' * ports, and allow each of the 'real' ports to only talk to - * the CPU port. + * the upstream port. */ - REG_WRITE(addr, 0x06, - ((p & 0xf) << 12) | - ((p == ds->cpu_port) ? - ds->valid_port_mask : - (1 << ds->cpu_port))); + val = (p & 0xf) << 12; + if (dsa_is_cpu_port(ds, p)) + val |= ds->phys_port_mask; + else + val |= 1 << dsa_upstream_port(ds); + REG_WRITE(addr, 0x06, val); /* * Default VLAN ID and priority: don't set a default VLAN @@ -394,7 +420,7 @@ static int mv88e6123_61_65_get_sset_count(struct dsa_switch *ds) } static struct dsa_switch_driver mv88e6123_61_65_switch_driver = { - .tag_protocol = __constant_htons(ETH_P_EDSA), + .tag_protocol = cpu_to_be16(ETH_P_EDSA), .priv_size = sizeof(struct mv88e6xxx_priv_state), .probe = mv88e6123_61_65_probe, .setup = mv88e6123_61_65_setup, diff --git a/net/dsa/mv88e6131.c b/net/dsa/mv88e6131.c index 374d46a01265..bb2b41bc854e 100644 --- a/net/dsa/mv88e6131.c +++ b/net/dsa/mv88e6131.c @@ -1,6 +1,6 @@ /* - * net/dsa/mv88e6131.c - Marvell 88e6131 switch chip support - * Copyright (c) 2008 Marvell Semiconductor + * net/dsa/mv88e6131.c - Marvell 88e6095/6095f/6131 switch chip support + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,6 +21,8 @@ static char *mv88e6131_probe(struct mii_bus *bus, int sw_addr) ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); if (ret >= 0) { ret &= 0xfff0; + if (ret == 0x0950) + return "Marvell 88E6095/88E6095F"; if (ret == 0x1060) return "Marvell 88E6131"; } @@ -36,7 +38,7 @@ static int mv88e6131_switch_reset(struct dsa_switch *ds) /* * Set all ports to the disabled state. */ - for (i = 0; i < 8; i++) { + for (i = 0; i < 11; i++) { ret = REG_READ(REG_PORT(i), 0x04); REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc); } @@ -100,17 +102,17 @@ static int mv88e6131_setup_global(struct dsa_switch *ds) REG_WRITE(REG_GLOBAL, 0x19, 0x8100); /* - * Disable ARP mirroring, and configure the cpu port as the - * port to which ingress and egress monitor frames are to be - * sent. + * Disable ARP mirroring, and configure the upstream port as + * the port to which ingress and egress monitor frames are to + * be sent. */ - REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1100) | 0x00f0); + REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1100) | 0x00f0); /* * Disable cascade port functionality, and set the switch's - * DSA device number to zero. + * DSA device number. */ - REG_WRITE(REG_GLOBAL, 0x1c, 0xe000); + REG_WRITE(REG_GLOBAL, 0x1c, 0xe000 | (ds->index & 0x1f)); /* * Send all frames with destination addresses matching @@ -127,16 +129,23 @@ static int mv88e6131_setup_global(struct dsa_switch *ds) REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff); /* - * Map all DSA device IDs to the CPU port. + * Program the DSA routing table. */ - for (i = 0; i < 32; i++) - REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port); + for (i = 0; i < 32; i++) { + int nexthop; + + nexthop = 0x1f; + if (i != ds->index && i < ds->dst->pd->nr_chips) + nexthop = ds->pd->rtable[i] & 0x1f; + + REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop); + } /* * Clear all trunk masks. */ for (i = 0; i < 8; i++) - REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0xff); + REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0x7ff); /* * Clear all trunk mappings. @@ -156,12 +165,18 @@ static int mv88e6131_setup_global(struct dsa_switch *ds) static int mv88e6131_setup_port(struct dsa_switch *ds, int p) { int addr = REG_PORT(p); + u16 val; /* * MAC Forcing register: don't force link, speed, duplex - * or flow control state to any particular values. + * or flow control state to any particular values on physical + * ports, but force the CPU port and all DSA ports to 1000 Mb/s + * full duplex. */ - REG_WRITE(addr, 0x01, 0x0003); + if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p)) + REG_WRITE(addr, 0x01, 0x003e); + else + REG_WRITE(addr, 0x01, 0x0003); /* * Port Control: disable Core Tag, disable Drop-on-Lock, @@ -169,29 +184,40 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p) * enable IGMP/MLD snoop, disable DoubleTag, disable VLAN * tunneling, determine priority by looking at 802.1p and * IP priority fields (IP prio has precedence), and set STP - * state to Forwarding. Finally, if this is the CPU port, - * additionally enable DSA tagging and forwarding of unknown - * unicast addresses. + * state to Forwarding. + * + * If this is the upstream port for this switch, enable + * forwarding of unknown unicasts, and enable DSA tagging + * mode. + * + * If this is the link to another switch, use DSA tagging + * mode, but do not enable forwarding of unknown unicasts. */ - REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x0537 : 0x0433); + val = 0x0433; + if (p == dsa_upstream_port(ds)) + val |= 0x0104; + if (ds->dsa_port_mask & (1 << p)) + val |= 0x0100; + REG_WRITE(addr, 0x04, val); /* * Port Control 1: disable trunking. Also, if this is the * CPU port, enable learn messages to be sent to this port. */ - REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000); + REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000); /* * Port based VLAN map: give each port its own address * database, allow the CPU port to talk to each of the 'real' * ports, and allow each of the 'real' ports to only talk to - * the CPU port. + * the upstream port. */ - REG_WRITE(addr, 0x06, - ((p & 0xf) << 12) | - ((p == ds->cpu_port) ? - ds->valid_port_mask : - (1 << ds->cpu_port))); + val = (p & 0xf) << 12; + if (dsa_is_cpu_port(ds, p)) + val |= ds->phys_port_mask; + else + val |= 1 << dsa_upstream_port(ds); + REG_WRITE(addr, 0x06, val); /* * Default VLAN ID and priority: don't set a default VLAN @@ -207,13 +233,15 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p) * untagged frames on this port, do a destination address * lookup on received packets as usual, don't send a copy * of all transmitted/received frames on this port to the - * CPU, and configure the CPU port number. Also, if this - * is the CPU port, enable forwarding of unknown multicast - * addresses. + * CPU, and configure the upstream port number. + * + * If this is the upstream port for this switch, enable + * forwarding of unknown multicast addresses. */ - REG_WRITE(addr, 0x08, - ((p == ds->cpu_port) ? 0x00c0 : 0x0080) | - ds->cpu_port); + val = 0x0080 | dsa_upstream_port(ds); + if (p == dsa_upstream_port(ds)) + val |= 0x0040; + REG_WRITE(addr, 0x08, val); /* * Rate Control: disable ingress rate limiting. @@ -268,7 +296,7 @@ static int mv88e6131_setup(struct dsa_switch *ds) if (ret < 0) return ret; - for (i = 0; i < 6; i++) { + for (i = 0; i < 11; i++) { ret = mv88e6131_setup_port(ds, i); if (ret < 0) return ret; @@ -279,7 +307,7 @@ static int mv88e6131_setup(struct dsa_switch *ds) static int mv88e6131_port_to_phy_addr(int port) { - if (port >= 0 && port != 3 && port <= 7) + if (port >= 0 && port <= 11) return port; return -1; } @@ -353,7 +381,7 @@ static int mv88e6131_get_sset_count(struct dsa_switch *ds) } static struct dsa_switch_driver mv88e6131_switch_driver = { - .tag_protocol = __constant_htons(ETH_P_DSA), + .tag_protocol = cpu_to_be16(ETH_P_DSA), .priv_size = sizeof(struct mv88e6xxx_priv_state), .probe = mv88e6131_probe, .setup = mv88e6131_setup, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a68fd79e9eca..ed131181215d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1,6 +1,6 @@ /* * net/dsa/slave.c - Slave device handling - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { struct dsa_switch *ds = bus->priv; - if (ds->valid_port_mask & (1 << addr)) + if (ds->phys_port_mask & (1 << addr)) return ds->drv->phy_read(ds, addr, reg); return 0xffff; @@ -29,7 +29,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) { struct dsa_switch *ds = bus->priv; - if (ds->valid_port_mask & (1 << addr)) + if (ds->phys_port_mask & (1 << addr)) return ds->drv->phy_write(ds, addr, reg, val); return 0; @@ -43,15 +43,24 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->write = dsa_slave_phy_write; snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s:%.2x", ds->master_mii_bus->id, ds->pd->sw_addr); - ds->slave_mii_bus->parent = &(ds->master_mii_bus->dev); + ds->slave_mii_bus->parent = &ds->master_mii_bus->dev; } /* slave device handling ****************************************************/ +static int dsa_slave_init(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + dev->iflink = p->parent->dst->master_netdev->ifindex; + + return 0; +} + static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->master_netdev; + struct net_device *master = p->parent->dst->master_netdev; int err; if (!(master->flags & IFF_UP)) @@ -89,7 +98,7 @@ out: static int dsa_slave_close(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->master_netdev; + struct net_device *master = p->parent->dst->master_netdev; dev_mc_unsync(master, dev); dev_unicast_unsync(master, dev); @@ -107,7 +116,7 @@ static int dsa_slave_close(struct net_device *dev) static void dsa_slave_change_rx_flags(struct net_device *dev, int change) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->master_netdev; + struct net_device *master = p->parent->dst->master_netdev; if (change & IFF_ALLMULTI) dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1); @@ -118,7 +127,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change) static void dsa_slave_set_rx_mode(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->master_netdev; + struct net_device *master = p->parent->dst->master_netdev; dev_mc_sync(master, dev); dev_unicast_sync(master, dev); @@ -127,7 +136,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev) static int dsa_slave_set_mac_address(struct net_device *dev, void *a) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->master_netdev; + struct net_device *master = p->parent->dst->master_netdev; struct sockaddr *addr = a; int err; @@ -288,6 +297,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { #ifdef CONFIG_NET_DSA_TAG_DSA static const struct net_device_ops dsa_netdev_ops = { + .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, .ndo_start_xmit = dsa_xmit, @@ -300,6 +310,7 @@ static const struct net_device_ops dsa_netdev_ops = { #endif #ifdef CONFIG_NET_DSA_TAG_EDSA static const struct net_device_ops edsa_netdev_ops = { + .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, .ndo_start_xmit = edsa_xmit, @@ -312,6 +323,7 @@ static const struct net_device_ops edsa_netdev_ops = { #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER static const struct net_device_ops trailer_netdev_ops = { + .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, .ndo_start_xmit = trailer_xmit, @@ -328,7 +340,7 @@ struct net_device * dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name) { - struct net_device *master = ds->master_netdev; + struct net_device *master = ds->dst->master_netdev; struct net_device *slave_dev; struct dsa_slave_priv *p; int ret; @@ -343,7 +355,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN); slave_dev->tx_queue_len = 0; - switch (ds->tag_protocol) { + switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA case htons(ETH_P_DSA): slave_dev->netdev_ops = &dsa_netdev_ops; diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index f99a019b939e..8fa25bafe6ca 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -1,6 +1,6 @@ /* * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,7 +36,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev) * Construct tagged FROM_CPU DSA tag from 802.1q tag. */ dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x60; + dsa_header[0] = 0x60 | p->parent->index; dsa_header[1] = p->port << 3; /* @@ -57,7 +57,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev) * Construct untagged FROM_CPU DSA tag. */ dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x40; + dsa_header[0] = 0x40 | p->parent->index; dsa_header[1] = p->port << 3; dsa_header[2] = 0x00; dsa_header[3] = 0x00; @@ -65,7 +65,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev) skb->protocol = htons(ETH_P_DSA); - skb->dev = p->parent->master_netdev; + skb->dev = p->parent->dst->master_netdev; dev_queue_xmit(skb); return NETDEV_TX_OK; @@ -78,11 +78,13 @@ out_free: static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct dsa_switch *ds = dev->dsa_ptr; + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; u8 *dsa_header; + int source_device; int source_port; - if (unlikely(ds == NULL)) + if (unlikely(dst == NULL)) goto out_drop; skb = skb_unshare(skb, GFP_ATOMIC); @@ -98,16 +100,24 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, dsa_header = skb->data - 2; /* - * Check that frame type is either TO_CPU or FORWARD, and - * that the source device is zero. + * Check that frame type is either TO_CPU or FORWARD. */ - if ((dsa_header[0] & 0xdf) != 0x00 && (dsa_header[0] & 0xdf) != 0xc0) + if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0) goto out_drop; /* - * Check that the source port is a registered DSA port. + * Determine source device and port. */ + source_device = dsa_header[0] & 0x1f; source_port = (dsa_header[1] >> 3) & 0x1f; + + /* + * Check that the source device exists and that the source + * port is a registered DSA port. + */ + if (source_device >= dst->pd->nr_chips) + goto out_drop; + ds = dst->ds[source_device]; if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) goto out_drop; @@ -175,8 +185,8 @@ out: return 0; } -static struct packet_type dsa_packet_type = { - .type = __constant_htons(ETH_P_DSA), +static struct packet_type dsa_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_DSA), .func = dsa_rcv, }; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 328ec957f786..815607bd286f 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -1,6 +1,6 @@ /* * net/dsa/tag_edsa.c - Ethertype DSA tagging - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -45,7 +45,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; edsa_header[3] = 0x00; - edsa_header[4] = 0x60; + edsa_header[4] = 0x60 | p->parent->index; edsa_header[5] = p->port << 3; /* @@ -70,7 +70,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; edsa_header[3] = 0x00; - edsa_header[4] = 0x40; + edsa_header[4] = 0x40 | p->parent->index; edsa_header[5] = p->port << 3; edsa_header[6] = 0x00; edsa_header[7] = 0x00; @@ -78,7 +78,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev) skb->protocol = htons(ETH_P_EDSA); - skb->dev = p->parent->master_netdev; + skb->dev = p->parent->dst->master_netdev; dev_queue_xmit(skb); return NETDEV_TX_OK; @@ -91,11 +91,13 @@ out_free: static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct dsa_switch *ds = dev->dsa_ptr; + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; u8 *edsa_header; + int source_device; int source_port; - if (unlikely(ds == NULL)) + if (unlikely(dst == NULL)) goto out_drop; skb = skb_unshare(skb, GFP_ATOMIC); @@ -111,16 +113,24 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, edsa_header = skb->data + 2; /* - * Check that frame type is either TO_CPU or FORWARD, and - * that the source device is zero. + * Check that frame type is either TO_CPU or FORWARD. */ - if ((edsa_header[0] & 0xdf) != 0x00 && (edsa_header[0] & 0xdf) != 0xc0) + if ((edsa_header[0] & 0xc0) != 0x00 && (edsa_header[0] & 0xc0) != 0xc0) goto out_drop; /* - * Check that the source port is a registered DSA port. + * Determine source device and port. */ + source_device = edsa_header[0] & 0x1f; source_port = (edsa_header[1] >> 3) & 0x1f; + + /* + * Check that the source device exists and that the source + * port is a registered DSA port. + */ + if (source_device >= dst->pd->nr_chips) + goto out_drop; + ds = dst->ds[source_device]; if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) goto out_drop; @@ -194,8 +204,8 @@ out: return 0; } -static struct packet_type edsa_packet_type = { - .type = __constant_htons(ETH_P_EDSA), +static struct packet_type edsa_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_EDSA), .func = edsa_rcv, }; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index b59132878ad1..1c3e30c38b86 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -1,6 +1,6 @@ /* * net/dsa/tag_trailer.c - Trailer tag format handling - * Copyright (c) 2008 Marvell Semiconductor + * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -59,7 +59,7 @@ int trailer_xmit(struct sk_buff *skb, struct net_device *dev) nskb->protocol = htons(ETH_P_TRAILER); - nskb->dev = p->parent->master_netdev; + nskb->dev = p->parent->dst->master_netdev; dev_queue_xmit(nskb); return NETDEV_TX_OK; @@ -68,12 +68,14 @@ int trailer_xmit(struct sk_buff *skb, struct net_device *dev) static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct dsa_switch *ds = dev->dsa_ptr; + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; u8 *trailer; int source_port; - if (unlikely(ds == NULL)) + if (unlikely(dst == NULL)) goto out_drop; + ds = dst->ds[0]; skb = skb_unshare(skb, GFP_ATOMIC); if (skb == NULL) @@ -111,8 +113,8 @@ out: return 0; } -static struct packet_type trailer_packet_type = { - .type = __constant_htons(ETH_P_TRAILER), +static struct packet_type trailer_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_TRAILER), .func = trailer_rcv, }; diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 8789d2bb1b06..6f479fa522c3 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -1102,8 +1102,8 @@ drop: return NET_RX_DROP; } -static struct packet_type econet_packet_type = { - .type = __constant_htons(ETH_P_ECONET), +static struct packet_type econet_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_ECONET), .func = econet_rcv, }; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 691268f3a359..b2cf91e4ccaa 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -35,7 +35,7 @@ config IP_ADVANCED_ROUTER at boot time after the /proc file system has been mounted. - If you turn on IP forwarding, you will also get the rp_filter, which + If you turn on IP forwarding, you should consider the rp_filter, which automatically rejects incoming packets if the routing table entry for their source address doesn't match the network interface they're arriving on. This has security advantages because it prevents the @@ -46,12 +46,16 @@ config IP_ADVANCED_ROUTER rp_filter on use: echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter - or + and echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter + Note that some distributions enable it in startup scripts. + For details about rp_filter strict and loose mode read + <file:Documentation/networking/ip-sysctl.txt>. + If unsure, say N here. -choice +choice prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" depends on IP_ADVANCED_ROUTER default ASK_IP_FIB_HASH @@ -59,27 +63,29 @@ choice config ASK_IP_FIB_HASH bool "FIB_HASH" ---help--- - Current FIB is very proven and good enough for most users. + Current FIB is very proven and good enough for most users. config IP_FIB_TRIE bool "FIB_TRIE" ---help--- - Use new experimental LC-trie as FIB lookup algorithm. - This improves lookup performance if you have a large - number of routes. - - LC-trie is a longest matching prefix lookup algorithm which - performs better than FIB_HASH for large routing tables. - But, it consumes more memory and is more complex. - - LC-trie is described in: - - IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson - IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 - An experimental study of compression methods for dynamic tries - Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ - + Use new experimental LC-trie as FIB lookup algorithm. + This improves lookup performance if you have a large + number of routes. + + LC-trie is a longest matching prefix lookup algorithm which + performs better than FIB_HASH for large routing tables. + But, it consumes more memory and is more complex. + + LC-trie is described in: + + IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson + IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, + June 1999 + + An experimental study of compression methods for dynamic tries + Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. + http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ + endchoice config IP_FIB_HASH @@ -191,7 +197,7 @@ config IP_PNP_RARP <file:Documentation/filesystems/nfsroot.txt> for details. # not yet ready.. -# bool ' IP: ARP support' CONFIG_IP_PNP_ARP +# bool ' IP: ARP support' CONFIG_IP_PNP_ARP config NET_IPIP tristate "IP: tunneling" select INET_TUNNEL @@ -361,7 +367,7 @@ config INET_IPCOMP ---help--- Support for IP Payload Compression Protocol (IPComp) (RFC3173), typically needed for IPsec. - + If unsure, say Y. config INET_XFRM_TUNNEL @@ -415,7 +421,7 @@ config INET_DIAG Support for INET (TCP, DCCP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at <http://linux-net.osdl.org/index.php/Iproute2>. - + If unsure, say Y. config INET_TCP_DIAG diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 743f5542d65a..d5aaabbb7cb3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -369,7 +369,6 @@ lookup_protocol: sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; - sk->sk_family = PF_INET; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; @@ -1253,10 +1252,10 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, int proto; int id; - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) + iph = skb_gro_header(skb, sizeof(*iph)); + if (unlikely(!iph)) goto out; - iph = ip_hdr(skb); proto = iph->protocol & (MAX_INET_PROTOS - 1); rcu_read_lock(); @@ -1264,13 +1263,13 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (!ops || !ops->gro_receive) goto out_unlock; - if (iph->version != 4 || iph->ihl != 5) + if (*(u8 *)iph != 0x45) goto out_unlock; if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto out_unlock; - flush = ntohs(iph->tot_len) != skb->len || + flush = ntohs(iph->tot_len) != skb_gro_len(skb) || iph->frag_off != htons(IP_DF); id = ntohs(iph->id); @@ -1282,24 +1281,25 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, iph2 = ip_hdr(p); - if (iph->protocol != iph2->protocol || - iph->tos != iph2->tos || - memcmp(&iph->saddr, &iph2->saddr, 8)) { + if ((iph->protocol ^ iph2->protocol) | + (iph->tos ^ iph2->tos) | + (iph->saddr ^ iph2->saddr) | + (iph->daddr ^ iph2->daddr)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } /* All fields must match except length and checksum. */ NAPI_GRO_CB(p)->flush |= - memcmp(&iph->frag_off, &iph2->frag_off, 4) || - (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id; + (iph->ttl ^ iph2->ttl) | + ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); NAPI_GRO_CB(p)->flush |= flush; } NAPI_GRO_CB(skb)->flush |= flush; - __skb_pull(skb, sizeof(*iph)); - skb_reset_transport_header(skb); + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); pp = ops->gro_receive(head, skb); @@ -1500,8 +1500,8 @@ static int ipv4_proc_init(void); * IP protocol layer initialiser */ -static struct packet_type ip_packet_type = { - .type = __constant_htons(ETH_P_IP), +static struct packet_type ip_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, .gso_send_check = inet_gso_send_check, .gso_segment = inet_gso_segment, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 29a74c01d8de..f11931c18381 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -801,8 +801,11 @@ static int arp_process(struct sk_buff *skb) * cache. */ - /* Special case: IPv4 duplicate address detection packet (RFC2131) */ - if (sip == 0) { + /* + * Special case: IPv4 duplicate address detection packet (RFC2131) + * and Gratuitous ARP/ARP Announce. (RFC3927, Section 2.4) + */ + if (sip == 0 || tip == sip) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type(net, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) @@ -892,7 +895,7 @@ static int arp_process(struct sk_buff *skb) out: if (in_dev) in_dev_put(in_dev); - kfree_skb(skb); + consume_skb(skb); return 0; } @@ -1225,8 +1228,8 @@ void arp_ifdown(struct net_device *dev) * Called once on startup. */ -static struct packet_type arp_packet_type = { - .type = __constant_htons(ETH_P_ARP), +static struct packet_type arp_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_ARP), .func = arp_rcv, }; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 309997edc8a5..126bb911880f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1075,6 +1075,14 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, } } ip_mc_up(in_dev); + /* fall through */ + case NETDEV_CHANGEADDR: + if (IN_DEV_ARP_NOTIFY(in_dev)) + arp_send(ARPOP_REQUEST, ETH_P_ARP, + in_dev->ifa_list->ifa_address, + dev, + in_dev->ifa_list->ifa_address, + NULL, dev->dev_addr, NULL); break; case NETDEV_DOWN: ip_mc_down(in_dev); @@ -1208,7 +1216,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); @@ -1439,6 +1448,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), + DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 741e4fa3e474..cafcc49d0993 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -275,7 +275,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, fib_res_put(&res); if (no_addr) goto last_resort; - if (rpf) + if (rpf == 1) goto e_inval; fl.oif = dev->ifindex; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4817dea3bc73..f831df500907 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -322,8 +322,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, - info->nlh, GFP_KERNEL); + rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, + info->nlh, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index fc562d29cc46..3f50807237e0 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -375,6 +375,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; + ipc.shtx.flags = 0; if (icmp_param->replyopts.optlen) { ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) @@ -532,6 +533,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) inet_sk(sk)->tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts; + ipc.shtx.flags = 0; { struct flowi fl = { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f26ab38680de..22cd19ee44e5 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -93,24 +93,40 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct inet_bind_hashbucket *head; struct hlist_node *node; struct inet_bind_bucket *tb; - int ret; + int ret, attempts = 5; struct net *net = sock_net(sk); + int smallest_size = -1, smallest_rover; local_bh_disable(); if (!snum) { int remaining, rover, low, high; +again: inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; - rover = net_random() % remaining + low; + smallest_rover = rover = net_random() % remaining + low; + smallest_size = -1; do { head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (ib_net(tb) == net && tb->port == rover) + if (ib_net(tb) == net && tb->port == rover) { + if (tb->fastreuse > 0 && + sk->sk_reuse && + sk->sk_state != TCP_LISTEN && + (tb->num_owners < smallest_size || smallest_size == -1)) { + smallest_size = tb->num_owners; + smallest_rover = rover; + if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { + spin_unlock(&head->lock); + snum = smallest_rover; + goto have_snum; + } + } goto next; + } break; next: spin_unlock(&head->lock); @@ -125,14 +141,19 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) * the top level, not from the 'break;' statement. */ ret = 1; - if (remaining <= 0) + if (remaining <= 0) { + if (smallest_size != -1) { + snum = smallest_rover; + goto have_snum; + } goto fail; - + } /* OK, here is the one we will use. HEAD is * non-NULL and we hold it's mutex. */ snum = rover; } else { +have_snum: head = &hashinfo->bhash[inet_bhashfn(net, snum, hashinfo->bhash_size)]; spin_lock(&head->lock); @@ -145,12 +166,19 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) tb_found: if (!hlist_empty(&tb->owners)) { if (tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) { + sk->sk_reuse && sk->sk_state != TCP_LISTEN && + smallest_size == -1) { goto success; } else { ret = 1; - if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) + if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && + smallest_size != -1 && --attempts >= 0) { + spin_unlock(&head->lock); + goto again; + } goto fail_unlock; + } } } tb_not_found: diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 6c52e08f786e..eaf3e2c8646a 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -267,6 +267,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) + __releases(&f->lock) { struct inet_frag_queue *q; struct hlist_node *n; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 6a1045da48d2..625cc5f64c94 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -38,6 +38,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, write_pnet(&tb->ib_net, hold_net(net)); tb->port = snum; tb->fastreuse = 0; + tb->num_owners = 0; INIT_HLIST_HEAD(&tb->owners); hlist_add_head(&tb->node, &head->chain); } @@ -59,8 +60,13 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + + atomic_inc(&hashinfo->bsockets); + inet_sk(sk)->num = snum; sk_add_bind_node(sk, &tb->owners); + tb->num_owners++; inet_csk(sk)->icsk_bind_hash = tb; } @@ -75,9 +81,12 @@ static void __inet_put_port(struct sock *sk) struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; + atomic_dec(&hashinfo->bsockets); + spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); + tb->num_owners--; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); @@ -444,9 +453,9 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, */ inet_bind_bucket_for_each(tb, node, &head->chain) { if (ib_net(tb) == net && tb->port == port) { - WARN_ON(hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) goto next_port; + WARN_ON(hlist_empty(&tb->owners)); if (!check_established(death_row, sk, port, &tw)) goto ok; @@ -523,6 +532,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h) { int i; + atomic_set(&h->bsockets, 0); for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0101521f366b..e62510d5ea5a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -164,67 +164,124 @@ static DEFINE_RWLOCK(ipgre_lock); /* Given src, dst and key, find appropriate for input tunnel. */ -static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net, +static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, __be32 remote, __be32 local, __be32 key, __be16 gre_proto) { + struct net *net = dev_net(dev); + int link = dev->ifindex; unsigned h0 = HASH(remote); unsigned h1 = HASH(key); - struct ip_tunnel *t; - struct ip_tunnel *t2 = NULL; + struct ip_tunnel *t, *cand = NULL; struct ipgre_net *ign = net_generic(net, ipgre_net_id); int dev_type = (gre_proto == htons(ETH_P_TEB)) ? ARPHRD_ETHER : ARPHRD_IPGRE; + int score, cand_score = 4; for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { - if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { - if (t->parms.i_key == key && t->dev->flags & IFF_UP) { - if (t->dev->type == dev_type) - return t; - if (t->dev->type == ARPHRD_IPGRE && !t2) - t2 = t; - } + if (local != t->parms.iph.saddr || + remote != t->parms.iph.daddr || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IPGRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; } } for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { - if (remote == t->parms.iph.daddr) { - if (t->parms.i_key == key && t->dev->flags & IFF_UP) { - if (t->dev->type == dev_type) - return t; - if (t->dev->type == ARPHRD_IPGRE && !t2) - t2 = t; - } + if (remote != t->parms.iph.daddr || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IPGRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; } } for (t = ign->tunnels_l[h1]; t; t = t->next) { - if (local == t->parms.iph.saddr || - (local == t->parms.iph.daddr && - ipv4_is_multicast(local))) { - if (t->parms.i_key == key && t->dev->flags & IFF_UP) { - if (t->dev->type == dev_type) - return t; - if (t->dev->type == ARPHRD_IPGRE && !t2) - t2 = t; - } + if ((local != t->parms.iph.saddr && + (local != t->parms.iph.daddr || + !ipv4_is_multicast(local))) || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IPGRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; } } for (t = ign->tunnels_wc[h1]; t; t = t->next) { - if (t->parms.i_key == key && t->dev->flags & IFF_UP) { - if (t->dev->type == dev_type) - return t; - if (t->dev->type == ARPHRD_IPGRE && !t2) - t2 = t; + if (t->parms.i_key != key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IPGRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; } } - if (t2) - return t2; + if (cand != NULL) + return cand; - if (ign->fb_tunnel_dev->flags&IFF_UP) + if (ign->fb_tunnel_dev->flags & IFF_UP) return netdev_priv(ign->fb_tunnel_dev); + return NULL; } @@ -284,6 +341,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net, __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; __be32 key = parms->i_key; + int link = parms->link; struct ip_tunnel *t, **tp; struct ipgre_net *ign = net_generic(net, ipgre_net_id); @@ -291,6 +349,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net, if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && key == t->parms.i_key && + link == t->parms.link && type == t->dev->type) break; @@ -421,7 +480,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) } read_lock(&ipgre_lock); - t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr, + t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, flags & GRE_KEY ? *(((__be32 *)p) + (grehlen / 4) - 1) : 0, p[1]); @@ -432,7 +491,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; - if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; @@ -518,7 +577,7 @@ static int ipgre_rcv(struct sk_buff *skb) gre_proto = *(__be16 *)(h + 2); read_lock(&ipgre_lock); - if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev), + if ((tunnel = ipgre_tunnel_lookup(skb->dev, iph->saddr, iph->daddr, key, gre_proto))) { struct net_device_stats *stats = &tunnel->dev->stats; @@ -744,7 +803,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) #endif if (tunnel->err_count > 0) { - if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8ebe86dd72af..3e7e910c7c0f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -935,6 +935,10 @@ alloc_new_skb: sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; + else + /* only the initial fragment is + time stamped */ + ipc->shtx.flags = 0; } if (skb == NULL) goto error; @@ -945,6 +949,7 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); + *skb_tx(skb) = ipc->shtx; /* * Find where to start putting bytes. @@ -1364,6 +1369,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; + ipc.shtx.flags = 0; if (replyopts.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index d722013c1cae..90d22ae0a419 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -100,8 +100,8 @@ #define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers - '3' from resolv.h */ -#define NONE __constant_htonl(INADDR_NONE) -#define ANY __constant_htonl(INADDR_ANY) +#define NONE cpu_to_be32(INADDR_NONE) +#define ANY cpu_to_be32(INADDR_ANY) /* * Public IP configuration @@ -406,7 +406,7 @@ static int __init ic_defaults(void) static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type rarp_packet_type __initdata = { - .type = __constant_htons(ETH_P_RARP), + .type = cpu_to_be16(ETH_P_RARP), .func = ic_rarp_recv, }; @@ -568,7 +568,7 @@ struct bootp_pkt { /* BOOTP packet format */ static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type bootp_packet_type __initdata = { - .type = __constant_htons(ETH_P_IP), + .type = cpu_to_be16(ETH_P_IP), .func = ic_bootp_recv, }; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 5079dfbc6f38..9054139795af 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -327,7 +327,7 @@ static int ipip_err(struct sk_buff *skb, u32 info) if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; - if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; @@ -466,7 +466,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } if (tunnel->err_count > 0) { - if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else @@ -750,7 +751,7 @@ static struct xfrm_tunnel ipip_handler = { .priority = 1, }; -static char banner[] __initdata = +static const char banner[] __initconst = KERN_INFO "IPv4 over IPv4 tunneling driver\n"; static void ipip_destroy_tunnels(struct ipip_net *ipn) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 14666449dc1c..13e9dd3012b3 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -67,9 +67,6 @@ #define CONFIG_IP_PIMSM 1 #endif -static struct sock *mroute_socket; - - /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. */ @@ -80,18 +77,9 @@ static DEFINE_RWLOCK(mrt_lock); * Multicast router control variables */ -static struct vif_device vif_table[MAXVIFS]; /* Devices */ -static int maxvif; - -#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL) - -static int mroute_do_assert; /* Set in PIM assert */ -static int mroute_do_pim; - -static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */ +#define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL) static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */ -static atomic_t cache_resolve_queue_len; /* Size of unresolved */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); @@ -107,7 +95,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __read_mostly; static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); -static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); +static int ipmr_cache_report(struct net *net, + struct sk_buff *pkt, vifi_t vifi, int assert); static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); #ifdef CONFIG_IP_PIMSM_V2 @@ -120,9 +109,11 @@ static struct timer_list ipmr_expire_timer; static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) { + struct net *net = dev_net(dev); + dev_close(dev); - dev = __dev_get_by_name(&init_net, "tunl0"); + dev = __dev_get_by_name(net, "tunl0"); if (dev) { const struct net_device_ops *ops = dev->netdev_ops; struct ifreq ifr; @@ -148,11 +139,11 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) } static -struct net_device *ipmr_new_tunnel(struct vifctl *v) +struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { struct net_device *dev; - dev = __dev_get_by_name(&init_net, "tunl0"); + dev = __dev_get_by_name(net, "tunl0"); if (dev) { const struct net_device_ops *ops = dev->netdev_ops; @@ -181,7 +172,8 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) dev = NULL; - if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) { + if (err == 0 && + (dev = __dev_get_by_name(net, p.name)) != NULL) { dev->flags |= IFF_MULTICAST; in_dev = __in_dev_get_rtnl(dev); @@ -209,14 +201,15 @@ failure: #ifdef CONFIG_IP_PIMSM -static int reg_vif_num = -1; - static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { + struct net *net = dev_net(dev); + read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; dev->stats.tx_packets++; - ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); + ipmr_cache_report(net, skb, net->ipv4.mroute_reg_vif_num, + IGMPMSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); return 0; @@ -283,16 +276,16 @@ failure: * @notify: Set to 1, if the caller is a notifier_call */ -static int vif_delete(int vifi, int notify) +static int vif_delete(struct net *net, int vifi, int notify) { struct vif_device *v; struct net_device *dev; struct in_device *in_dev; - if (vifi < 0 || vifi >= maxvif) + if (vifi < 0 || vifi >= net->ipv4.maxvif) return -EADDRNOTAVAIL; - v = &vif_table[vifi]; + v = &net->ipv4.vif_table[vifi]; write_lock_bh(&mrt_lock); dev = v->dev; @@ -304,17 +297,17 @@ static int vif_delete(int vifi, int notify) } #ifdef CONFIG_IP_PIMSM - if (vifi == reg_vif_num) - reg_vif_num = -1; + if (vifi == net->ipv4.mroute_reg_vif_num) + net->ipv4.mroute_reg_vif_num = -1; #endif - if (vifi+1 == maxvif) { + if (vifi+1 == net->ipv4.maxvif) { int tmp; for (tmp=vifi-1; tmp>=0; tmp--) { - if (VIF_EXISTS(tmp)) + if (VIF_EXISTS(net, tmp)) break; } - maxvif = tmp+1; + net->ipv4.maxvif = tmp+1; } write_unlock_bh(&mrt_lock); @@ -333,6 +326,12 @@ static int vif_delete(int vifi, int notify) return 0; } +static inline void ipmr_cache_free(struct mfc_cache *c) +{ + release_net(mfc_net(c)); + kmem_cache_free(mrt_cachep, c); +} + /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ @@ -341,8 +340,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c) { struct sk_buff *skb; struct nlmsgerr *e; + struct net *net = mfc_net(c); - atomic_dec(&cache_resolve_queue_len); + atomic_dec(&net->ipv4.cache_resolve_queue_len); while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { @@ -354,12 +354,12 @@ static void ipmr_destroy_unres(struct mfc_cache *c) e->error = -ETIMEDOUT; memset(&e->msg, 0, sizeof(e->msg)); - rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid); + rtnl_unicast(skb, net, NETLINK_CB(skb).pid); } else kfree_skb(skb); } - kmem_cache_free(mrt_cachep, c); + ipmr_cache_free(c); } @@ -376,7 +376,7 @@ static void ipmr_expire_process(unsigned long dummy) return; } - if (atomic_read(&cache_resolve_queue_len) == 0) + if (mfc_unres_queue == NULL) goto out; now = jiffies; @@ -397,7 +397,7 @@ static void ipmr_expire_process(unsigned long dummy) ipmr_destroy_unres(c); } - if (atomic_read(&cache_resolve_queue_len)) + if (mfc_unres_queue != NULL) mod_timer(&ipmr_expire_timer, jiffies + expires); out: @@ -409,13 +409,15 @@ out: static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls) { int vifi; + struct net *net = mfc_net(cache); cache->mfc_un.res.minvif = MAXVIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXVIFS); - for (vifi=0; vifi<maxvif; vifi++) { - if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) { + for (vifi = 0; vifi < net->ipv4.maxvif; vifi++) { + if (VIF_EXISTS(net, vifi) && + ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; @@ -425,16 +427,16 @@ static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls) } } -static int vif_add(struct vifctl *vifc, int mrtsock) +static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) { int vifi = vifc->vifc_vifi; - struct vif_device *v = &vif_table[vifi]; + struct vif_device *v = &net->ipv4.vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; int err; /* Is vif busy ? */ - if (VIF_EXISTS(vifi)) + if (VIF_EXISTS(net, vifi)) return -EADDRINUSE; switch (vifc->vifc_flags) { @@ -444,7 +446,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock) * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ - if (reg_vif_num >= 0) + if (net->ipv4.mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ipmr_reg_vif(); if (!dev) @@ -458,7 +460,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock) break; #endif case VIFF_TUNNEL: - dev = ipmr_new_tunnel(vifc); + dev = ipmr_new_tunnel(net, vifc); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); @@ -469,7 +471,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock) } break; case 0: - dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr); + dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); @@ -510,20 +512,22 @@ static int vif_add(struct vifctl *vifc, int mrtsock) v->dev = dev; #ifdef CONFIG_IP_PIMSM if (v->flags&VIFF_REGISTER) - reg_vif_num = vifi; + net->ipv4.mroute_reg_vif_num = vifi; #endif - if (vifi+1 > maxvif) - maxvif = vifi+1; + if (vifi+1 > net->ipv4.maxvif) + net->ipv4.maxvif = vifi+1; write_unlock_bh(&mrt_lock); return 0; } -static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp) +static struct mfc_cache *ipmr_cache_find(struct net *net, + __be32 origin, + __be32 mcastgrp) { int line = MFC_HASH(mcastgrp, origin); struct mfc_cache *c; - for (c=mfc_cache_array[line]; c; c = c->next) { + for (c = net->ipv4.mfc_cache_array[line]; c; c = c->next) { if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp) break; } @@ -533,22 +537,24 @@ static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp) /* * Allocate a multicast cache entry */ -static struct mfc_cache *ipmr_cache_alloc(void) +static struct mfc_cache *ipmr_cache_alloc(struct net *net) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (c == NULL) return NULL; c->mfc_un.res.minvif = MAXVIFS; + mfc_net_set(c, net); return c; } -static struct mfc_cache *ipmr_cache_alloc_unres(void) +static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (c == NULL) return NULL; skb_queue_head_init(&c->mfc_un.unres.unresolved); c->mfc_un.unres.expires = jiffies + 10*HZ; + mfc_net_set(c, net); return c; } @@ -581,7 +587,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) memset(&e->msg, 0, sizeof(e->msg)); } - rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid); + rtnl_unicast(skb, mfc_net(c), NETLINK_CB(skb).pid); } else ip_mr_forward(skb, c, 0); } @@ -594,7 +600,8 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) * Called under mrt_lock. */ -static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) +static int ipmr_cache_report(struct net *net, + struct sk_buff *pkt, vifi_t vifi, int assert) { struct sk_buff *skb; const int ihl = ip_hdrlen(pkt); @@ -626,7 +633,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = IGMPMSG_WHOLEPKT; msg->im_mbz = 0; - msg->im_vif = reg_vif_num; + msg->im_vif = net->ipv4.mroute_reg_vif_num; ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -658,7 +665,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) skb->transport_header = skb->network_header; } - if (mroute_socket == NULL) { + if (net->ipv4.mroute_sk == NULL) { kfree_skb(skb); return -EINVAL; } @@ -666,7 +673,8 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) /* * Deliver to mrouted */ - if ((ret = sock_queue_rcv_skb(mroute_socket, skb))<0) { + ret = sock_queue_rcv_skb(net->ipv4.mroute_sk, skb); + if (ret < 0) { if (net_ratelimit()) printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); kfree_skb(skb); @@ -680,7 +688,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) */ static int -ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) +ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb) { int err; struct mfc_cache *c; @@ -688,7 +696,8 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) spin_lock_bh(&mfc_unres_lock); for (c=mfc_unres_queue; c; c=c->next) { - if (c->mfc_mcastgrp == iph->daddr && + if (net_eq(mfc_net(c), net) && + c->mfc_mcastgrp == iph->daddr && c->mfc_origin == iph->saddr) break; } @@ -698,8 +707,8 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) * Create a new entry if allowable */ - if (atomic_read(&cache_resolve_queue_len) >= 10 || - (c=ipmr_cache_alloc_unres())==NULL) { + if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 || + (c = ipmr_cache_alloc_unres(net)) == NULL) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); @@ -716,18 +725,19 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) /* * Reflect first query at mrouted. */ - if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) { + err = ipmr_cache_report(net, skb, vifi, IGMPMSG_NOCACHE); + if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); - kmem_cache_free(mrt_cachep, c); + ipmr_cache_free(c); kfree_skb(skb); return err; } - atomic_inc(&cache_resolve_queue_len); + atomic_inc(&net->ipv4.cache_resolve_queue_len); c->next = mfc_unres_queue; mfc_unres_queue = c; @@ -753,35 +763,37 @@ ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) * MFC cache manipulation by user space mroute daemon */ -static int ipmr_mfc_delete(struct mfcctl *mfc) +static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc) { int line; struct mfc_cache *c, **cp; line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); - for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { + for (cp = &net->ipv4.mfc_cache_array[line]; + (c = *cp) != NULL; cp = &c->next) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { write_lock_bh(&mrt_lock); *cp = c->next; write_unlock_bh(&mrt_lock); - kmem_cache_free(mrt_cachep, c); + ipmr_cache_free(c); return 0; } } return -ENOENT; } -static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) +static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock) { int line; struct mfc_cache *uc, *c, **cp; line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); - for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { + for (cp = &net->ipv4.mfc_cache_array[line]; + (c = *cp) != NULL; cp = &c->next) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) break; @@ -800,7 +812,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) return -EINVAL; - c = ipmr_cache_alloc(); + c = ipmr_cache_alloc(net); if (c == NULL) return -ENOMEM; @@ -812,8 +824,8 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) c->mfc_flags |= MFC_STATIC; write_lock_bh(&mrt_lock); - c->next = mfc_cache_array[line]; - mfc_cache_array[line] = c; + c->next = net->ipv4.mfc_cache_array[line]; + net->ipv4.mfc_cache_array[line] = c; write_unlock_bh(&mrt_lock); /* @@ -823,19 +835,21 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) spin_lock_bh(&mfc_unres_lock); for (cp = &mfc_unres_queue; (uc=*cp) != NULL; cp = &uc->next) { - if (uc->mfc_origin == c->mfc_origin && + if (net_eq(mfc_net(uc), net) && + uc->mfc_origin == c->mfc_origin && uc->mfc_mcastgrp == c->mfc_mcastgrp) { *cp = uc->next; - if (atomic_dec_and_test(&cache_resolve_queue_len)) - del_timer(&ipmr_expire_timer); + atomic_dec(&net->ipv4.cache_resolve_queue_len); break; } } + if (mfc_unres_queue == NULL) + del_timer(&ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (uc) { ipmr_cache_resolve(uc, c); - kmem_cache_free(mrt_cachep, uc); + ipmr_cache_free(uc); } return 0; } @@ -844,16 +858,16 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct sock *sk) +static void mroute_clean_tables(struct net *net) { int i; /* * Shut down all active vif entries */ - for (i=0; i<maxvif; i++) { - if (!(vif_table[i].flags&VIFF_STATIC)) - vif_delete(i, 0); + for (i = 0; i < net->ipv4.maxvif; i++) { + if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC)) + vif_delete(net, i, 0); } /* @@ -862,7 +876,7 @@ static void mroute_clean_tables(struct sock *sk) for (i=0; i<MFC_LINES; i++) { struct mfc_cache *c, **cp; - cp = &mfc_cache_array[i]; + cp = &net->ipv4.mfc_cache_array[i]; while ((c = *cp) != NULL) { if (c->mfc_flags&MFC_STATIC) { cp = &c->next; @@ -872,22 +886,23 @@ static void mroute_clean_tables(struct sock *sk) *cp = c->next; write_unlock_bh(&mrt_lock); - kmem_cache_free(mrt_cachep, c); + ipmr_cache_free(c); } } - if (atomic_read(&cache_resolve_queue_len) != 0) { - struct mfc_cache *c; + if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) { + struct mfc_cache *c, **cp; spin_lock_bh(&mfc_unres_lock); - while (mfc_unres_queue != NULL) { - c = mfc_unres_queue; - mfc_unres_queue = c->next; - spin_unlock_bh(&mfc_unres_lock); + cp = &mfc_unres_queue; + while ((c = *cp) != NULL) { + if (!net_eq(mfc_net(c), net)) { + cp = &c->next; + continue; + } + *cp = c->next; ipmr_destroy_unres(c); - - spin_lock_bh(&mfc_unres_lock); } spin_unlock_bh(&mfc_unres_lock); } @@ -895,15 +910,17 @@ static void mroute_clean_tables(struct sock *sk) static void mrtsock_destruct(struct sock *sk) { + struct net *net = sock_net(sk); + rtnl_lock(); - if (sk == mroute_socket) { - IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--; + if (sk == net->ipv4.mroute_sk) { + IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; write_lock_bh(&mrt_lock); - mroute_socket = NULL; + net->ipv4.mroute_sk = NULL; write_unlock_bh(&mrt_lock); - mroute_clean_tables(sk); + mroute_clean_tables(net); } rtnl_unlock(); } @@ -920,9 +937,10 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int int ret; struct vifctl vif; struct mfcctl mfc; + struct net *net = sock_net(sk); if (optname != MRT_INIT) { - if (sk != mroute_socket && !capable(CAP_NET_ADMIN)) + if (sk != net->ipv4.mroute_sk && !capable(CAP_NET_ADMIN)) return -EACCES; } @@ -935,7 +953,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int return -ENOPROTOOPT; rtnl_lock(); - if (mroute_socket) { + if (net->ipv4.mroute_sk) { rtnl_unlock(); return -EADDRINUSE; } @@ -943,15 +961,15 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int ret = ip_ra_control(sk, 1, mrtsock_destruct); if (ret == 0) { write_lock_bh(&mrt_lock); - mroute_socket = sk; + net->ipv4.mroute_sk = sk; write_unlock_bh(&mrt_lock); - IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++; + IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; } rtnl_unlock(); return ret; case MRT_DONE: - if (sk != mroute_socket) + if (sk != net->ipv4.mroute_sk) return -EACCES; return ip_ra_control(sk, 0, NULL); case MRT_ADD_VIF: @@ -964,9 +982,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int return -ENFILE; rtnl_lock(); if (optname == MRT_ADD_VIF) { - ret = vif_add(&vif, sk==mroute_socket); + ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk); } else { - ret = vif_delete(vif.vifc_vifi, 0); + ret = vif_delete(net, vif.vifc_vifi, 0); } rtnl_unlock(); return ret; @@ -983,9 +1001,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int return -EFAULT; rtnl_lock(); if (optname == MRT_DEL_MFC) - ret = ipmr_mfc_delete(&mfc); + ret = ipmr_mfc_delete(net, &mfc); else - ret = ipmr_mfc_add(&mfc, sk==mroute_socket); + ret = ipmr_mfc_add(net, &mfc, sk == net->ipv4.mroute_sk); rtnl_unlock(); return ret; /* @@ -996,7 +1014,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int int v; if (get_user(v,(int __user *)optval)) return -EFAULT; - mroute_do_assert=(v)?1:0; + net->ipv4.mroute_do_assert = (v) ? 1 : 0; return 0; } #ifdef CONFIG_IP_PIMSM @@ -1010,11 +1028,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int rtnl_lock(); ret = 0; - if (v != mroute_do_pim) { - mroute_do_pim = v; - mroute_do_assert = v; + if (v != net->ipv4.mroute_do_pim) { + net->ipv4.mroute_do_pim = v; + net->ipv4.mroute_do_assert = v; #ifdef CONFIG_IP_PIMSM_V2 - if (mroute_do_pim) + if (net->ipv4.mroute_do_pim) ret = inet_add_protocol(&pim_protocol, IPPROTO_PIM); else @@ -1045,6 +1063,7 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int { int olr; int val; + struct net *net = sock_net(sk); if (optname != MRT_VERSION && #ifdef CONFIG_IP_PIMSM @@ -1066,10 +1085,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int val = 0x0305; #ifdef CONFIG_IP_PIMSM else if (optname == MRT_PIM) - val = mroute_do_pim; + val = net->ipv4.mroute_do_pim; #endif else - val = mroute_do_assert; + val = net->ipv4.mroute_do_assert; if (copy_to_user(optval, &val, olr)) return -EFAULT; return 0; @@ -1085,16 +1104,17 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) struct sioc_vif_req vr; struct vif_device *vif; struct mfc_cache *c; + struct net *net = sock_net(sk); switch (cmd) { case SIOCGETVIFCNT: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; - if (vr.vifi >= maxvif) + if (vr.vifi >= net->ipv4.maxvif) return -EINVAL; read_lock(&mrt_lock); - vif=&vif_table[vr.vifi]; - if (VIF_EXISTS(vr.vifi)) { + vif = &net->ipv4.vif_table[vr.vifi]; + if (VIF_EXISTS(net, vr.vifi)) { vr.icount = vif->pkt_in; vr.ocount = vif->pkt_out; vr.ibytes = vif->bytes_in; @@ -1112,7 +1132,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) return -EFAULT; read_lock(&mrt_lock); - c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr); + c = ipmr_cache_find(net, sr.src.s_addr, sr.grp.s_addr); if (c) { sr.pktcnt = c->mfc_un.res.pkt; sr.bytecnt = c->mfc_un.res.bytes; @@ -1134,18 +1154,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; + struct net *net = dev_net(dev); struct vif_device *v; int ct; - if (!net_eq(dev_net(dev), &init_net)) + if (!net_eq(dev_net(dev), net)) return NOTIFY_DONE; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; - v=&vif_table[0]; - for (ct=0; ct<maxvif; ct++,v++) { + v = &net->ipv4.vif_table[0]; + for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) { if (v->dev == dev) - vif_delete(ct, 1); + vif_delete(net, ct, 1); } return NOTIFY_DONE; } @@ -1205,8 +1226,9 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) { + struct net *net = mfc_net(c); const struct iphdr *iph = ip_hdr(skb); - struct vif_device *vif = &vif_table[vifi]; + struct vif_device *vif = &net->ipv4.vif_table[vifi]; struct net_device *dev; struct rtable *rt; int encap = 0; @@ -1220,9 +1242,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) vif->bytes_out += skb->len; vif->dev->stats.tx_bytes += skb->len; vif->dev->stats.tx_packets++; - ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); - kfree_skb(skb); - return; + ipmr_cache_report(net, skb, vifi, IGMPMSG_WHOLEPKT); + goto out_free; } #endif @@ -1233,7 +1254,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) .saddr = vif->local, .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_IPIP }; - if (ip_route_output_key(&init_net, &rt, &fl)) + if (ip_route_output_key(net, &rt, &fl)) goto out_free; encap = sizeof(struct iphdr); } else { @@ -1242,7 +1263,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) { .daddr = iph->daddr, .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_IPIP }; - if (ip_route_output_key(&init_net, &rt, &fl)) + if (ip_route_output_key(net, &rt, &fl)) goto out_free; } @@ -1306,9 +1327,10 @@ out_free: static int ipmr_find_vif(struct net_device *dev) { + struct net *net = dev_net(dev); int ct; - for (ct=maxvif-1; ct>=0; ct--) { - if (vif_table[ct].dev == dev) + for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) { + if (net->ipv4.vif_table[ct].dev == dev) break; } return ct; @@ -1320,6 +1342,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local { int psend = -1; int vif, ct; + struct net *net = mfc_net(cache); vif = cache->mfc_parent; cache->mfc_un.res.pkt++; @@ -1328,7 +1351,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (vif_table[vif].dev != skb->dev) { + if (net->ipv4.vif_table[vif].dev != skb->dev) { int true_vifi; if (skb->rtable->fl.iif == 0) { @@ -1349,23 +1372,24 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local cache->mfc_un.res.wrong_if++; true_vifi = ipmr_find_vif(skb->dev); - if (true_vifi >= 0 && mroute_do_assert && + if (true_vifi >= 0 && net->ipv4.mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ - (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) && + (net->ipv4.mroute_do_pim || + cache->mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { cache->mfc_un.res.last_assert = jiffies; - ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF); + ipmr_cache_report(net, skb, true_vifi, IGMPMSG_WRONGVIF); } goto dont_forward; } - vif_table[vif].pkt_in++; - vif_table[vif].bytes_in += skb->len; + net->ipv4.vif_table[vif].pkt_in++; + net->ipv4.vif_table[vif].bytes_in += skb->len; /* * Forward the frame @@ -1405,6 +1429,7 @@ dont_forward: int ip_mr_input(struct sk_buff *skb) { struct mfc_cache *cache; + struct net *net = dev_net(skb->dev); int local = skb->rtable->rt_flags&RTCF_LOCAL; /* Packet is looped back after forward, it should not be @@ -1425,9 +1450,9 @@ int ip_mr_input(struct sk_buff *skb) that we can forward NO IGMP messages. */ read_lock(&mrt_lock); - if (mroute_socket) { + if (net->ipv4.mroute_sk) { nf_reset(skb); - raw_rcv(mroute_socket, skb); + raw_rcv(net->ipv4.mroute_sk, skb); read_unlock(&mrt_lock); return 0; } @@ -1436,7 +1461,7 @@ int ip_mr_input(struct sk_buff *skb) } read_lock(&mrt_lock); - cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + cache = ipmr_cache_find(net, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* * No usable cache entry @@ -1456,7 +1481,7 @@ int ip_mr_input(struct sk_buff *skb) vif = ipmr_find_vif(skb->dev); if (vif >= 0) { - int err = ipmr_cache_unresolved(vif, skb); + int err = ipmr_cache_unresolved(net, vif, skb); read_unlock(&mrt_lock); return err; @@ -1487,6 +1512,7 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen) { struct net_device *reg_dev = NULL; struct iphdr *encap; + struct net *net = dev_net(skb->dev); encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); /* @@ -1501,8 +1527,8 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen) return 1; read_lock(&mrt_lock); - if (reg_vif_num >= 0) - reg_dev = vif_table[reg_vif_num].dev; + if (net->ipv4.mroute_reg_vif_num >= 0) + reg_dev = net->ipv4.vif_table[net->ipv4.mroute_reg_vif_num].dev; if (reg_dev) dev_hold(reg_dev); read_unlock(&mrt_lock); @@ -1537,13 +1563,14 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen) int pim_rcv_v1(struct sk_buff * skb) { struct igmphdr *pim; + struct net *net = dev_net(skb->dev); if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) goto drop; pim = igmp_hdr(skb); - if (!mroute_do_pim || + if (!net->ipv4.mroute_do_pim || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; @@ -1583,7 +1610,8 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) { int ct; struct rtnexthop *nhp; - struct net_device *dev = vif_table[c->mfc_parent].dev; + struct net *net = mfc_net(c); + struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev; u8 *b = skb_tail_pointer(skb); struct rtattr *mp_head; @@ -1599,7 +1627,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = vif_table[ct].dev->ifindex; + nhp->rtnh_ifindex = net->ipv4.vif_table[ct].dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } @@ -1613,14 +1641,15 @@ rtattr_failure: return -EMSGSIZE; } -int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) +int ipmr_get_route(struct net *net, + struct sk_buff *skb, struct rtmsg *rtm, int nowait) { int err; struct mfc_cache *cache; struct rtable *rt = skb->rtable; read_lock(&mrt_lock); - cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); + cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst); if (cache == NULL) { struct sk_buff *skb2; @@ -1651,7 +1680,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; iph->version = 0; - err = ipmr_cache_unresolved(vif, skb2); + err = ipmr_cache_unresolved(net, vif, skb2); read_unlock(&mrt_lock); return err; } @@ -1668,17 +1697,19 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif */ struct ipmr_vif_iter { + struct seq_net_private p; int ct; }; -static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter, +static struct vif_device *ipmr_vif_seq_idx(struct net *net, + struct ipmr_vif_iter *iter, loff_t pos) { - for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) { - if (!VIF_EXISTS(iter->ct)) + for (iter->ct = 0; iter->ct < net->ipv4.maxvif; ++iter->ct) { + if (!VIF_EXISTS(net, iter->ct)) continue; if (pos-- == 0) - return &vif_table[iter->ct]; + return &net->ipv4.vif_table[iter->ct]; } return NULL; } @@ -1686,23 +1717,26 @@ static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter, static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(mrt_lock) { + struct net *net = seq_file_net(seq); + read_lock(&mrt_lock); - return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) + return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1) : SEQ_START_TOKEN; } static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ipmr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); ++*pos; if (v == SEQ_START_TOKEN) - return ipmr_vif_seq_idx(iter, 0); + return ipmr_vif_seq_idx(net, iter, 0); - while (++iter->ct < maxvif) { - if (!VIF_EXISTS(iter->ct)) + while (++iter->ct < net->ipv4.maxvif) { + if (!VIF_EXISTS(net, iter->ct)) continue; - return &vif_table[iter->ct]; + return &net->ipv4.vif_table[iter->ct]; } return NULL; } @@ -1715,6 +1749,8 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) static int ipmr_vif_seq_show(struct seq_file *seq, void *v) { + struct net *net = seq_file_net(seq); + if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); @@ -1724,7 +1760,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", - vif - vif_table, + vif - net->ipv4.vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags, vif->local, vif->remote); @@ -1741,8 +1777,8 @@ static const struct seq_operations ipmr_vif_seq_ops = { static int ipmr_vif_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &ipmr_vif_seq_ops, - sizeof(struct ipmr_vif_iter)); + return seq_open_net(inode, file, &ipmr_vif_seq_ops, + sizeof(struct ipmr_vif_iter)); } static const struct file_operations ipmr_vif_fops = { @@ -1750,23 +1786,26 @@ static const struct file_operations ipmr_vif_fops = { .open = ipmr_vif_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; struct ipmr_mfc_iter { + struct seq_net_private p; struct mfc_cache **cache; int ct; }; -static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos) +static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, + struct ipmr_mfc_iter *it, loff_t pos) { struct mfc_cache *mfc; - it->cache = mfc_cache_array; + it->cache = net->ipv4.mfc_cache_array; read_lock(&mrt_lock); for (it->ct = 0; it->ct < MFC_LINES; it->ct++) - for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) + for (mfc = net->ipv4.mfc_cache_array[it->ct]; + mfc; mfc = mfc->next) if (pos-- == 0) return mfc; read_unlock(&mrt_lock); @@ -1774,7 +1813,8 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos) it->cache = &mfc_unres_queue; spin_lock_bh(&mfc_unres_lock); for (mfc = mfc_unres_queue; mfc; mfc = mfc->next) - if (pos-- == 0) + if (net_eq(mfc_net(mfc), net) && + pos-- == 0) return mfc; spin_unlock_bh(&mfc_unres_lock); @@ -1786,9 +1826,11 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos) static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct ipmr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + it->cache = NULL; it->ct = 0; - return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) + return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) : SEQ_START_TOKEN; } @@ -1796,11 +1838,12 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct mfc_cache *mfc = v; struct ipmr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); ++*pos; if (v == SEQ_START_TOKEN) - return ipmr_mfc_seq_idx(seq->private, 0); + return ipmr_mfc_seq_idx(net, seq->private, 0); if (mfc->next) return mfc->next; @@ -1808,10 +1851,10 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (it->cache == &mfc_unres_queue) goto end_of_list; - BUG_ON(it->cache != mfc_cache_array); + BUG_ON(it->cache != net->ipv4.mfc_cache_array); while (++it->ct < MFC_LINES) { - mfc = mfc_cache_array[it->ct]; + mfc = net->ipv4.mfc_cache_array[it->ct]; if (mfc) return mfc; } @@ -1823,6 +1866,8 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) spin_lock_bh(&mfc_unres_lock); mfc = mfc_unres_queue; + while (mfc && !net_eq(mfc_net(mfc), net)) + mfc = mfc->next; if (mfc) return mfc; @@ -1836,16 +1881,18 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) { struct ipmr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); if (it->cache == &mfc_unres_queue) spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == mfc_cache_array) + else if (it->cache == net->ipv4.mfc_cache_array) read_unlock(&mrt_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; + struct net *net = seq_file_net(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -1866,9 +1913,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) mfc->mfc_un.res.wrong_if); for (n = mfc->mfc_un.res.minvif; n < mfc->mfc_un.res.maxvif; n++ ) { - if (VIF_EXISTS(n) - && mfc->mfc_un.res.ttls[n] < 255) - seq_printf(seq, + if (VIF_EXISTS(net, n) && + mfc->mfc_un.res.ttls[n] < 255) + seq_printf(seq, " %2d:%-3d", n, mfc->mfc_un.res.ttls[n]); } @@ -1892,8 +1939,8 @@ static const struct seq_operations ipmr_mfc_seq_ops = { static int ipmr_mfc_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &ipmr_mfc_seq_ops, - sizeof(struct ipmr_mfc_iter)); + return seq_open_net(inode, file, &ipmr_mfc_seq_ops, + sizeof(struct ipmr_mfc_iter)); } static const struct file_operations ipmr_mfc_fops = { @@ -1901,7 +1948,7 @@ static const struct file_operations ipmr_mfc_fops = { .open = ipmr_mfc_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif @@ -1915,6 +1962,65 @@ static struct net_protocol pim_protocol = { /* * Setup for IP multicast routing */ +static int __net_init ipmr_net_init(struct net *net) +{ + int err = 0; + + net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device), + GFP_KERNEL); + if (!net->ipv4.vif_table) { + err = -ENOMEM; + goto fail; + } + + /* Forwarding cache */ + net->ipv4.mfc_cache_array = kcalloc(MFC_LINES, + sizeof(struct mfc_cache *), + GFP_KERNEL); + if (!net->ipv4.mfc_cache_array) { + err = -ENOMEM; + goto fail_mfc_cache; + } + +#ifdef CONFIG_IP_PIMSM + net->ipv4.mroute_reg_vif_num = -1; +#endif + +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops)) + goto proc_vif_fail; + if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops)) + goto proc_cache_fail; +#endif + return 0; + +#ifdef CONFIG_PROC_FS +proc_cache_fail: + proc_net_remove(net, "ip_mr_vif"); +proc_vif_fail: + kfree(net->ipv4.mfc_cache_array); +#endif +fail_mfc_cache: + kfree(net->ipv4.vif_table); +fail: + return err; +} + +static void __net_exit ipmr_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove(net, "ip_mr_cache"); + proc_net_remove(net, "ip_mr_vif"); +#endif + kfree(net->ipv4.mfc_cache_array); + kfree(net->ipv4.vif_table); +} + +static struct pernet_operations ipmr_net_ops = { + .init = ipmr_net_init, + .exit = ipmr_net_exit, +}; int __init ip_mr_init(void) { @@ -1927,26 +2033,20 @@ int __init ip_mr_init(void) if (!mrt_cachep) return -ENOMEM; + err = register_pernet_subsys(&ipmr_net_ops); + if (err) + goto reg_pernet_fail; + setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0); err = register_netdevice_notifier(&ip_mr_notifier); if (err) goto reg_notif_fail; -#ifdef CONFIG_PROC_FS - err = -ENOMEM; - if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops)) - goto proc_vif_fail; - if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops)) - goto proc_cache_fail; -#endif return 0; -#ifdef CONFIG_PROC_FS -proc_cache_fail: - proc_net_remove(&init_net, "ip_mr_vif"); -proc_vif_fail: - unregister_netdevice_notifier(&ip_mr_notifier); -#endif + reg_notif_fail: del_timer(&ipmr_expire_timer); + unregister_pernet_subsys(&ipmr_net_ops); +reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; } diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 3816e1dc9295..1833bdbf9805 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -31,7 +31,7 @@ config NF_CONNTRACK_PROC_COMPAT default y help This option enables /proc and sysctl compatibility with the old - layer 3 dependant connection tracking. This is needed to keep + layer 3 dependent connection tracking. This is needed to keep old programs that have not been adapted to the new names working. If unsure, say Y. @@ -95,11 +95,11 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_TTL tristate '"ttl" match support' depends on NETFILTER_ADVANCED - help - This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user - to match packets by their TTL value. - - To compile it as a module, choose M here. If unsure, say N. + select NETFILTER_XT_MATCH_HL + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MATCH_HL. # `filter', generic and specific targets config IP_NF_FILTER @@ -323,19 +323,13 @@ config IP_NF_TARGET_ECN To compile it as a module, choose M here. If unsure, say N. config IP_NF_TARGET_TTL - tristate 'TTL target support' - depends on IP_NF_MANGLE + tristate '"TTL" target support' depends on NETFILTER_ADVANCED - help - This option adds a `TTL' target, which enables the user to modify - the TTL value of the IP header. - - While it is safe to decrement/lower the TTL, this target also enables - functionality to increment and set the TTL value of the IP header to - arbitrary values. This is EXTREMELY DANGEROUS since you can easily - create immortal packets that loop forever on the network. - - To compile it as a module, choose M here. If unsure, say N. + select NETFILTER_XT_TARGET_HL + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. # raw + specific targets config IP_NF_RAW diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 5f9b650d90fc..48111594ee9b 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -51,7 +51,6 @@ obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o -obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o # targets obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o @@ -61,7 +60,6 @@ obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o -obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o # generic ARP tables diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 7ea88b61cb0d..84b9c179df51 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -73,6 +73,40 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, return (ret != 0); } +/* + * Unfortunatly, _b and _mask are not aligned to an int (or long int) + * Some arches dont care, unrolling the loop is a win on them. + * For other arches, we only have a 16bit alignement. + */ +static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask) +{ +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + const unsigned long *a = (const unsigned long *)_a; + const unsigned long *b = (const unsigned long *)_b; + const unsigned long *mask = (const unsigned long *)_mask; + unsigned long ret; + + ret = (a[0] ^ b[0]) & mask[0]; + if (IFNAMSIZ > sizeof(unsigned long)) + ret |= (a[1] ^ b[1]) & mask[1]; + if (IFNAMSIZ > 2 * sizeof(unsigned long)) + ret |= (a[2] ^ b[2]) & mask[2]; + if (IFNAMSIZ > 3 * sizeof(unsigned long)) + ret |= (a[3] ^ b[3]) & mask[3]; + BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long)); +#else + unsigned long ret = 0; + const u16 *a = (const u16 *)_a; + const u16 *b = (const u16 *)_b; + const u16 *mask = (const u16 *)_mask; + int i; + + for (i = 0; i < IFNAMSIZ/sizeof(u16); i++) + ret |= (a[i] ^ b[i]) & mask[i]; +#endif + return ret; +} + /* Returns whether packet matches rule or not. */ static inline int arp_packet_match(const struct arphdr *arphdr, struct net_device *dev, @@ -83,7 +117,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, const char *arpptr = (char *)(arphdr + 1); const char *src_devaddr, *tgt_devaddr; __be32 src_ipaddr, tgt_ipaddr; - int i, ret; + long ret; #define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) @@ -156,10 +190,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, } /* Look for ifname matches. */ - for (i = 0, ret = 0; i < IFNAMSIZ; i++) { - ret |= (indev[i] ^ arpinfo->iniface[i]) - & arpinfo->iniface_mask[i]; - } + ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask); if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { dprintf("VIA in mismatch (%s vs %s).%s\n", @@ -168,10 +199,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, return 0; } - for (i = 0, ret = 0; i < IFNAMSIZ; i++) { - ret |= (outdev[i] ^ arpinfo->outiface[i]) - & arpinfo->outiface_mask[i]; - } + ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask); if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { dprintf("VIA out mismatch (%s vs %s).%s\n", @@ -221,7 +249,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, const struct net_device *out, struct xt_table *table) { - static const char nulldevname[IFNAMSIZ]; + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); unsigned int verdict = NF_DROP; const struct arphdr *arp; bool hotdrop = false; @@ -237,9 +265,10 @@ unsigned int arpt_do_table(struct sk_buff *skb, indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - read_lock_bh(&table->lock); - private = table->private; - table_base = (void *)private->entries[smp_processor_id()]; + rcu_read_lock(); + private = rcu_dereference(table->private); + table_base = rcu_dereference(private->entries[smp_processor_id()]); + e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -311,7 +340,8 @@ unsigned int arpt_do_table(struct sk_buff *skb, e = (void *)e + e->next_offset; } } while (!hotdrop); - read_unlock_bh(&table->lock); + + rcu_read_unlock(); if (hotdrop) return NF_DROP; @@ -714,11 +744,65 @@ static void get_counters(const struct xt_table_info *t, } } -static inline struct xt_counters *alloc_counters(struct xt_table *table) + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct arpt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +/* Take values from counters and add them back onto the current cpu */ +static void put_counters(struct xt_table_info *t, + const struct xt_counters counters[]) +{ + unsigned int i, cpu; + + local_bh_disable(); + cpu = smp_processor_id(); + i = 0; + ARPT_ENTRY_ITERATE(t->entries[cpu], + t->size, + add_counter_to_entry, + counters, + &i); + local_bh_enable(); +} + +static inline int +zero_entry_counter(struct arpt_entry *e, void *arg) +{ + e->counters.bcnt = 0; + e->counters.pcnt = 0; + return 0; +} + +static void +clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) +{ + unsigned int cpu; + const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; + + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + for_each_possible_cpu(cpu) { + memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); + ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, + zero_entry_counter, NULL); + } +} + +static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + struct xt_table_info *private = table->private; + struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -728,14 +812,30 @@ static inline struct xt_counters *alloc_counters(struct xt_table *table) counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - return ERR_PTR(-ENOMEM); + goto nomem; + + info = xt_alloc_table_info(private->size); + if (!info) + goto free_counters; + + clone_counters(info, private); + + mutex_lock(&table->lock); + xt_table_entry_swap_rcu(private, info); + synchronize_net(); /* Wait until smoke has cleared */ + + get_counters(info, counters); + put_counters(private, counters); + mutex_unlock(&table->lock); - /* First, sum counters... */ - write_lock_bh(&table->lock); - get_counters(private, counters); - write_unlock_bh(&table->lock); + xt_free_table_info(info); return counters; + + free_counters: + vfree(counters); + nomem: + return ERR_PTR(-ENOMEM); } static int copy_entries_to_user(unsigned int total_size, @@ -1075,20 +1175,6 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. - */ -static inline int add_counter_to_entry(struct arpt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { @@ -1148,13 +1234,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, goto free; } - write_lock_bh(&t->lock); + mutex_lock(&t->lock); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } + preempt_disable(); i = 0; /* Choose the copy that is on our node */ loc_cpu_entry = private->entries[smp_processor_id()]; @@ -1163,8 +1250,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, add_counter_to_entry, paddc, &i); + preempt_enable(); unlock_up_free: - write_unlock_bh(&t->lock); + mutex_unlock(&t->lock); + xt_table_unlock(t); module_put(t->me); free: diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index e091187e864f..6ecfdae7c589 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -48,8 +48,6 @@ static struct static struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(packet_filter.lock), - .private = NULL, .me = THIS_MODULE, .af = NFPROTO_ARP, }; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 432ce9d1c11c..5f22c91c6e15 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -24,6 +24,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/security.h> +#include <linux/net.h> #include <linux/mutex.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -640,6 +641,7 @@ static void __exit ip_queue_fini(void) MODULE_DESCRIPTION("IPv4 packet queue handler"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL); module_init(ip_queue_init); module_exit(ip_queue_fini); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index ef8b6ca068b2..e5294aec967d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -74,6 +74,25 @@ do { \ Hence the start of any table is given by get_table() below. */ +static unsigned long ifname_compare(const char *_a, const char *_b, + const unsigned char *_mask) +{ + const unsigned long *a = (const unsigned long *)_a; + const unsigned long *b = (const unsigned long *)_b; + const unsigned long *mask = (const unsigned long *)_mask; + unsigned long ret; + + ret = (a[0] ^ b[0]) & mask[0]; + if (IFNAMSIZ > sizeof(unsigned long)) + ret |= (a[1] ^ b[1]) & mask[1]; + if (IFNAMSIZ > 2 * sizeof(unsigned long)) + ret |= (a[2] ^ b[2]) & mask[2]; + if (IFNAMSIZ > 3 * sizeof(unsigned long)) + ret |= (a[3] ^ b[3]) & mask[3]; + BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long)); + return ret; +} + /* Returns whether matches rule or not. */ /* Performance critical - called for every packet */ static inline bool @@ -83,7 +102,6 @@ ip_packet_match(const struct iphdr *ip, const struct ipt_ip *ipinfo, int isfrag) { - size_t i; unsigned long ret; #define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) @@ -103,12 +121,7 @@ ip_packet_match(const struct iphdr *ip, return false; } - /* Look for ifname matches; this should unroll nicely. */ - for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { - ret |= (((const unsigned long *)indev)[i] - ^ ((const unsigned long *)ipinfo->iniface)[i]) - & ((const unsigned long *)ipinfo->iniface_mask)[i]; - } + ret = ifname_compare(indev, ipinfo->iniface, ipinfo->iniface_mask); if (FWINV(ret != 0, IPT_INV_VIA_IN)) { dprintf("VIA in mismatch (%s vs %s).%s\n", @@ -117,11 +130,7 @@ ip_packet_match(const struct iphdr *ip, return false; } - for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { - ret |= (((const unsigned long *)outdev)[i] - ^ ((const unsigned long *)ipinfo->outiface)[i]) - & ((const unsigned long *)ipinfo->outiface_mask)[i]; - } + ret = ifname_compare(outdev, ipinfo->outiface, ipinfo->outiface_mask); if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { dprintf("VIA out mismatch (%s vs %s).%s\n", @@ -347,10 +356,12 @@ ipt_do_table(struct sk_buff *skb, mtpar.family = tgpar.family = NFPROTO_IPV4; tgpar.hooknum = hook; - read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - private = table->private; - table_base = (void *)private->entries[smp_processor_id()]; + + rcu_read_lock(); + private = rcu_dereference(table->private); + table_base = rcu_dereference(private->entries[smp_processor_id()]); + e = get_entry(table_base, private->hook_entry[hook]); /* For return from builtin chain */ @@ -445,7 +456,7 @@ ipt_do_table(struct sk_buff *skb, } } while (!hotdrop); - read_unlock_bh(&table->lock); + rcu_read_unlock(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -924,13 +935,68 @@ get_counters(const struct xt_table_info *t, counters, &i); } + +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ipt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +/* Take values from counters and add them back onto the current cpu */ +static void put_counters(struct xt_table_info *t, + const struct xt_counters counters[]) +{ + unsigned int i, cpu; + + local_bh_disable(); + cpu = smp_processor_id(); + i = 0; + IPT_ENTRY_ITERATE(t->entries[cpu], + t->size, + add_counter_to_entry, + counters, + &i); + local_bh_enable(); +} + + +static inline int +zero_entry_counter(struct ipt_entry *e, void *arg) +{ + e->counters.bcnt = 0; + e->counters.pcnt = 0; + return 0; +} + +static void +clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) +{ + unsigned int cpu; + const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; + + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + for_each_possible_cpu(cpu) { + memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); + IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, + zero_entry_counter, NULL); + } } static struct xt_counters * alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + struct xt_table_info *private = table->private; + struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -939,14 +1005,30 @@ static struct xt_counters * alloc_counters(struct xt_table *table) counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - return ERR_PTR(-ENOMEM); + goto nomem; + + info = xt_alloc_table_info(private->size); + if (!info) + goto free_counters; - /* First, sum counters... */ - write_lock_bh(&table->lock); - get_counters(private, counters); - write_unlock_bh(&table->lock); + clone_counters(info, private); + + mutex_lock(&table->lock); + xt_table_entry_swap_rcu(private, info); + synchronize_net(); /* Wait until smoke has cleared */ + + get_counters(info, counters); + put_counters(private, counters); + mutex_unlock(&table->lock); + + xt_free_table_info(info); return counters; + + free_counters: + vfree(counters); + nomem: + return ERR_PTR(-ENOMEM); } static int @@ -1312,27 +1394,6 @@ do_replace(struct net *net, void __user *user, unsigned int len) return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct ipt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ -#if 0 - duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", - *i, - (long unsigned int)e->counters.pcnt, - (long unsigned int)e->counters.bcnt, - (long unsigned int)addme[*i].pcnt, - (long unsigned int)addme[*i].bcnt); -#endif - - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) @@ -1393,13 +1454,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat goto free; } - write_lock_bh(&t->lock); + mutex_lock(&t->lock); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } + preempt_disable(); i = 0; /* Choose the copy that is on our node */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; @@ -1408,8 +1470,9 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat add_counter_to_entry, paddc, &i); + preempt_enable(); unlock_up_free: - write_unlock_bh(&t->lock); + mutex_unlock(&t->lock); xt_table_unlock(t); module_put(t->me); free: diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 27a78fbbd92b..acc44c69eb68 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -464,7 +464,7 @@ static struct xt_target log_tg_reg __read_mostly = { .me = THIS_MODULE, }; -static const struct nf_logger ipt_log_logger ={ +static struct nf_logger ipt_log_logger __read_mostly = { .name = "ipt_LOG", .logfn = &ipt_log_packet, .me = THIS_MODULE, diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c deleted file mode 100644 index 6d76aae90cc0..000000000000 --- a/net/ipv4/netfilter/ipt_TTL.c +++ /dev/null @@ -1,97 +0,0 @@ -/* TTL modification target for IP tables - * (C) 2000,2005 by Harald Welte <laforge@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <net/checksum.h> - -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ipt_TTL.h> - -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("Xtables: IPv4 TTL field modification target"); -MODULE_LICENSE("GPL"); - -static unsigned int -ttl_tg(struct sk_buff *skb, const struct xt_target_param *par) -{ - struct iphdr *iph; - const struct ipt_TTL_info *info = par->targinfo; - int new_ttl; - - if (!skb_make_writable(skb, skb->len)) - return NF_DROP; - - iph = ip_hdr(skb); - - switch (info->mode) { - case IPT_TTL_SET: - new_ttl = info->ttl; - break; - case IPT_TTL_INC: - new_ttl = iph->ttl + info->ttl; - if (new_ttl > 255) - new_ttl = 255; - break; - case IPT_TTL_DEC: - new_ttl = iph->ttl - info->ttl; - if (new_ttl < 0) - new_ttl = 0; - break; - default: - new_ttl = iph->ttl; - break; - } - - if (new_ttl != iph->ttl) { - csum_replace2(&iph->check, htons(iph->ttl << 8), - htons(new_ttl << 8)); - iph->ttl = new_ttl; - } - - return XT_CONTINUE; -} - -static bool ttl_tg_check(const struct xt_tgchk_param *par) -{ - const struct ipt_TTL_info *info = par->targinfo; - - if (info->mode > IPT_TTL_MAXMODE) { - printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", - info->mode); - return false; - } - if (info->mode != IPT_TTL_SET && info->ttl == 0) - return false; - return true; -} - -static struct xt_target ttl_tg_reg __read_mostly = { - .name = "TTL", - .family = NFPROTO_IPV4, - .target = ttl_tg, - .targetsize = sizeof(struct ipt_TTL_info), - .table = "mangle", - .checkentry = ttl_tg_check, - .me = THIS_MODULE, -}; - -static int __init ttl_tg_init(void) -{ - return xt_register_target(&ttl_tg_reg); -} - -static void __exit ttl_tg_exit(void) -{ - xt_unregister_target(&ttl_tg_reg); -} - -module_init(ttl_tg_init); -module_exit(ttl_tg_exit); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 18a2826b57c6..d32cc4bb328a 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -379,7 +379,7 @@ static struct xt_target ulog_tg_reg __read_mostly = { .me = THIS_MODULE, }; -static struct nf_logger ipt_ulog_logger = { +static struct nf_logger ipt_ulog_logger __read_mostly = { .name = "ipt_ULOG", .logfn = ipt_logfn, .me = THIS_MODULE, diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c deleted file mode 100644 index 297f1cbf4ff5..000000000000 --- a/net/ipv4/netfilter/ipt_ttl.c +++ /dev/null @@ -1,63 +0,0 @@ -/* IP tables module for matching the value of the TTL - * - * (C) 2000,2001 by Harald Welte <laforge@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter_ipv4/ipt_ttl.h> -#include <linux/netfilter/x_tables.h> - -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("Xtables: IPv4 TTL field match"); -MODULE_LICENSE("GPL"); - -static bool ttl_mt(const struct sk_buff *skb, const struct xt_match_param *par) -{ - const struct ipt_ttl_info *info = par->matchinfo; - const u8 ttl = ip_hdr(skb)->ttl; - - switch (info->mode) { - case IPT_TTL_EQ: - return ttl == info->ttl; - case IPT_TTL_NE: - return ttl != info->ttl; - case IPT_TTL_LT: - return ttl < info->ttl; - case IPT_TTL_GT: - return ttl > info->ttl; - default: - printk(KERN_WARNING "ipt_ttl: unknown mode %d\n", - info->mode); - return false; - } - - return false; -} - -static struct xt_match ttl_mt_reg __read_mostly = { - .name = "ttl", - .family = NFPROTO_IPV4, - .match = ttl_mt, - .matchsize = sizeof(struct ipt_ttl_info), - .me = THIS_MODULE, -}; - -static int __init ttl_mt_init(void) -{ - return xt_register_match(&ttl_mt_reg); -} - -static void __exit ttl_mt_exit(void) -{ - xt_unregister_match(&ttl_mt_reg); -} - -module_init(ttl_mt_init); -module_exit(ttl_mt_exit); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 52cb6939d093..c30a969724f8 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -56,7 +56,6 @@ static struct static struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(packet_filter.lock), .me = THIS_MODULE, .af = AF_INET, }; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 3929d20b9e45..4087614d9519 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -67,7 +67,6 @@ static struct static struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(packet_mangler.lock), .me = THIS_MODULE, .af = AF_INET, }; diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 7f65d18333e3..e5356da1fb54 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -39,7 +39,6 @@ static struct static struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(packet_raw.lock), .me = THIS_MODULE, .af = AF_INET, }; diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index a52a35f4a584..29ab630f240a 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -60,7 +60,6 @@ static struct static struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(security_table.lock), .me = THIS_MODULE, .af = AF_INET, }; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 4beb04fac588..8b681f24e271 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -120,8 +120,10 @@ static unsigned int ipv4_confirm(unsigned int hooknum, typeof(nf_nat_seq_adjust_hook) seq_adjust; seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); - if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) + if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; + } } out: /* We've seen it coming out the other side: confirm it */ diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index a7eb04719044..6348a793936e 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -61,7 +61,6 @@ static struct static struct xt_table nat_table = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(nat_table.lock), .me = THIS_MODULE, .af = AF_INET, }; diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 182f845de92f..d9521f6f9ed0 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -1292,7 +1292,7 @@ static struct nf_conntrack_helper snmp_helper __read_mostly = { .expect_policy = &snmp_exp_policy, .name = "snmp", .tuple.src.l3num = AF_INET, - .tuple.src.u.udp.port = __constant_htons(SNMP_PORT), + .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT), .tuple.dst.protonum = IPPROTO_UDP, }; @@ -1302,7 +1302,7 @@ static struct nf_conntrack_helper snmp_trap_helper __read_mostly = { .expect_policy = &snmp_exp_policy, .name = "snmp_trap", .tuple.src.l3num = AF_INET, - .tuple.src.u.udp.port = __constant_htons(SNMP_TRAP_PORT), + .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT), .tuple.dst.protonum = IPPROTO_UDP, }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index eb62e58bff79..cf0cdeeb1db0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -54,8 +54,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) int orphans, sockets; local_bh_disable(); - orphans = percpu_counter_sum_positive(&tcp_orphan_count), - sockets = percpu_counter_sum_positive(&tcp_sockets_allocated), + orphans = percpu_counter_sum_positive(&tcp_orphan_count); + sockets = percpu_counter_sum_positive(&tcp_sockets_allocated); local_bh_enable(); socket_seq_show(seq); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index dff8bc4e0fac..f774651f0a47 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -493,6 +493,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->saddr; ipc.opt = NULL; + ipc.shtx.flags = 0; ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 97f71153584f..5caee609be06 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -151,7 +151,7 @@ static void rt_emergency_hash_rebuild(struct net *net); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, - .protocol = __constant_htons(ETH_P_IP), + .protocol = cpu_to_be16(ETH_P_IP), .gc = rt_garbage_collect, .check = ipv4_dst_check, .destroy = ipv4_dst_destroy, @@ -2696,7 +2696,7 @@ static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, - .protocol = __constant_htons(ETH_P_IP), + .protocol = cpu_to_be16(ETH_P_IP), .destroy = ipv4_dst_destroy, .check = ipv4_dst_check, .update_pmtu = ipv4_rt_blackhole_update_pmtu, @@ -2779,7 +2779,8 @@ int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) return ip_route_output_flow(net, rp, flp, NULL, 0); } -static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, +static int rt_fill_info(struct net *net, + struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, unsigned int flags) { struct rtable *rt = skb->rtable; @@ -2844,8 +2845,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, __be32 dst = rt->rt_dst; if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && - IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) { - int err = ipmr_get_route(skb, r, nowait); + IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { + int err = ipmr_get_route(net, skb, r, nowait); if (err <= 0) { if (!nowait) { if (err == 0) @@ -2950,7 +2951,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err <= 0) goto errout_free; @@ -2988,7 +2989,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) if (rt_is_expired(rt)) continue; skb->dst = dst_clone(&rt->u.dst); - if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, + if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { dst_release(xchg(&skb->dst, NULL)); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 76b148bcb0dc..2451aeb5ac23 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -661,6 +661,47 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) return NULL; } +static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, + int large_allowed) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 xmit_size_goal, old_size_goal; + + xmit_size_goal = mss_now; + + if (large_allowed && sk_can_gso(sk)) { + xmit_size_goal = ((sk->sk_gso_max_size - 1) - + inet_csk(sk)->icsk_af_ops->net_header_len - + inet_csk(sk)->icsk_ext_hdr_len - + tp->tcp_header_len); + + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); + + /* We try hard to avoid divides here */ + old_size_goal = tp->xmit_size_goal_segs * mss_now; + + if (likely(old_size_goal <= xmit_size_goal && + old_size_goal + mss_now > xmit_size_goal)) { + xmit_size_goal = old_size_goal; + } else { + tp->xmit_size_goal_segs = xmit_size_goal / mss_now; + xmit_size_goal = tp->xmit_size_goal_segs * mss_now; + } + } + + return max(xmit_size_goal, mss_now); +} + +static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +{ + int mss_now; + + mss_now = tcp_current_mss(sk); + *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + + return mss_now; +} + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) { @@ -677,13 +718,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto do_error; + goto out_err; while (psize > 0) { struct sk_buff *skb = tcp_write_queue_tail(sk); @@ -761,8 +801,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); } out: @@ -844,8 +883,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, /* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -854,7 +892,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto do_error; + goto out_err; while (--iovlen >= 0) { int seglen = iov->iov_len; @@ -1007,8 +1045,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); } } @@ -1045,8 +1082,7 @@ out_err: */ static int tcp_recv_urg(struct sock *sk, long timeo, - struct msghdr *msg, int len, int flags, - int *addr_len) + struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); @@ -1661,7 +1697,7 @@ out: return err; recv_urg: - err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); + err = tcp_recv_urg(sk, timeo, msg, len, flags); goto out; } @@ -2478,23 +2514,23 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct tcphdr *th2; unsigned int thlen; unsigned int flags; - unsigned int total; unsigned int mss = 1; int flush = 1; + int i; - if (!pskb_may_pull(skb, sizeof(*th))) + th = skb_gro_header(skb, sizeof(*th)); + if (unlikely(!th)) goto out; - th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) goto out; - if (!pskb_may_pull(skb, thlen)) + th = skb_gro_header(skb, thlen); + if (unlikely(!th)) goto out; - th = tcp_hdr(skb); - __skb_pull(skb, thlen); + skb_gro_pull(skb, thlen); flags = tcp_flag_word(th); @@ -2504,7 +2540,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) th2 = tcp_hdr(p); - if (th->source != th2->source || th->dest != th2->dest) { + if ((th->source ^ th2->source) | (th->dest ^ th2->dest)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } @@ -2519,14 +2555,15 @@ found: flush |= flags & TCP_FLAG_CWR; flush |= (flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH); - flush |= th->ack_seq != th2->ack_seq || th->window != th2->window; - flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th)); + flush |= (th->ack_seq ^ th2->ack_seq) | (th->window ^ th2->window); + for (i = sizeof(*th); !flush && i < thlen; i += 4) + flush |= *(u32 *)((u8 *)th + i) ^ + *(u32 *)((u8 *)th2 + i); - total = p->len; mss = skb_shinfo(p)->gso_size; - flush |= skb->len > mss || skb->len <= 0; - flush |= ntohl(th2->seq) + total != ntohl(th->seq); + flush |= (skb_gro_len(skb) > mss) | !skb_gro_len(skb); + flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); if (flush || skb_gro_receive(head, skb)) { mss = 1; @@ -2538,7 +2575,7 @@ found: tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); out_check_final: - flush = skb->len < mss; + flush = skb_gro_len(skb) < mss; flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_FIN); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 7eb7636db0d0..3b53fd1af23f 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -149,16 +149,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) tcp_slow_start(tp); else { bictcp_update(ca, tp->snd_cwnd); - - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= ca->cnt) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; + tcp_cong_avoid_ai(tp, ca->cnt); } } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 4ec5b4e97c4e..e92beb9e55e0 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -336,6 +336,19 @@ void tcp_slow_start(struct tcp_sock *tp) } EXPORT_SYMBOL_GPL(tcp_slow_start); +/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ +void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) +{ + if (tp->snd_cwnd_cnt >= w) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else { + tp->snd_cwnd_cnt++; + } +} +EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); + /* * TCP Reno congestion control * This is special case used for fallback as well. @@ -365,13 +378,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) tp->snd_cwnd++; } } else { - /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; + tcp_cong_avoid_ai(tp, tp->snd_cwnd); } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index ee467ec40c4f..71d5f2f29fa6 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -294,16 +294,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) tcp_slow_start(tp); } else { bictcp_update(ca, tp->snd_cwnd); - - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= ca->cnt) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; + tcp_cong_avoid_ai(tp, ca->cnt); } } diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 937549b8a921..26d5c7fc7de5 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -115,8 +115,7 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt return; /* achieved throughput calculations */ - if (icsk->icsk_ca_state != TCP_CA_Open && - icsk->icsk_ca_state != TCP_CA_Disorder) { + if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) { ca->packetcount = 0; ca->lasttime = now; return; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c28976a7e596..2bc8e27a163d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -64,6 +64,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/sysctl.h> +#include <linux/kernel.h> #include <net/dst.h> #include <net/tcp.h> #include <net/inet_common.h> @@ -1178,10 +1179,18 @@ static void tcp_mark_lost_retrans(struct sock *sk) if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) continue; - if (after(received_upto, ack_seq) && - (tcp_is_fack(tp) || - !before(received_upto, - ack_seq + tp->reordering * tp->mss_cache))) { + /* TODO: We would like to get rid of tcp_is_fack(tp) only + * constraint here (see above) but figuring out that at + * least tp->reordering SACK blocks reside between ack_seq + * and received_upto is not easy task to do cheaply with + * the available datastructures. + * + * Whether FACK should check here for tp->reordering segs + * in-between one could argue for either way (it would be + * rather simple to implement as we could count fack_count + * during the walk and do tp->fackets_out - fack_count). + */ + if (after(received_upto, ack_seq)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); @@ -1794,11 +1803,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, for (i = used_sacks - 1; i > 0; i--) { for (j = 0; j < i; j++) { if (after(sp[j].start_seq, sp[j + 1].start_seq)) { - struct tcp_sack_block tmp; - - tmp = sp[j]; - sp[j] = sp[j + 1]; - sp[j + 1] = tmp; + swap(sp[j], sp[j + 1]); /* Track where the first SACK block goes to */ if (j == first_sack_index) @@ -2453,6 +2458,44 @@ static int tcp_time_to_recover(struct sock *sk) return 0; } +/* New heuristics: it is possible only after we switched to restart timer + * each time when something is ACKed. Hence, we can detect timed out packets + * during fast retransmit without falling to slow start. + * + * Usefulness of this as is very questionable, since we should know which of + * the segments is the next to timeout which is relatively expensive to find + * in general case unless we add some data structure just for that. The + * current approach certainly won't find the right one too often and when it + * finally does find _something_ it usually marks large part of the window + * right away (because a retransmission with a larger timestamp blocks the + * loop from advancing). -ij + */ +static void tcp_timeout_skbs(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (!tcp_is_fack(tp) || !tcp_head_timedout(sk)) + return; + + skb = tp->scoreboard_skb_hint; + if (tp->scoreboard_skb_hint == NULL) + skb = tcp_write_queue_head(sk); + + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + if (!tcp_skb_timedout(sk, skb)) + break; + + tcp_skb_mark_lost(tp, skb); + } + + tp->scoreboard_skb_hint = skb; + + tcp_verify_left_out(tp); +} + /* Mark head of queue up as lost. With RFC3517 SACK, the packets is * is against sacked "cnt", otherwise it's against facked "cnt" */ @@ -2525,30 +2568,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) tcp_mark_head_lost(sk, sacked_upto); } - /* New heuristics: it is possible only after we switched - * to restart timer each time when something is ACKed. - * Hence, we can detect timed out packets during fast - * retransmit without falling to slow start. - */ - if (tcp_is_fack(tp) && tcp_head_timedout(sk)) { - struct sk_buff *skb; - - skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint - : tcp_write_queue_head(sk); - - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - if (!tcp_skb_timedout(sk, skb)) - break; - - tcp_skb_mark_lost(tp, skb); - } - - tp->scoreboard_skb_hint = skb; - - tcp_verify_left_out(tp); - } + tcp_timeout_skbs(sk); } /* CWND moderation, preventing bursts due to too big ACKs @@ -2813,7 +2833,7 @@ static void tcp_mtup_probe_failed(struct sock *sk) icsk->icsk_mtup.probe_size = 0; } -static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) +static void tcp_mtup_probe_success(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -2841,7 +2861,7 @@ void tcp_simple_retransmit(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int mss = tcp_current_mss(sk, 0); + unsigned int mss = tcp_current_mss(sk); u32 prior_lost = tp->lost_out; tcp_for_write_queue(skb, sk) { @@ -3178,7 +3198,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - u32 end_seq; u32 acked_pcount; u8 sacked = scb->sacked; @@ -3193,16 +3212,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, break; fully_acked = 0; - end_seq = tp->snd_una; } else { acked_pcount = tcp_skb_pcount(skb); - end_seq = scb->end_seq; - } - - /* MTU probing checks */ - if (fully_acked && icsk->icsk_mtup.probe_size && - !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) { - tcp_mtup_probe_success(sk, skb); } if (sacked & TCPCB_RETRANS) { @@ -3267,24 +3278,26 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + if (unlikely(icsk->icsk_mtup.probe_size && + !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { + tcp_mtup_probe_success(sk); + } + tcp_ack_update_rtt(sk, flag, seq_rtt); tcp_rearm_rto(sk); if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); } else { + int delta; + /* Non-retransmitted hole got filled? That's reordering */ if (reord < prior_fackets) tcp_update_reordering(sk, tp->fackets_out - reord, 0); - /* No need to care for underflows here because - * the lost_skb_hint gets NULLed if we're past it - * (or something non-trivial happened) - */ - if (tcp_is_fack(tp)) - tp->lost_cnt_hint -= pkts_acked; - else - tp->lost_cnt_hint -= prior_sacked - tp->sacked_out; + delta = tcp_is_fack(tp) ? pkts_acked : + prior_sacked - tp->sacked_out; + tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } tp->fackets_out -= min(pkts_acked, tp->fackets_out); @@ -3396,7 +3409,7 @@ static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { flag |= FLAG_WIN_UPDATE; - tcp_update_wl(tp, ack, ack_seq); + tcp_update_wl(tp, ack_seq); if (tp->snd_wnd != nwin) { tp->snd_wnd = nwin; @@ -3572,15 +3585,18 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) int prior_packets; int frto_cwnd = 0; - /* If the ack is newer than sent or older than previous acks + /* If the ack is older than previous acks * then we can probably ignore it. */ - if (after(ack, tp->snd_nxt)) - goto uninteresting_ack; - if (before(ack, prior_snd_una)) goto old_ack; + /* If the ack includes data we haven't sent yet, discard + * this segment (RFC793 Section 3.9). + */ + if (after(ack, tp->snd_nxt)) + goto invalid_ack; + if (after(ack, prior_snd_una)) flag |= FLAG_SND_UNA_ADVANCED; @@ -3601,7 +3617,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. */ - tcp_update_wl(tp, ack, ack_seq); + tcp_update_wl(tp, ack_seq); tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; @@ -3670,6 +3686,10 @@ no_queue: tcp_ack_probe(sk); return 1; +invalid_ack: + SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt); + return -1; + old_ack: if (TCP_SKB_CB(skb)->sacked) { tcp_sacktag_write_queue(sk, skb, prior_snd_una); @@ -3677,8 +3697,7 @@ old_ack: tcp_try_keep_open(sk); } -uninteresting_ack: - SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); + SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); return 0; } @@ -3866,8 +3885,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * Not only, also it occurs for expired timestamps. */ - if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || - get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) + if (tcp_paws_check(&tp->rx_opt, 0)) tcp_store_ts_recent(tp); } } @@ -3919,9 +3937,9 @@ static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); - return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && - get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && - !tcp_disordered_ack(sk, skb)); + + return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) && + !tcp_disordered_ack(sk, skb); } /* Check segment sequence number for validity. @@ -4079,7 +4097,6 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; tp->duplicate_sack[0].end_seq = end_seq; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1; } } @@ -4134,8 +4151,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) * Decrease num_sacks. */ tp->rx_opt.num_sacks--; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + - tp->rx_opt.dsack; for (i = this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i + 1]; continue; @@ -4144,20 +4159,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } -static inline void tcp_sack_swap(struct tcp_sack_block *sack1, - struct tcp_sack_block *sack2) -{ - __u32 tmp; - - tmp = sack1->start_seq; - sack1->start_seq = sack2->start_seq; - sack2->start_seq = tmp; - - tmp = sack1->end_seq; - sack1->end_seq = sack2->end_seq; - sack2->end_seq = tmp; -} - static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); @@ -4172,7 +4173,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) if (tcp_sack_extend(sp, seq, end_seq)) { /* Rotate this_sack to the first one. */ for (; this_sack > 0; this_sack--, sp--) - tcp_sack_swap(sp, sp - 1); + swap(*sp, *(sp - 1)); if (cur_sacks > 1) tcp_sack_maybe_coalesce(tp); return; @@ -4198,7 +4199,6 @@ new_sack: sp->start_seq = seq; sp->end_seq = end_seq; tp->rx_opt.num_sacks++; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; } /* RCV.NXT advances, some SACKs should be eaten. */ @@ -4212,7 +4212,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ if (skb_queue_empty(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; - tp->rx_opt.eff_sacks = tp->rx_opt.dsack; return; } @@ -4233,11 +4232,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) this_sack++; sp++; } - if (num_sacks != tp->rx_opt.num_sacks) { - tp->rx_opt.num_sacks = num_sacks; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + - tp->rx_opt.dsack; - } + tp->rx_opt.num_sacks = num_sacks; } /* This one checks to see if we can put data from the @@ -4313,10 +4308,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) TCP_ECN_accept_cwr(tp, skb); - if (tp->rx_opt.dsack) { - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; - } + tp->rx_opt.dsack = 0; /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. @@ -4435,8 +4427,6 @@ drop: /* Initial out of order segment, build 1 SACK. */ if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks = 1; tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; @@ -5157,7 +5147,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && - TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + TCP_SKB_CB(skb)->seq == tp->rcv_nxt && + !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { int tcp_header_len = tp->tcp_header_len; /* Timestamp header prediction: tcp_header_len @@ -5310,8 +5301,8 @@ slow_path: return -res; step5: - if (th->ack) - tcp_ack(sk, skb, FLAG_SLOWPATH); + if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) + goto discard; tcp_rcv_rtt_measure_ts(sk, skb); @@ -5409,7 +5400,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * never scaled. */ tp->snd_wnd = ntohs(th->window); - tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; @@ -5510,7 +5501,7 @@ discard: /* PAWS check. */ if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && - tcp_paws_check(&tp->rx_opt, 0)) + tcp_paws_reject(&tp->rx_opt, 0)) goto discard_and_undo; if (th->syn) { @@ -5648,7 +5639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* step 5: check the ACK field */ if (th->ack) { - int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); + int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; switch (sk->sk_state) { case TCP_SYN_RECV: @@ -5670,8 +5661,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; - tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, - TCP_SKB_CB(skb)->seq); + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); /* tcp_ack considers this ACK as duplicate * and does not calculate rtt. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cf74c416831a..d0a314879d81 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1226,15 +1226,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); - if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) { - /* Some OSes (unknown ones, but I see them on web server, which - * contains information interesting only for windows' - * users) do not send their stamp in SYN. It is easy case. - * We simply do not advertise TS support. - */ - tmp_opt.saw_tstamp = 0; - tmp_opt.tstamp_ok = 0; - } tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); @@ -2355,7 +2346,7 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) switch (skb->ip_summed) { case CHECKSUM_COMPLETE: - if (!tcp_v4_check(skb->len, iph->saddr, iph->daddr, + if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f67effbb102b..43bbba7926ee 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -107,7 +107,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = tcptw->tw_ts_recent; tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - paws_reject = tcp_paws_check(&tmp_opt, th->rst); + paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } @@ -399,7 +399,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_prequeue_init(newtp); - tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); + tcp_init_wl(newtp, treq->rcv_isn); newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; @@ -434,9 +434,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.saw_tstamp = 0; newtp->rx_opt.dsack = 0; - newtp->rx_opt.eff_sacks = 0; - newtp->rx_opt.num_sacks = 0; + newtp->urg_data = 0; if (sock_flag(newsk, SOCK_KEEPOPEN)) @@ -512,7 +511,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * from another data. */ tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); - paws_reject = tcp_paws_check(&tmp_opt, th->rst); + paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index da2c3b8794f2..c1f259d2d33b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -441,10 +441,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(sp[this_sack].end_seq); } - if (tp->rx_opt.dsack) { - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; - } + tp->rx_opt.dsack = 0; } } @@ -550,6 +547,7 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); unsigned size = 0; + unsigned int eff_sacks; #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); @@ -568,10 +566,11 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, size += TCPOLEN_TSTAMP_ALIGNED; } - if (unlikely(tp->rx_opt.eff_sacks)) { + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; + if (unlikely(eff_sacks)) { const unsigned remaining = MAX_TCP_OPTION_SPACE - size; opts->num_sack_blocks = - min_t(unsigned, tp->rx_opt.eff_sacks, + min_t(unsigned, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); size += TCPOLEN_SACK_BASE_ALIGNED + @@ -663,10 +662,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->urg_ptr = 0; /* The urg_mode check is necessary during a below snd_una win probe */ - if (unlikely(tcp_urg_mode(tp) && - between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { - th->urg_ptr = htons(tp->snd_up - tcb->seq); - th->urg = 1; + if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { + if (before(tp->snd_up, tcb->seq + 0x10000)) { + th->urg_ptr = htons(tp->snd_up - tcb->seq); + th->urg = 1; + } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { + th->urg_ptr = 0xFFFF; + th->urg = 1; + } } tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); @@ -763,11 +766,10 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, struct sk_buff *buff; int nsize, old_factor; int nlen; - u16 flags; + u8 flags; BUG_ON(len > skb->len); - tcp_clear_retrans_hints_partial(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -850,6 +852,12 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, tcp_verify_left_out(tp); } tcp_adjust_fackets_out(sk, skb, diff); + + if (tp->lost_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->lost_skb_hint)->seq) && + (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked)) + tp->lost_cnt_hint -= diff; } /* Link BUFF into the send queue. */ @@ -913,7 +921,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) * factor and mss. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1)); + tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk)); return 0; } @@ -974,15 +982,6 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.probe_size = 0; } -/* Bound MSS / TSO packet size with the half of the window */ -static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) -{ - if (tp->max_window && pktsize > (tp->max_window >> 1)) - return max(tp->max_window >> 1, 68U - tp->tcp_header_len); - else - return pktsize; -} - /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts @@ -1029,22 +1028,17 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. */ -unsigned int tcp_current_mss(struct sock *sk, int large_allowed) +unsigned int tcp_current_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; - u16 xmit_size_goal; - int doing_tso = 0; unsigned header_len; struct tcp_out_options opts; struct tcp_md5sig_key *md5; mss_now = tp->mss_cache; - if (large_allowed && sk_can_gso(sk)) - doing_tso = 1; - if (dst) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) @@ -1062,19 +1056,6 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) mss_now -= delta; } - xmit_size_goal = mss_now; - - if (doing_tso) { - xmit_size_goal = ((sk->sk_gso_max_size - 1) - - inet_csk(sk)->icsk_af_ops->net_header_len - - inet_csk(sk)->icsk_ext_hdr_len - - tp->tcp_header_len); - - xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); - xmit_size_goal -= (xmit_size_goal % mss_now); - } - tp->xmit_size_goal = xmit_size_goal; - return mss_now; } @@ -1256,7 +1237,7 @@ int tcp_may_send_now(struct sock *sk) struct sk_buff *skb = tcp_send_head(sk); return (skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + tcp_snd_test(sk, skb, tcp_current_mss(sk), (tcp_skb_is_last(sk, skb) ? tp->nonagle : TCP_NAGLE_PUSH))); } @@ -1273,7 +1254,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, { struct sk_buff *buff; int nlen = skb->len - len; - u16 flags; + u8 flags; /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) @@ -1352,6 +1333,10 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) if (limit >= sk->sk_gso_max_size) goto send_now; + /* Middle in queue won't get any more data, full sendable already? */ + if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) + goto send_now; + if (sysctl_tcp_tso_win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); @@ -1405,11 +1390,11 @@ static int tcp_mtu_probe(struct sock *sk) icsk->icsk_mtup.probe_size || inet_csk(sk)->icsk_ca_state != TCP_CA_Open || tp->snd_cwnd < 11 || - tp->rx_opt.eff_sacks) + tp->rx_opt.num_sacks || tp->rx_opt.dsack) return -1; /* Very simple search strategy: just double the MSS. */ - mss_now = tcp_current_mss(sk, 0); + mss_now = tcp_current_mss(sk); probe_size = 2 * tp->mss_cache; size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { @@ -1754,11 +1739,9 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); int skb_size, next_skb_size; - u16 flags; skb_size = skb->len; next_skb_size = next_skb->len; - flags = TCP_SKB_CB(skb)->flags; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); @@ -1778,9 +1761,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; - /* Merge over control information. */ - flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ - TCP_SKB_CB(skb)->flags = flags; + /* Merge over control information. This moves PSH/FIN etc. over */ + TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. @@ -1894,7 +1876,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ - cur_mss = tcp_current_mss(sk, 0); + cur_mss = tcp_current_mss(sk); /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the @@ -1908,6 +1890,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (skb->len > cur_mss) { if (tcp_fragment(sk, skb, cur_mss, cur_mss)) return -ENOMEM; /* We'll try again later. */ + } else { + tcp_init_tso_segs(sk, skb, cur_mss); } tcp_retrans_try_collapse(sk, skb, cur_mss); @@ -2061,7 +2045,7 @@ begin_fwd: goto begin_fwd; } else if (!(sacked & TCPCB_LOST)) { - if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS)) + if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; @@ -2100,7 +2084,7 @@ void tcp_send_fin(struct sock *sk) * unsent frames. But be careful about outgoing SACKS * and IP options. */ - mss_now = tcp_current_mss(sk, 1); + mss_now = tcp_current_mss(sk); if (tcp_send_head(sk) != NULL) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; @@ -2325,7 +2309,7 @@ static void tcp_connect_init(struct sock *sk) sk->sk_err = 0; sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; - tcp_init_wl(tp, tp->write_seq, 0); + tcp_init_wl(tp, 0); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; @@ -2512,7 +2496,7 @@ int tcp_write_wakeup(struct sock *sk) if ((skb = tcp_send_head(sk)) != NULL && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { int err; - unsigned int mss = tcp_current_mss(sk, 0); + unsigned int mss = tcp_current_mss(sk); unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 25524d4e372a..59f5b5e7c566 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -165,9 +165,10 @@ static int tcpprobe_sprint(char *tbuf, int n) static ssize_t tcpprobe_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { - int error = 0, cnt = 0; + int error = 0; + size_t cnt = 0; - if (!buf || len < 0) + if (!buf) return -EINVAL; while (cnt < len) { diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 4660b088a8ce..a76513779e2b 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -24,14 +24,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp); - else { - tp->snd_cwnd_cnt++; - if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } - } + else + tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)); } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0170e914f1b0..b144a26359bc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -328,19 +328,16 @@ static void tcp_retransmit_timer(struct sock *sk) if (icsk->icsk_retransmits == 0) { int mib_idx; - if (icsk->icsk_ca_state == TCP_CA_Disorder || - icsk->icsk_ca_state == TCP_CA_Recovery) { - if (tcp_is_sack(tp)) { - if (icsk->icsk_ca_state == TCP_CA_Recovery) - mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; - else - mib_idx = LINUX_MIB_TCPSACKFAILURES; - } else { - if (icsk->icsk_ca_state == TCP_CA_Recovery) - mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; - else - mib_idx = LINUX_MIB_TCPRENOFAILURES; - } + if (icsk->icsk_ca_state == TCP_CA_Disorder) { + if (tcp_is_sack(tp)) + mib_idx = LINUX_MIB_TCPSACKFAILURES; + else + mib_idx = LINUX_MIB_TCPRENOFAILURES; + } else if (icsk->icsk_ca_state == TCP_CA_Recovery) { + if (tcp_is_sack(tp)) + mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; + else + mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; } else if (icsk->icsk_ca_state == TCP_CA_Loss) { mib_idx = LINUX_MIB_TCPLOSSFAILURES; } else { diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index d08b2e855c22..e9bbff746488 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -159,12 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) /* In the "non-congestive state", increase cwnd * every rtt. */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; + tcp_cong_avoid_ai(tp, tp->snd_cwnd); } else { /* In the "congestive state", increase cwnd * every other rtt. diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 9ec843a9bbb2..66b6821b984e 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -94,14 +94,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) } else { /* Reno */ - - if (tp->snd_cwnd_cnt < tp->snd_cwnd) - tp->snd_cwnd_cnt++; - - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } + tcp_cong_avoid_ai(tp, tp->snd_cwnd); } /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c8bee189a193..bda08a09357d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -222,7 +222,7 @@ fail: return error; } -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); @@ -596,6 +596,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return -EOPNOTSUPP; ipc.opt = NULL; + ipc.shtx.flags = 0; if (up->pending) { /* @@ -643,6 +644,9 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->saddr; ipc.oif = sk->sk_bound_dev_if; + err = sock_tx_timestamp(msg, sk, &ipc.shtx); + if (err) + return err; if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc); if (err) @@ -1180,7 +1184,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, sk = sknext; } while (sknext); } else - kfree_skb(skb); + consume_skb(skb); spin_unlock(&hslot->lock); return 0; } @@ -1819,6 +1823,7 @@ EXPORT_SYMBOL(udp_lib_getsockopt); EXPORT_SYMBOL(udp_lib_setsockopt); EXPORT_SYMBOL(udp_poll); EXPORT_SYMBOL(udp_lib_get_port); +EXPORT_SYMBOL(ipv4_rcv_saddr_equal); #ifdef CONFIG_PROC_FS EXPORT_SYMBOL(udp_proc_register); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 2ad24ba31f9d..60d918c96a4f 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -241,7 +241,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static struct dst_ops xfrm4_dst_ops = { .family = AF_INET, - .protocol = __constant_htons(ETH_P_IP), + .protocol = cpu_to_be16(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, .destroy = xfrm4_dst_destroy, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 1220e2c7831e..a8218bc1806a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -40,6 +40,7 @@ #include <linux/errno.h> #include <linux/types.h> +#include <linux/kernel.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> @@ -590,6 +591,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, { struct inet6_ifaddr *ifa = NULL; struct rt6_info *rt; + struct net *net = dev_net(idev->dev); int hash; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -606,6 +608,11 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, goto out2; } + if (idev->cnf.disable_ipv6 || net->ipv6.devconf_all->disable_ipv6) { + err = -EACCES; + goto out2; + } + write_lock(&addrconf_hash_lock); /* Ignore adding duplicate addresses on an interface */ @@ -1209,16 +1216,12 @@ int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev, } break; } else if (minihiscore < miniscore) { - struct ipv6_saddr_score *tmp; - if (hiscore->ifa) in6_ifa_put(hiscore->ifa); in6_ifa_hold(score->ifa); - tmp = hiscore; - hiscore = score; - score = tmp; + swap(hiscore, score); /* restore our iterator */ score->ifa = hiscore->ifa; @@ -1367,40 +1370,6 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add return ifp; } -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) -{ - const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; - const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - __be32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; - __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); - int sk_ipv6only = ipv6_only_sock(sk); - int sk2_ipv6only = inet_v6_ipv6only(sk2); - int addr_type = ipv6_addr_type(sk_rcv_saddr6); - int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; - - if (!sk2_rcv_saddr && !sk_ipv6only) - return 1; - - if (addr_type2 == IPV6_ADDR_ANY && - !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; - - if (addr_type == IPV6_ADDR_ANY && - !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; - - if (sk2_rcv_saddr6 && - ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6)) - return 1; - - if (addr_type == IPV6_ADDR_MAPPED && - !sk2_ipv6only && - (!sk2_rcv_saddr || !sk_rcv_saddr || sk_rcv_saddr == sk2_rcv_saddr)) - return 1; - - return 0; -} - /* Gets referenced address, destroys ifaddr */ static void addrconf_dad_stop(struct inet6_ifaddr *ifp) @@ -1433,6 +1402,11 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp) void addrconf_dad_failure(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; + + if (net_ratelimit()) + printk(KERN_INFO "%s: IPv6 duplicate address detected!\n", + ifp->idev->dev->name); + if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { struct in6_addr addr; @@ -1443,11 +1417,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) ipv6_addr_equal(&ifp->addr, &addr)) { /* DAD failed for link-local based on MAC address */ idev->cnf.disable_ipv6 = 1; + + printk(KERN_INFO "%s: IPv6 being disabled!\n", + ifp->idev->dev->name); } } - if (net_ratelimit()) - printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); addrconf_dad_stop(ifp); } @@ -2227,10 +2202,24 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) return err; } +static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, + int plen, int scope) +{ + struct inet6_ifaddr *ifp; + + ifp = ipv6_add_addr(idev, addr, plen, scope, IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + ipv6_ifa_notify(RTM_NEWADDR, ifp); + in6_ifa_put(ifp); + } +} + #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) static void sit_add_v4_addrs(struct inet6_dev *idev) { - struct inet6_ifaddr * ifp; struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); @@ -2249,14 +2238,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) } if (addr.s6_addr32[3]) { - ifp = ipv6_add_addr(idev, &addr, 128, scope, IFA_F_PERMANENT); - if (!IS_ERR(ifp)) { - spin_lock_bh(&ifp->lock); - ifp->flags &= ~IFA_F_TENTATIVE; - spin_unlock_bh(&ifp->lock); - ipv6_ifa_notify(RTM_NEWADDR, ifp); - in6_ifa_put(ifp); - } + add_addr(idev, &addr, 128, scope); return; } @@ -2284,15 +2266,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) else plen = 96; - ifp = ipv6_add_addr(idev, &addr, plen, flag, - IFA_F_PERMANENT); - if (!IS_ERR(ifp)) { - spin_lock_bh(&ifp->lock); - ifp->flags &= ~IFA_F_TENTATIVE; - spin_unlock_bh(&ifp->lock); - ipv6_ifa_notify(RTM_NEWADDR, ifp); - in6_ifa_put(ifp); - } + add_addr(idev, &addr, plen, flag); } } } @@ -2302,7 +2276,6 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) static void init_loopback(struct net_device *dev) { struct inet6_dev *idev; - struct inet6_ifaddr * ifp; /* ::1 */ @@ -2313,14 +2286,7 @@ static void init_loopback(struct net_device *dev) return; } - ifp = ipv6_add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFA_F_PERMANENT); - if (!IS_ERR(ifp)) { - spin_lock_bh(&ifp->lock); - ifp->flags &= ~IFA_F_TENTATIVE; - spin_unlock_bh(&ifp->lock); - ipv6_ifa_notify(RTM_NEWADDR, ifp); - in6_ifa_put(ifp); - } + add_addr(idev, &in6addr_loopback, 128, IFA_HOST); } static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr) @@ -2832,11 +2798,6 @@ static void addrconf_dad_timer(unsigned long data) read_unlock_bh(&idev->lock); goto out; } - if (idev->cnf.accept_dad > 1 && idev->cnf.disable_ipv6) { - read_unlock_bh(&idev->lock); - addrconf_dad_failure(ifp); - return; - } spin_lock_bh(&ifp->lock); if (ifp->probes == 0) { /* @@ -3647,7 +3608,8 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); @@ -3858,7 +3820,8 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); @@ -3928,7 +3891,8 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9c8309ed35cf..61f55386a236 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -276,11 +276,26 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { - v4addr = addr->sin6_addr.s6_addr32[3]; - if (inet_addr_type(net, v4addr) != RTN_LOCAL) { - err = -EADDRNOTAVAIL; + int chk_addr_ret; + + /* Binding to v4-mapped address on a v6-only socket + * makes no sense + */ + if (np->ipv6only) { + err = -EINVAL; goto out; } + + /* Reproduce AF_INET checks to make the bindings consitant */ + v4addr = addr->sin6_addr.s6_addr32[3]; + chk_addr_ret = inet_addr_type(net, v4addr); + if (!sysctl_ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && + v4addr != htonl(INADDR_ANY) && + chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && + chk_addr_ret != RTN_BROADCAST) + goto out; } else { if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; @@ -339,8 +354,11 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - if (addr_type != IPV6_ADDR_ANY) + if (addr_type != IPV6_ADDR_ANY) { sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (addr_type != IPV6_ADDR_MAPPED) + np->ipv6only = 1; + } if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->sport = htons(inet->num); @@ -803,24 +821,34 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, int proto; __wsum csum; - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) + iph = skb_gro_header(skb, sizeof(*iph)); + if (unlikely(!iph)) goto out; - iph = ipv6_hdr(skb); - __skb_pull(skb, sizeof(*iph)); + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); - flush += ntohs(iph->payload_len) != skb->len; + flush += ntohs(iph->payload_len) != skb_gro_len(skb); rcu_read_lock(); - proto = ipv6_gso_pull_exthdrs(skb, iph->nexthdr); - iph = ipv6_hdr(skb); - IPV6_GRO_CB(skb)->proto = proto; + proto = iph->nexthdr; ops = rcu_dereference(inet6_protos[proto]); - if (!ops || !ops->gro_receive) - goto out_unlock; + if (!ops || !ops->gro_receive) { + __pskb_pull(skb, skb_gro_offset(skb)); + proto = ipv6_gso_pull_exthdrs(skb, proto); + skb_gro_pull(skb, -skb_transport_offset(skb)); + skb_reset_transport_header(skb); + __skb_push(skb, skb_gro_offset(skb)); + + if (!ops || !ops->gro_receive) + goto out_unlock; + + iph = ipv6_hdr(skb); + } + + IPV6_GRO_CB(skb)->proto = proto; flush--; - skb_reset_transport_header(skb); nlen = skb_network_header_len(skb); for (p = *head; p; p = p->next) { @@ -883,8 +911,8 @@ out_unlock: return err; } -static struct packet_type ipv6_packet_type = { - .type = __constant_htons(ETH_P_IPV6), +static struct packet_type ipv6_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_IPV6), .func = ipv6_rcv, .gso_send_check = ipv6_gso_send_check, .gso_segment = ipv6_gso_segment, diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 40f324655e24..d31df0f4bc9a 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -218,8 +218,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (opt) sock_kfree_s(sk, opt, opt->tot_len); pktopt = xchg(&np->pktoptions, NULL); - if (pktopt) - kfree_skb(pktopt); + kfree_skb(pktopt); sk->sk_destruct = inet_sock_destruct; /* diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3e2970841bd8..9f061d1adbc2 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1095,11 +1095,7 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) &ipv6_hdr(ra)->saddr); nlmsg_end(skb, nlh); - err = rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, - GFP_ATOMIC); - if (err < 0) - goto errout; - + rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); return; nla_put_failure: @@ -1538,13 +1534,10 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK2(KERN_WARNING "ICMPv6 Redirect: destination is not a neighbour.\n"); - dst_release(dst); - return; - } - if (!xrlim_allow(dst, 1*HZ)) { - dst_release(dst); - return; + goto release; } + if (!xrlim_allow(dst, 1*HZ)) + goto release; if (dev->addr_len) { read_lock_bh(&neigh->lock); @@ -1570,8 +1563,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, ND_PRINTK0(KERN_ERR "ICMPv6 Redirect: %s() failed to allocate an skb.\n", __func__); - dst_release(dst); - return; + goto release; } skb_reserve(buff, LL_RESERVED_SPACE(dev)); @@ -1631,6 +1623,10 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, if (likely(idev != NULL)) in6_dev_put(idev); + return; + +release: + dst_release(dst); } static void pndisc_redo(struct sk_buff *skb) diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 53ea512c4608..29d643bcafa4 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -95,13 +95,13 @@ config IP6_NF_MATCH_OPTS To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_HL - tristate '"hl" match support' + tristate '"hl" hoplimit match support' depends on NETFILTER_ADVANCED - help - HL matching allows you to match packets based on the hop - limit of the packet. - - To compile it as a module, choose M here. If unsure, say N. + select NETFILTER_XT_MATCH_HL + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MATCH_HL. config IP6_NF_MATCH_IPV6HEADER tristate '"ipv6header" IPv6 Extension Headers Match' @@ -130,6 +130,15 @@ config IP6_NF_MATCH_RT To compile it as a module, choose M here. If unsure, say N. # The targets +config IP6_NF_TARGET_HL + tristate '"HL" hoplimit target support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_TARGET_HL + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. + config IP6_NF_TARGET_LOG tristate "LOG target support" default m if NETFILTER_ADVANCED=n @@ -170,23 +179,6 @@ config IP6_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_TARGET_HL - tristate 'HL (hoplimit) target support' - depends on IP6_NF_MANGLE - depends on NETFILTER_ADVANCED - help - This option adds a `HL' target, which enables the user to decrement - the hoplimit value of the IPv6 header or set it to a given (lower) - value. - - While it is safe to decrement the hoplimit value, this option also - enables functionality to increment and set the hoplimit value of the - IPv6 header to arbitrary values. This is EXTREMELY DANGEROUS since - you can easily create immortal packets that loop forever on the - network. - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_RAW tristate 'raw table support (required for TRACE)' depends on NETFILTER_ADVANCED diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 3f17c948eefb..aafbba30c899 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -20,13 +20,11 @@ obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o -obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o # targets -obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 5859c046cbc4..b693f841aeb4 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -643,6 +643,7 @@ static void __exit ip6_queue_fini(void) MODULE_DESCRIPTION("IPv6 packet queue handler"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_IP6_FW); module_init(ip6_queue_init); module_exit(ip6_queue_fini); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index a33485dc81cb..34af7bb8df5f 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -89,6 +89,25 @@ ip6t_ext_hdr(u8 nexthdr) (nexthdr == IPPROTO_DSTOPTS) ); } +static unsigned long ifname_compare(const char *_a, const char *_b, + const unsigned char *_mask) +{ + const unsigned long *a = (const unsigned long *)_a; + const unsigned long *b = (const unsigned long *)_b; + const unsigned long *mask = (const unsigned long *)_mask; + unsigned long ret; + + ret = (a[0] ^ b[0]) & mask[0]; + if (IFNAMSIZ > sizeof(unsigned long)) + ret |= (a[1] ^ b[1]) & mask[1]; + if (IFNAMSIZ > 2 * sizeof(unsigned long)) + ret |= (a[2] ^ b[2]) & mask[2]; + if (IFNAMSIZ > 3 * sizeof(unsigned long)) + ret |= (a[3] ^ b[3]) & mask[3]; + BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long)); + return ret; +} + /* Returns whether matches rule or not. */ /* Performance critical - called for every packet */ static inline bool @@ -99,7 +118,6 @@ ip6_packet_match(const struct sk_buff *skb, unsigned int *protoff, int *fragoff, bool *hotdrop) { - size_t i; unsigned long ret; const struct ipv6hdr *ipv6 = ipv6_hdr(skb); @@ -120,12 +138,7 @@ ip6_packet_match(const struct sk_buff *skb, return false; } - /* Look for ifname matches; this should unroll nicely. */ - for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { - ret |= (((const unsigned long *)indev)[i] - ^ ((const unsigned long *)ip6info->iniface)[i]) - & ((const unsigned long *)ip6info->iniface_mask)[i]; - } + ret = ifname_compare(indev, ip6info->iniface, ip6info->iniface_mask); if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { dprintf("VIA in mismatch (%s vs %s).%s\n", @@ -134,11 +147,7 @@ ip6_packet_match(const struct sk_buff *skb, return false; } - for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { - ret |= (((const unsigned long *)outdev)[i] - ^ ((const unsigned long *)ip6info->outiface)[i]) - & ((const unsigned long *)ip6info->outiface_mask)[i]; - } + ret = ifname_compare(outdev, ip6info->outiface, ip6info->outiface_mask); if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { dprintf("VIA out mismatch (%s vs %s).%s\n", @@ -373,10 +382,12 @@ ip6t_do_table(struct sk_buff *skb, mtpar.family = tgpar.family = NFPROTO_IPV6; tgpar.hooknum = hook; - read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - private = table->private; - table_base = (void *)private->entries[smp_processor_id()]; + + rcu_read_lock(); + private = rcu_dereference(table->private); + table_base = rcu_dereference(private->entries[smp_processor_id()]); + e = get_entry(table_base, private->hook_entry[hook]); /* For return from builtin chain */ @@ -474,7 +485,7 @@ ip6t_do_table(struct sk_buff *skb, #ifdef CONFIG_NETFILTER_DEBUG ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; #endif - read_unlock_bh(&table->lock); + rcu_read_unlock(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -955,11 +966,64 @@ get_counters(const struct xt_table_info *t, } } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ip6t_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +/* Take values from counters and add them back onto the current cpu */ +static void put_counters(struct xt_table_info *t, + const struct xt_counters counters[]) +{ + unsigned int i, cpu; + + local_bh_disable(); + cpu = smp_processor_id(); + i = 0; + IP6T_ENTRY_ITERATE(t->entries[cpu], + t->size, + add_counter_to_entry, + counters, + &i); + local_bh_enable(); +} + +static inline int +zero_entry_counter(struct ip6t_entry *e, void *arg) +{ + e->counters.bcnt = 0; + e->counters.pcnt = 0; + return 0; +} + +static void +clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) +{ + unsigned int cpu; + const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; + + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + for_each_possible_cpu(cpu) { + memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); + IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, + zero_entry_counter, NULL); + } +} + static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + struct xt_table_info *private = table->private; + struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -968,14 +1032,28 @@ static struct xt_counters *alloc_counters(struct xt_table *table) counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - return ERR_PTR(-ENOMEM); + goto nomem; + + info = xt_alloc_table_info(private->size); + if (!info) + goto free_counters; + + clone_counters(info, private); - /* First, sum counters... */ - write_lock_bh(&table->lock); - get_counters(private, counters); - write_unlock_bh(&table->lock); + mutex_lock(&table->lock); + xt_table_entry_swap_rcu(private, info); + synchronize_net(); /* Wait until smoke has cleared */ - return counters; + get_counters(info, counters); + put_counters(private, counters); + mutex_unlock(&table->lock); + + xt_free_table_info(info); + + free_counters: + vfree(counters); + nomem: + return ERR_PTR(-ENOMEM); } static int @@ -1342,28 +1420,6 @@ do_replace(struct net *net, void __user *user, unsigned int len) return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static inline int -add_counter_to_entry(struct ip6t_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ -#if 0 - duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", - *i, - (long unsigned int)e->counters.pcnt, - (long unsigned int)e->counters.bcnt, - (long unsigned int)addme[*i].pcnt, - (long unsigned int)addme[*i].bcnt); -#endif - - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) @@ -1424,13 +1480,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, goto free; } - write_lock_bh(&t->lock); + mutex_lock(&t->lock); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } + preempt_disable(); i = 0; /* Choose the copy that is on our node */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; @@ -1439,8 +1496,9 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, add_counter_to_entry, paddc, &i); + preempt_enable(); unlock_up_free: - write_unlock_bh(&t->lock); + mutex_unlock(&t->lock); xt_table_unlock(t); module_put(t->me); free: diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c deleted file mode 100644 index 27b5adf670a2..000000000000 --- a/net/ipv6/netfilter/ip6t_HL.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Hop Limit modification target for ip6tables - * Maciej Soltysiak <solt@dns.toxicfilms.tv> - * Based on HW's TTL module - * - * This software is distributed under the terms of GNU GPL - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/ipv6.h> - -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv6/ip6t_HL.h> - -MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); -MODULE_DESCRIPTION("Xtables: IPv6 Hop Limit field modification target"); -MODULE_LICENSE("GPL"); - -static unsigned int -hl_tg6(struct sk_buff *skb, const struct xt_target_param *par) -{ - struct ipv6hdr *ip6h; - const struct ip6t_HL_info *info = par->targinfo; - int new_hl; - - if (!skb_make_writable(skb, skb->len)) - return NF_DROP; - - ip6h = ipv6_hdr(skb); - - switch (info->mode) { - case IP6T_HL_SET: - new_hl = info->hop_limit; - break; - case IP6T_HL_INC: - new_hl = ip6h->hop_limit + info->hop_limit; - if (new_hl > 255) - new_hl = 255; - break; - case IP6T_HL_DEC: - new_hl = ip6h->hop_limit - info->hop_limit; - if (new_hl < 0) - new_hl = 0; - break; - default: - new_hl = ip6h->hop_limit; - break; - } - - ip6h->hop_limit = new_hl; - - return XT_CONTINUE; -} - -static bool hl_tg6_check(const struct xt_tgchk_param *par) -{ - const struct ip6t_HL_info *info = par->targinfo; - - if (info->mode > IP6T_HL_MAXMODE) { - printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n", - info->mode); - return false; - } - if (info->mode != IP6T_HL_SET && info->hop_limit == 0) { - printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't " - "make sense with value 0\n"); - return false; - } - return true; -} - -static struct xt_target hl_tg6_reg __read_mostly = { - .name = "HL", - .family = NFPROTO_IPV6, - .target = hl_tg6, - .targetsize = sizeof(struct ip6t_HL_info), - .table = "mangle", - .checkentry = hl_tg6_check, - .me = THIS_MODULE -}; - -static int __init hl_tg6_init(void) -{ - return xt_register_target(&hl_tg6_reg); -} - -static void __exit hl_tg6_exit(void) -{ - xt_unregister_target(&hl_tg6_reg); -} - -module_init(hl_tg6_init); -module_exit(hl_tg6_exit); diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 37adf5abc51e..7018cac4fddc 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -477,7 +477,7 @@ static struct xt_target log_tg6_reg __read_mostly = { .me = THIS_MODULE, }; -static const struct nf_logger ip6t_logger = { +static struct nf_logger ip6t_logger __read_mostly = { .name = "ip6t_LOG", .logfn = &ip6t_log_packet, .me = THIS_MODULE, diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c deleted file mode 100644 index c964dca1132d..000000000000 --- a/net/ipv6/netfilter/ip6t_hl.c +++ /dev/null @@ -1,68 +0,0 @@ -/* Hop Limit matching module */ - -/* (C) 2001-2002 Maciej Soltysiak <solt@dns.toxicfilms.tv> - * Based on HW's ttl module - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/ipv6.h> -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter_ipv6/ip6t_hl.h> -#include <linux/netfilter/x_tables.h> - -MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); -MODULE_DESCRIPTION("Xtables: IPv6 Hop Limit field match"); -MODULE_LICENSE("GPL"); - -static bool hl_mt6(const struct sk_buff *skb, const struct xt_match_param *par) -{ - const struct ip6t_hl_info *info = par->matchinfo; - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - - switch (info->mode) { - case IP6T_HL_EQ: - return ip6h->hop_limit == info->hop_limit; - break; - case IP6T_HL_NE: - return ip6h->hop_limit != info->hop_limit; - break; - case IP6T_HL_LT: - return ip6h->hop_limit < info->hop_limit; - break; - case IP6T_HL_GT: - return ip6h->hop_limit > info->hop_limit; - break; - default: - printk(KERN_WARNING "ip6t_hl: unknown mode %d\n", - info->mode); - return false; - } - - return false; -} - -static struct xt_match hl_mt6_reg __read_mostly = { - .name = "hl", - .family = NFPROTO_IPV6, - .match = hl_mt6, - .matchsize = sizeof(struct ip6t_hl_info), - .me = THIS_MODULE, -}; - -static int __init hl_mt6_init(void) -{ - return xt_register_match(&hl_mt6_reg); -} - -static void __exit hl_mt6_exit(void) -{ - xt_unregister_match(&hl_mt6_reg); -} - -module_init(hl_mt6_init); -module_exit(hl_mt6_exit); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 40d2e36d8fac..ef5a0a32bf8e 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -54,7 +54,6 @@ static struct static struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(packet_filter.lock), .me = THIS_MODULE, .af = AF_INET6, }; diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index d0b31b259d4d..ab0d398a2ba7 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -60,7 +60,6 @@ static struct static struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(packet_mangler.lock), .me = THIS_MODULE, .af = AF_INET6, }; diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 109fab6f831a..4b792b6ca321 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -38,7 +38,6 @@ static struct static struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(packet_raw.lock), .me = THIS_MODULE, .af = AF_INET6, }; diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 20bc52f13e43..0ea37ff15d56 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -59,7 +59,6 @@ static struct static struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, - .lock = __RW_LOCK_UNLOCKED(security_table.lock), .me = THIS_MODULE, .af = AF_INET6, }; diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 727b9530448a..e6852f617217 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -26,6 +26,7 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_tuple *tuple) diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 72dbb6d1a6b3..41b8a956e1be 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -126,6 +126,10 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, pr_debug("icmpv6: can't create new conn with type %u\n", type + 128); nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); + if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6)) + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: invalid new with type %d ", + type + 128); return false; } atomic_set(&ct->proto.icmp.count, 0); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9c574235c905..1394ddb6e35c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -98,7 +98,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, - .protocol = __constant_htons(ETH_P_IPV6), + .protocol = cpu_to_be16(ETH_P_IPV6), .gc = ip6_dst_gc, .gc_thresh = 1024, .check = ip6_dst_check, @@ -117,7 +117,7 @@ static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) static struct dst_ops ip6_dst_blackhole_ops = { .family = AF_INET6, - .protocol = __constant_htons(ETH_P_IPV6), + .protocol = cpu_to_be16(ETH_P_IPV6), .destroy = ip6_dst_destroy, .check = ip6_dst_check, .update_pmtu = ip6_rt_blackhole_update_pmtu, @@ -2400,8 +2400,9 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE, - info->nlh, gfp_any()); + rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 5cee2bcbcece..664ab82e03b2 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -454,7 +454,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info) if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; - if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; @@ -658,7 +658,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } if (tunnel->err_count > 0) { - if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e5b85d45bee8..4b5aa1854260 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -533,8 +533,7 @@ static inline void syn_flood_warning(struct sk_buff *skb) static void tcp_v6_reqsk_destructor(struct request_sock *req) { - if (inet6_rsk(req)->pktopts) - kfree_skb(inet6_rsk(req)->pktopts); + kfree_skb(inet6_rsk(req)->pktopts); } #ifdef CONFIG_TCP_MD5SIG @@ -948,7 +947,7 @@ struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb) switch (skb->ip_summed) { case CHECKSUM_COMPLETE: - if (!tcp_v6_check(skb->len, &iph->saddr, &iph->daddr, + if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; @@ -1611,8 +1610,7 @@ ipv6_pktoptions: } } - if (opt_skb) - kfree_skb(opt_skb); + kfree_skb(opt_skb); return 0; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 84b1a296eecb..6842dd2edd5b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -49,6 +49,34 @@ #include <linux/seq_file.h> #include "udp_impl.h" +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) +{ + const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); + int sk_ipv6only = ipv6_only_sock(sk); + int sk2_ipv6only = inet_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(sk_rcv_saddr6); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) + return ipv4_rcv_saddr_equal(sk, sk2); + + if (addr_type2 == IPV6_ADDR_ANY && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && + !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6)) + return 1; + + return 0; +} + int udp_v6_get_port(struct sock *sk, unsigned short snum) { return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 97ab068e8ccc..b4b16a43f277 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -272,7 +272,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static struct dst_ops xfrm6_dst_ops = { .family = AF_INET6, - .protocol = __constant_htons(ETH_P_IPV6), + .protocol = cpu_to_be16(ETH_P_IPV6), .gc = xfrm6_garbage_collect, .update_pmtu = xfrm6_update_pmtu, .destroy = xfrm6_dst_destroy, diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 0e685b05496e..f417b77fa0e1 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -69,7 +69,7 @@ __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass) for (i = 0; i < n; i++) { dst[count[class[i] - 1]++] = src[i]; - src[i] = 0; + src[i] = NULL; } return 0; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index b6e70f92e7fb..1627050e29fd 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1958,13 +1958,13 @@ static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { SOCKOPS_WRAP(ipx_dgram, PF_IPX); -static struct packet_type ipx_8023_packet_type = { - .type = __constant_htons(ETH_P_802_3), +static struct packet_type ipx_8023_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_802_3), .func = ipx_rcv, }; -static struct packet_type ipx_dix_packet_type = { - .type = __constant_htons(ETH_P_IPX), +static struct packet_type ipx_dix_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_IPX), .func = ipx_rcv, }; @@ -1975,15 +1975,15 @@ static struct notifier_block ipx_dev_notifier = { extern struct datalink_proto *make_EII_client(void); extern void destroy_EII_client(struct datalink_proto *); -static unsigned char ipx_8022_type = 0xE0; -static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; -static char ipx_EII_err_msg[] __initdata = +static const unsigned char ipx_8022_type = 0xE0; +static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; +static const char ipx_EII_err_msg[] __initconst = KERN_CRIT "IPX: Unable to register with Ethernet II\n"; -static char ipx_8023_err_msg[] __initdata = +static const char ipx_8023_err_msg[] __initconst = KERN_CRIT "IPX: Unable to register with 802.3\n"; -static char ipx_llc_err_msg[] __initdata = +static const char ipx_llc_err_msg[] __initconst = KERN_CRIT "IPX: Unable to register with 802.2\n"; -static char ipx_snap_err_msg[] __initdata = +static const char ipx_snap_err_msg[] __initconst = KERN_CRIT "IPX: Unable to register with SNAP\n"; static int __init ipx_init(void) diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c index ea319e3ddc18..bf92e1473447 100644 --- a/net/irda/irda_device.c +++ b/net/irda/irda_device.c @@ -149,13 +149,14 @@ int irda_device_is_receiving(struct net_device *dev) IRDA_DEBUG(2, "%s()\n", __func__); - if (!dev->do_ioctl) { + if (!dev->netdev_ops->ndo_do_ioctl) { IRDA_ERROR("%s: do_ioctl not impl. by device driver\n", __func__); return -1; } - ret = dev->do_ioctl(dev, (struct ifreq *) &req, SIOCGRECEIVING); + ret = (dev->netdev_ops->ndo_do_ioctl)(dev, (struct ifreq *) &req, + SIOCGRECEIVING); if (ret < 0) return ret; diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index 05112be99569..724bcf951b80 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -45,6 +45,16 @@ static int irlan_eth_xmit(struct sk_buff *skb, struct net_device *dev); static void irlan_eth_set_multicast_list( struct net_device *dev); static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev); +static const struct net_device_ops irlan_eth_netdev_ops = { + .ndo_open = irlan_eth_open, + .ndo_stop = irlan_eth_close, + .ndo_start_xmit = irlan_eth_xmit, + .ndo_get_stats = irlan_eth_get_stats, + .ndo_set_multicast_list = irlan_eth_set_multicast_list, + .ndo_change_mtu = eth_change_mtu, + .ndo_validate_addr = eth_validate_addr, +}; + /* * Function irlan_eth_setup (dev) * @@ -53,14 +63,11 @@ static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev); */ static void irlan_eth_setup(struct net_device *dev) { - dev->open = irlan_eth_open; - dev->stop = irlan_eth_close; - dev->hard_start_xmit = irlan_eth_xmit; - dev->get_stats = irlan_eth_get_stats; - dev->set_multicast_list = irlan_eth_set_multicast_list; + ether_setup(dev); + + dev->netdev_ops = &irlan_eth_netdev_ops; dev->destructor = free_netdev; - ether_setup(dev); /* * Lets do all queueing in IrTTP instead of this device driver. diff --git a/net/irda/irmod.c b/net/irda/irmod.c index 4c487a883725..303a68d92731 100644 --- a/net/irda/irmod.c +++ b/net/irda/irmod.c @@ -55,8 +55,8 @@ EXPORT_SYMBOL(irda_debug); /* Packet type handler. * Tell the kernel how IrDA packets should be handled. */ -static struct packet_type irda_packet_type = { - .type = __constant_htons(ETH_P_IRDA), +static struct packet_type irda_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_IRDA), .func = irlap_driver_rcv, /* Packet type handler irlap_frame.c */ }; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index eb8a2a0b6eb7..49e786535dc8 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1171,8 +1171,7 @@ static void iucv_callback_txdone(struct iucv_path *path, spin_unlock_irqrestore(&list->lock, flags); - if (this) - kfree_skb(this); + kfree_skb(this); } BUG_ON(!this); diff --git a/net/key/af_key.c b/net/key/af_key.c index 7dcbde3ea7d9..643c1be2d02e 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -313,8 +313,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, if (one_sk != NULL) err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); - if (skb2) - kfree_skb(skb2); + kfree_skb(skb2); kfree_skb(skb); return err; } @@ -3573,8 +3572,7 @@ static int pfkey_sendmsg(struct kiocb *kiocb, out: if (err && hdr && pfkey_error(hdr, err, sk) == 0) err = 0; - if (skb) - kfree_skb(skb); + kfree_skb(skb); return err ? : len; } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 56fd85ab358e..febae702685c 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1118,11 +1118,11 @@ static const struct proto_ops llc_ui_ops = { .sendpage = sock_no_sendpage, }; -static char llc_proc_err_msg[] __initdata = +static const char llc_proc_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the proc_fs entries\n"; -static char llc_sysctl_err_msg[] __initdata = +static const char llc_sysctl_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the sysctl entries\n"; -static char llc_sock_err_msg[] __initdata = +static const char llc_sock_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the network family\n"; static int __init llc2_init(void) diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 5c6d89c6d51d..3477624a4906 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -332,8 +332,7 @@ int llc_conn_remove_acked_pdus(struct sock *sk, u8 nr, u16 *how_many_unacked) for (i = 0; i < pdu_pos && i < q_len; i++) { skb = skb_dequeue(&llc->pdu_unack_q); - if (skb) - kfree_skb(skb); + kfree_skb(skb); nbr_acked++; } out: diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index 50d5b10e23a2..ff4c0ab96a69 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -147,13 +147,13 @@ void llc_sap_close(struct llc_sap *sap) kfree(sap); } -static struct packet_type llc_packet_type = { - .type = __constant_htons(ETH_P_802_2), +static struct packet_type llc_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_802_2), .func = llc_rcv, }; -static struct packet_type llc_tr_packet_type = { - .type = __constant_htons(ETH_P_TR_802_2), +static struct packet_type llc_tr_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_TR_802_2), .func = llc_rcv, }; diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 7d4971aa443f..0e3ab88bb706 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -8,13 +8,15 @@ mac80211-y := \ wep.o \ wpa.o \ scan.o \ - ht.o \ + ht.o agg-tx.o agg-rx.o \ + ibss.o \ mlme.o \ iface.o \ rate.o \ michael.o \ tkip.o \ aes_ccm.o \ + aes_cmac.o \ cfg.o \ rx.o \ spectmgmt.o \ @@ -37,6 +39,8 @@ mac80211-$(CONFIG_MAC80211_MESH) += \ mesh_plink.o \ mesh_hwmp.o +mac80211-$(CONFIG_PM) += pm.o + # objects for PID algorithm rc80211_pid-y := rc80211_pid_algo.o rc80211_pid-$(CONFIG_MAC80211_DEBUGFS) += rc80211_pid_debugfs.o diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c new file mode 100644 index 000000000000..3d097b3d7b62 --- /dev/null +++ b/net/mac80211/aes_cmac.c @@ -0,0 +1,135 @@ +/* + * AES-128-CMAC with TLen 16 for IEEE 802.11w BIP + * Copyright 2008, Jouni Malinen <j@w1.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/crypto.h> +#include <linux/err.h> + +#include <net/mac80211.h> +#include "key.h" +#include "aes_cmac.h" + +#define AES_BLOCK_SIZE 16 +#define AES_CMAC_KEY_LEN 16 +#define CMAC_TLEN 8 /* CMAC TLen = 64 bits (8 octets) */ +#define AAD_LEN 20 + + +static void gf_mulx(u8 *pad) +{ + int i, carry; + + carry = pad[0] & 0x80; + for (i = 0; i < AES_BLOCK_SIZE - 1; i++) + pad[i] = (pad[i] << 1) | (pad[i + 1] >> 7); + pad[AES_BLOCK_SIZE - 1] <<= 1; + if (carry) + pad[AES_BLOCK_SIZE - 1] ^= 0x87; +} + + +static void aes_128_cmac_vector(struct crypto_cipher *tfm, u8 *scratch, + size_t num_elem, + const u8 *addr[], const size_t *len, u8 *mac) +{ + u8 *cbc, *pad; + const u8 *pos, *end; + size_t i, e, left, total_len; + + cbc = scratch; + pad = scratch + AES_BLOCK_SIZE; + + memset(cbc, 0, AES_BLOCK_SIZE); + + total_len = 0; + for (e = 0; e < num_elem; e++) + total_len += len[e]; + left = total_len; + + e = 0; + pos = addr[0]; + end = pos + len[0]; + + while (left >= AES_BLOCK_SIZE) { + for (i = 0; i < AES_BLOCK_SIZE; i++) { + cbc[i] ^= *pos++; + if (pos >= end) { + e++; + pos = addr[e]; + end = pos + len[e]; + } + } + if (left > AES_BLOCK_SIZE) + crypto_cipher_encrypt_one(tfm, cbc, cbc); + left -= AES_BLOCK_SIZE; + } + + memset(pad, 0, AES_BLOCK_SIZE); + crypto_cipher_encrypt_one(tfm, pad, pad); + gf_mulx(pad); + + if (left || total_len == 0) { + for (i = 0; i < left; i++) { + cbc[i] ^= *pos++; + if (pos >= end) { + e++; + pos = addr[e]; + end = pos + len[e]; + } + } + cbc[left] ^= 0x80; + gf_mulx(pad); + } + + for (i = 0; i < AES_BLOCK_SIZE; i++) + pad[i] ^= cbc[i]; + crypto_cipher_encrypt_one(tfm, pad, pad); + memcpy(mac, pad, CMAC_TLEN); +} + + +void ieee80211_aes_cmac(struct crypto_cipher *tfm, u8 *scratch, const u8 *aad, + const u8 *data, size_t data_len, u8 *mic) +{ + const u8 *addr[3]; + size_t len[3]; + u8 zero[CMAC_TLEN]; + + memset(zero, 0, CMAC_TLEN); + addr[0] = aad; + len[0] = AAD_LEN; + addr[1] = data; + len[1] = data_len - CMAC_TLEN; + addr[2] = zero; + len[2] = CMAC_TLEN; + + aes_128_cmac_vector(tfm, scratch, 3, addr, len, mic); +} + + +struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[]) +{ + struct crypto_cipher *tfm; + + tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return NULL; + + crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN); + + return tfm; +} + + +void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm) +{ + if (tfm) + crypto_free_cipher(tfm); +} diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h new file mode 100644 index 000000000000..0eb9a4831508 --- /dev/null +++ b/net/mac80211/aes_cmac.h @@ -0,0 +1,19 @@ +/* + * Copyright 2008, Jouni Malinen <j@w1.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef AES_CMAC_H +#define AES_CMAC_H + +#include <linux/crypto.h> + +struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[]); +void ieee80211_aes_cmac(struct crypto_cipher *tfm, u8 *scratch, const u8 *aad, + const u8 *data, size_t data_len, u8 *mic); +void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm); + +#endif /* AES_CMAC_H */ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c new file mode 100644 index 000000000000..a95affc94629 --- /dev/null +++ b/net/mac80211/agg-rx.c @@ -0,0 +1,302 @@ +/* + * HT handling + * + * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> + * Copyright 2007, Michael Wu <flamingice@sourmilk.net> + * Copyright 2007-2008, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ieee80211.h> +#include <net/mac80211.h> +#include "ieee80211_i.h" + +void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, + u16 initiator, u16 reason) +{ + struct ieee80211_local *local = sta->local; + struct ieee80211_hw *hw = &local->hw; + int i; + + /* check if TID is in operational state */ + spin_lock_bh(&sta->lock); + if (sta->ampdu_mlme.tid_state_rx[tid] != HT_AGG_STATE_OPERATIONAL) { + spin_unlock_bh(&sta->lock); + return; + } + + sta->ampdu_mlme.tid_state_rx[tid] = + HT_AGG_STATE_REQ_STOP_BA_MSK | + (initiator << HT_AGG_STATE_INITIATOR_SHIFT); + spin_unlock_bh(&sta->lock); + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Rx BA session stop requested for %pM tid %u\n", + sta->sta.addr, tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + if (local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_STOP, + &sta->sta, tid, NULL)) + printk(KERN_DEBUG "HW problem - can not stop rx " + "aggregation for tid %d\n", tid); + + /* shutdown timer has not expired */ + if (initiator != WLAN_BACK_TIMER) + del_timer_sync(&sta->ampdu_mlme.tid_rx[tid]->session_timer); + + /* check if this is a self generated aggregation halt */ + if (initiator == WLAN_BACK_RECIPIENT || initiator == WLAN_BACK_TIMER) + ieee80211_send_delba(sta->sdata, sta->sta.addr, + tid, 0, reason); + + /* free the reordering buffer */ + for (i = 0; i < sta->ampdu_mlme.tid_rx[tid]->buf_size; i++) { + if (sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i]) { + /* release the reordered frames */ + dev_kfree_skb(sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i]); + sta->ampdu_mlme.tid_rx[tid]->stored_mpdu_num--; + sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i] = NULL; + } + } + + spin_lock_bh(&sta->lock); + /* free resources */ + kfree(sta->ampdu_mlme.tid_rx[tid]->reorder_buf); + + if (!sta->ampdu_mlme.tid_rx[tid]->shutdown) { + kfree(sta->ampdu_mlme.tid_rx[tid]); + sta->ampdu_mlme.tid_rx[tid] = NULL; + } + + sta->ampdu_mlme.tid_state_rx[tid] = HT_AGG_STATE_IDLE; + spin_unlock_bh(&sta->lock); +} + +void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, + u16 initiator, u16 reason) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + /* stop HW Rx aggregation. ampdu_action existence + * already verified in session init so we add the BUG_ON */ + BUG_ON(!local->ops->ampdu_action); + + rcu_read_lock(); + + sta = sta_info_get(local, ra); + if (!sta) { + rcu_read_unlock(); + return; + } + + __ieee80211_stop_rx_ba_session(sta, tid, initiator, reason); + + rcu_read_unlock(); +} + +/* + * After accepting the AddBA Request we activated a timer, + * resetting it after each frame that arrives from the originator. + * if this timer expires ieee80211_sta_stop_rx_ba_session will be executed. + */ +static void sta_rx_agg_session_timer_expired(unsigned long data) +{ + /* not an elegant detour, but there is no choice as the timer passes + * only one argument, and various sta_info are needed here, so init + * flow in sta_info_create gives the TID as data, while the timer_to_id + * array gives the sta through container_of */ + u8 *ptid = (u8 *)data; + u8 *timer_to_id = ptid - *ptid; + struct sta_info *sta = container_of(timer_to_id, struct sta_info, + timer_to_tid[0]); + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid); +#endif + ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->sta.addr, + (u16)*ptid, WLAN_BACK_TIMER, + WLAN_REASON_QSTA_TIMEOUT); +} + +static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, + u8 dialog_token, u16 status, u16 policy, + u16 buf_size, u16 timeout) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u16 capab; + + skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); + + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer " + "for addba resp frame\n", sdata->dev->name); + return; + } + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(mgmt, 0, 24); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN); + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN); + + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + + skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp)); + mgmt->u.action.category = WLAN_CATEGORY_BACK; + mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP; + mgmt->u.action.u.addba_resp.dialog_token = dialog_token; + + capab = (u16)(policy << 1); /* bit 1 aggregation policy */ + capab |= (u16)(tid << 2); /* bit 5:2 TID number */ + capab |= (u16)(buf_size << 6); /* bit 15:6 max size of aggregation */ + + mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab); + mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); + mgmt->u.action.u.addba_resp.status = cpu_to_le16(status); + + ieee80211_tx_skb(sdata, skb, 1); +} + +void ieee80211_process_addba_request(struct ieee80211_local *local, + struct sta_info *sta, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + struct ieee80211_hw *hw = &local->hw; + struct ieee80211_conf *conf = &hw->conf; + struct tid_ampdu_rx *tid_agg_rx; + u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num, status; + u8 dialog_token; + int ret = -EOPNOTSUPP; + + /* extract session parameters from addba request frame */ + dialog_token = mgmt->u.action.u.addba_req.dialog_token; + timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout); + start_seq_num = + le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4; + + capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1; + tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; + buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; + + status = WLAN_STATUS_REQUEST_DECLINED; + + /* sanity check for incoming parameters: + * check if configuration can support the BA policy + * and if buffer size does not exceeds max value */ + /* XXX: check own ht delayed BA capability?? */ + if (((ba_policy != 1) + && (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) + || (buf_size > IEEE80211_MAX_AMPDU_BUF)) { + status = WLAN_STATUS_INVALID_QOS_PARAM; +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "AddBA Req with bad params from " + "%pM on tid %u. policy %d, buffer size %d\n", + mgmt->sa, tid, ba_policy, + buf_size); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + goto end_no_lock; + } + /* determine default buffer size */ + if (buf_size == 0) { + struct ieee80211_supported_band *sband; + + sband = local->hw.wiphy->bands[conf->channel->band]; + buf_size = IEEE80211_MIN_AMPDU_BUF; + buf_size = buf_size << sband->ht_cap.ampdu_factor; + } + + + /* examine state machine */ + spin_lock_bh(&sta->lock); + + if (sta->ampdu_mlme.tid_state_rx[tid] != HT_AGG_STATE_IDLE) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "unexpected AddBA Req from " + "%pM on tid %u\n", + mgmt->sa, tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + goto end; + } + + /* prepare A-MPDU MLME for Rx aggregation */ + sta->ampdu_mlme.tid_rx[tid] = + kmalloc(sizeof(struct tid_ampdu_rx), GFP_ATOMIC); + if (!sta->ampdu_mlme.tid_rx[tid]) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_ERR "allocate rx mlme to tid %d failed\n", + tid); +#endif + goto end; + } + /* rx timer */ + sta->ampdu_mlme.tid_rx[tid]->session_timer.function = + sta_rx_agg_session_timer_expired; + sta->ampdu_mlme.tid_rx[tid]->session_timer.data = + (unsigned long)&sta->timer_to_tid[tid]; + init_timer(&sta->ampdu_mlme.tid_rx[tid]->session_timer); + + tid_agg_rx = sta->ampdu_mlme.tid_rx[tid]; + + /* prepare reordering buffer */ + tid_agg_rx->reorder_buf = + kcalloc(buf_size, sizeof(struct sk_buff *), GFP_ATOMIC); + if (!tid_agg_rx->reorder_buf) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_ERR "can not allocate reordering buffer " + "to tid %d\n", tid); +#endif + kfree(sta->ampdu_mlme.tid_rx[tid]); + goto end; + } + + if (local->ops->ampdu_action) + ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_START, + &sta->sta, tid, &start_seq_num); +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Rx A-MPDU request on tid %d result %d\n", tid, ret); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + if (ret) { + kfree(tid_agg_rx->reorder_buf); + kfree(tid_agg_rx); + sta->ampdu_mlme.tid_rx[tid] = NULL; + goto end; + } + + /* change state and send addba resp */ + sta->ampdu_mlme.tid_state_rx[tid] = HT_AGG_STATE_OPERATIONAL; + tid_agg_rx->dialog_token = dialog_token; + tid_agg_rx->ssn = start_seq_num; + tid_agg_rx->head_seq_num = start_seq_num; + tid_agg_rx->buf_size = buf_size; + tid_agg_rx->timeout = timeout; + tid_agg_rx->stored_mpdu_num = 0; + status = WLAN_STATUS_SUCCESS; +end: + spin_unlock_bh(&sta->lock); + +end_no_lock: + ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid, + dialog_token, status, 1, buf_size, timeout); +} diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c new file mode 100644 index 000000000000..1df116d4d6e7 --- /dev/null +++ b/net/mac80211/agg-tx.c @@ -0,0 +1,701 @@ +/* + * HT handling + * + * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> + * Copyright 2007, Michael Wu <flamingice@sourmilk.net> + * Copyright 2007-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ieee80211.h> +#include <net/mac80211.h> +#include "ieee80211_i.h" +#include "wme.h" + +/** + * DOC: TX aggregation + * + * Aggregation on the TX side requires setting the hardware flag + * %IEEE80211_HW_AMPDU_AGGREGATION as well as, if present, the @ampdu_queues + * hardware parameter to the number of hardware AMPDU queues. If there are no + * hardware queues then the driver will (currently) have to do all frame + * buffering. + * + * When TX aggregation is started by some subsystem (usually the rate control + * algorithm would be appropriate) by calling the + * ieee80211_start_tx_ba_session() function, the driver will be notified via + * its @ampdu_action function, with the %IEEE80211_AMPDU_TX_START action. + * + * In response to that, the driver is later required to call the + * ieee80211_start_tx_ba_cb() (or ieee80211_start_tx_ba_cb_irqsafe()) + * function, which will start the aggregation session. + * + * Similarly, when the aggregation session is stopped by + * ieee80211_stop_tx_ba_session(), the driver's @ampdu_action function will + * be called with the action %IEEE80211_AMPDU_TX_STOP. In this case, the + * call must not fail, and the driver must later call ieee80211_stop_tx_ba_cb() + * (or ieee80211_stop_tx_ba_cb_irqsafe()). + */ + +static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, + const u8 *da, u16 tid, + u8 dialog_token, u16 start_seq_num, + u16 agg_size, u16 timeout) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u16 capab; + + skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); + + if (!skb) { + printk(KERN_ERR "%s: failed to allocate buffer " + "for addba request frame\n", sdata->dev->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(mgmt, 0, 24); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN); + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN); + + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + + skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req)); + + mgmt->u.action.category = WLAN_CATEGORY_BACK; + mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ; + + mgmt->u.action.u.addba_req.dialog_token = dialog_token; + capab = (u16)(1 << 1); /* bit 1 aggregation policy */ + capab |= (u16)(tid << 2); /* bit 5:2 TID number */ + capab |= (u16)(agg_size << 6); /* bit 15:6 max size of aggergation */ + + mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab); + + mgmt->u.action.u.addba_req.timeout = cpu_to_le16(timeout); + mgmt->u.action.u.addba_req.start_seq_num = + cpu_to_le16(start_seq_num << 4); + + ieee80211_tx_skb(sdata, skb, 1); +} + +void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_bar *bar; + u16 bar_control = 0; + + skb = dev_alloc_skb(sizeof(*bar) + local->hw.extra_tx_headroom); + if (!skb) { + printk(KERN_ERR "%s: failed to allocate buffer for " + "bar frame\n", sdata->dev->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + bar = (struct ieee80211_bar *)skb_put(skb, sizeof(*bar)); + memset(bar, 0, sizeof(*bar)); + bar->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | + IEEE80211_STYPE_BACK_REQ); + memcpy(bar->ra, ra, ETH_ALEN); + memcpy(bar->ta, sdata->dev->dev_addr, ETH_ALEN); + bar_control |= (u16)IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL; + bar_control |= (u16)IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA; + bar_control |= (u16)(tid << 12); + bar->control = cpu_to_le16(bar_control); + bar->start_seq_num = cpu_to_le16(ssn); + + ieee80211_tx_skb(sdata, skb, 0); +} + +static int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, + enum ieee80211_back_parties initiator) +{ + struct ieee80211_local *local = sta->local; + int ret; + u8 *state; + + state = &sta->ampdu_mlme.tid_state_tx[tid]; + + if (local->hw.ampdu_queues) { + if (initiator) { + /* + * Stop the AC queue to avoid issues where we send + * unaggregated frames already before the delba. + */ + ieee80211_stop_queue_by_reason(&local->hw, + local->hw.queues + sta->tid_to_tx_q[tid], + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + } + + /* + * Pretend the driver woke the queue, just in case + * it disabled it before the session was stopped. + */ + ieee80211_wake_queue( + &local->hw, local->hw.queues + sta->tid_to_tx_q[tid]); + } + *state = HT_AGG_STATE_REQ_STOP_BA_MSK | + (initiator << HT_AGG_STATE_INITIATOR_SHIFT); + + ret = local->ops->ampdu_action(&local->hw, IEEE80211_AMPDU_TX_STOP, + &sta->sta, tid, NULL); + + /* HW shall not deny going back to legacy */ + if (WARN_ON(ret)) { + *state = HT_AGG_STATE_OPERATIONAL; + } + + return ret; +} + +/* + * After sending add Block Ack request we activated a timer until + * add Block Ack response will arrive from the recipient. + * If this timer expires sta_addba_resp_timer_expired will be executed. + */ +static void sta_addba_resp_timer_expired(unsigned long data) +{ + /* not an elegant detour, but there is no choice as the timer passes + * only one argument, and both sta_info and TID are needed, so init + * flow in sta_info_create gives the TID as data, while the timer_to_id + * array gives the sta through container_of */ + u16 tid = *(u8 *)data; + struct sta_info *sta = container_of((void *)data, + struct sta_info, timer_to_tid[tid]); + u8 *state; + + state = &sta->ampdu_mlme.tid_state_tx[tid]; + + /* check if the TID waits for addBA response */ + spin_lock_bh(&sta->lock); + if (!(*state & HT_ADDBA_REQUESTED_MSK)) { + spin_unlock_bh(&sta->lock); + *state = HT_AGG_STATE_IDLE; +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "timer expired on tid %d but we are not " + "expecting addBA response there", tid); +#endif + return; + } + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "addBA response timer expired on tid %d\n", tid); +#endif + + ___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR); + spin_unlock_bh(&sta->lock); +} + +static inline int ieee80211_ac_from_tid(int tid) +{ + return ieee802_1d_to_ac[tid & 7]; +} + +int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct sta_info *sta; + struct ieee80211_sub_if_data *sdata; + u8 *state; + int i, qn = -1, ret = 0; + u16 start_seq_num; + + if (WARN_ON(!local->ops->ampdu_action)) + return -EINVAL; + + if ((tid >= STA_TID_NUM) || !(hw->flags & IEEE80211_HW_AMPDU_AGGREGATION)) + return -EINVAL; + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Open BA session requested for %pM tid %u\n", + ra, tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + if (hw->ampdu_queues && ieee80211_ac_from_tid(tid) == 0) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "rejecting on voice AC\n"); +#endif + return -EINVAL; + } + + rcu_read_lock(); + + sta = sta_info_get(local, ra); + if (!sta) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Could not find the station\n"); +#endif + ret = -ENOENT; + goto unlock; + } + + /* + * The aggregation code is not prepared to handle + * anything but STA/AP due to the BSSID handling. + * IBSS could work in the code but isn't supported + * by drivers or the standard. + */ + if (sta->sdata->vif.type != NL80211_IFTYPE_STATION && + sta->sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + sta->sdata->vif.type != NL80211_IFTYPE_AP) { + ret = -EINVAL; + goto unlock; + } + + spin_lock_bh(&sta->lock); + + sdata = sta->sdata; + + /* we have tried too many times, receiver does not want A-MPDU */ + if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) { + ret = -EBUSY; + goto err_unlock_sta; + } + + state = &sta->ampdu_mlme.tid_state_tx[tid]; + /* check if the TID is not in aggregation flow already */ + if (*state != HT_AGG_STATE_IDLE) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "BA request denied - session is not " + "idle on tid %u\n", tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + ret = -EAGAIN; + goto err_unlock_sta; + } + + if (hw->ampdu_queues) { + spin_lock(&local->queue_stop_reason_lock); + /* reserve a new queue for this session */ + for (i = 0; i < local->hw.ampdu_queues; i++) { + if (local->ampdu_ac_queue[i] < 0) { + qn = i; + local->ampdu_ac_queue[qn] = + ieee80211_ac_from_tid(tid); + break; + } + } + spin_unlock(&local->queue_stop_reason_lock); + + if (qn < 0) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "BA request denied - " + "queue unavailable for tid %d\n", tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + ret = -ENOSPC; + goto err_unlock_sta; + } + + /* + * If we successfully allocate the session, we can't have + * anything going on on the queue this TID maps into, so + * stop it for now. This is a "virtual" stop using the same + * mechanism that drivers will use. + * + * XXX: queue up frames for this session in the sta_info + * struct instead to avoid hitting all other STAs. + */ + ieee80211_stop_queue_by_reason( + &local->hw, hw->queues + qn, + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + } + + /* prepare A-MPDU MLME for Tx aggregation */ + sta->ampdu_mlme.tid_tx[tid] = + kmalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC); + if (!sta->ampdu_mlme.tid_tx[tid]) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_ERR "allocate tx mlme to tid %d failed\n", + tid); +#endif + ret = -ENOMEM; + goto err_return_queue; + } + + /* Tx timer */ + sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.function = + sta_addba_resp_timer_expired; + sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.data = + (unsigned long)&sta->timer_to_tid[tid]; + init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); + + /* Ok, the Addba frame hasn't been sent yet, but if the driver calls the + * call back right away, it must see that the flow has begun */ + *state |= HT_ADDBA_REQUESTED_MSK; + + start_seq_num = sta->tid_seq[tid]; + + ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_START, + &sta->sta, tid, &start_seq_num); + + if (ret) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "BA request denied - HW unavailable for" + " tid %d\n", tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + *state = HT_AGG_STATE_IDLE; + goto err_free; + } + sta->tid_to_tx_q[tid] = qn; + + spin_unlock_bh(&sta->lock); + + /* send an addBA request */ + sta->ampdu_mlme.dialog_token_allocator++; + sta->ampdu_mlme.tid_tx[tid]->dialog_token = + sta->ampdu_mlme.dialog_token_allocator; + sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num; + + ieee80211_send_addba_request(sta->sdata, ra, tid, + sta->ampdu_mlme.tid_tx[tid]->dialog_token, + sta->ampdu_mlme.tid_tx[tid]->ssn, + 0x40, 5000); + /* activate the timer for the recipient's addBA response */ + sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.expires = + jiffies + ADDBA_RESP_INTERVAL; + add_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid); +#endif + goto unlock; + + err_free: + kfree(sta->ampdu_mlme.tid_tx[tid]); + sta->ampdu_mlme.tid_tx[tid] = NULL; + err_return_queue: + if (qn >= 0) { + /* We failed, so start queue again right away. */ + ieee80211_wake_queue_by_reason(hw, hw->queues + qn, + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + /* give queue back to pool */ + spin_lock(&local->queue_stop_reason_lock); + local->ampdu_ac_queue[qn] = -1; + spin_unlock(&local->queue_stop_reason_lock); + } + err_unlock_sta: + spin_unlock_bh(&sta->lock); + unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(ieee80211_start_tx_ba_session); + +void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct sta_info *sta; + u8 *state; + + if (tid >= STA_TID_NUM) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n", + tid, STA_TID_NUM); +#endif + return; + } + + rcu_read_lock(); + sta = sta_info_get(local, ra); + if (!sta) { + rcu_read_unlock(); +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Could not find station: %pM\n", ra); +#endif + return; + } + + state = &sta->ampdu_mlme.tid_state_tx[tid]; + spin_lock_bh(&sta->lock); + + if (WARN_ON(!(*state & HT_ADDBA_REQUESTED_MSK))) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "addBA was not requested yet, state is %d\n", + *state); +#endif + spin_unlock_bh(&sta->lock); + rcu_read_unlock(); + return; + } + + if (WARN_ON(*state & HT_ADDBA_DRV_READY_MSK)) + goto out; + + *state |= HT_ADDBA_DRV_READY_MSK; + + if (*state == HT_AGG_STATE_OPERATIONAL) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Aggregation is on for tid %d \n", tid); +#endif + if (hw->ampdu_queues) { + /* + * Wake up this queue, we stopped it earlier, + * this will in turn wake the entire AC. + */ + ieee80211_wake_queue_by_reason(hw, + hw->queues + sta->tid_to_tx_q[tid], + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + } + } + + out: + spin_unlock_bh(&sta->lock); + rcu_read_unlock(); +} +EXPORT_SYMBOL(ieee80211_start_tx_ba_cb); + +void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_hw *hw, + const u8 *ra, u16 tid) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_ra_tid *ra_tid; + struct sk_buff *skb = dev_alloc_skb(0); + + if (unlikely(!skb)) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_WARNING "%s: Not enough memory, " + "dropping start BA session", skb->dev->name); +#endif + return; + } + ra_tid = (struct ieee80211_ra_tid *) &skb->cb; + memcpy(&ra_tid->ra, ra, ETH_ALEN); + ra_tid->tid = tid; + + skb->pkt_type = IEEE80211_ADDBA_MSG; + skb_queue_tail(&local->skb_queue, skb); + tasklet_schedule(&local->tasklet); +} +EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe); + +int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, + enum ieee80211_back_parties initiator) +{ + u8 *state; + int ret; + + /* check if the TID is in aggregation */ + state = &sta->ampdu_mlme.tid_state_tx[tid]; + spin_lock_bh(&sta->lock); + + if (*state != HT_AGG_STATE_OPERATIONAL) { + ret = -ENOENT; + goto unlock; + } + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Tx BA session stop requested for %pM tid %u\n", + sta->sta.addr, tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator); + + unlock: + spin_unlock_bh(&sta->lock); + return ret; +} + +int ieee80211_stop_tx_ba_session(struct ieee80211_hw *hw, + u8 *ra, u16 tid, + enum ieee80211_back_parties initiator) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct sta_info *sta; + int ret = 0; + + if (WARN_ON(!local->ops->ampdu_action)) + return -EINVAL; + + if (tid >= STA_TID_NUM) + return -EINVAL; + + rcu_read_lock(); + sta = sta_info_get(local, ra); + if (!sta) { + rcu_read_unlock(); + return -ENOENT; + } + + ret = __ieee80211_stop_tx_ba_session(sta, tid, initiator); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(ieee80211_stop_tx_ba_session); + +void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct sta_info *sta; + u8 *state; + + if (tid >= STA_TID_NUM) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n", + tid, STA_TID_NUM); +#endif + return; + } + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Stopping Tx BA session for %pM tid %d\n", + ra, tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + rcu_read_lock(); + sta = sta_info_get(local, ra); + if (!sta) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Could not find station: %pM\n", ra); +#endif + rcu_read_unlock(); + return; + } + state = &sta->ampdu_mlme.tid_state_tx[tid]; + + /* NOTE: no need to use sta->lock in this state check, as + * ieee80211_stop_tx_ba_session will let only one stop call to + * pass through per sta/tid + */ + if ((*state & HT_AGG_STATE_REQ_STOP_BA_MSK) == 0) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "unexpected callback to A-MPDU stop\n"); +#endif + rcu_read_unlock(); + return; + } + + if (*state & HT_AGG_STATE_INITIATOR_MSK) + ieee80211_send_delba(sta->sdata, ra, tid, + WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); + + spin_lock_bh(&sta->lock); + + if (*state & HT_AGG_STATE_INITIATOR_MSK && + hw->ampdu_queues) { + /* + * Wake up this queue, we stopped it earlier, + * this will in turn wake the entire AC. + */ + ieee80211_wake_queue_by_reason(hw, + hw->queues + sta->tid_to_tx_q[tid], + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + } + + *state = HT_AGG_STATE_IDLE; + sta->ampdu_mlme.addba_req_num[tid] = 0; + kfree(sta->ampdu_mlme.tid_tx[tid]); + sta->ampdu_mlme.tid_tx[tid] = NULL; + spin_unlock_bh(&sta->lock); + + rcu_read_unlock(); +} +EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb); + +void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_hw *hw, + const u8 *ra, u16 tid) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_ra_tid *ra_tid; + struct sk_buff *skb = dev_alloc_skb(0); + + if (unlikely(!skb)) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_WARNING "%s: Not enough memory, " + "dropping stop BA session", skb->dev->name); +#endif + return; + } + ra_tid = (struct ieee80211_ra_tid *) &skb->cb; + memcpy(&ra_tid->ra, ra, ETH_ALEN); + ra_tid->tid = tid; + + skb->pkt_type = IEEE80211_DELBA_MSG; + skb_queue_tail(&local->skb_queue, skb); + tasklet_schedule(&local->tasklet); +} +EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb_irqsafe); + + +void ieee80211_process_addba_resp(struct ieee80211_local *local, + struct sta_info *sta, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + struct ieee80211_hw *hw = &local->hw; + u16 capab; + u16 tid, start_seq_num; + u8 *state; + + capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); + tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; + + state = &sta->ampdu_mlme.tid_state_tx[tid]; + + spin_lock_bh(&sta->lock); + + if (!(*state & HT_ADDBA_REQUESTED_MSK)) { + spin_unlock_bh(&sta->lock); + return; + } + + if (mgmt->u.action.u.addba_resp.dialog_token != + sta->ampdu_mlme.tid_tx[tid]->dialog_token) { + spin_unlock_bh(&sta->lock); +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "wrong addBA response token, tid %d\n", tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + return; + } + + del_timer_sync(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "switched off addBA timer for tid %d \n", tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + if (le16_to_cpu(mgmt->u.action.u.addba_resp.status) + == WLAN_STATUS_SUCCESS) { + u8 curstate = *state; + + *state |= HT_ADDBA_RECEIVED_MSK; + + if (hw->ampdu_queues && *state != curstate && + *state == HT_AGG_STATE_OPERATIONAL) { + /* + * Wake up this queue, we stopped it earlier, + * this will in turn wake the entire AC. + */ + ieee80211_wake_queue_by_reason(hw, + hw->queues + sta->tid_to_tx_q[tid], + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + } + sta->ampdu_mlme.addba_req_num[tid] = 0; + + if (local->ops->ampdu_action) { + (void)local->ops->ampdu_action(hw, + IEEE80211_AMPDU_TX_RESUME, + &sta->sta, tid, &start_seq_num); + } +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Resuming TX aggregation for tid %d\n", tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + } else { + sta->ampdu_mlme.addba_req_num[tid]++; + ___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR); + } + spin_unlock_bh(&sta->lock); +} diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9d4e4d846ec1..58693e52d458 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -133,6 +133,9 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, case WLAN_CIPHER_SUITE_CCMP: alg = ALG_CCMP; break; + case WLAN_CIPHER_SUITE_AES_CMAC: + alg = ALG_AES_CMAC; + break; default: return -EINVAL; } @@ -275,6 +278,17 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, else params.cipher = WLAN_CIPHER_SUITE_WEP104; break; + case ALG_AES_CMAC: + params.cipher = WLAN_CIPHER_SUITE_AES_CMAC; + seq[0] = key->u.aes_cmac.tx_pn[5]; + seq[1] = key->u.aes_cmac.tx_pn[4]; + seq[2] = key->u.aes_cmac.tx_pn[3]; + seq[3] = key->u.aes_cmac.tx_pn[2]; + seq[4] = key->u.aes_cmac.tx_pn[1]; + seq[5] = key->u.aes_cmac.tx_pn[0]; + params.seq = seq; + params.seq_len = 6; + break; } params.key = key->conf.key; @@ -304,6 +318,22 @@ static int ieee80211_config_default_key(struct wiphy *wiphy, return 0; } +static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy, + struct net_device *dev, + u8 key_idx) +{ + struct ieee80211_sub_if_data *sdata; + + rcu_read_lock(); + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + ieee80211_set_default_mgmt_key(sdata, key_idx); + + rcu_read_unlock(); + + return 0; +} + static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = sta->sdata; @@ -311,11 +341,15 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->filled = STATION_INFO_INACTIVE_TIME | STATION_INFO_RX_BYTES | STATION_INFO_TX_BYTES | + STATION_INFO_RX_PACKETS | + STATION_INFO_TX_PACKETS | STATION_INFO_TX_BITRATE; sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); sinfo->rx_bytes = sta->rx_bytes; sinfo->tx_bytes = sta->tx_bytes; + sinfo->rx_packets = sta->rx_packets; + sinfo->tx_packets = sta->tx_packets; if (sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { sinfo->filled |= STATION_INFO_SIGNAL; @@ -417,7 +451,8 @@ static int ieee80211_config_beacon(struct ieee80211_sub_if_data *sdata, * This is a kludge. beacon interval should really be part * of the beacon information. */ - if (params->interval) { + if (params->interval && (sdata->local->hw.conf.beacon_int != + params->interval)) { sdata->local->hw.conf.beacon_int = params->interval; err = ieee80211_hw_config(sdata->local, IEEE80211_CONF_CHANGE_BEACON_INTERVAL); @@ -493,7 +528,8 @@ static int ieee80211_config_beacon(struct ieee80211_sub_if_data *sdata, kfree(old); - return ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON); + return ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON | + IEEE80211_IFCC_BEACON_ENABLED); } static int ieee80211_add_beacon(struct wiphy *wiphy, struct net_device *dev, @@ -553,7 +589,7 @@ static int ieee80211_del_beacon(struct wiphy *wiphy, struct net_device *dev) synchronize_rcu(); kfree(old); - return ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON); + return ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON_ENABLED); } /* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ @@ -630,6 +666,10 @@ static void sta_apply_parameters(struct ieee80211_local *local, sta->flags &= ~WLAN_STA_WME; if (params->station_flags & STATION_FLAG_WME) sta->flags |= WLAN_STA_WME; + + sta->flags &= ~WLAN_STA_MFP; + if (params->station_flags & STATION_FLAG_MFP) + sta->flags |= WLAN_STA_MFP; spin_unlock_bh(&sta->lock); } @@ -1141,6 +1181,125 @@ static int ieee80211_set_channel(struct wiphy *wiphy, return ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); } +static int set_mgmt_extra_ie_sta(struct ieee80211_sub_if_data *sdata, + u8 subtype, u8 *ies, size_t ies_len) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + switch (subtype) { + case IEEE80211_STYPE_PROBE_REQ >> 4: + if (local->ops->hw_scan) + break; + kfree(ifmgd->ie_probereq); + ifmgd->ie_probereq = ies; + ifmgd->ie_probereq_len = ies_len; + return 0; + case IEEE80211_STYPE_PROBE_RESP >> 4: + kfree(ifmgd->ie_proberesp); + ifmgd->ie_proberesp = ies; + ifmgd->ie_proberesp_len = ies_len; + return 0; + case IEEE80211_STYPE_AUTH >> 4: + kfree(ifmgd->ie_auth); + ifmgd->ie_auth = ies; + ifmgd->ie_auth_len = ies_len; + return 0; + case IEEE80211_STYPE_ASSOC_REQ >> 4: + kfree(ifmgd->ie_assocreq); + ifmgd->ie_assocreq = ies; + ifmgd->ie_assocreq_len = ies_len; + return 0; + case IEEE80211_STYPE_REASSOC_REQ >> 4: + kfree(ifmgd->ie_reassocreq); + ifmgd->ie_reassocreq = ies; + ifmgd->ie_reassocreq_len = ies_len; + return 0; + case IEEE80211_STYPE_DEAUTH >> 4: + kfree(ifmgd->ie_deauth); + ifmgd->ie_deauth = ies; + ifmgd->ie_deauth_len = ies_len; + return 0; + case IEEE80211_STYPE_DISASSOC >> 4: + kfree(ifmgd->ie_disassoc); + ifmgd->ie_disassoc = ies; + ifmgd->ie_disassoc_len = ies_len; + return 0; + } + + return -EOPNOTSUPP; +} + +static int ieee80211_set_mgmt_extra_ie(struct wiphy *wiphy, + struct net_device *dev, + struct mgmt_extra_ie_params *params) +{ + struct ieee80211_sub_if_data *sdata; + u8 *ies; + size_t ies_len; + int ret = -EOPNOTSUPP; + + if (params->ies) { + ies = kmemdup(params->ies, params->ies_len, GFP_KERNEL); + if (ies == NULL) + return -ENOMEM; + ies_len = params->ies_len; + } else { + ies = NULL; + ies_len = 0; + } + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + ret = set_mgmt_extra_ie_sta(sdata, params->subtype, + ies, ies_len); + break; + default: + ret = -EOPNOTSUPP; + break; + } + + if (ret) + kfree(ies); + return ret; +} + +#ifdef CONFIG_PM +static int ieee80211_suspend(struct wiphy *wiphy) +{ + return __ieee80211_suspend(wiphy_priv(wiphy)); +} + +static int ieee80211_resume(struct wiphy *wiphy) +{ + return __ieee80211_resume(wiphy_priv(wiphy)); +} +#else +#define ieee80211_suspend NULL +#define ieee80211_resume NULL +#endif + +static int ieee80211_scan(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_scan_request *req) +{ + struct ieee80211_sub_if_data *sdata; + + if (!netif_running(dev)) + return -ENETDOWN; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + if (sdata->vif.type != NL80211_IFTYPE_STATION && + sdata->vif.type != NL80211_IFTYPE_ADHOC && + sdata->vif.type != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + + return ieee80211_request_scan(sdata, req); +} + struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -1149,6 +1308,7 @@ struct cfg80211_ops mac80211_config_ops = { .del_key = ieee80211_del_key, .get_key = ieee80211_get_key, .set_default_key = ieee80211_config_default_key, + .set_default_mgmt_key = ieee80211_config_default_mgmt_key, .add_beacon = ieee80211_add_beacon, .set_beacon = ieee80211_set_beacon, .del_beacon = ieee80211_del_beacon, @@ -1169,4 +1329,8 @@ struct cfg80211_ops mac80211_config_ops = { .change_bss = ieee80211_change_bss, .set_txq_params = ieee80211_set_txq_params, .set_channel = ieee80211_set_channel, + .set_mgmt_extra_ie = ieee80211_set_mgmt_extra_ie, + .suspend = ieee80211_suspend, + .resume = ieee80211_resume, + .scan = ieee80211_scan, }; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 2697a2fe608f..e37f557de3f3 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -57,11 +57,62 @@ DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d", local->hw.conf.long_frame_max_tx_count); DEBUGFS_READONLY_FILE(total_ps_buffered, 20, "%d", local->total_ps_buffered); -DEBUGFS_READONLY_FILE(wep_iv, 20, "%#06x", +DEBUGFS_READONLY_FILE(wep_iv, 20, "%#08x", local->wep_iv & 0xffffff); DEBUGFS_READONLY_FILE(rate_ctrl_alg, 100, "%s", local->rate_ctrl ? local->rate_ctrl->ops->name : "<unset>"); +static ssize_t tsf_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + u64 tsf = 0; + char buf[100]; + + if (local->ops->get_tsf) + tsf = local->ops->get_tsf(local_to_hw(local)); + + snprintf(buf, sizeof(buf), "0x%016llx\n", (unsigned long long) tsf); + + return simple_read_from_buffer(user_buf, count, ppos, buf, 19); +} + +static ssize_t tsf_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + unsigned long long tsf; + char buf[100]; + size_t len; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + buf[len] = '\0'; + + if (strncmp(buf, "reset", 5) == 0) { + if (local->ops->reset_tsf) { + local->ops->reset_tsf(local_to_hw(local)); + printk(KERN_INFO "%s: debugfs reset TSF\n", wiphy_name(local->hw.wiphy)); + } + } else { + tsf = simple_strtoul(buf, NULL, 0); + if (local->ops->set_tsf) { + local->ops->set_tsf(local_to_hw(local), tsf); + printk(KERN_INFO "%s: debugfs set TSF to %#018llx\n", wiphy_name(local->hw.wiphy), tsf); + } + } + + return count; +} + +static const struct file_operations tsf_ops = { + .read = tsf_read, + .write = tsf_write, + .open = mac80211_open_file_generic +}; + /* statistics stuff */ #define DEBUGFS_STATS_FILE(name, buflen, fmt, value...) \ @@ -136,8 +187,6 @@ DEBUGFS_STATS_FILE(multicast_received_frame_count, 20, "%u", local->dot11MulticastReceivedFrameCount); DEBUGFS_STATS_FILE(transmitted_frame_count, 20, "%u", local->dot11TransmittedFrameCount); -DEBUGFS_STATS_FILE(wep_undecryptable_count, 20, "%u", - local->dot11WEPUndecryptableCount); #ifdef CONFIG_MAC80211_DEBUG_COUNTERS DEBUGFS_STATS_FILE(tx_handlers_drop, 20, "%u", local->tx_handlers_drop); @@ -204,6 +253,7 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD(long_retry_limit); DEBUGFS_ADD(total_ps_buffered); DEBUGFS_ADD(wep_iv); + DEBUGFS_ADD(tsf); statsd = debugfs_create_dir("statistics", phyd); local->debugfs.statistics = statsd; @@ -221,7 +271,6 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_STATS_ADD(received_fragment_count); DEBUGFS_STATS_ADD(multicast_received_frame_count); DEBUGFS_STATS_ADD(transmitted_frame_count); - DEBUGFS_STATS_ADD(wep_undecryptable_count); #ifdef CONFIG_MAC80211_DEBUG_COUNTERS DEBUGFS_STATS_ADD(tx_handlers_drop); DEBUGFS_STATS_ADD(tx_handlers_queued); @@ -258,6 +307,7 @@ void debugfs_hw_del(struct ieee80211_local *local) DEBUGFS_DEL(long_retry_limit); DEBUGFS_DEL(total_ps_buffered); DEBUGFS_DEL(wep_iv); + DEBUGFS_DEL(tsf); DEBUGFS_STATS_DEL(transmitted_fragment_count); DEBUGFS_STATS_DEL(multicast_transmitted_frame_count); @@ -268,7 +318,6 @@ void debugfs_hw_del(struct ieee80211_local *local) DEBUGFS_STATS_DEL(received_fragment_count); DEBUGFS_STATS_DEL(multicast_received_frame_count); DEBUGFS_STATS_DEL(transmitted_frame_count); - DEBUGFS_STATS_DEL(wep_undecryptable_count); DEBUGFS_STATS_DEL(num_scans); #ifdef CONFIG_MAC80211_DEBUG_COUNTERS DEBUGFS_STATS_DEL(tx_handlers_drop); diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 6424ac565ae0..99c752588b30 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -76,6 +76,9 @@ static ssize_t key_algorithm_read(struct file *file, case ALG_CCMP: alg = "CCMP\n"; break; + case ALG_AES_CMAC: + alg = "AES-128-CMAC\n"; + break; default: return 0; } @@ -105,6 +108,12 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], tpn[5]); break; + case ALG_AES_CMAC: + tpn = key->u.aes_cmac.tx_pn; + len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", + tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], + tpn[5]); + break; default: return 0; } @@ -142,6 +151,14 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, } len = p - buf; break; + case ALG_AES_CMAC: + rpn = key->u.aes_cmac.rx_pn; + p += scnprintf(p, sizeof(buf)+buf-p, + "%02x%02x%02x%02x%02x%02x\n", + rpn[0], rpn[1], rpn[2], + rpn[3], rpn[4], rpn[5]); + len = p - buf; + break; default: return 0; } @@ -156,13 +173,40 @@ static ssize_t key_replays_read(struct file *file, char __user *userbuf, char buf[20]; int len; - if (key->conf.alg != ALG_CCMP) + switch (key->conf.alg) { + case ALG_CCMP: + len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays); + break; + case ALG_AES_CMAC: + len = scnprintf(buf, sizeof(buf), "%u\n", + key->u.aes_cmac.replays); + break; + default: return 0; - len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays); + } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(replays); +static ssize_t key_icverrors_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct ieee80211_key *key = file->private_data; + char buf[20]; + int len; + + switch (key->conf.alg) { + case ALG_AES_CMAC: + len = scnprintf(buf, sizeof(buf), "%u\n", + key->u.aes_cmac.icverrors); + break; + default: + return 0; + } + return simple_read_from_buffer(userbuf, count, ppos, buf, len); +} +KEY_OPS(icverrors); + static ssize_t key_key_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -222,6 +266,7 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key) DEBUGFS_ADD(tx_spec); DEBUGFS_ADD(rx_spec); DEBUGFS_ADD(replays); + DEBUGFS_ADD(icverrors); DEBUGFS_ADD(key); DEBUGFS_ADD(ifindex); }; @@ -243,6 +288,7 @@ void ieee80211_debugfs_key_remove(struct ieee80211_key *key) DEBUGFS_DEL(tx_spec); DEBUGFS_DEL(rx_spec); DEBUGFS_DEL(replays); + DEBUGFS_DEL(icverrors); DEBUGFS_DEL(key); DEBUGFS_DEL(ifindex); @@ -280,6 +326,35 @@ void ieee80211_debugfs_key_remove_default(struct ieee80211_sub_if_data *sdata) sdata->common_debugfs.default_key = NULL; } +void ieee80211_debugfs_key_add_mgmt_default(struct ieee80211_sub_if_data *sdata) +{ + char buf[50]; + struct ieee80211_key *key; + + if (!sdata->debugfsdir) + return; + + /* this is running under the key lock */ + + key = sdata->default_mgmt_key; + if (key) { + sprintf(buf, "../keys/%d", key->debugfs.cnt); + sdata->common_debugfs.default_mgmt_key = + debugfs_create_symlink("default_mgmt_key", + sdata->debugfsdir, buf); + } else + ieee80211_debugfs_key_remove_mgmt_default(sdata); +} + +void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata) +{ + if (!sdata) + return; + + debugfs_remove(sdata->common_debugfs.default_mgmt_key); + sdata->common_debugfs.default_mgmt_key = NULL; +} + void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key, struct sta_info *sta) { diff --git a/net/mac80211/debugfs_key.h b/net/mac80211/debugfs_key.h index b1a3754ee240..54717b4e1371 100644 --- a/net/mac80211/debugfs_key.h +++ b/net/mac80211/debugfs_key.h @@ -6,6 +6,10 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key); void ieee80211_debugfs_key_remove(struct ieee80211_key *key); void ieee80211_debugfs_key_add_default(struct ieee80211_sub_if_data *sdata); void ieee80211_debugfs_key_remove_default(struct ieee80211_sub_if_data *sdata); +void ieee80211_debugfs_key_add_mgmt_default( + struct ieee80211_sub_if_data *sdata); +void ieee80211_debugfs_key_remove_mgmt_default( + struct ieee80211_sub_if_data *sdata); void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key, struct sta_info *sta); #else @@ -19,6 +23,12 @@ static inline void ieee80211_debugfs_key_add_default( static inline void ieee80211_debugfs_key_remove_default( struct ieee80211_sub_if_data *sdata) {} +static inline void ieee80211_debugfs_key_add_mgmt_default( + struct ieee80211_sub_if_data *sdata) +{} +static inline void ieee80211_debugfs_key_remove_mgmt_default( + struct ieee80211_sub_if_data *sdata) +{} static inline void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key, struct sta_info *sta) {} diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index c54219301724..e3420329f4e6 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -94,31 +94,31 @@ IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC); IEEE80211_IF_FILE(force_unicast_rateidx, force_unicast_rateidx, DEC); IEEE80211_IF_FILE(max_ratectrl_rateidx, max_ratectrl_rateidx, DEC); -/* STA/IBSS attributes */ -IEEE80211_IF_FILE(state, u.sta.state, DEC); -IEEE80211_IF_FILE(bssid, u.sta.bssid, MAC); -IEEE80211_IF_FILE(prev_bssid, u.sta.prev_bssid, MAC); -IEEE80211_IF_FILE(ssid_len, u.sta.ssid_len, SIZE); -IEEE80211_IF_FILE(aid, u.sta.aid, DEC); -IEEE80211_IF_FILE(ap_capab, u.sta.ap_capab, HEX); -IEEE80211_IF_FILE(capab, u.sta.capab, HEX); -IEEE80211_IF_FILE(extra_ie_len, u.sta.extra_ie_len, SIZE); -IEEE80211_IF_FILE(auth_tries, u.sta.auth_tries, DEC); -IEEE80211_IF_FILE(assoc_tries, u.sta.assoc_tries, DEC); -IEEE80211_IF_FILE(auth_algs, u.sta.auth_algs, HEX); -IEEE80211_IF_FILE(auth_alg, u.sta.auth_alg, DEC); -IEEE80211_IF_FILE(auth_transaction, u.sta.auth_transaction, DEC); +/* STA attributes */ +IEEE80211_IF_FILE(state, u.mgd.state, DEC); +IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); +IEEE80211_IF_FILE(prev_bssid, u.mgd.prev_bssid, MAC); +IEEE80211_IF_FILE(ssid_len, u.mgd.ssid_len, SIZE); +IEEE80211_IF_FILE(aid, u.mgd.aid, DEC); +IEEE80211_IF_FILE(ap_capab, u.mgd.ap_capab, HEX); +IEEE80211_IF_FILE(capab, u.mgd.capab, HEX); +IEEE80211_IF_FILE(extra_ie_len, u.mgd.extra_ie_len, SIZE); +IEEE80211_IF_FILE(auth_tries, u.mgd.auth_tries, DEC); +IEEE80211_IF_FILE(assoc_tries, u.mgd.assoc_tries, DEC); +IEEE80211_IF_FILE(auth_algs, u.mgd.auth_algs, HEX); +IEEE80211_IF_FILE(auth_alg, u.mgd.auth_alg, DEC); +IEEE80211_IF_FILE(auth_transaction, u.mgd.auth_transaction, DEC); static ssize_t ieee80211_if_fmt_flags( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) { return scnprintf(buf, buflen, "%s%s%s%s%s%s%s\n", - sdata->u.sta.flags & IEEE80211_STA_SSID_SET ? "SSID\n" : "", - sdata->u.sta.flags & IEEE80211_STA_BSSID_SET ? "BSSID\n" : "", - sdata->u.sta.flags & IEEE80211_STA_PREV_BSSID_SET ? "prev BSSID\n" : "", - sdata->u.sta.flags & IEEE80211_STA_AUTHENTICATED ? "AUTH\n" : "", - sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED ? "ASSOC\n" : "", - sdata->u.sta.flags & IEEE80211_STA_PROBEREQ_POLL ? "PROBEREQ POLL\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_SSID_SET ? "SSID\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_BSSID_SET ? "BSSID\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_PREV_BSSID_SET ? "prev BSSID\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_AUTHENTICATED ? "AUTH\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED ? "ASSOC\n" : "", + sdata->u.mgd.flags & IEEE80211_STA_PROBEREQ_POLL ? "PROBEREQ POLL\n" : "", sdata->vif.bss_conf.use_cts_prot ? "CTS prot\n" : ""); } __IEEE80211_IF_FILE(flags); @@ -283,9 +283,11 @@ static void add_files(struct ieee80211_sub_if_data *sdata) #endif break; case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: add_sta_files(sdata); break; + case NL80211_IFTYPE_ADHOC: + /* XXX */ + break; case NL80211_IFTYPE_AP: add_ap_files(sdata); break; @@ -418,9 +420,11 @@ static void del_files(struct ieee80211_sub_if_data *sdata) #endif break; case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: del_sta_files(sdata); break; + case NL80211_IFTYPE_ADHOC: + /* XXX */ + break; case NL80211_IFTYPE_AP: del_ap_files(sdata); break; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index a2fbe0131312..90230c718b5b 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -67,14 +67,15 @@ static ssize_t sta_flags_read(struct file *file, char __user *userbuf, char buf[100]; struct sta_info *sta = file->private_data; u32 staflags = get_sta_flags(sta); - int res = scnprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s", + int res = scnprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s", staflags & WLAN_STA_AUTH ? "AUTH\n" : "", staflags & WLAN_STA_ASSOC ? "ASSOC\n" : "", staflags & WLAN_STA_PS ? "PS\n" : "", staflags & WLAN_STA_AUTHORIZED ? "AUTHORIZED\n" : "", staflags & WLAN_STA_SHORT_PREAMBLE ? "SHORT PREAMBLE\n" : "", staflags & WLAN_STA_WME ? "WME\n" : "", - staflags & WLAN_STA_WDS ? "WDS\n" : ""); + staflags & WLAN_STA_WDS ? "WDS\n" : "", + staflags & WLAN_STA_MFP ? "MFP\n" : ""); return simple_read_from_buffer(userbuf, count, ppos, buf, res); } STA_OPS(flags); diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index c5c0c5271096..4e3c72f20de7 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -17,8 +17,7 @@ #include <net/wireless.h> #include <net/mac80211.h> #include "ieee80211_i.h" -#include "sta_info.h" -#include "wme.h" +#include "rate.h" void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband, struct ieee80211_ht_cap *ht_cap_ie, @@ -95,7 +94,9 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss_ht_conf ht; + struct sta_info *sta; u32 changed = 0; bool enable_ht = true, ht_changed; enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; @@ -130,14 +131,25 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata, } } - ht_changed = local->hw.conf.ht.enabled != enable_ht || - channel_type != local->hw.conf.ht.channel_type; + ht_changed = conf_is_ht(&local->hw.conf) != enable_ht || + channel_type != local->hw.conf.channel_type; local->oper_channel_type = channel_type; - local->hw.conf.ht.enabled = enable_ht; - if (ht_changed) - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_HT); + if (ht_changed) { + /* channel_type change automatically detected */ + ieee80211_hw_config(local, 0); + + rcu_read_lock(); + + sta = sta_info_get(local, ifmgd->bssid); + if (sta) + rate_control_rate_update(local, sband, sta, + IEEE80211_RC_HT_CHANGED); + + rcu_read_unlock(); + + } /* disable HT */ if (!enable_ht) @@ -154,108 +166,22 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata, return changed; } -static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, - const u8 *da, u16 tid, - u8 dialog_token, u16 start_seq_num, - u16 agg_size, u16 timeout) +void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta) { - struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; - struct sk_buff *skb; - struct ieee80211_mgmt *mgmt; - u16 capab; - - skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); - - if (!skb) { - printk(KERN_ERR "%s: failed to allocate buffer " - "for addba request frame\n", sdata->dev->name); - return; - } - skb_reserve(skb, local->hw.extra_tx_headroom); - mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); - memset(mgmt, 0, 24); - memcpy(mgmt->da, da, ETH_ALEN); - memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - if (sdata->vif.type == NL80211_IFTYPE_AP) - memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN); - else - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); - - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_ACTION); - - skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req)); - - mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ; - - mgmt->u.action.u.addba_req.dialog_token = dialog_token; - capab = (u16)(1 << 1); /* bit 1 aggregation policy */ - capab |= (u16)(tid << 2); /* bit 5:2 TID number */ - capab |= (u16)(agg_size << 6); /* bit 15:6 max size of aggergation */ - - mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab); - - mgmt->u.action.u.addba_req.timeout = cpu_to_le16(timeout); - mgmt->u.action.u.addba_req.start_seq_num = - cpu_to_le16(start_seq_num << 4); - - ieee80211_tx_skb(sdata, skb, 0); -} - -static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, - u8 dialog_token, u16 status, u16 policy, - u16 buf_size, u16 timeout) -{ - struct ieee80211_if_sta *ifsta = &sdata->u.sta; - struct ieee80211_local *local = sdata->local; - struct sk_buff *skb; - struct ieee80211_mgmt *mgmt; - u16 capab; - - skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); + int i; - if (!skb) { - printk(KERN_DEBUG "%s: failed to allocate buffer " - "for addba resp frame\n", sdata->dev->name); - return; + for (i = 0; i < STA_TID_NUM; i++) { + __ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR); + __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, + WLAN_REASON_QSTA_LEAVE_QBSS); } - - skb_reserve(skb, local->hw.extra_tx_headroom); - mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); - memset(mgmt, 0, 24); - memcpy(mgmt->da, da, ETH_ALEN); - memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - if (sdata->vif.type == NL80211_IFTYPE_AP) - memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN); - else - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_ACTION); - - skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp)); - mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP; - mgmt->u.action.u.addba_resp.dialog_token = dialog_token; - - capab = (u16)(policy << 1); /* bit 1 aggregation policy */ - capab |= (u16)(tid << 2); /* bit 5:2 TID number */ - capab |= (u16)(buf_size << 6); /* bit 15:6 max size of aggregation */ - - mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab); - mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); - mgmt->u.action.u.addba_resp.status = cpu_to_le16(status); - - ieee80211_tx_skb(sdata, skb, 0); } -static void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, - const u8 *da, u16 tid, - u16 initiator, u16 reason_code) +void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, + const u8 *da, u16 tid, + u16 initiator, u16 reason_code) { struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u16 params; @@ -273,10 +199,12 @@ static void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, memset(mgmt, 0, 24); memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - if (sdata->vif.type == NL80211_IFTYPE_AP) + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN); - else - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); @@ -290,770 +218,7 @@ static void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, mgmt->u.action.u.delba.params = cpu_to_le16(params); mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code); - ieee80211_tx_skb(sdata, skb, 0); -} - -void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn) -{ - struct ieee80211_local *local = sdata->local; - struct sk_buff *skb; - struct ieee80211_bar *bar; - u16 bar_control = 0; - - skb = dev_alloc_skb(sizeof(*bar) + local->hw.extra_tx_headroom); - if (!skb) { - printk(KERN_ERR "%s: failed to allocate buffer for " - "bar frame\n", sdata->dev->name); - return; - } - skb_reserve(skb, local->hw.extra_tx_headroom); - bar = (struct ieee80211_bar *)skb_put(skb, sizeof(*bar)); - memset(bar, 0, sizeof(*bar)); - bar->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | - IEEE80211_STYPE_BACK_REQ); - memcpy(bar->ra, ra, ETH_ALEN); - memcpy(bar->ta, sdata->dev->dev_addr, ETH_ALEN); - bar_control |= (u16)IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL; - bar_control |= (u16)IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA; - bar_control |= (u16)(tid << 12); - bar->control = cpu_to_le16(bar_control); - bar->start_seq_num = cpu_to_le16(ssn); - - ieee80211_tx_skb(sdata, skb, 0); -} - -void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, - u16 initiator, u16 reason) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_hw *hw = &local->hw; - struct sta_info *sta; - int ret, i; - - rcu_read_lock(); - - sta = sta_info_get(local, ra); - if (!sta) { - rcu_read_unlock(); - return; - } - - /* check if TID is in operational state */ - spin_lock_bh(&sta->lock); - if (sta->ampdu_mlme.tid_state_rx[tid] - != HT_AGG_STATE_OPERATIONAL) { - spin_unlock_bh(&sta->lock); - rcu_read_unlock(); - return; - } - sta->ampdu_mlme.tid_state_rx[tid] = - HT_AGG_STATE_REQ_STOP_BA_MSK | - (initiator << HT_AGG_STATE_INITIATOR_SHIFT); - spin_unlock_bh(&sta->lock); - - /* stop HW Rx aggregation. ampdu_action existence - * already verified in session init so we add the BUG_ON */ - BUG_ON(!local->ops->ampdu_action); - -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Rx BA session stop requested for %pM tid %u\n", - ra, tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - - ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_STOP, - &sta->sta, tid, NULL); - if (ret) - printk(KERN_DEBUG "HW problem - can not stop rx " - "aggregation for tid %d\n", tid); - - /* shutdown timer has not expired */ - if (initiator != WLAN_BACK_TIMER) - del_timer_sync(&sta->ampdu_mlme.tid_rx[tid]->session_timer); - - /* check if this is a self generated aggregation halt */ - if (initiator == WLAN_BACK_RECIPIENT || initiator == WLAN_BACK_TIMER) - ieee80211_send_delba(sdata, ra, tid, 0, reason); - - /* free the reordering buffer */ - for (i = 0; i < sta->ampdu_mlme.tid_rx[tid]->buf_size; i++) { - if (sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i]) { - /* release the reordered frames */ - dev_kfree_skb(sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i]); - sta->ampdu_mlme.tid_rx[tid]->stored_mpdu_num--; - sta->ampdu_mlme.tid_rx[tid]->reorder_buf[i] = NULL; - } - } - /* free resources */ - kfree(sta->ampdu_mlme.tid_rx[tid]->reorder_buf); - kfree(sta->ampdu_mlme.tid_rx[tid]); - sta->ampdu_mlme.tid_rx[tid] = NULL; - sta->ampdu_mlme.tid_state_rx[tid] = HT_AGG_STATE_IDLE; - - rcu_read_unlock(); -} - - -/* - * After sending add Block Ack request we activated a timer until - * add Block Ack response will arrive from the recipient. - * If this timer expires sta_addba_resp_timer_expired will be executed. - */ -static void sta_addba_resp_timer_expired(unsigned long data) -{ - /* not an elegant detour, but there is no choice as the timer passes - * only one argument, and both sta_info and TID are needed, so init - * flow in sta_info_create gives the TID as data, while the timer_to_id - * array gives the sta through container_of */ - u16 tid = *(u8 *)data; - struct sta_info *temp_sta = container_of((void *)data, - struct sta_info, timer_to_tid[tid]); - - struct ieee80211_local *local = temp_sta->local; - struct ieee80211_hw *hw = &local->hw; - struct sta_info *sta; - u8 *state; - - rcu_read_lock(); - - sta = sta_info_get(local, temp_sta->sta.addr); - if (!sta) { - rcu_read_unlock(); - return; - } - - state = &sta->ampdu_mlme.tid_state_tx[tid]; - /* check if the TID waits for addBA response */ - spin_lock_bh(&sta->lock); - if (!(*state & HT_ADDBA_REQUESTED_MSK)) { - spin_unlock_bh(&sta->lock); - *state = HT_AGG_STATE_IDLE; -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "timer expired on tid %d but we are not " - "expecting addBA response there", tid); -#endif - goto timer_expired_exit; - } - -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "addBA response timer expired on tid %d\n", tid); -#endif - - /* go through the state check in stop_BA_session */ - *state = HT_AGG_STATE_OPERATIONAL; - spin_unlock_bh(&sta->lock); - ieee80211_stop_tx_ba_session(hw, temp_sta->sta.addr, tid, - WLAN_BACK_INITIATOR); - -timer_expired_exit: - rcu_read_unlock(); -} - -void ieee80211_sta_tear_down_BA_sessions(struct ieee80211_sub_if_data *sdata, u8 *addr) -{ - struct ieee80211_local *local = sdata->local; - int i; - - for (i = 0; i < STA_TID_NUM; i++) { - ieee80211_stop_tx_ba_session(&local->hw, addr, i, - WLAN_BACK_INITIATOR); - ieee80211_sta_stop_rx_ba_session(sdata, addr, i, - WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_LEAVE_QBSS); - } -} - -int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) -{ - struct ieee80211_local *local = hw_to_local(hw); - struct sta_info *sta; - struct ieee80211_sub_if_data *sdata; - u16 start_seq_num; - u8 *state; - int ret = 0; - - if ((tid >= STA_TID_NUM) || !(hw->flags & IEEE80211_HW_AMPDU_AGGREGATION)) - return -EINVAL; - -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Open BA session requested for %pM tid %u\n", - ra, tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - - rcu_read_lock(); - - sta = sta_info_get(local, ra); - if (!sta) { -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Could not find the station\n"); -#endif - ret = -ENOENT; - goto exit; - } - - spin_lock_bh(&sta->lock); - - /* we have tried too many times, receiver does not want A-MPDU */ - if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) { - ret = -EBUSY; - goto err_unlock_sta; - } - - state = &sta->ampdu_mlme.tid_state_tx[tid]; - /* check if the TID is not in aggregation flow already */ - if (*state != HT_AGG_STATE_IDLE) { -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "BA request denied - session is not " - "idle on tid %u\n", tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - ret = -EAGAIN; - goto err_unlock_sta; - } - - /* prepare A-MPDU MLME for Tx aggregation */ - sta->ampdu_mlme.tid_tx[tid] = - kmalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC); - if (!sta->ampdu_mlme.tid_tx[tid]) { -#ifdef CONFIG_MAC80211_HT_DEBUG - if (net_ratelimit()) - printk(KERN_ERR "allocate tx mlme to tid %d failed\n", - tid); -#endif - ret = -ENOMEM; - goto err_unlock_sta; - } - /* Tx timer */ - sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.function = - sta_addba_resp_timer_expired; - sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.data = - (unsigned long)&sta->timer_to_tid[tid]; - init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); - - if (hw->ampdu_queues) { - /* create a new queue for this aggregation */ - ret = ieee80211_ht_agg_queue_add(local, sta, tid); - - /* case no queue is available to aggregation - * don't switch to aggregation */ - if (ret) { -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "BA request denied - " - "queue unavailable for tid %d\n", tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - goto err_unlock_queue; - } - } - sdata = sta->sdata; - - /* Ok, the Addba frame hasn't been sent yet, but if the driver calls the - * call back right away, it must see that the flow has begun */ - *state |= HT_ADDBA_REQUESTED_MSK; - - /* This is slightly racy because the queue isn't stopped */ - start_seq_num = sta->tid_seq[tid]; - - if (local->ops->ampdu_action) - ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_START, - &sta->sta, tid, &start_seq_num); - - if (ret) { - /* No need to requeue the packets in the agg queue, since we - * held the tx lock: no packet could be enqueued to the newly - * allocated queue */ - if (hw->ampdu_queues) - ieee80211_ht_agg_queue_remove(local, sta, tid, 0); -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "BA request denied - HW unavailable for" - " tid %d\n", tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - *state = HT_AGG_STATE_IDLE; - goto err_unlock_queue; - } - - /* Will put all the packets in the new SW queue */ - if (hw->ampdu_queues) - ieee80211_requeue(local, ieee802_1d_to_ac[tid]); - spin_unlock_bh(&sta->lock); - - /* send an addBA request */ - sta->ampdu_mlme.dialog_token_allocator++; - sta->ampdu_mlme.tid_tx[tid]->dialog_token = - sta->ampdu_mlme.dialog_token_allocator; - sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num; - - - ieee80211_send_addba_request(sta->sdata, ra, tid, - sta->ampdu_mlme.tid_tx[tid]->dialog_token, - sta->ampdu_mlme.tid_tx[tid]->ssn, - 0x40, 5000); - /* activate the timer for the recipient's addBA response */ - sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.expires = - jiffies + ADDBA_RESP_INTERVAL; - add_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid); -#endif - goto exit; - -err_unlock_queue: - kfree(sta->ampdu_mlme.tid_tx[tid]); - sta->ampdu_mlme.tid_tx[tid] = NULL; - ret = -EBUSY; -err_unlock_sta: - spin_unlock_bh(&sta->lock); -exit: - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL(ieee80211_start_tx_ba_session); - -int ieee80211_stop_tx_ba_session(struct ieee80211_hw *hw, - u8 *ra, u16 tid, - enum ieee80211_back_parties initiator) -{ - struct ieee80211_local *local = hw_to_local(hw); - struct sta_info *sta; - u8 *state; - int ret = 0; - - if (tid >= STA_TID_NUM) - return -EINVAL; - - rcu_read_lock(); - sta = sta_info_get(local, ra); - if (!sta) { - rcu_read_unlock(); - return -ENOENT; - } - - /* check if the TID is in aggregation */ - state = &sta->ampdu_mlme.tid_state_tx[tid]; - spin_lock_bh(&sta->lock); - - if (*state != HT_AGG_STATE_OPERATIONAL) { - ret = -ENOENT; - goto stop_BA_exit; - } - -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Tx BA session stop requested for %pM tid %u\n", - ra, tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - - if (hw->ampdu_queues) - ieee80211_stop_queue(hw, sta->tid_to_tx_q[tid]); - - *state = HT_AGG_STATE_REQ_STOP_BA_MSK | - (initiator << HT_AGG_STATE_INITIATOR_SHIFT); - - if (local->ops->ampdu_action) - ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_STOP, - &sta->sta, tid, NULL); - - /* case HW denied going back to legacy */ - if (ret) { - WARN_ON(ret != -EBUSY); - *state = HT_AGG_STATE_OPERATIONAL; - if (hw->ampdu_queues) - ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); - goto stop_BA_exit; - } - -stop_BA_exit: - spin_unlock_bh(&sta->lock); - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL(ieee80211_stop_tx_ba_session); - -void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid) -{ - struct ieee80211_local *local = hw_to_local(hw); - struct sta_info *sta; - u8 *state; - - if (tid >= STA_TID_NUM) { -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n", - tid, STA_TID_NUM); -#endif - return; - } - - rcu_read_lock(); - sta = sta_info_get(local, ra); - if (!sta) { - rcu_read_unlock(); -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Could not find station: %pM\n", ra); -#endif - return; - } - - state = &sta->ampdu_mlme.tid_state_tx[tid]; - spin_lock_bh(&sta->lock); - - if (!(*state & HT_ADDBA_REQUESTED_MSK)) { -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "addBA was not requested yet, state is %d\n", - *state); -#endif - spin_unlock_bh(&sta->lock); - rcu_read_unlock(); - return; - } - - WARN_ON_ONCE(*state & HT_ADDBA_DRV_READY_MSK); - - *state |= HT_ADDBA_DRV_READY_MSK; - - if (*state == HT_AGG_STATE_OPERATIONAL) { -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Aggregation is on for tid %d \n", tid); -#endif - if (hw->ampdu_queues) - ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); - } - spin_unlock_bh(&sta->lock); - rcu_read_unlock(); -} -EXPORT_SYMBOL(ieee80211_start_tx_ba_cb); - -void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid) -{ - struct ieee80211_local *local = hw_to_local(hw); - struct sta_info *sta; - u8 *state; - int agg_queue; - - if (tid >= STA_TID_NUM) { -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n", - tid, STA_TID_NUM); -#endif - return; - } - -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Stopping Tx BA session for %pM tid %d\n", - ra, tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - - rcu_read_lock(); - sta = sta_info_get(local, ra); - if (!sta) { -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Could not find station: %pM\n", ra); -#endif - rcu_read_unlock(); - return; - } - state = &sta->ampdu_mlme.tid_state_tx[tid]; - - /* NOTE: no need to use sta->lock in this state check, as - * ieee80211_stop_tx_ba_session will let only one stop call to - * pass through per sta/tid - */ - if ((*state & HT_AGG_STATE_REQ_STOP_BA_MSK) == 0) { -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "unexpected callback to A-MPDU stop\n"); -#endif - rcu_read_unlock(); - return; - } - - if (*state & HT_AGG_STATE_INITIATOR_MSK) - ieee80211_send_delba(sta->sdata, ra, tid, - WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); - - if (hw->ampdu_queues) { - agg_queue = sta->tid_to_tx_q[tid]; - ieee80211_ht_agg_queue_remove(local, sta, tid, 1); - - /* We just requeued the all the frames that were in the - * removed queue, and since we might miss a softirq we do - * netif_schedule_queue. ieee80211_wake_queue is not used - * here as this queue is not necessarily stopped - */ - netif_schedule_queue(netdev_get_tx_queue(local->mdev, - agg_queue)); - } - spin_lock_bh(&sta->lock); - *state = HT_AGG_STATE_IDLE; - sta->ampdu_mlme.addba_req_num[tid] = 0; - kfree(sta->ampdu_mlme.tid_tx[tid]); - sta->ampdu_mlme.tid_tx[tid] = NULL; - spin_unlock_bh(&sta->lock); - - rcu_read_unlock(); -} -EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb); - -void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_hw *hw, - const u8 *ra, u16 tid) -{ - struct ieee80211_local *local = hw_to_local(hw); - struct ieee80211_ra_tid *ra_tid; - struct sk_buff *skb = dev_alloc_skb(0); - - if (unlikely(!skb)) { -#ifdef CONFIG_MAC80211_HT_DEBUG - if (net_ratelimit()) - printk(KERN_WARNING "%s: Not enough memory, " - "dropping start BA session", skb->dev->name); -#endif - return; - } - ra_tid = (struct ieee80211_ra_tid *) &skb->cb; - memcpy(&ra_tid->ra, ra, ETH_ALEN); - ra_tid->tid = tid; - - skb->pkt_type = IEEE80211_ADDBA_MSG; - skb_queue_tail(&local->skb_queue, skb); - tasklet_schedule(&local->tasklet); -} -EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe); - -void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_hw *hw, - const u8 *ra, u16 tid) -{ - struct ieee80211_local *local = hw_to_local(hw); - struct ieee80211_ra_tid *ra_tid; - struct sk_buff *skb = dev_alloc_skb(0); - - if (unlikely(!skb)) { -#ifdef CONFIG_MAC80211_HT_DEBUG - if (net_ratelimit()) - printk(KERN_WARNING "%s: Not enough memory, " - "dropping stop BA session", skb->dev->name); -#endif - return; - } - ra_tid = (struct ieee80211_ra_tid *) &skb->cb; - memcpy(&ra_tid->ra, ra, ETH_ALEN); - ra_tid->tid = tid; - - skb->pkt_type = IEEE80211_DELBA_MSG; - skb_queue_tail(&local->skb_queue, skb); - tasklet_schedule(&local->tasklet); -} -EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb_irqsafe); - -/* - * After accepting the AddBA Request we activated a timer, - * resetting it after each frame that arrives from the originator. - * if this timer expires ieee80211_sta_stop_rx_ba_session will be executed. - */ -static void sta_rx_agg_session_timer_expired(unsigned long data) -{ - /* not an elegant detour, but there is no choice as the timer passes - * only one argument, and various sta_info are needed here, so init - * flow in sta_info_create gives the TID as data, while the timer_to_id - * array gives the sta through container_of */ - u8 *ptid = (u8 *)data; - u8 *timer_to_id = ptid - *ptid; - struct sta_info *sta = container_of(timer_to_id, struct sta_info, - timer_to_tid[0]); - -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid); -#endif - ieee80211_sta_stop_rx_ba_session(sta->sdata, sta->sta.addr, - (u16)*ptid, WLAN_BACK_TIMER, - WLAN_REASON_QSTA_TIMEOUT); -} - -void ieee80211_process_addba_request(struct ieee80211_local *local, - struct sta_info *sta, - struct ieee80211_mgmt *mgmt, - size_t len) -{ - struct ieee80211_hw *hw = &local->hw; - struct ieee80211_conf *conf = &hw->conf; - struct tid_ampdu_rx *tid_agg_rx; - u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num, status; - u8 dialog_token; - int ret = -EOPNOTSUPP; - - /* extract session parameters from addba request frame */ - dialog_token = mgmt->u.action.u.addba_req.dialog_token; - timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout); - start_seq_num = - le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4; - - capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); - ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1; - tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; - buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; - - status = WLAN_STATUS_REQUEST_DECLINED; - - /* sanity check for incoming parameters: - * check if configuration can support the BA policy - * and if buffer size does not exceeds max value */ - /* XXX: check own ht delayed BA capability?? */ - if (((ba_policy != 1) - && (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) - || (buf_size > IEEE80211_MAX_AMPDU_BUF)) { - status = WLAN_STATUS_INVALID_QOS_PARAM; -#ifdef CONFIG_MAC80211_HT_DEBUG - if (net_ratelimit()) - printk(KERN_DEBUG "AddBA Req with bad params from " - "%pM on tid %u. policy %d, buffer size %d\n", - mgmt->sa, tid, ba_policy, - buf_size); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - goto end_no_lock; - } - /* determine default buffer size */ - if (buf_size == 0) { - struct ieee80211_supported_band *sband; - - sband = local->hw.wiphy->bands[conf->channel->band]; - buf_size = IEEE80211_MIN_AMPDU_BUF; - buf_size = buf_size << sband->ht_cap.ampdu_factor; - } - - - /* examine state machine */ - spin_lock_bh(&sta->lock); - - if (sta->ampdu_mlme.tid_state_rx[tid] != HT_AGG_STATE_IDLE) { -#ifdef CONFIG_MAC80211_HT_DEBUG - if (net_ratelimit()) - printk(KERN_DEBUG "unexpected AddBA Req from " - "%pM on tid %u\n", - mgmt->sa, tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - goto end; - } - - /* prepare A-MPDU MLME for Rx aggregation */ - sta->ampdu_mlme.tid_rx[tid] = - kmalloc(sizeof(struct tid_ampdu_rx), GFP_ATOMIC); - if (!sta->ampdu_mlme.tid_rx[tid]) { -#ifdef CONFIG_MAC80211_HT_DEBUG - if (net_ratelimit()) - printk(KERN_ERR "allocate rx mlme to tid %d failed\n", - tid); -#endif - goto end; - } - /* rx timer */ - sta->ampdu_mlme.tid_rx[tid]->session_timer.function = - sta_rx_agg_session_timer_expired; - sta->ampdu_mlme.tid_rx[tid]->session_timer.data = - (unsigned long)&sta->timer_to_tid[tid]; - init_timer(&sta->ampdu_mlme.tid_rx[tid]->session_timer); - - tid_agg_rx = sta->ampdu_mlme.tid_rx[tid]; - - /* prepare reordering buffer */ - tid_agg_rx->reorder_buf = - kmalloc(buf_size * sizeof(struct sk_buff *), GFP_ATOMIC); - if (!tid_agg_rx->reorder_buf) { -#ifdef CONFIG_MAC80211_HT_DEBUG - if (net_ratelimit()) - printk(KERN_ERR "can not allocate reordering buffer " - "to tid %d\n", tid); -#endif - kfree(sta->ampdu_mlme.tid_rx[tid]); - goto end; - } - memset(tid_agg_rx->reorder_buf, 0, - buf_size * sizeof(struct sk_buff *)); - - if (local->ops->ampdu_action) - ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_START, - &sta->sta, tid, &start_seq_num); -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Rx A-MPDU request on tid %d result %d\n", tid, ret); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - - if (ret) { - kfree(tid_agg_rx->reorder_buf); - kfree(tid_agg_rx); - sta->ampdu_mlme.tid_rx[tid] = NULL; - goto end; - } - - /* change state and send addba resp */ - sta->ampdu_mlme.tid_state_rx[tid] = HT_AGG_STATE_OPERATIONAL; - tid_agg_rx->dialog_token = dialog_token; - tid_agg_rx->ssn = start_seq_num; - tid_agg_rx->head_seq_num = start_seq_num; - tid_agg_rx->buf_size = buf_size; - tid_agg_rx->timeout = timeout; - tid_agg_rx->stored_mpdu_num = 0; - status = WLAN_STATUS_SUCCESS; -end: - spin_unlock_bh(&sta->lock); - -end_no_lock: - ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid, - dialog_token, status, 1, buf_size, timeout); -} - -void ieee80211_process_addba_resp(struct ieee80211_local *local, - struct sta_info *sta, - struct ieee80211_mgmt *mgmt, - size_t len) -{ - struct ieee80211_hw *hw = &local->hw; - u16 capab; - u16 tid, start_seq_num; - u8 *state; - - capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); - tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; - - state = &sta->ampdu_mlme.tid_state_tx[tid]; - - spin_lock_bh(&sta->lock); - - if (!(*state & HT_ADDBA_REQUESTED_MSK)) { - spin_unlock_bh(&sta->lock); - return; - } - - if (mgmt->u.action.u.addba_resp.dialog_token != - sta->ampdu_mlme.tid_tx[tid]->dialog_token) { - spin_unlock_bh(&sta->lock); -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "wrong addBA response token, tid %d\n", tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - return; - } - - del_timer_sync(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "switched off addBA timer for tid %d \n", tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - if (le16_to_cpu(mgmt->u.action.u.addba_resp.status) - == WLAN_STATUS_SUCCESS) { - *state |= HT_ADDBA_RECEIVED_MSK; - sta->ampdu_mlme.addba_req_num[tid] = 0; - - if (*state == HT_AGG_STATE_OPERATIONAL && - local->hw.ampdu_queues) - ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); - - if (local->ops->ampdu_action) { - (void)local->ops->ampdu_action(hw, - IEEE80211_AMPDU_TX_RESUME, - &sta->sta, tid, &start_seq_num); - } -#ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "Resuming TX aggregation for tid %d\n", tid); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - spin_unlock_bh(&sta->lock); - } else { - sta->ampdu_mlme.addba_req_num[tid]++; - /* this will allow the state check in stop_BA_session */ - *state = HT_AGG_STATE_OPERATIONAL; - spin_unlock_bh(&sta->lock); - ieee80211_stop_tx_ba_session(hw, sta->sta.addr, tid, - WLAN_BACK_INITIATOR); - } + ieee80211_tx_skb(sdata, skb, 1); } void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c new file mode 100644 index 000000000000..f4becc12904e --- /dev/null +++ b/net/mac80211/ibss.c @@ -0,0 +1,907 @@ +/* + * IBSS mode implementation + * Copyright 2003-2008, Jouni Malinen <j@w1.fi> + * Copyright 2004, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> + * Copyright 2007, Michael Wu <flamingice@sourmilk.net> + * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/delay.h> +#include <linux/if_ether.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/etherdevice.h> +#include <linux/rtnetlink.h> +#include <net/mac80211.h> +#include <asm/unaligned.h> + +#include "ieee80211_i.h" +#include "rate.h" + +#define IEEE80211_SCAN_INTERVAL (2 * HZ) +#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ) +#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ) + +#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ) +#define IEEE80211_IBSS_MERGE_DELAY 0x400000 +#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ) + +#define IEEE80211_IBSS_MAX_STA_ENTRIES 128 + + +static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + u16 auth_alg, auth_transaction, status_code; + + if (len < 24 + 6) + return; + + auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); + auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); + status_code = le16_to_cpu(mgmt->u.auth.status_code); + + /* + * IEEE 802.11 standard does not require authentication in IBSS + * networks and most implementations do not seem to use it. + * However, try to reply to authentication attempts if someone + * has actually implemented this. + */ + if (auth_alg == WLAN_AUTH_OPEN && auth_transaction == 1) + ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, NULL, 0, + sdata->u.ibss.bssid, 0); +} + +static int __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, + const u8 *bssid, const int beacon_int, + const int freq, + const size_t supp_rates_len, + const u8 *supp_rates, + const u16 capability, u64 tsf) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + int res = 0, rates, i, j; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u8 *pos; + struct ieee80211_supported_band *sband; + union iwreq_data wrqu; + + if (local->ops->reset_tsf) { + /* Reset own TSF to allow time synchronization work. */ + local->ops->reset_tsf(local_to_hw(local)); + } + + if ((ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET) && + memcmp(ifibss->bssid, bssid, ETH_ALEN) == 0) + return res; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for probe " + "response\n", sdata->dev->name); + return -ENOMEM; + } + + if (!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET)) { + /* Remove possible STA entries from other IBSS networks. */ + sta_info_flush_delayed(sdata); + } + + memcpy(ifibss->bssid, bssid, ETH_ALEN); + res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID); + if (res) + return res; + + local->hw.conf.beacon_int = beacon_int >= 10 ? beacon_int : 10; + + sdata->drop_unencrypted = capability & + WLAN_CAPABILITY_PRIVACY ? 1 : 0; + + res = ieee80211_set_freq(sdata, freq); + + if (res) + return res; + + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + /* Build IBSS probe response */ + + skb_reserve(skb, local->hw.extra_tx_headroom); + + mgmt = (struct ieee80211_mgmt *) + skb_put(skb, 24 + sizeof(mgmt->u.beacon)); + memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon)); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_PROBE_RESP); + memset(mgmt->da, 0xff, ETH_ALEN); + memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); + memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); + mgmt->u.beacon.beacon_int = + cpu_to_le16(local->hw.conf.beacon_int); + mgmt->u.beacon.timestamp = cpu_to_le64(tsf); + mgmt->u.beacon.capab_info = cpu_to_le16(capability); + + pos = skb_put(skb, 2 + ifibss->ssid_len); + *pos++ = WLAN_EID_SSID; + *pos++ = ifibss->ssid_len; + memcpy(pos, ifibss->ssid, ifibss->ssid_len); + + rates = supp_rates_len; + if (rates > 8) + rates = 8; + pos = skb_put(skb, 2 + rates); + *pos++ = WLAN_EID_SUPP_RATES; + *pos++ = rates; + memcpy(pos, supp_rates, rates); + + if (sband->band == IEEE80211_BAND_2GHZ) { + pos = skb_put(skb, 2 + 1); + *pos++ = WLAN_EID_DS_PARAMS; + *pos++ = 1; + *pos++ = ieee80211_frequency_to_channel(freq); + } + + pos = skb_put(skb, 2 + 2); + *pos++ = WLAN_EID_IBSS_PARAMS; + *pos++ = 2; + /* FIX: set ATIM window based on scan results */ + *pos++ = 0; + *pos++ = 0; + + if (supp_rates_len > 8) { + rates = supp_rates_len - 8; + pos = skb_put(skb, 2 + rates); + *pos++ = WLAN_EID_EXT_SUPP_RATES; + *pos++ = rates; + memcpy(pos, &supp_rates[8], rates); + } + + ifibss->probe_resp = skb; + + ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON | + IEEE80211_IFCC_BEACON_ENABLED); + + + rates = 0; + for (i = 0; i < supp_rates_len; i++) { + int bitrate = (supp_rates[i] & 0x7f) * 5; + for (j = 0; j < sband->n_bitrates; j++) + if (sband->bitrates[j].bitrate == bitrate) + rates |= BIT(j); + } + + ieee80211_sta_def_wmm_params(sdata, supp_rates_len, supp_rates); + + ifibss->flags |= IEEE80211_IBSS_PREV_BSSID_SET; + ifibss->state = IEEE80211_IBSS_MLME_JOINED; + mod_timer(&ifibss->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL); + + memset(&wrqu, 0, sizeof(wrqu)); + memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN); + wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL); + + return res; +} + +static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, + struct ieee80211_bss *bss) +{ + return __ieee80211_sta_join_ibss(sdata, + bss->cbss.bssid, + bss->cbss.beacon_interval, + bss->cbss.channel->center_freq, + bss->supp_rates_len, bss->supp_rates, + bss->cbss.capability, + bss->cbss.tsf); +} + +static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status, + struct ieee802_11_elems *elems, + bool beacon) +{ + struct ieee80211_local *local = sdata->local; + int freq; + struct ieee80211_bss *bss; + struct sta_info *sta; + struct ieee80211_channel *channel; + u64 beacon_timestamp, rx_timestamp; + u32 supp_rates = 0; + enum ieee80211_band band = rx_status->band; + + if (elems->ds_params && elems->ds_params_len == 1) + freq = ieee80211_channel_to_frequency(elems->ds_params[0]); + else + freq = rx_status->freq; + + channel = ieee80211_get_channel(local->hw.wiphy, freq); + + if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) + return; + + if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates && + memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) { + supp_rates = ieee80211_sta_get_rates(local, elems, band); + + rcu_read_lock(); + + sta = sta_info_get(local, mgmt->sa); + if (sta) { + u32 prev_rates; + + prev_rates = sta->sta.supp_rates[band]; + /* make sure mandatory rates are always added */ + sta->sta.supp_rates[band] = supp_rates | + ieee80211_mandatory_rates(local, band); + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + if (sta->sta.supp_rates[band] != prev_rates) + printk(KERN_DEBUG "%s: updated supp_rates set " + "for %pM based on beacon info (0x%llx | " + "0x%llx -> 0x%llx)\n", + sdata->dev->name, + sta->sta.addr, + (unsigned long long) prev_rates, + (unsigned long long) supp_rates, + (unsigned long long) sta->sta.supp_rates[band]); +#endif + } else + ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); + + rcu_read_unlock(); + } + + bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, + channel, beacon); + if (!bss) + return; + + /* was just updated in ieee80211_bss_info_update */ + beacon_timestamp = bss->cbss.tsf; + + /* check if we need to merge IBSS */ + + /* merge only on beacons (???) */ + if (!beacon) + goto put_bss; + + /* we use a fixed BSSID */ + if (sdata->u.ibss.flags & IEEE80211_IBSS_BSSID_SET) + goto put_bss; + + /* not an IBSS */ + if (!(bss->cbss.capability & WLAN_CAPABILITY_IBSS)) + goto put_bss; + + /* different channel */ + if (bss->cbss.channel != local->oper_channel) + goto put_bss; + + /* different SSID */ + if (elems->ssid_len != sdata->u.ibss.ssid_len || + memcmp(elems->ssid, sdata->u.ibss.ssid, + sdata->u.ibss.ssid_len)) + goto put_bss; + + /* same BSSID */ + if (memcmp(bss->cbss.bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) + goto put_bss; + + if (rx_status->flag & RX_FLAG_TSFT) { + /* + * For correct IBSS merging we need mactime; since mactime is + * defined as the time the first data symbol of the frame hits + * the PHY, and the timestamp of the beacon is defined as "the + * time that the data symbol containing the first bit of the + * timestamp is transmitted to the PHY plus the transmitting + * STA's delays through its local PHY from the MAC-PHY + * interface to its interface with the WM" (802.11 11.1.2) + * - equals the time this bit arrives at the receiver - we have + * to take into account the offset between the two. + * + * E.g. at 1 MBit that means mactime is 192 usec earlier + * (=24 bytes * 8 usecs/byte) than the beacon timestamp. + */ + int rate; + + if (rx_status->flag & RX_FLAG_HT) + rate = 65; /* TODO: HT rates */ + else + rate = local->hw.wiphy->bands[band]-> + bitrates[rx_status->rate_idx].bitrate; + + rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate); + } else if (local && local->ops && local->ops->get_tsf) + /* second best option: get current TSF */ + rx_timestamp = local->ops->get_tsf(local_to_hw(local)); + else + /* can't merge without knowing the TSF */ + rx_timestamp = -1LLU; + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "RX beacon SA=%pM BSSID=" + "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n", + mgmt->sa, mgmt->bssid, + (unsigned long long)rx_timestamp, + (unsigned long long)beacon_timestamp, + (unsigned long long)(rx_timestamp - beacon_timestamp), + jiffies); +#endif + + /* give slow hardware some time to do the TSF sync */ + if (rx_timestamp < IEEE80211_IBSS_MERGE_DELAY) + goto put_bss; + + if (beacon_timestamp > rx_timestamp) { +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: beacon TSF higher than " + "local TSF - IBSS merge with BSSID %pM\n", + sdata->dev->name, mgmt->bssid); +#endif + ieee80211_sta_join_ibss(sdata, bss); + ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); + } + + put_bss: + ieee80211_rx_bss_put(local, bss); +} + +/* + * Add a new IBSS station, will also be called by the RX code when, + * in IBSS mode, receiving a frame from a yet-unknown station, hence + * must be callable in atomic context. + */ +struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, + u8 *bssid,u8 *addr, u32 supp_rates) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + int band = local->hw.conf.channel->band; + + /* TODO: Could consider removing the least recently used entry and + * allow new one to be added. */ + if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) { + if (net_ratelimit()) { + printk(KERN_DEBUG "%s: No room for a new IBSS STA " + "entry %pM\n", sdata->dev->name, addr); + } + return NULL; + } + + if (compare_ether_addr(bssid, sdata->u.ibss.bssid)) + return NULL; + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n", + wiphy_name(local->hw.wiphy), addr, sdata->dev->name); +#endif + + sta = sta_info_alloc(sdata, addr, GFP_ATOMIC); + if (!sta) + return NULL; + + set_sta_flags(sta, WLAN_STA_AUTHORIZED); + + /* make sure mandatory rates are always added */ + sta->sta.supp_rates[band] = supp_rates | + ieee80211_mandatory_rates(local, band); + + rate_control_rate_init(sta); + + if (sta_info_insert(sta)) + return NULL; + + return sta; +} + +static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + int active = 0; + struct sta_info *sta; + + rcu_read_lock(); + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sta->sdata == sdata && + time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL, + jiffies)) { + active++; + break; + } + } + + rcu_read_unlock(); + + return active; +} + + +static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + mod_timer(&ifibss->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL); + + ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT); + if (ieee80211_sta_active_ibss(sdata)) + return; + + if ((ifibss->flags & IEEE80211_IBSS_BSSID_SET) && + (!(ifibss->flags & IEEE80211_IBSS_AUTO_CHANNEL_SEL))) + return; + + printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other " + "IBSS networks with same SSID (merge)\n", sdata->dev->name); + + /* XXX maybe racy? */ + if (sdata->local->scan_req) + return; + + memcpy(sdata->local->int_scan_req.ssids[0].ssid, + ifibss->ssid, IEEE80211_MAX_SSID_LEN); + sdata->local->int_scan_req.ssids[0].ssid_len = ifibss->ssid_len; + ieee80211_request_scan(sdata, &sdata->local->int_scan_req); +} + +static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + u8 *pos; + u8 bssid[ETH_ALEN]; + u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; + u16 capability; + int i; + + if (ifibss->flags & IEEE80211_IBSS_BSSID_SET) { + memcpy(bssid, ifibss->bssid, ETH_ALEN); + } else { + /* Generate random, not broadcast, locally administered BSSID. Mix in + * own MAC address to make sure that devices that do not have proper + * random number generator get different BSSID. */ + get_random_bytes(bssid, ETH_ALEN); + for (i = 0; i < ETH_ALEN; i++) + bssid[i] ^= sdata->dev->dev_addr[i]; + bssid[0] &= ~0x01; + bssid[0] |= 0x02; + } + + printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n", + sdata->dev->name, bssid); + + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + if (local->hw.conf.beacon_int == 0) + local->hw.conf.beacon_int = 100; + + capability = WLAN_CAPABILITY_IBSS; + + if (sdata->default_key) + capability |= WLAN_CAPABILITY_PRIVACY; + else + sdata->drop_unencrypted = 0; + + pos = supp_rates; + for (i = 0; i < sband->n_bitrates; i++) { + int rate = sband->bitrates[i].bitrate; + *pos++ = (u8) (rate / 5); + } + + return __ieee80211_sta_join_ibss(sdata, + bssid, local->hw.conf.beacon_int, + local->hw.conf.channel->center_freq, + sband->n_bitrates, supp_rates, + capability, 0); +} + +static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + struct ieee80211_bss *bss; + const u8 *bssid = NULL; + int active_ibss; + + if (ifibss->ssid_len == 0) + return -EINVAL; + + active_ibss = ieee80211_sta_active_ibss(sdata); +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n", + sdata->dev->name, active_ibss); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + if (active_ibss) + return 0; + + if (ifibss->flags & IEEE80211_IBSS_BSSID_SET) + bssid = ifibss->bssid; + bss = (void *)cfg80211_get_bss(local->hw.wiphy, NULL, bssid, + ifibss->ssid, ifibss->ssid_len, + WLAN_CAPABILITY_IBSS, + WLAN_CAPABILITY_IBSS); + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + if (bss) + printk(KERN_DEBUG " sta_find_ibss: selected %pM current " + "%pM\n", bss->cbss.bssid, ifibss->bssid); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + if (bss && + (!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET) || + memcmp(ifibss->bssid, bss->cbss.bssid, ETH_ALEN))) { + int ret; + + printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM" + " based on configured SSID\n", + sdata->dev->name, bss->cbss.bssid); + + ret = ieee80211_sta_join_ibss(sdata, bss); + ieee80211_rx_bss_put(local, bss); + return ret; + } else if (bss) + ieee80211_rx_bss_put(local, bss); + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG " did not try to join ibss\n"); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + /* Selected IBSS not found in current scan results - try to scan */ + if (ifibss->state == IEEE80211_IBSS_MLME_JOINED && + !ieee80211_sta_active_ibss(sdata)) { + mod_timer(&ifibss->timer, jiffies + + IEEE80211_IBSS_MERGE_INTERVAL); + } else if (time_after(jiffies, local->last_scan_completed + + IEEE80211_SCAN_INTERVAL)) { + printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to " + "join\n", sdata->dev->name); + + /* XXX maybe racy? */ + if (local->scan_req) + return -EBUSY; + + memcpy(local->int_scan_req.ssids[0].ssid, + ifibss->ssid, IEEE80211_MAX_SSID_LEN); + local->int_scan_req.ssids[0].ssid_len = ifibss->ssid_len; + return ieee80211_request_scan(sdata, &local->int_scan_req); + } else if (ifibss->state != IEEE80211_IBSS_MLME_JOINED) { + int interval = IEEE80211_SCAN_INTERVAL; + + if (time_after(jiffies, ifibss->ibss_join_req + + IEEE80211_IBSS_JOIN_TIMEOUT)) { + if (!(local->oper_channel->flags & + IEEE80211_CHAN_NO_IBSS)) + return ieee80211_sta_create_ibss(sdata); + printk(KERN_DEBUG "%s: IBSS not allowed on" + " %d MHz\n", sdata->dev->name, + local->hw.conf.channel->center_freq); + + /* No IBSS found - decrease scan interval and continue + * scanning. */ + interval = IEEE80211_SCAN_INTERVAL_SLOW; + } + + ifibss->state = IEEE80211_IBSS_MLME_SEARCH; + mod_timer(&ifibss->timer, jiffies + interval); + return 0; + } + + return 0; +} + +static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + int tx_last_beacon; + struct sk_buff *skb; + struct ieee80211_mgmt *resp; + u8 *pos, *end; + + if (ifibss->state != IEEE80211_IBSS_MLME_JOINED || + len < 24 + 2 || !ifibss->probe_resp) + return; + + if (local->ops->tx_last_beacon) + tx_last_beacon = local->ops->tx_last_beacon(local_to_hw(local)); + else + tx_last_beacon = 1; + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM" + " (tx_last_beacon=%d)\n", + sdata->dev->name, mgmt->sa, mgmt->da, + mgmt->bssid, tx_last_beacon); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + if (!tx_last_beacon) + return; + + if (memcmp(mgmt->bssid, ifibss->bssid, ETH_ALEN) != 0 && + memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0) + return; + + end = ((u8 *) mgmt) + len; + pos = mgmt->u.probe_req.variable; + if (pos[0] != WLAN_EID_SSID || + pos + 2 + pos[1] > end) { +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq " + "from %pM\n", + sdata->dev->name, mgmt->sa); +#endif + return; + } + if (pos[1] != 0 && + (pos[1] != ifibss->ssid_len || + memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len) != 0)) { + /* Ignore ProbeReq for foreign SSID */ + return; + } + + /* Reply with ProbeResp */ + skb = skb_copy(ifibss->probe_resp, GFP_KERNEL); + if (!skb) + return; + + resp = (struct ieee80211_mgmt *) skb->data; + memcpy(resp->da, mgmt->sa, ETH_ALEN); +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n", + sdata->dev->name, resp->da); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + ieee80211_tx_skb(sdata, skb, 0); +} + +static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status) +{ + size_t baselen; + struct ieee802_11_elems elems; + + if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN)) + return; /* ignore ProbeResp to foreign address */ + + baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; + if (baselen > len) + return; + + ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, + &elems); + + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false); +} + +static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status) +{ + size_t baselen; + struct ieee802_11_elems elems; + + /* Process beacon from the current BSS */ + baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt; + if (baselen > len) + return; + + ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems); + + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true); +} + +static void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_rx_status *rx_status; + struct ieee80211_mgmt *mgmt; + u16 fc; + + rx_status = (struct ieee80211_rx_status *) skb->cb; + mgmt = (struct ieee80211_mgmt *) skb->data; + fc = le16_to_cpu(mgmt->frame_control); + + switch (fc & IEEE80211_FCTL_STYPE) { + case IEEE80211_STYPE_PROBE_REQ: + ieee80211_rx_mgmt_probe_req(sdata, mgmt, skb->len); + break; + case IEEE80211_STYPE_PROBE_RESP: + ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, + rx_status); + break; + case IEEE80211_STYPE_BEACON: + ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, + rx_status); + break; + case IEEE80211_STYPE_AUTH: + ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len); + break; + } + + kfree_skb(skb); +} + +static void ieee80211_ibss_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, u.ibss.work); + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_ibss *ifibss; + struct sk_buff *skb; + + if (!netif_running(sdata->dev)) + return; + + if (local->sw_scanning || local->hw_scanning) + return; + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_ADHOC)) + return; + ifibss = &sdata->u.ibss; + + while ((skb = skb_dequeue(&ifibss->skb_queue))) + ieee80211_ibss_rx_queued_mgmt(sdata, skb); + + if (!test_and_clear_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request)) + return; + + switch (ifibss->state) { + case IEEE80211_IBSS_MLME_SEARCH: + ieee80211_sta_find_ibss(sdata); + break; + case IEEE80211_IBSS_MLME_JOINED: + ieee80211_sta_merge_ibss(sdata); + break; + default: + WARN_ON(1); + break; + } +} + +static void ieee80211_ibss_timer(unsigned long data) +{ + struct ieee80211_sub_if_data *sdata = + (struct ieee80211_sub_if_data *) data; + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + + set_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request); + queue_work(local->hw.workqueue, &ifibss->work); +} + +void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + INIT_WORK(&ifibss->work, ieee80211_ibss_work); + setup_timer(&ifibss->timer, ieee80211_ibss_timer, + (unsigned long) sdata); + skb_queue_head_init(&ifibss->skb_queue); + + ifibss->flags |= IEEE80211_IBSS_AUTO_BSSID_SEL | + IEEE80211_IBSS_AUTO_CHANNEL_SEL; +} + +int ieee80211_ibss_commit(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + ifibss->flags &= ~IEEE80211_IBSS_PREV_BSSID_SET; + + if (ifibss->ssid_len) + ifibss->flags |= IEEE80211_IBSS_SSID_SET; + else + ifibss->flags &= ~IEEE80211_IBSS_SSID_SET; + + ifibss->ibss_join_req = jiffies; + ifibss->state = IEEE80211_IBSS_MLME_SEARCH; + + return ieee80211_sta_find_ibss(sdata); +} + +int ieee80211_ibss_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + if (len > IEEE80211_MAX_SSID_LEN) + return -EINVAL; + + if (ifibss->ssid_len != len || memcmp(ifibss->ssid, ssid, len) != 0) { + memset(ifibss->ssid, 0, sizeof(ifibss->ssid)); + memcpy(ifibss->ssid, ssid, len); + ifibss->ssid_len = len; + } + + return ieee80211_ibss_commit(sdata); +} + +int ieee80211_ibss_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + memcpy(ssid, ifibss->ssid, ifibss->ssid_len); + *len = ifibss->ssid_len; + + return 0; +} + +int ieee80211_ibss_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + if (is_valid_ether_addr(bssid)) { + memcpy(ifibss->bssid, bssid, ETH_ALEN); + ifibss->flags |= IEEE80211_IBSS_BSSID_SET; + } else { + memset(ifibss->bssid, 0, ETH_ALEN); + ifibss->flags &= ~IEEE80211_IBSS_BSSID_SET; + } + + if (netif_running(sdata->dev)) { + if (ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID)) { + printk(KERN_DEBUG "%s: Failed to config new BSSID to " + "the low-level driver\n", sdata->dev->name); + } + } + + return ieee80211_ibss_commit(sdata); +} + +/* scan finished notification */ +void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata = local->scan_sdata; + struct ieee80211_if_ibss *ifibss; + + if (sdata && sdata->vif.type == NL80211_IFTYPE_ADHOC) { + ifibss = &sdata->u.ibss; + if ((!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET)) || + !ieee80211_sta_active_ibss(sdata)) + ieee80211_sta_find_ibss(sdata); + } +} + +ieee80211_rx_result +ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, + struct ieee80211_rx_status *rx_status) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + u16 fc; + + if (skb->len < 24) + return RX_DROP_MONITOR; + + mgmt = (struct ieee80211_mgmt *) skb->data; + fc = le16_to_cpu(mgmt->frame_control); + + switch (fc & IEEE80211_FCTL_STYPE) { + case IEEE80211_STYPE_PROBE_RESP: + case IEEE80211_STYPE_BEACON: + memcpy(skb->cb, rx_status, sizeof(*rx_status)); + case IEEE80211_STYPE_PROBE_REQ: + case IEEE80211_STYPE_AUTH: + skb_queue_tail(&sdata->u.ibss.skb_queue, skb); + queue_work(local->hw.workqueue, &sdata->u.ibss.work); + return RX_QUEUED; + } + + return RX_DROP_MONITOR; +} diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f3eec989662b..fbb91f1aebb2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -43,7 +43,7 @@ struct ieee80211_local; /* Required encryption head and tailroom */ #define IEEE80211_ENCRYPT_HEADROOM 8 -#define IEEE80211_ENCRYPT_TAILROOM 12 +#define IEEE80211_ENCRYPT_TAILROOM 18 /* IEEE 802.11 (Ch. 9.5 Defragmentation) requires support for concurrent * reception of at least three fragmented frames. This limit can be increased @@ -57,6 +57,8 @@ struct ieee80211_local; */ #define IEEE80211_SCAN_RESULT_EXPIRE (10 * HZ) +#define TU_TO_EXP_TIME(x) (jiffies + usecs_to_jiffies((x) * 1024)) + struct ieee80211_fragment_entry { unsigned long first_frag_time; unsigned int seq; @@ -70,43 +72,36 @@ struct ieee80211_fragment_entry { struct ieee80211_bss { - struct list_head list; - struct ieee80211_bss *hnext; - size_t ssid_len; + /* Yes, this is a hack */ + struct cfg80211_bss cbss; - atomic_t users; - - u8 bssid[ETH_ALEN]; + /* don't want to look up all the time */ + size_t ssid_len; u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 dtim_period; - u16 capability; /* host byte order */ - enum ieee80211_band band; - int freq; - int signal, noise, qual; - u8 *ies; /* all information elements from the last Beacon or Probe - * Response frames; note Beacon frame is not allowed to - * override values from Probe Response */ - size_t ies_len; + bool wmm_used; + + unsigned long last_probe_resp; + #ifdef CONFIG_MAC80211_MESH u8 *mesh_id; size_t mesh_id_len; u8 *mesh_cfg; #endif + #define IEEE80211_MAX_SUPP_RATES 32 u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; size_t supp_rates_len; - u64 timestamp; - int beacon_int; - - unsigned long last_probe_resp; - unsigned long last_update; - /* during assocation, we save an ERP value from a probe response so + /* + * During assocation, we save an ERP value from a probe response so * that we can feed ERP info to the driver when handling the * association completes. these fields probably won't be up-to-date - * otherwise, you probably don't want to use them. */ - int has_erp_value; + * otherwise, you probably don't want to use them. + */ + bool has_erp_value; u8 erp_value; }; @@ -244,7 +239,7 @@ struct mesh_preq_queue { u8 flags; }; -/* flags used in struct ieee80211_if_sta.flags */ +/* flags used in struct ieee80211_if_managed.flags */ #define IEEE80211_STA_SSID_SET BIT(0) #define IEEE80211_STA_BSSID_SET BIT(1) #define IEEE80211_STA_PREV_BSSID_SET BIT(2) @@ -258,37 +253,39 @@ struct mesh_preq_queue { #define IEEE80211_STA_AUTO_BSSID_SEL BIT(11) #define IEEE80211_STA_AUTO_CHANNEL_SEL BIT(12) #define IEEE80211_STA_PRIVACY_INVOKED BIT(13) +#define IEEE80211_STA_TKIP_WEP_USED BIT(14) +#define IEEE80211_STA_CSA_RECEIVED BIT(15) +#define IEEE80211_STA_MFP_ENABLED BIT(16) /* flags for MLME request */ #define IEEE80211_STA_REQ_SCAN 0 #define IEEE80211_STA_REQ_DIRECT_PROBE 1 #define IEEE80211_STA_REQ_AUTH 2 #define IEEE80211_STA_REQ_RUN 3 -/* STA/IBSS MLME states */ -enum ieee80211_sta_mlme_state { - IEEE80211_STA_MLME_DISABLED, - IEEE80211_STA_MLME_DIRECT_PROBE, - IEEE80211_STA_MLME_AUTHENTICATE, - IEEE80211_STA_MLME_ASSOCIATE, - IEEE80211_STA_MLME_ASSOCIATED, - IEEE80211_STA_MLME_IBSS_SEARCH, - IEEE80211_STA_MLME_IBSS_JOINED, -}; - /* bitfield of allowed auth algs */ #define IEEE80211_AUTH_ALG_OPEN BIT(0) #define IEEE80211_AUTH_ALG_SHARED_KEY BIT(1) #define IEEE80211_AUTH_ALG_LEAP BIT(2) -struct ieee80211_if_sta { +struct ieee80211_if_managed { struct timer_list timer; + struct timer_list chswitch_timer; struct work_struct work; + struct work_struct chswitch_work; + u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN]; + u8 ssid[IEEE80211_MAX_SSID_LEN]; - enum ieee80211_sta_mlme_state state; size_t ssid_len; - u8 scan_ssid[IEEE80211_MAX_SSID_LEN]; - size_t scan_ssid_len; + + enum { + IEEE80211_STA_MLME_DISABLED, + IEEE80211_STA_MLME_DIRECT_PROBE, + IEEE80211_STA_MLME_AUTHENTICATE, + IEEE80211_STA_MLME_ASSOCIATE, + IEEE80211_STA_MLME_ASSOCIATED, + } state; + u16 aid; u16 ap_capab, capab; u8 *extra_ie; /* to be added to the end of AssocReq */ @@ -315,11 +312,65 @@ struct ieee80211_if_sta { int auth_alg; /* currently used IEEE 802.11 authentication algorithm */ int auth_transaction; + enum { + IEEE80211_MFP_DISABLED, + IEEE80211_MFP_OPTIONAL, + IEEE80211_MFP_REQUIRED + } mfp; /* management frame protection */ + + int wmm_last_param_set; + + /* Extra IE data for management frames */ + u8 *ie_probereq; + size_t ie_probereq_len; + u8 *ie_proberesp; + size_t ie_proberesp_len; + u8 *ie_auth; + size_t ie_auth_len; + u8 *ie_assocreq; + size_t ie_assocreq_len; + u8 *ie_reassocreq; + size_t ie_reassocreq_len; + u8 *ie_deauth; + size_t ie_deauth_len; + u8 *ie_disassoc; + size_t ie_disassoc_len; +}; + +enum ieee80211_ibss_flags { + IEEE80211_IBSS_AUTO_CHANNEL_SEL = BIT(0), + IEEE80211_IBSS_AUTO_BSSID_SEL = BIT(1), + IEEE80211_IBSS_BSSID_SET = BIT(2), + IEEE80211_IBSS_PREV_BSSID_SET = BIT(3), + IEEE80211_IBSS_SSID_SET = BIT(4), +}; + +enum ieee80211_ibss_request { + IEEE80211_IBSS_REQ_RUN = 0, +}; + +struct ieee80211_if_ibss { + struct timer_list timer; + struct work_struct work; + + struct sk_buff_head skb_queue; + + u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 ssid_len; + + u32 flags; + + u8 bssid[ETH_ALEN]; + + unsigned long request; + unsigned long ibss_join_req; struct sk_buff *probe_resp; /* ProbeResp template for IBSS */ - u32 supp_rates_bits[IEEE80211_NUM_BANDS]; - int wmm_last_param_set; + enum { + IEEE80211_IBSS_MLME_SEARCH, + IEEE80211_IBSS_MLME_JOINED, + } state; }; struct ieee80211_if_mesh { @@ -404,8 +455,10 @@ struct ieee80211_sub_if_data { unsigned int fragment_next; #define NUM_DEFAULT_KEYS 4 - struct ieee80211_key *keys[NUM_DEFAULT_KEYS]; +#define NUM_DEFAULT_MGMT_KEYS 2 + struct ieee80211_key *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS]; struct ieee80211_key *default_key; + struct ieee80211_key *default_mgmt_key; u16 sequence_number; @@ -423,7 +476,8 @@ struct ieee80211_sub_if_data { struct ieee80211_if_ap ap; struct ieee80211_if_wds wds; struct ieee80211_if_vlan vlan; - struct ieee80211_if_sta sta; + struct ieee80211_if_managed mgd; + struct ieee80211_if_ibss ibss; #ifdef CONFIG_MAC80211_MESH struct ieee80211_if_mesh mesh; #endif @@ -477,6 +531,7 @@ struct ieee80211_sub_if_data { } debugfs; struct { struct dentry *default_key; + struct dentry *default_mgmt_key; } common_debugfs; #ifdef CONFIG_MAC80211_MESH @@ -541,11 +596,10 @@ enum { enum queue_stop_reason { IEEE80211_QUEUE_STOP_REASON_DRIVER, IEEE80211_QUEUE_STOP_REASON_PS, + IEEE80211_QUEUE_STOP_REASON_CSA, + IEEE80211_QUEUE_STOP_REASON_AGGREGATION, }; -/* maximum number of hardware queues we support. */ -#define QD_MAX_QUEUES (IEEE80211_MAX_AMPDU_QUEUES + IEEE80211_MAX_QUEUES) - struct ieee80211_master_priv { struct ieee80211_local *local; }; @@ -558,9 +612,15 @@ struct ieee80211_local { const struct ieee80211_ops *ops; - unsigned long queue_pool[BITS_TO_LONGS(QD_MAX_QUEUES)]; - unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES]; + /* AC queue corresponding to each AMPDU queue */ + s8 ampdu_ac_queue[IEEE80211_MAX_AMPDU_QUEUES]; + unsigned int amdpu_ac_stop_refcnt[IEEE80211_MAX_AMPDU_QUEUES]; + + unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES + + IEEE80211_MAX_AMPDU_QUEUES]; + /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */ spinlock_t queue_stop_reason_lock; + struct net_device *mdev; /* wmaster# - "master" 802.11 device */ int open_count; int monitors, cooked_mntrs; @@ -568,7 +628,6 @@ struct ieee80211_local { int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss; unsigned int filter_flags; /* FIF_* */ struct iw_statistics wstats; - u8 wstats_flags; bool tim_in_locked_section; /* see ieee80211_beacon_get() */ int tx_headroom; /* required headroom for hardware/radiotap */ @@ -612,7 +671,9 @@ struct ieee80211_local { struct crypto_blkcipher *wep_rx_tfm; u32 wep_iv; + /* see iface.c */ struct list_head interfaces; + struct mutex iflist_mtx; /* * Key lock, protects sdata's key_list and sta_info's @@ -623,20 +684,18 @@ struct ieee80211_local { /* Scanning and BSS list */ bool sw_scanning, hw_scanning; + struct cfg80211_ssid scan_ssid; + struct cfg80211_scan_request int_scan_req; + struct cfg80211_scan_request *scan_req; + struct ieee80211_channel *scan_channel; int scan_channel_idx; - enum ieee80211_band scan_band; enum { SCAN_SET_CHANNEL, SCAN_SEND_PROBE } scan_state; unsigned long last_scan_completed; struct delayed_work scan_work; struct ieee80211_sub_if_data *scan_sdata; - struct ieee80211_channel *oper_channel, *scan_channel; enum nl80211_channel_type oper_channel_type; - u8 scan_ssid[IEEE80211_MAX_SSID_LEN]; - size_t scan_ssid_len; - struct list_head bss_list; - struct ieee80211_bss *bss_hash[STA_HASH_SIZE]; - spinlock_t bss_lock; + struct ieee80211_channel *oper_channel, *csa_channel; /* SNMP counters */ /* dot11CountersTable */ @@ -649,7 +708,6 @@ struct ieee80211_local { u32 dot11ReceivedFragmentCount; u32 dot11MulticastReceivedFrameCount; u32 dot11TransmittedFrameCount; - u32 dot11WEPUndecryptableCount; #ifdef CONFIG_MAC80211_LEDS int tx_led_counter, rx_led_counter; @@ -696,11 +754,14 @@ struct ieee80211_local { unsigned int wmm_acm; /* bit field of ACM bits (BIT(802.1D tag)) */ bool powersave; - int dynamic_ps_timeout; + bool pspolling; struct work_struct dynamic_ps_enable_work; struct work_struct dynamic_ps_disable_work; struct timer_list dynamic_ps_timer; + int user_power_level; /* in dBm */ + int power_constr_level; /* in dBm */ + #ifdef CONFIG_MAC80211_DEBUGFS struct local_debugfsdentries { struct dentry *rcdir; @@ -712,6 +773,7 @@ struct ieee80211_local { struct dentry *long_retry_limit; struct dentry *total_ps_buffered; struct dentry *wep_iv; + struct dentry *tsf; struct dentry *statistics; struct local_debugfsdentries_statsdentries { struct dentry *transmitted_fragment_count; @@ -805,6 +867,7 @@ struct ieee802_11_elems { u8 *country_elem; u8 *pwr_constr_elem; u8 *quiet_elem; /* first quite element */ + u8 *timeout_int; /* length of them, respectively */ u8 ssid_len; @@ -832,6 +895,7 @@ struct ieee802_11_elems { u8 pwr_constr_elem_len; u8 quiet_elem_len; u8 num_of_quiet_elem; /* can be more the one */ + u8 timeout_int_len; }; static inline struct ieee80211_local *hw_to_local( @@ -860,34 +924,43 @@ void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx); void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, u32 changed); void ieee80211_configure_filter(struct ieee80211_local *local); +u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); /* wireless extensions */ extern const struct iw_handler_def ieee80211_iw_handler_def; -/* STA/IBSS code */ +/* STA code */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata); -void ieee80211_scan_work(struct work_struct *work); -void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, - struct ieee80211_rx_status *rx_status); +ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + struct ieee80211_rx_status *rx_status); +int ieee80211_sta_commit(struct ieee80211_sub_if_data *sdata); int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len); int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len); int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid); -void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta); -struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, - u8 *bssid, u8 *addr, u64 supp_rates); +void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata); int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason); int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason); -u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); -u64 ieee80211_sta_get_rates(struct ieee80211_local *local, - struct ieee802_11_elems *elems, - enum ieee80211_band band); -void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, - u8 *ssid, size_t ssid_len); +void ieee80211_send_pspoll(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); + +/* IBSS code */ +int ieee80211_ibss_commit(struct ieee80211_sub_if_data *sdata); +int ieee80211_ibss_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len); +int ieee80211_ibss_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len); +int ieee80211_ibss_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid); +void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); +void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata); +ieee80211_rx_result +ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, + struct ieee80211_rx_status *rx_status); +struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, + u8 *bssid, u8 *addr, u32 supp_rates); /* scan/BSS handling */ +void ieee80211_scan_work(struct work_struct *work); int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, - u8 *ssid, size_t ssid_len); + struct cfg80211_scan_request *req); int ieee80211_scan_results(struct ieee80211_local *local, struct iw_request_info *info, char *buf, size_t len); @@ -895,29 +968,28 @@ ieee80211_rx_result ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_rx_status *rx_status); -void ieee80211_rx_bss_list_init(struct ieee80211_local *local); -void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local); int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len); void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local); +void ieee80211_scan_failed(struct ieee80211_local *local); int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata, - u8 *ssid, size_t ssid_len); + struct cfg80211_scan_request *req); struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, struct ieee80211_mgmt *mgmt, size_t len, struct ieee802_11_elems *elems, - int freq, bool beacon); -struct ieee80211_bss * -ieee80211_rx_bss_add(struct ieee80211_local *local, u8 *bssid, int freq, - u8 *ssid, u8 ssid_len); + struct ieee80211_channel *channel, + bool beacon); struct ieee80211_bss * ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq, u8 *ssid, u8 ssid_len); void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss); +void ieee80211_rx_bss_remove(struct ieee80211_sub_if_data *sdata, u8 *bssid, + int freq, u8 *ssid, u8 ssid_len); /* interface handling */ int ieee80211_if_add(struct ieee80211_local *local, const char *name, @@ -943,10 +1015,15 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata, struct ieee80211_ht_info *hti, u16 ap_ht_cap_flags); void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn); +void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, + const u8 *da, u16 tid, + u16 initiator, u16 reason_code); void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, u16 initiator, u16 reason); -void ieee80211_sta_tear_down_BA_sessions(struct ieee80211_sub_if_data *sdata, u8 *addr); +void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, + u16 initiator, u16 reason); +void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); @@ -959,10 +1036,25 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, struct ieee80211_mgmt *mgmt, size_t len); +int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, + enum ieee80211_back_parties initiator); + /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); +void ieee80211_chswitch_timer(unsigned long data); +void ieee80211_chswitch_work(struct work_struct *work); +void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel_sw_ie *sw_elem, + struct ieee80211_bss *bss); +void ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, + u16 capab_info, u8 *pwr_constr_elem, + u8 pwr_constr_elem_len); + +/* Suspend/resume */ +int __ieee80211_suspend(struct ieee80211_hw *hw); +int __ieee80211_resume(struct ieee80211_hw *hw); /* utility functions/constants */ extern void *mac80211_wiphy_privid; /* for wiphy privid */ @@ -980,17 +1072,39 @@ void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, void ieee802_11_parse_elems(u8 *start, size_t len, struct ieee802_11_elems *elems); int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freq); -u64 ieee80211_mandatory_rates(struct ieee80211_local *local, +u32 ieee80211_mandatory_rates(struct ieee80211_local *local, enum ieee80211_band band); void ieee80211_dynamic_ps_enable_work(struct work_struct *work); void ieee80211_dynamic_ps_disable_work(struct work_struct *work); void ieee80211_dynamic_ps_timer(unsigned long data); +void ieee80211_send_nullfunc(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + int powersave); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, enum queue_stop_reason reason); void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, enum queue_stop_reason reason); +void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason); +void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason); + +void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, + u16 transaction, u16 auth_alg, + u8 *extra, size_t extra_len, + const u8 *bssid, int encrypt); +void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, + u8 *ssid, size_t ssid_len, + u8 *ie, size_t ie_len); + +void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata, + const size_t supp_rates_len, + const u8 *supp_rates); +u32 ieee80211_sta_get_rates(struct ieee80211_local *local, + struct ieee802_11_elems *elems, + enum ieee80211_band band); #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b9074824862a..f9f27b9cadbe 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -21,6 +21,23 @@ #include "mesh.h" #include "led.h" +/** + * DOC: Interface list locking + * + * The interface list in each struct ieee80211_local is protected + * three-fold: + * + * (1) modifications may only be done under the RTNL + * (2) modifications and readers are protected against each other by + * the iflist_mtx. + * (3) modifications are done in an RCU manner so atomic readers + * can traverse the list in RCU-safe blocks. + * + * As a consequence, reads (traversals) of the list can be protected + * by either the RTNL, the iflist_mtx or RCU. + */ + + static int ieee80211_change_mtu(struct net_device *dev, int new_mtu) { int meshhdrlen; @@ -219,7 +236,10 @@ static int ieee80211_open(struct net_device *dev) break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: - sdata->u.sta.flags &= ~IEEE80211_STA_PREV_BSSID_SET; + if (sdata->vif.type == NL80211_IFTYPE_STATION) + sdata->u.mgd.flags &= ~IEEE80211_STA_PREV_BSSID_SET; + else + sdata->u.ibss.flags &= ~IEEE80211_IBSS_PREV_BSSID_SET; /* fall through */ default: conf.vif = &sdata->vif; @@ -304,11 +324,10 @@ static int ieee80211_open(struct net_device *dev) * yet be effective. Trigger execution of ieee80211_sta_work * to fix this. */ - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; - queue_work(local->hw.workqueue, &ifsta->work); - } + if (sdata->vif.type == NL80211_IFTYPE_STATION) + queue_work(local->hw.workqueue, &sdata->u.mgd.work); + else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + queue_work(local->hw.workqueue, &sdata->u.ibss.work); netif_tx_start_all_queues(dev); @@ -345,13 +364,24 @@ static int ieee80211_stop(struct net_device *dev) list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sta->sdata == sdata) - ieee80211_sta_tear_down_BA_sessions(sdata, - sta->sta.addr); + ieee80211_sta_tear_down_BA_sessions(sta); } rcu_read_unlock(); /* + * Announce that we are leaving the network, in case we are a + * station interface type. This must be done before removing + * all stations associated with sta_info_flush, otherwise STA + * information will be gone and no announce being done. + */ + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + if (sdata->u.mgd.state != IEEE80211_STA_MLME_DISABLED) + ieee80211_sta_deauthenticate(sdata, + WLAN_REASON_DEAUTH_LEAVING); + } + + /* * Remove all stations associated with this interface. * * This must be done before calling ops->remove_interface() @@ -383,6 +413,8 @@ static int ieee80211_stop(struct net_device *dev) atomic_dec(&local->iff_promiscs); dev_mc_unsync(local->mdev, dev); + del_timer_sync(&local->dynamic_ps_timer); + cancel_work_sync(&local->dynamic_ps_enable_work); /* APs need special treatment */ if (sdata->vif.type == NL80211_IFTYPE_AP) { @@ -434,14 +466,9 @@ static int ieee80211_stop(struct net_device *dev) netif_addr_unlock_bh(local->mdev); break; case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: - /* Announce that we are leaving the network. */ - if (sdata->u.sta.state != IEEE80211_STA_MLME_DISABLED) - ieee80211_sta_deauthenticate(sdata, - WLAN_REASON_DEAUTH_LEAVING); - - memset(sdata->u.sta.bssid, 0, ETH_ALEN); - del_timer_sync(&sdata->u.sta.timer); + memset(sdata->u.mgd.bssid, 0, ETH_ALEN); + del_timer_sync(&sdata->u.mgd.chswitch_timer); + del_timer_sync(&sdata->u.mgd.timer); /* * If the timer fired while we waited for it, it will have * requeued the work. Now the work will be running again @@ -449,7 +476,8 @@ static int ieee80211_stop(struct net_device *dev) * whether the interface is running, which, at this point, * it no longer is. */ - cancel_work_sync(&sdata->u.sta.work); + cancel_work_sync(&sdata->u.mgd.work); + cancel_work_sync(&sdata->u.mgd.chswitch_work); /* * When we get here, the interface is marked down. * Call synchronize_rcu() to wait for the RX path @@ -457,12 +485,22 @@ static int ieee80211_stop(struct net_device *dev) * frames at this very time on another CPU. */ synchronize_rcu(); - skb_queue_purge(&sdata->u.sta.skb_queue); + skb_queue_purge(&sdata->u.mgd.skb_queue); - sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED; - kfree(sdata->u.sta.extra_ie); - sdata->u.sta.extra_ie = NULL; - sdata->u.sta.extra_ie_len = 0; + sdata->u.mgd.flags &= ~(IEEE80211_STA_PRIVACY_INVOKED | + IEEE80211_STA_TKIP_WEP_USED); + kfree(sdata->u.mgd.extra_ie); + sdata->u.mgd.extra_ie = NULL; + sdata->u.mgd.extra_ie_len = 0; + /* fall through */ + case NL80211_IFTYPE_ADHOC: + if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + memset(sdata->u.ibss.bssid, 0, ETH_ALEN); + del_timer_sync(&sdata->u.ibss.timer); + cancel_work_sync(&sdata->u.ibss.work); + synchronize_rcu(); + skb_queue_purge(&sdata->u.ibss.skb_queue); + } /* fall through */ case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif)) { @@ -501,7 +539,7 @@ static int ieee80211_stop(struct net_device *dev) * scan event to userspace -- the scan is incomplete. */ if (local->sw_scanning) - ieee80211_scan_completed(&local->hw); + ieee80211_scan_completed(&local->hw, true); } conf.vif = &sdata->vif; @@ -569,19 +607,6 @@ static void ieee80211_set_multicast_list(struct net_device *dev) dev_mc_sync(local->mdev, dev); } -static void ieee80211_if_setup(struct net_device *dev) -{ - ether_setup(dev); - dev->hard_start_xmit = ieee80211_subif_start_xmit; - dev->wireless_handlers = &ieee80211_iw_handler_def; - dev->set_multicast_list = ieee80211_set_multicast_list; - dev->change_mtu = ieee80211_change_mtu; - dev->open = ieee80211_open; - dev->stop = ieee80211_stop; - dev->destructor = free_netdev; - /* we will validate the address ourselves in ->open */ - dev->validate_addr = NULL; -} /* * Called when the netdev is removed or, by the code below, before * the interface type changes. @@ -621,12 +646,20 @@ static void ieee80211_teardown_sdata(struct net_device *dev) if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_rmc_free(sdata); break; - case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: - kfree(sdata->u.sta.extra_ie); - kfree(sdata->u.sta.assocreq_ies); - kfree(sdata->u.sta.assocresp_ies); - kfree_skb(sdata->u.sta.probe_resp); + kfree_skb(sdata->u.ibss.probe_resp); + break; + case NL80211_IFTYPE_STATION: + kfree(sdata->u.mgd.extra_ie); + kfree(sdata->u.mgd.assocreq_ies); + kfree(sdata->u.mgd.assocresp_ies); + kfree(sdata->u.mgd.ie_probereq); + kfree(sdata->u.mgd.ie_proberesp); + kfree(sdata->u.mgd.ie_auth); + kfree(sdata->u.mgd.ie_assocreq); + kfree(sdata->u.mgd.ie_reassocreq); + kfree(sdata->u.mgd.ie_deauth); + kfree(sdata->u.mgd.ie_disassoc); break; case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_AP_VLAN: @@ -642,6 +675,34 @@ static void ieee80211_teardown_sdata(struct net_device *dev) WARN_ON(flushed); } +static const struct net_device_ops ieee80211_dataif_ops = { + .ndo_open = ieee80211_open, + .ndo_stop = ieee80211_stop, + .ndo_uninit = ieee80211_teardown_sdata, + .ndo_start_xmit = ieee80211_subif_start_xmit, + .ndo_set_multicast_list = ieee80211_set_multicast_list, + .ndo_change_mtu = ieee80211_change_mtu, + .ndo_set_mac_address = eth_mac_addr, +}; + +static const struct net_device_ops ieee80211_monitorif_ops = { + .ndo_open = ieee80211_open, + .ndo_stop = ieee80211_stop, + .ndo_uninit = ieee80211_teardown_sdata, + .ndo_start_xmit = ieee80211_monitor_start_xmit, + .ndo_set_multicast_list = ieee80211_set_multicast_list, + .ndo_change_mtu = ieee80211_change_mtu, + .ndo_set_mac_address = eth_mac_addr, +}; + +static void ieee80211_if_setup(struct net_device *dev) +{ + ether_setup(dev); + dev->netdev_ops = &ieee80211_dataif_ops; + dev->wireless_handlers = &ieee80211_iw_handler_def; + dev->destructor = free_netdev; +} + /* * Helper function to initialise an interface to a specific type. */ @@ -653,7 +714,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, /* and set some type-dependent values */ sdata->vif.type = type; - sdata->dev->hard_start_xmit = ieee80211_subif_start_xmit; + sdata->dev->netdev_ops = &ieee80211_dataif_ops; sdata->wdev.iftype = type; /* only monitor differs */ @@ -665,16 +726,18 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, INIT_LIST_HEAD(&sdata->u.ap.vlans); break; case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: ieee80211_sta_setup_sdata(sdata); break; + case NL80211_IFTYPE_ADHOC: + ieee80211_ibss_setup_sdata(sdata); + break; case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif)) ieee80211_mesh_init_sdata(sdata); break; case NL80211_IFTYPE_MONITOR: sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP; - sdata->dev->hard_start_xmit = ieee80211_monitor_start_xmit; + sdata->dev->netdev_ops = &ieee80211_monitorif_ops; sdata->u.mntr_flags = MONITOR_FLAG_CONTROL | MONITOR_FLAG_OTHER_BSS; break; @@ -755,6 +818,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, memcpy(ndev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN); SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy)); + ndev->features |= NETIF_F_NETNS_LOCAL; /* don't use IEEE80211_DEV_TO_SUB_IF because it checks too much */ sdata = netdev_priv(ndev); @@ -780,15 +844,15 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, if (ret) goto fail; - ndev->uninit = ieee80211_teardown_sdata; - if (ieee80211_vif_is_mesh(&sdata->vif) && params && params->mesh_id_len) ieee80211_sdata_set_mesh_id(sdata, params->mesh_id_len, params->mesh_id); + mutex_lock(&local->iflist_mtx); list_add_tail_rcu(&sdata->list, &local->interfaces); + mutex_unlock(&local->iflist_mtx); if (new_dev) *new_dev = ndev; @@ -804,7 +868,10 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata) { ASSERT_RTNL(); + mutex_lock(&sdata->local->iflist_mtx); list_del_rcu(&sdata->list); + mutex_unlock(&sdata->local->iflist_mtx); + synchronize_rcu(); unregister_netdevice(sdata->dev); } @@ -820,7 +887,16 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) ASSERT_RTNL(); list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) { + /* + * we cannot hold the iflist_mtx across unregister_netdevice, + * but we only need to hold it for list modifications to lock + * out readers since we're under the RTNL here as all other + * writers. + */ + mutex_lock(&local->iflist_mtx); list_del(&sdata->list); + mutex_unlock(&local->iflist_mtx); + unregister_netdevice(sdata->dev); } } diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 999f7aa42326..687acf23054d 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -18,6 +18,7 @@ #include "ieee80211_i.h" #include "debugfs_key.h" #include "aes_ccm.h" +#include "aes_cmac.h" /** @@ -47,7 +48,6 @@ */ static const u8 bcast_addr[ETH_ALEN] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; -static const u8 zero_addr[ETH_ALEN]; /* key mutex: used to synchronise todo runners */ static DEFINE_MUTEX(key_mutex); @@ -108,29 +108,18 @@ static void assert_key_lock(void) WARN_ON(!mutex_is_locked(&key_mutex)); } -static const u8 *get_mac_for_key(struct ieee80211_key *key) +static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key) { - const u8 *addr = bcast_addr; - - /* - * If we're an AP we won't ever receive frames with a non-WEP - * group key so we tell the driver that by using the zero MAC - * address to indicate a transmit-only key. - */ - if (key->conf.alg != ALG_WEP && - (key->sdata->vif.type == NL80211_IFTYPE_AP || - key->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) - addr = zero_addr; - if (key->sta) - addr = key->sta->sta.addr; + return &key->sta->sta; - return addr; + return NULL; } static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key) { - const u8 *addr; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_sta *sta; int ret; assert_key_lock(); @@ -139,11 +128,16 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key) if (!key->local->ops->set_key) return; - addr = get_mac_for_key(key); + sta = get_sta_for_key(key); + + sdata = key->sdata; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); ret = key->local->ops->set_key(local_to_hw(key->local), SET_KEY, - key->sdata->dev->dev_addr, addr, - &key->conf); + &sdata->vif, sta, &key->conf); if (!ret) { spin_lock(&todo_lock); @@ -155,12 +149,13 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key) printk(KERN_ERR "mac80211-%s: failed to set key " "(%d, %pM) to hardware (%d)\n", wiphy_name(key->local->hw.wiphy), - key->conf.keyidx, addr, ret); + key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); } static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) { - const u8 *addr; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_sta *sta; int ret; assert_key_lock(); @@ -176,17 +171,22 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) } spin_unlock(&todo_lock); - addr = get_mac_for_key(key); + sta = get_sta_for_key(key); + sdata = key->sdata; + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); ret = key->local->ops->set_key(local_to_hw(key->local), DISABLE_KEY, - key->sdata->dev->dev_addr, addr, - &key->conf); + &sdata->vif, sta, &key->conf); if (ret) printk(KERN_ERR "mac80211-%s: failed to remove key " "(%d, %pM) from hardware (%d)\n", wiphy_name(key->local->hw.wiphy), - key->conf.keyidx, addr, ret); + key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); spin_lock(&todo_lock); key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; @@ -216,13 +216,38 @@ void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx) spin_unlock_irqrestore(&sdata->local->key_lock, flags); } +static void +__ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, int idx) +{ + struct ieee80211_key *key = NULL; + + if (idx >= NUM_DEFAULT_KEYS && + idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) + key = sdata->keys[idx]; + + rcu_assign_pointer(sdata->default_mgmt_key, key); + + if (key) + add_todo(key, KEY_FLAG_TODO_DEFMGMTKEY); +} + +void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, + int idx) +{ + unsigned long flags; + + spin_lock_irqsave(&sdata->local->key_lock, flags); + __ieee80211_set_default_mgmt_key(sdata, idx); + spin_unlock_irqrestore(&sdata->local->key_lock, flags); +} + static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_key *old, struct ieee80211_key *new) { - int idx, defkey; + int idx, defkey, defmgmtkey; if (new) list_add(&new->list, &sdata->key_list); @@ -238,13 +263,19 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, idx = new->conf.keyidx; defkey = old && sdata->default_key == old; + defmgmtkey = old && sdata->default_mgmt_key == old; if (defkey && !new) __ieee80211_set_default_key(sdata, -1); + if (defmgmtkey && !new) + __ieee80211_set_default_mgmt_key(sdata, -1); rcu_assign_pointer(sdata->keys[idx], new); if (defkey && new) __ieee80211_set_default_key(sdata, new->conf.keyidx); + if (defmgmtkey && new) + __ieee80211_set_default_mgmt_key(sdata, + new->conf.keyidx); } if (old) { @@ -263,7 +294,7 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, { struct ieee80211_key *key; - BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS); + BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS); key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL); if (!key) @@ -292,6 +323,10 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, key->conf.iv_len = CCMP_HDR_LEN; key->conf.icv_len = CCMP_MIC_LEN; break; + case ALG_AES_CMAC: + key->conf.iv_len = 0; + key->conf.icv_len = sizeof(struct ieee80211_mmie); + break; } memcpy(key->conf.key, key_data, key_len); INIT_LIST_HEAD(&key->list); @@ -309,6 +344,19 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, } } + if (alg == ALG_AES_CMAC) { + /* + * Initialize AES key state here as an optimization so that + * it does not need to be initialized for every packet. + */ + key->u.aes_cmac.tfm = + ieee80211_aes_cmac_key_setup(key_data); + if (!key->u.aes_cmac.tfm) { + kfree(key); + return NULL; + } + } + return key; } @@ -352,7 +400,7 @@ void ieee80211_key_link(struct ieee80211_key *key, */ /* same here, the AP could be using QoS */ - ap = sta_info_get(key->local, key->sdata->u.sta.bssid); + ap = sta_info_get(key->local, key->sdata->u.mgd.bssid); if (ap) { if (test_sta_flags(ap, WLAN_STA_WME)) key->conf.flags |= @@ -462,6 +510,8 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key) if (key->conf.alg == ALG_CCMP) ieee80211_aes_key_free(key->u.ccmp.tfm); + if (key->conf.alg == ALG_AES_CMAC) + ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm); ieee80211_debugfs_key_remove(key); kfree(key); @@ -484,6 +534,7 @@ static void __ieee80211_key_todo(void) list_del_init(&key->todo); todoflags = key->flags & (KEY_FLAG_TODO_ADD_DEBUGFS | KEY_FLAG_TODO_DEFKEY | + KEY_FLAG_TODO_DEFMGMTKEY | KEY_FLAG_TODO_HWACCEL_ADD | KEY_FLAG_TODO_HWACCEL_REMOVE | KEY_FLAG_TODO_DELETE); @@ -501,6 +552,11 @@ static void __ieee80211_key_todo(void) ieee80211_debugfs_key_add_default(key->sdata); work_done = true; } + if (todoflags & KEY_FLAG_TODO_DEFMGMTKEY) { + ieee80211_debugfs_key_remove_mgmt_default(key->sdata); + ieee80211_debugfs_key_add_mgmt_default(key->sdata); + work_done = true; + } if (todoflags & KEY_FLAG_TODO_HWACCEL_ADD) { ieee80211_key_enable_hw_accel(key); work_done = true; @@ -536,6 +592,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata) ieee80211_key_lock(); ieee80211_debugfs_key_remove_default(sdata); + ieee80211_debugfs_key_remove_mgmt_default(sdata); spin_lock_irqsave(&sdata->local->key_lock, flags); list_for_each_entry_safe(key, tmp, &sdata->key_list, list) diff --git a/net/mac80211/key.h b/net/mac80211/key.h index 425816e0996c..215d3ef42a4f 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -46,6 +46,8 @@ struct sta_info; * acceleration. * @KEY_FLAG_TODO_DEFKEY: Key is default key and debugfs needs to be updated. * @KEY_FLAG_TODO_ADD_DEBUGFS: Key needs to be added to debugfs. + * @KEY_FLAG_TODO_DEFMGMTKEY: Key is default management key and debugfs needs + * to be updated. */ enum ieee80211_internal_key_flags { KEY_FLAG_UPLOADED_TO_HARDWARE = BIT(0), @@ -54,6 +56,7 @@ enum ieee80211_internal_key_flags { KEY_FLAG_TODO_HWACCEL_REMOVE = BIT(3), KEY_FLAG_TODO_DEFKEY = BIT(4), KEY_FLAG_TODO_ADD_DEBUGFS = BIT(5), + KEY_FLAG_TODO_DEFMGMTKEY = BIT(6), }; struct tkip_ctx { @@ -96,6 +99,16 @@ struct ieee80211_key { u8 tx_crypto_buf[6 * AES_BLOCK_LEN]; u8 rx_crypto_buf[6 * AES_BLOCK_LEN]; } ccmp; + struct { + u8 tx_pn[6]; + u8 rx_pn[6]; + struct crypto_cipher *tfm; + u32 replays; /* dot11RSNAStatsCMACReplays */ + u32 icverrors; /* dot11RSNAStatsCMACICVErrors */ + /* scratch buffers for virt_to_page() (crypto API) */ + u8 tx_crypto_buf[2 * AES_BLOCK_LEN]; + u8 rx_crypto_buf[2 * AES_BLOCK_LEN]; + } aes_cmac; } u; /* number of times this key has been used */ @@ -114,6 +127,7 @@ struct ieee80211_key { struct dentry *tx_spec; struct dentry *rx_spec; struct dentry *replays; + struct dentry *icverrors; struct dentry *key; struct dentry *ifindex; int cnt; @@ -140,6 +154,8 @@ void ieee80211_key_link(struct ieee80211_key *key, struct sta_info *sta); void ieee80211_key_free(struct ieee80211_key *key); void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx); +void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, + int idx); void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata); void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata); void ieee80211_disable_keys(struct ieee80211_sub_if_data *sdata); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 24b14363d6e7..f38db4d37e5d 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -168,24 +168,67 @@ int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed) return 0; memset(&conf, 0, sizeof(conf)); - conf.changed = changed; - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) - conf.bssid = sdata->u.sta.bssid; + if (sdata->vif.type == NL80211_IFTYPE_STATION) + conf.bssid = sdata->u.mgd.bssid; + else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + conf.bssid = sdata->u.ibss.bssid; else if (sdata->vif.type == NL80211_IFTYPE_AP) conf.bssid = sdata->dev->dev_addr; else if (ieee80211_vif_is_mesh(&sdata->vif)) { - u8 zero[ETH_ALEN] = { 0 }; + static const u8 zero[ETH_ALEN] = { 0 }; conf.bssid = zero; } else { WARN_ON(1); return -EINVAL; } + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_MESH_POINT: + break; + default: + /* do not warn to simplify caller in scan.c */ + changed &= ~IEEE80211_IFCC_BEACON_ENABLED; + if (WARN_ON(changed & IEEE80211_IFCC_BEACON)) + return -EINVAL; + changed &= ~IEEE80211_IFCC_BEACON; + break; + } + + if (changed & IEEE80211_IFCC_BEACON_ENABLED) { + if (local->sw_scanning) { + conf.enable_beacon = false; + } else { + /* + * Beacon should be enabled, but AP mode must + * check whether there is a beacon configured. + */ + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: + conf.enable_beacon = + !!rcu_dereference(sdata->u.ap.beacon); + break; + case NL80211_IFTYPE_ADHOC: + conf.enable_beacon = !!sdata->u.ibss.probe_resp; + break; + case NL80211_IFTYPE_MESH_POINT: + conf.enable_beacon = true; + break; + default: + /* not reached */ + WARN_ON(1); + break; + } + } + } + if (WARN_ON(!conf.bssid && (changed & IEEE80211_IFCC_BSSID))) return -EINVAL; + conf.changed = changed; + return local->ops->config_interface(local_to_hw(local), &sdata->vif, &conf); } @@ -208,26 +251,22 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) } if (chan != local->hw.conf.channel || - channel_type != local->hw.conf.ht.channel_type) { + channel_type != local->hw.conf.channel_type) { local->hw.conf.channel = chan; - local->hw.conf.ht.channel_type = channel_type; - switch (channel_type) { - case NL80211_CHAN_NO_HT: - local->hw.conf.ht.enabled = false; - break; - case NL80211_CHAN_HT20: - case NL80211_CHAN_HT40MINUS: - case NL80211_CHAN_HT40PLUS: - local->hw.conf.ht.enabled = true; - break; - } + local->hw.conf.channel_type = channel_type; changed |= IEEE80211_CONF_CHANGE_CHANNEL; } - if (!local->hw.conf.power_level) + if (local->sw_scanning) power = chan->max_power; else - power = min(chan->max_power, local->hw.conf.power_level); + power = local->power_constr_level ? + (chan->max_power - local->power_constr_level) : + chan->max_power; + + if (local->user_power_level) + power = min(power, local->user_power_level); + if (local->hw.conf.power_level != power) { changed |= IEEE80211_CONF_CHANGE_POWER; local->hw.conf.power_level = power; @@ -667,7 +706,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { struct ieee80211_local *local; - int priv_size; + int priv_size, i; struct wiphy *wiphy; /* Ensure 32-byte alignment of our private data and hw private data. @@ -695,6 +734,10 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, return NULL; wiphy->privid = mac80211_wiphy_privid; + wiphy->max_scan_ssids = 4; + /* Yes, putting cfg80211_bss into ieee80211_bss is a hack */ + wiphy->bss_priv_size = sizeof(struct ieee80211_bss) - + sizeof(struct cfg80211_bss); local = wiphy_priv(wiphy); local->hw.wiphy = wiphy; @@ -722,6 +765,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, local->hw.conf.radio_enabled = true; INIT_LIST_HEAD(&local->interfaces); + mutex_init(&local->iflist_mtx); spin_lock_init(&local->key_lock); @@ -736,6 +780,11 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, setup_timer(&local->dynamic_ps_timer, ieee80211_dynamic_ps_timer, (unsigned long) local); + for (i = 0; i < IEEE80211_MAX_AMPDU_QUEUES; i++) + local->ampdu_ac_queue[i] = -1; + /* using an s8 won't work with more than that */ + BUILD_BUG_ON(IEEE80211_MAX_AMPDU_QUEUES > 127); + sta_info_init(local); tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending, @@ -754,6 +803,23 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, } EXPORT_SYMBOL(ieee80211_alloc_hw); +static const struct net_device_ops ieee80211_master_ops = { + .ndo_start_xmit = ieee80211_master_start_xmit, + .ndo_open = ieee80211_master_open, + .ndo_stop = ieee80211_master_stop, + .ndo_set_multicast_list = ieee80211_master_set_multicast_list, + .ndo_select_queue = ieee80211_select_queue, +}; + +static void ieee80211_master_setup(struct net_device *mdev) +{ + mdev->type = ARPHRD_IEEE80211; + mdev->netdev_ops = &ieee80211_master_ops; + mdev->header_ops = &ieee80211_header_ops; + mdev->tx_queue_len = 1000; + mdev->addr_len = ETH_ALEN; +} + int ieee80211_register_hw(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); @@ -761,25 +827,33 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) enum ieee80211_band band; struct net_device *mdev; struct ieee80211_master_priv *mpriv; + int channels, i, j; /* * generic code guarantees at least one band, * set this very early because much code assumes * that hw.conf.channel is assigned */ + channels = 0; for (band = 0; band < IEEE80211_NUM_BANDS; band++) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[band]; - if (sband) { + if (sband && !local->oper_channel) { /* init channel we're on */ local->hw.conf.channel = local->oper_channel = local->scan_channel = &sband->channels[0]; - break; } + if (sband) + channels += sband->n_channels; } + local->int_scan_req.n_channels = channels; + local->int_scan_req.channels = kzalloc(sizeof(void *) * channels, GFP_KERNEL); + if (!local->int_scan_req.channels) + return -ENOMEM; + /* if low-level driver supports AP, we also support VLAN */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP)) local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_AP_VLAN); @@ -787,9 +861,14 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* mac80211 always supports monitor */ local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR); + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; + else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) + local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC; + result = wiphy_register(local->hw.wiphy); if (result < 0) - return result; + goto fail_wiphy_register; /* * We use the number of queues for feature tests (QoS, HT) internally @@ -803,8 +882,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) hw->ampdu_queues = 0; mdev = alloc_netdev_mq(sizeof(struct ieee80211_master_priv), - "wmaster%d", ether_setup, - ieee80211_num_queues(hw)); + "wmaster%d", ieee80211_master_setup, + hw->queues); if (!mdev) goto fail_mdev_alloc; @@ -812,17 +891,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) mpriv->local = local; local->mdev = mdev; - ieee80211_rx_bss_list_init(local); - - mdev->hard_start_xmit = ieee80211_master_start_xmit; - mdev->open = ieee80211_master_open; - mdev->stop = ieee80211_master_stop; - mdev->type = ARPHRD_IEEE80211; - mdev->header_ops = &ieee80211_header_ops; - mdev->set_multicast_list = ieee80211_master_set_multicast_list; - local->hw.workqueue = - create_freezeable_workqueue(wiphy_name(local->hw.wiphy)); + create_singlethread_workqueue(wiphy_name(local->hw.wiphy)); if (!local->hw.workqueue) { result = -ENOMEM; goto fail_workqueue; @@ -846,15 +916,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->hw.conf.listen_interval = local->hw.max_listen_interval; - local->wstats_flags |= local->hw.flags & (IEEE80211_HW_SIGNAL_UNSPEC | - IEEE80211_HW_SIGNAL_DB | - IEEE80211_HW_SIGNAL_DBM) ? - IW_QUAL_QUAL_UPDATED : IW_QUAL_QUAL_INVALID; - local->wstats_flags |= local->hw.flags & IEEE80211_HW_NOISE_DBM ? - IW_QUAL_NOISE_UPDATED : IW_QUAL_NOISE_INVALID; - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) - local->wstats_flags |= IW_QUAL_DBM; - result = sta_info_start(local); if (result < 0) goto fail_sta_info; @@ -866,6 +927,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) memcpy(local->mdev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN); SET_NETDEV_DEV(local->mdev, wiphy_dev(local->hw.wiphy)); + local->mdev->features |= NETIF_F_NETNS_LOCAL; result = register_netdevice(local->mdev); if (result < 0) @@ -887,8 +949,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) goto fail_wep; } - local->mdev->select_queue = ieee80211_select_queue; - /* add one default STA interface if supported */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION)) { result = ieee80211_if_add(local, "wlan%d", NULL, @@ -902,6 +962,20 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) ieee80211_led_init(local); + /* alloc internal scan request */ + i = 0; + local->int_scan_req.ssids = &local->scan_ssid; + local->int_scan_req.n_ssids = 1; + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + if (!hw->wiphy->bands[band]) + continue; + for (j = 0; j < hw->wiphy->bands[band]->n_channels; j++) { + local->int_scan_req.channels[i] = + &hw->wiphy->bands[band]->channels[j]; + i++; + } + } + return 0; fail_wep: @@ -920,6 +994,8 @@ fail_workqueue: free_netdev(local->mdev); fail_mdev_alloc: wiphy_unregister(local->hw.wiphy); +fail_wiphy_register: + kfree(local->int_scan_req.channels); return result; } EXPORT_SYMBOL(ieee80211_register_hw); @@ -947,7 +1023,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) rtnl_unlock(); - ieee80211_rx_bss_list_deinit(local); ieee80211_clear_tx_pending(local); sta_info_stop(local); rate_control_deinitialize(local); @@ -965,6 +1040,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) ieee80211_wep_free(local); ieee80211_led_exit(local); free_netdev(local->mdev); + kfree(local->int_scan_req.channels); } EXPORT_SYMBOL(ieee80211_unregister_hw); @@ -972,6 +1048,8 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); + mutex_destroy(&local->iflist_mtx); + wiphy_free(local->hw.wiphy); } EXPORT_SYMBOL(ieee80211_free_hw); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 82f568e94365..9a3e5de0410a 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -275,16 +275,6 @@ u32 mesh_table_hash(u8 *addr, struct ieee80211_sub_if_data *sdata, struct mesh_t & tbl->hash_mask; } -u8 mesh_id_hash(u8 *mesh_id, int mesh_id_len) -{ - if (!mesh_id_len) - return 1; - else if (mesh_id_len == 1) - return (u8) mesh_id[0]; - else - return (u8) (mesh_id[0] + 2 * mesh_id[1]); -} - struct mesh_table *mesh_table_alloc(int size_order) { int i; @@ -442,7 +432,8 @@ void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) ifmsh->housekeeping = true; queue_work(local->hw.workqueue, &ifmsh->work); - ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON); + ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON | + IEEE80211_IFCC_BEACON_ENABLED); } void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) @@ -476,7 +467,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee802_11_elems elems; struct ieee80211_channel *channel; - u64 supp_rates = 0; + u32 supp_rates = 0; size_t baselen; int freq; enum ieee80211_band band = rx_status->band; diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index c197ab545e54..d891d7ddccd7 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -24,15 +24,15 @@ * * * - * @MESH_PATH_ACTIVE: the mesh path is can be used for forwarding - * @MESH_PATH_RESOLVED: the discovery process is running for this mesh path + * @MESH_PATH_ACTIVE: the mesh path can be used for forwarding + * @MESH_PATH_RESOLVING: the discovery process is running for this mesh path * @MESH_PATH_DSN_VALID: the mesh path contains a valid destination sequence * number * @MESH_PATH_FIXED: the mesh path has been manually set and should not be * modified * @MESH_PATH_RESOLVED: the mesh path can has been resolved * - * MESH_PATH_RESOLVED and MESH_PATH_DELETE are used by the mesh path timer to + * MESH_PATH_RESOLVED is used by the mesh path timer to * decide when to stop or cancel the mesh path discovery. */ enum mesh_path_flags { @@ -196,7 +196,6 @@ struct mesh_rmc { /* Public interfaces */ /* Various */ -u8 mesh_id_hash(u8 *mesh_id, int mesh_id_len); int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr); int ieee80211_new_mesh_header(struct ieee80211s_hdr *meshhdr, struct ieee80211_sub_if_data *sdata); @@ -236,14 +235,13 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata); /* Mesh plinks */ -void mesh_neighbour_update(u8 *hw_addr, u64 rates, +void mesh_neighbour_update(u8 *hw_addr, u32 rates, struct ieee80211_sub_if_data *sdata, bool add); bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie); void mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata); void mesh_plink_broken(struct sta_info *sta); void mesh_plink_deactivate(struct sta_info *sta); int mesh_plink_open(struct sta_info *sta); -int mesh_plink_close(struct sta_info *sta); void mesh_plink_block(struct sta_info *sta); void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 71fe60961230..60b35accda91 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -58,7 +58,6 @@ static inline u32 u32_field_get(u8 *preq_elem, int offset, bool ae) #define PERR_IE_DST_ADDR(x) (x + 2) #define PERR_IE_DST_DSN(x) u32_field_get(x, 8, 0); -#define TU_TO_EXP_TIME(x) (jiffies + msecs_to_jiffies(x * 1024 / 1000)) #define MSEC_TO_TU(x) (x*1000/1024) #define DSN_GT(x, y) ((long) (y) - (long) (x) < 0) #define DSN_LT(x, y) ((long) (x) - (long) (y) < 0) @@ -149,7 +148,7 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, pos += ETH_ALEN; memcpy(pos, &dst_dsn, 4); - ieee80211_tx_skb(sdata, skb, 0); + ieee80211_tx_skb(sdata, skb, 1); return 0; } @@ -198,7 +197,7 @@ int mesh_path_error_tx(u8 *dst, __le32 dst_dsn, u8 *ra, pos += ETH_ALEN; memcpy(pos, &dst_dsn, 4); - ieee80211_tx_skb(sdata, skb, 0); + ieee80211_tx_skb(sdata, skb, 1); return 0; } @@ -759,7 +758,7 @@ enddiscovery: } /** - * ieee80211s_lookup_nexthop - put the appropriate next hop on a mesh frame + * mesh_nexthop_lookup - put the appropriate next hop on a mesh frame * * @skb: 802.11 frame to be sent * @sdata: network subif the frame will be sent through diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 1159bdb4119c..a8bbdeca013a 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -93,7 +93,7 @@ static inline void mesh_plink_fsm_restart(struct sta_info *sta) * on it in the lifecycle management section! */ static struct sta_info *mesh_plink_alloc(struct ieee80211_sub_if_data *sdata, - u8 *hw_addr, u64 rates) + u8 *hw_addr, u32 rates) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; @@ -218,11 +218,11 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, memcpy(pos, &reason, 2); } - ieee80211_tx_skb(sdata, skb, 0); + ieee80211_tx_skb(sdata, skb, 1); return 0; } -void mesh_neighbour_update(u8 *hw_addr, u64 rates, struct ieee80211_sub_if_data *sdata, +void mesh_neighbour_update(u8 *hw_addr, u32 rates, struct ieee80211_sub_if_data *sdata, bool peer_accepting_plinks) { struct ieee80211_local *local = sdata->local; @@ -361,36 +361,6 @@ void mesh_plink_block(struct sta_info *sta) spin_unlock_bh(&sta->lock); } -int mesh_plink_close(struct sta_info *sta) -{ - struct ieee80211_sub_if_data *sdata = sta->sdata; - __le16 llid, plid, reason; - - mpl_dbg("Mesh plink: closing link with %pM\n", sta->sta.addr); - spin_lock_bh(&sta->lock); - sta->reason = cpu_to_le16(MESH_LINK_CANCELLED); - reason = sta->reason; - - if (sta->plink_state == PLINK_LISTEN || - sta->plink_state == PLINK_BLOCKED) { - mesh_plink_fsm_restart(sta); - spin_unlock_bh(&sta->lock); - return 0; - } else if (sta->plink_state == PLINK_ESTAB) { - __mesh_plink_deactivate(sta); - /* The timer should not be running */ - mod_plink_timer(sta, dot11MeshHoldingTimeout(sdata)); - } else if (!mod_plink_timer(sta, dot11MeshHoldingTimeout(sdata))) - sta->ignore_plink_timer = true; - - sta->plink_state = PLINK_HOLDING; - llid = sta->llid; - plid = sta->plid; - spin_unlock_bh(&sta->lock); - mesh_plink_frame_tx(sta->sdata, PLINK_CLOSE, sta->sta.addr, llid, - plid, reason); - return 0; -} void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) @@ -477,7 +447,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m spin_lock_bh(&sta->lock); } else if (!sta) { /* ftype == PLINK_OPEN */ - u64 rates; + u32 rates; if (!mesh_plink_free_count(sdata)) { mpl_dbg("Mesh plink error: no more free plinks\n"); rcu_read_unlock(); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2b890af01ba4..841b8450b3de 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1,6 +1,6 @@ /* * BSS client mode implementation - * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> + * Copyright 2003-2008, Jouni Malinen <j@w1.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> @@ -15,11 +15,8 @@ #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/if_arp.h> -#include <linux/wireless.h> -#include <linux/random.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> -#include <net/iw_handler.h> #include <net/mac80211.h> #include <asm/unaligned.h> @@ -35,15 +32,6 @@ #define IEEE80211_MONITORING_INTERVAL (2 * HZ) #define IEEE80211_PROBE_INTERVAL (60 * HZ) #define IEEE80211_RETRY_AUTH_INTERVAL (1 * HZ) -#define IEEE80211_SCAN_INTERVAL (2 * HZ) -#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ) -#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ) - -#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ) -#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ) - -#define IEEE80211_IBSS_MAX_STA_ENTRIES 128 - /* utils */ static int ecw2cw(int ecw) @@ -55,10 +43,10 @@ static u8 *ieee80211_bss_get_ie(struct ieee80211_bss *bss, u8 ie) { u8 *end, *pos; - pos = bss->ies; + pos = bss->cbss.information_elements; if (pos == NULL) return NULL; - end = pos + bss->ies_len; + end = pos + bss->cbss.len_information_elements; while (pos + 1 < end) { if (pos + 2 + pos[1] > end) @@ -73,7 +61,7 @@ static u8 *ieee80211_bss_get_ie(struct ieee80211_bss *bss, u8 ie) static int ieee80211_compatible_rates(struct ieee80211_bss *bss, struct ieee80211_supported_band *sband, - u64 *rates) + u32 *rates) { int i, j, count; *rates = 0; @@ -92,160 +80,40 @@ static int ieee80211_compatible_rates(struct ieee80211_bss *bss, return count; } -/* also used by mesh code */ -u64 ieee80211_sta_get_rates(struct ieee80211_local *local, - struct ieee802_11_elems *elems, - enum ieee80211_band band) -{ - struct ieee80211_supported_band *sband; - struct ieee80211_rate *bitrates; - size_t num_rates; - u64 supp_rates; - int i, j; - sband = local->hw.wiphy->bands[band]; - - if (!sband) { - WARN_ON(1); - sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; - } - - bitrates = sband->bitrates; - num_rates = sband->n_bitrates; - supp_rates = 0; - for (i = 0; i < elems->supp_rates_len + - elems->ext_supp_rates_len; i++) { - u8 rate = 0; - int own_rate; - if (i < elems->supp_rates_len) - rate = elems->supp_rates[i]; - else if (elems->ext_supp_rates) - rate = elems->ext_supp_rates - [i - elems->supp_rates_len]; - own_rate = 5 * (rate & 0x7f); - for (j = 0; j < num_rates; j++) - if (bitrates[j].bitrate == own_rate) - supp_rates |= BIT(j); - } - return supp_rates; -} - /* frame sending functions */ -/* also used by scanning code */ -void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, - u8 *ssid, size_t ssid_len) +static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len) { - struct ieee80211_local *local = sdata->local; - struct ieee80211_supported_band *sband; - struct sk_buff *skb; - struct ieee80211_mgmt *mgmt; - u8 *pos, *supp_rates, *esupp_rates = NULL; - int i; - - skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200); - if (!skb) { - printk(KERN_DEBUG "%s: failed to allocate buffer for probe " - "request\n", sdata->dev->name); - return; - } - skb_reserve(skb, local->hw.extra_tx_headroom); - - mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); - memset(mgmt, 0, 24); - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_PROBE_REQ); - memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - if (dst) { - memcpy(mgmt->da, dst, ETH_ALEN); - memcpy(mgmt->bssid, dst, ETH_ALEN); - } else { - memset(mgmt->da, 0xff, ETH_ALEN); - memset(mgmt->bssid, 0xff, ETH_ALEN); - } - pos = skb_put(skb, 2 + ssid_len); - *pos++ = WLAN_EID_SSID; - *pos++ = ssid_len; - memcpy(pos, ssid, ssid_len); - - supp_rates = skb_put(skb, 2); - supp_rates[0] = WLAN_EID_SUPP_RATES; - supp_rates[1] = 0; - sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; - - for (i = 0; i < sband->n_bitrates; i++) { - struct ieee80211_rate *rate = &sband->bitrates[i]; - if (esupp_rates) { - pos = skb_put(skb, 1); - esupp_rates[1]++; - } else if (supp_rates[1] == 8) { - esupp_rates = skb_put(skb, 3); - esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES; - esupp_rates[1] = 1; - pos = &esupp_rates[2]; - } else { - pos = skb_put(skb, 1); - supp_rates[1]++; - } - *pos = rate->bitrate / 5; - } - - ieee80211_tx_skb(sdata, skb, 0); + if (ies) + memcpy(skb_put(skb, ies_len), ies, ies_len); } -static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, - int transaction, u8 *extra, size_t extra_len, - int encrypt) +static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; - - skb = dev_alloc_skb(local->hw.extra_tx_headroom + - sizeof(*mgmt) + 6 + extra_len); - if (!skb) { - printk(KERN_DEBUG "%s: failed to allocate buffer for auth " - "frame\n", sdata->dev->name); - return; - } - skb_reserve(skb, local->hw.extra_tx_headroom); - - mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6); - memset(mgmt, 0, 24 + 6); - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_AUTH); - if (encrypt) - mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); - memcpy(mgmt->da, ifsta->bssid, ETH_ALEN); - memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); - mgmt->u.auth.auth_alg = cpu_to_le16(ifsta->auth_alg); - mgmt->u.auth.auth_transaction = cpu_to_le16(transaction); - ifsta->auth_transaction = transaction + 1; - mgmt->u.auth.status_code = cpu_to_le16(0); - if (extra) - memcpy(skb_put(skb, extra_len), extra, extra_len); - - ieee80211_tx_skb(sdata, skb, encrypt); -} - -static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) -{ - struct ieee80211_local *local = sdata->local; - struct sk_buff *skb; - struct ieee80211_mgmt *mgmt; - u8 *pos, *ies, *ht_ie; + u8 *pos, *ies, *ht_ie, *e_ies; int i, len, count, rates_len, supp_rates_len; u16 capab; struct ieee80211_bss *bss; int wmm = 0; struct ieee80211_supported_band *sband; - u64 rates = 0; + u32 rates = 0; + size_t e_ies_len; + + if (ifmgd->flags & IEEE80211_IBSS_PREV_BSSID_SET) { + e_ies = sdata->u.mgd.ie_reassocreq; + e_ies_len = sdata->u.mgd.ie_reassocreq_len; + } else { + e_ies = sdata->u.mgd.ie_assocreq; + e_ies_len = sdata->u.mgd.ie_assocreq_len; + } skb = dev_alloc_skb(local->hw.extra_tx_headroom + - sizeof(*mgmt) + 200 + ifsta->extra_ie_len + - ifsta->ssid_len); + sizeof(*mgmt) + 200 + ifmgd->extra_ie_len + + ifmgd->ssid_len + e_ies_len); if (!skb) { printk(KERN_DEBUG "%s: failed to allocate buffer for assoc " "frame\n", sdata->dev->name); @@ -255,7 +123,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; - capab = ifsta->capab; + capab = ifmgd->capab; if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ) { if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) @@ -264,11 +132,11 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; } - bss = ieee80211_rx_bss_get(local, ifsta->bssid, + bss = ieee80211_rx_bss_get(local, ifmgd->bssid, local->hw.conf.channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len); if (bss) { - if (bss->capability & WLAN_CAPABILITY_PRIVACY) + if (bss->cbss.capability & WLAN_CAPABILITY_PRIVACY) capab |= WLAN_CAPABILITY_PRIVACY; if (bss->wmm_used) wmm = 1; @@ -279,7 +147,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, * b-only mode) */ rates_len = ieee80211_compatible_rates(bss, sband, &rates); - if ((bss->capability & WLAN_CAPABILITY_SPECTRUM_MGMT) && + if ((bss->cbss.capability & WLAN_CAPABILITY_SPECTRUM_MGMT) && (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT)) capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; @@ -291,18 +159,18 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); memset(mgmt, 0, 24); - memcpy(mgmt->da, ifsta->bssid, ETH_ALEN); + memcpy(mgmt->da, ifmgd->bssid, ETH_ALEN); memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); + memcpy(mgmt->bssid, ifmgd->bssid, ETH_ALEN); - if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) { + if (ifmgd->flags & IEEE80211_STA_PREV_BSSID_SET) { skb_put(skb, 10); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_REQ); mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab); mgmt->u.reassoc_req.listen_interval = cpu_to_le16(local->hw.conf.listen_interval); - memcpy(mgmt->u.reassoc_req.current_ap, ifsta->prev_bssid, + memcpy(mgmt->u.reassoc_req.current_ap, ifmgd->prev_bssid, ETH_ALEN); } else { skb_put(skb, 4); @@ -314,10 +182,10 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, } /* SSID */ - ies = pos = skb_put(skb, 2 + ifsta->ssid_len); + ies = pos = skb_put(skb, 2 + ifmgd->ssid_len); *pos++ = WLAN_EID_SSID; - *pos++ = ifsta->ssid_len; - memcpy(pos, ifsta->ssid, ifsta->ssid_len); + *pos++ = ifmgd->ssid_len; + memcpy(pos, ifmgd->ssid, ifmgd->ssid_len); /* add all rates which were marked to be used above */ supp_rates_len = rates_len; @@ -372,12 +240,12 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, } } - if (ifsta->extra_ie) { - pos = skb_put(skb, ifsta->extra_ie_len); - memcpy(pos, ifsta->extra_ie, ifsta->extra_ie_len); + if (ifmgd->extra_ie) { + pos = skb_put(skb, ifmgd->extra_ie_len); + memcpy(pos, ifmgd->extra_ie, ifmgd->extra_ie_len); } - if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) { + if (wmm && (ifmgd->flags & IEEE80211_STA_WMM_ENABLED)) { pos = skb_put(skb, 9); *pos++ = WLAN_EID_VENDOR_SPECIFIC; *pos++ = 7; /* len */ @@ -391,10 +259,17 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, } /* wmm support is a must to HT */ - if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED) && + /* + * IEEE802.11n does not allow TKIP/WEP as pairwise + * ciphers in HT mode. We still associate in non-ht + * mode (11a/b/g) if any one of these ciphers is + * configured as pairwise. + */ + if (wmm && (ifmgd->flags & IEEE80211_STA_WMM_ENABLED) && sband->ht_cap.ht_supported && (ht_ie = ieee80211_bss_get_ie(bss, WLAN_EID_HT_INFORMATION)) && - ht_ie[1] >= sizeof(struct ieee80211_ht_info)) { + ht_ie[1] >= sizeof(struct ieee80211_ht_info) && + (!(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED))) { struct ieee80211_ht_info *ht_info = (struct ieee80211_ht_info *)(ht_ie + 2); u16 cap = sband->ht_cap.cap; @@ -429,11 +304,13 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, memcpy(pos, &sband->ht_cap.mcs, sizeof(sband->ht_cap.mcs)); } - kfree(ifsta->assocreq_ies); - ifsta->assocreq_ies_len = (skb->data + skb->len) - ies; - ifsta->assocreq_ies = kmalloc(ifsta->assocreq_ies_len, GFP_KERNEL); - if (ifsta->assocreq_ies) - memcpy(ifsta->assocreq_ies, ies, ifsta->assocreq_ies_len); + add_extra_ies(skb, e_ies, e_ies_len); + + kfree(ifmgd->assocreq_ies); + ifmgd->assocreq_ies_len = (skb->data + skb->len) - ies; + ifmgd->assocreq_ies = kmalloc(ifmgd->assocreq_ies_len, GFP_KERNEL); + if (ifmgd->assocreq_ies) + memcpy(ifmgd->assocreq_ies, ies, ifmgd->assocreq_ies_len); ieee80211_tx_skb(sdata, skb, 0); } @@ -443,11 +320,22 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, u16 stype, u16 reason) { struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; + u8 *ies; + size_t ies_len; + + if (stype == IEEE80211_STYPE_DEAUTH) { + ies = sdata->u.mgd.ie_deauth; + ies_len = sdata->u.mgd.ie_deauth_len; + } else { + ies = sdata->u.mgd.ie_disassoc; + ies_len = sdata->u.mgd.ie_disassoc_len; + } - skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt)); + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + + ies_len); if (!skb) { printk(KERN_DEBUG "%s: failed to allocate buffer for " "deauth/disassoc frame\n", sdata->dev->name); @@ -457,40 +345,53 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); memset(mgmt, 0, 24); - memcpy(mgmt->da, ifsta->bssid, ETH_ALEN); + memcpy(mgmt->da, ifmgd->bssid, ETH_ALEN); memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); + memcpy(mgmt->bssid, ifmgd->bssid, ETH_ALEN); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype); skb_put(skb, 2); /* u.deauth.reason_code == u.disassoc.reason_code */ mgmt->u.deauth.reason_code = cpu_to_le16(reason); - ieee80211_tx_skb(sdata, skb, 0); + add_extra_ies(skb, ies, ies_len); + + ieee80211_tx_skb(sdata, skb, ifmgd->flags & IEEE80211_STA_MFP_ENABLED); } -/* MLME */ -static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata, - struct ieee80211_bss *bss) +void ieee80211_send_pspoll(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) { - struct ieee80211_local *local = sdata->local; - int i, have_higher_than_11mbit = 0; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_pspoll *pspoll; + struct sk_buff *skb; + u16 fc; - /* cf. IEEE 802.11 9.2.12 */ - for (i = 0; i < bss->supp_rates_len; i++) - if ((bss->supp_rates[i] & 0x7f) * 5 > 110) - have_higher_than_11mbit = 1; + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*pspoll)); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for " + "pspoll frame\n", sdata->dev->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); - if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ && - have_higher_than_11mbit) - sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; - else - sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; + pspoll = (struct ieee80211_pspoll *) skb_put(skb, sizeof(*pspoll)); + memset(pspoll, 0, sizeof(*pspoll)); + fc = IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL | IEEE80211_FCTL_PM; + pspoll->frame_control = cpu_to_le16(fc); + pspoll->aid = cpu_to_le16(ifmgd->aid); - ieee80211_set_wmm_default(sdata); + /* aid in PS-Poll has its two MSBs each set to 1 */ + pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); + + memcpy(pspoll->bssid, ifmgd->bssid, ETH_ALEN); + memcpy(pspoll->ta, sdata->dev->dev_addr, ETH_ALEN); + + ieee80211_tx_skb(sdata, skb, 0); } +/* MLME */ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, - struct ieee80211_if_sta *ifsta, + struct ieee80211_if_managed *ifmgd, u8 *wmm_param, size_t wmm_param_len) { struct ieee80211_tx_queue_params params; @@ -498,7 +399,7 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, int count; u8 *pos; - if (!(ifsta->flags & IEEE80211_STA_WMM_ENABLED)) + if (!(ifmgd->flags & IEEE80211_STA_WMM_ENABLED)) return; if (!wmm_param) @@ -507,18 +408,15 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1) return; count = wmm_param[6] & 0x0f; - if (count == ifsta->wmm_last_param_set) + if (count == ifmgd->wmm_last_param_set) return; - ifsta->wmm_last_param_set = count; + ifmgd->wmm_last_param_set = count; pos = wmm_param + 8; left = wmm_param_len - 8; memset(¶ms, 0, sizeof(params)); - if (!local->ops->conf_tx) - return; - local->wmm_acm = 0; for (; left >= 4; left -= 4, pos += 4) { int aci = (pos[0] >> 5) & 0x03; @@ -526,26 +424,26 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, int queue; switch (aci) { - case 1: + case 1: /* AC_BK */ queue = 3; if (acm) - local->wmm_acm |= BIT(0) | BIT(3); + local->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ break; - case 2: + case 2: /* AC_VI */ queue = 1; if (acm) - local->wmm_acm |= BIT(4) | BIT(5); + local->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ break; - case 3: + case 3: /* AC_VO */ queue = 0; if (acm) - local->wmm_acm |= BIT(6) | BIT(7); + local->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ break; - case 0: + case 0: /* AC_BE */ default: queue = 2; if (acm) - local->wmm_acm |= BIT(1) | BIT(2); + local->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ break; } @@ -559,21 +457,41 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, local->mdev->name, queue, aci, acm, params.aifs, params.cw_min, params.cw_max, params.txop); #endif - /* TODO: handle ACM (block TX, fallback to next lowest allowed - * AC for now) */ - if (local->ops->conf_tx(local_to_hw(local), queue, ¶ms)) { + if (local->ops->conf_tx && + local->ops->conf_tx(local_to_hw(local), queue, ¶ms)) { printk(KERN_DEBUG "%s: failed to set TX queue " "parameters for queue %d\n", local->mdev->name, queue); } } } +static bool ieee80211_check_tim(struct ieee802_11_elems *elems, u16 aid) +{ + u8 mask; + u8 index, indexn1, indexn2; + struct ieee80211_tim_ie *tim = (struct ieee80211_tim_ie *) elems->tim; + + aid &= 0x3fff; + index = aid / 8; + mask = 1 << (aid & 7); + + indexn1 = tim->bitmap_ctrl & 0xfe; + indexn2 = elems->tim_len + indexn1 - 4; + + if (index < indexn1 || index > indexn2) + return false; + + index -= indexn1; + + return !!(tim->virtual_map[index] & mask); +} + static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, u16 capab, bool erp_valid, u8 erp) { struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; #endif u32 changed = 0; bool use_protection; @@ -596,7 +514,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, printk(KERN_DEBUG "%s: CTS protection %s (BSSID=%pM)\n", sdata->dev->name, use_protection ? "enabled" : "disabled", - ifsta->bssid); + ifmgd->bssid); } #endif bss_conf->use_cts_prot = use_protection; @@ -610,7 +528,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, " (BSSID=%pM)\n", sdata->dev->name, use_short_preamble ? "short" : "long", - ifsta->bssid); + ifmgd->bssid); } #endif bss_conf->use_short_preamble = use_short_preamble; @@ -624,7 +542,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, " (BSSID=%pM)\n", sdata->dev->name, use_short_slot ? "short" : "long", - ifsta->bssid); + ifmgd->bssid); } #endif bss_conf->use_short_slot = use_short_slot; @@ -634,57 +552,57 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, return changed; } -static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata) { union iwreq_data wrqu; + memset(&wrqu, 0, sizeof(wrqu)); - if (ifsta->flags & IEEE80211_STA_ASSOCIATED) - memcpy(wrqu.ap_addr.sa_data, sdata->u.sta.bssid, ETH_ALEN); + if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) + memcpy(wrqu.ap_addr.sa_data, sdata->u.mgd.bssid, ETH_ALEN); wrqu.ap_addr.sa_family = ARPHRD_ETHER; wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL); } -static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; char *buf; size_t len; int i; union iwreq_data wrqu; - if (!ifsta->assocreq_ies && !ifsta->assocresp_ies) + if (!ifmgd->assocreq_ies && !ifmgd->assocresp_ies) return; - buf = kmalloc(50 + 2 * (ifsta->assocreq_ies_len + - ifsta->assocresp_ies_len), GFP_KERNEL); + buf = kmalloc(50 + 2 * (ifmgd->assocreq_ies_len + + ifmgd->assocresp_ies_len), GFP_KERNEL); if (!buf) return; len = sprintf(buf, "ASSOCINFO("); - if (ifsta->assocreq_ies) { + if (ifmgd->assocreq_ies) { len += sprintf(buf + len, "ReqIEs="); - for (i = 0; i < ifsta->assocreq_ies_len; i++) { + for (i = 0; i < ifmgd->assocreq_ies_len; i++) { len += sprintf(buf + len, "%02x", - ifsta->assocreq_ies[i]); + ifmgd->assocreq_ies[i]); } } - if (ifsta->assocresp_ies) { - if (ifsta->assocreq_ies) + if (ifmgd->assocresp_ies) { + if (ifmgd->assocreq_ies) len += sprintf(buf + len, " "); len += sprintf(buf + len, "RespIEs="); - for (i = 0; i < ifsta->assocresp_ies_len; i++) { + for (i = 0; i < ifmgd->assocresp_ies_len; i++) { len += sprintf(buf + len, "%02x", - ifsta->assocresp_ies[i]); + ifmgd->assocresp_ies[i]); } } len += sprintf(buf + len, ")"); if (len > IW_CUSTOM_MAX) { len = sprintf(buf, "ASSOCRESPIE="); - for (i = 0; i < ifsta->assocresp_ies_len; i++) { + for (i = 0; i < ifmgd->assocresp_ies_len; i++) { len += sprintf(buf + len, "%02x", - ifsta->assocresp_ies[i]); + ifmgd->assocresp_ies[i]); } } @@ -699,40 +617,37 @@ static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata, static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, u32 bss_info_changed) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_conf *conf = &local_to_hw(local)->conf; struct ieee80211_bss *bss; bss_info_changed |= BSS_CHANGED_ASSOC; - ifsta->flags |= IEEE80211_STA_ASSOCIATED; + ifmgd->flags |= IEEE80211_STA_ASSOCIATED; - if (sdata->vif.type != NL80211_IFTYPE_STATION) - return; - - bss = ieee80211_rx_bss_get(local, ifsta->bssid, + bss = ieee80211_rx_bss_get(local, ifmgd->bssid, conf->channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len); if (bss) { /* set timing information */ - sdata->vif.bss_conf.beacon_int = bss->beacon_int; - sdata->vif.bss_conf.timestamp = bss->timestamp; + sdata->vif.bss_conf.beacon_int = bss->cbss.beacon_interval; + sdata->vif.bss_conf.timestamp = bss->cbss.tsf; sdata->vif.bss_conf.dtim_period = bss->dtim_period; bss_info_changed |= ieee80211_handle_bss_capability(sdata, - bss->capability, bss->has_erp_value, bss->erp_value); + bss->cbss.capability, bss->has_erp_value, bss->erp_value); ieee80211_rx_bss_put(local, bss); } - ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET; - memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN); - ieee80211_sta_send_associnfo(sdata, ifsta); + ifmgd->flags |= IEEE80211_STA_PREV_BSSID_SET; + memcpy(ifmgd->prev_bssid, sdata->u.mgd.bssid, ETH_ALEN); + ieee80211_sta_send_associnfo(sdata); - ifsta->last_probe = jiffies; + ifmgd->last_probe = jiffies; ieee80211_led_assoc(local, 1); sdata->vif.bss_conf.assoc = 1; @@ -745,72 +660,90 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, ieee80211_bss_info_change_notify(sdata, bss_info_changed); if (local->powersave) { - if (local->dynamic_ps_timeout > 0) + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) && + local->hw.conf.dynamic_ps_timeout > 0) { mod_timer(&local->dynamic_ps_timer, jiffies + - msecs_to_jiffies(local->dynamic_ps_timeout)); - else { + msecs_to_jiffies( + local->hw.conf.dynamic_ps_timeout)); + } else { + if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) + ieee80211_send_nullfunc(local, sdata, 1); conf->flags |= IEEE80211_CONF_PS; - ieee80211_hw_config(local, - IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } } netif_tx_start_all_queues(sdata->dev); netif_carrier_on(sdata->dev); - ieee80211_sta_send_apinfo(sdata, ifsta); + ieee80211_sta_send_apinfo(sdata); } -static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata) { - ifsta->direct_probe_tries++; - if (ifsta->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + ifmgd->direct_probe_tries++; + if (ifmgd->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) { printk(KERN_DEBUG "%s: direct probe to AP %pM timed out\n", - sdata->dev->name, ifsta->bssid); - ifsta->state = IEEE80211_STA_MLME_DISABLED; - ieee80211_sta_send_apinfo(sdata, ifsta); + sdata->dev->name, ifmgd->bssid); + ifmgd->state = IEEE80211_STA_MLME_DISABLED; + ieee80211_sta_send_apinfo(sdata); + + /* + * Most likely AP is not in the range so remove the + * bss information associated to the AP + */ + ieee80211_rx_bss_remove(sdata, ifmgd->bssid, + sdata->local->hw.conf.channel->center_freq, + ifmgd->ssid, ifmgd->ssid_len); return; } printk(KERN_DEBUG "%s: direct probe to AP %pM try %d\n", - sdata->dev->name, ifsta->bssid, - ifsta->direct_probe_tries); + sdata->dev->name, ifmgd->bssid, + ifmgd->direct_probe_tries); - ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE; + ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE; - set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifsta->request); + set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifmgd->request); /* Direct probe is sent to broadcast address as some APs * will not answer to direct packet in unassociated state. */ ieee80211_send_probe_req(sdata, NULL, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len, NULL, 0); - mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT); + mod_timer(&ifmgd->timer, jiffies + IEEE80211_AUTH_TIMEOUT); } -static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata) { - ifsta->auth_tries++; - if (ifsta->auth_tries > IEEE80211_AUTH_MAX_TRIES) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + ifmgd->auth_tries++; + if (ifmgd->auth_tries > IEEE80211_AUTH_MAX_TRIES) { printk(KERN_DEBUG "%s: authentication with AP %pM" " timed out\n", - sdata->dev->name, ifsta->bssid); - ifsta->state = IEEE80211_STA_MLME_DISABLED; - ieee80211_sta_send_apinfo(sdata, ifsta); + sdata->dev->name, ifmgd->bssid); + ifmgd->state = IEEE80211_STA_MLME_DISABLED; + ieee80211_sta_send_apinfo(sdata); + ieee80211_rx_bss_remove(sdata, ifmgd->bssid, + sdata->local->hw.conf.channel->center_freq, + ifmgd->ssid, ifmgd->ssid_len); return; } - ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE; + ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE; printk(KERN_DEBUG "%s: authenticate with AP %pM\n", - sdata->dev->name, ifsta->bssid); + sdata->dev->name, ifmgd->bssid); - ieee80211_send_auth(sdata, ifsta, 1, NULL, 0, 0); + ieee80211_send_auth(sdata, 1, ifmgd->auth_alg, NULL, 0, + ifmgd->bssid, 0); + ifmgd->auth_transaction = 2; - mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT); + mod_timer(&ifmgd->timer, jiffies + IEEE80211_AUTH_TIMEOUT); } /* @@ -818,32 +751,33 @@ static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata, * if self disconnected or a reason code from the AP. */ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, bool deauth, - bool self_disconnected, u16 reason) + bool deauth, bool self_disconnected, + u16 reason) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct sta_info *sta; u32 changed = 0, config_changed = 0; rcu_read_lock(); - sta = sta_info_get(local, ifsta->bssid); + sta = sta_info_get(local, ifmgd->bssid); if (!sta) { rcu_read_unlock(); return; } if (deauth) { - ifsta->direct_probe_tries = 0; - ifsta->auth_tries = 0; + ifmgd->direct_probe_tries = 0; + ifmgd->auth_tries = 0; } - ifsta->assoc_scan_tries = 0; - ifsta->assoc_tries = 0; + ifmgd->assoc_scan_tries = 0; + ifmgd->assoc_tries = 0; netif_tx_stop_all_queues(sdata->dev); netif_carrier_off(sdata->dev); - ieee80211_sta_tear_down_BA_sessions(sdata, sta->sta.addr); + ieee80211_sta_tear_down_BA_sessions(sta); if (self_disconnected) { if (deauth) @@ -854,23 +788,28 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, IEEE80211_STYPE_DISASSOC, reason); } - ifsta->flags &= ~IEEE80211_STA_ASSOCIATED; + ifmgd->flags &= ~IEEE80211_STA_ASSOCIATED; changed |= ieee80211_reset_erp_info(sdata); ieee80211_led_assoc(local, 0); changed |= BSS_CHANGED_ASSOC; sdata->vif.bss_conf.assoc = false; - ieee80211_sta_send_apinfo(sdata, ifsta); + ieee80211_sta_send_apinfo(sdata); - if (self_disconnected || reason == WLAN_REASON_DISASSOC_STA_HAS_LEFT) - ifsta->state = IEEE80211_STA_MLME_DISABLED; + if (self_disconnected || reason == WLAN_REASON_DISASSOC_STA_HAS_LEFT) { + ifmgd->state = IEEE80211_STA_MLME_DISABLED; + ieee80211_rx_bss_remove(sdata, ifmgd->bssid, + sdata->local->hw.conf.channel->center_freq, + ifmgd->ssid, ifmgd->ssid_len); + } rcu_read_unlock(); - local->hw.conf.ht.enabled = false; + /* channel(_type) changes are handled by ieee80211_hw_config */ local->oper_channel_type = NL80211_CHAN_NO_HT; - config_changed |= IEEE80211_CONF_CHANGE_HT; + + local->power_constr_level = 0; del_timer_sync(&local->dynamic_ps_timer); cancel_work_sync(&local->dynamic_ps_enable_work); @@ -885,7 +824,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); - sta = sta_info_get(local, ifsta->bssid); + sta = sta_info_get(local, ifmgd->bssid); if (!sta) { rcu_read_unlock(); return; @@ -906,27 +845,27 @@ static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata) return 1; } -static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_bss *bss; int bss_privacy; int wep_privacy; int privacy_invoked; - if (!ifsta || (ifsta->flags & IEEE80211_STA_MIXED_CELL)) + if (!ifmgd || (ifmgd->flags & IEEE80211_STA_MIXED_CELL)) return 0; - bss = ieee80211_rx_bss_get(local, ifsta->bssid, + bss = ieee80211_rx_bss_get(local, ifmgd->bssid, local->hw.conf.channel->center_freq, - ifsta->ssid, ifsta->ssid_len); + ifmgd->ssid, ifmgd->ssid_len); if (!bss) return 0; - bss_privacy = !!(bss->capability & WLAN_CAPABILITY_PRIVACY); + bss_privacy = !!(bss->cbss.capability & WLAN_CAPABILITY_PRIVACY); wep_privacy = !!ieee80211_sta_wep_configured(sdata); - privacy_invoked = !!(ifsta->flags & IEEE80211_STA_PRIVACY_INVOKED); + privacy_invoked = !!(ifmgd->flags & IEEE80211_STA_PRIVACY_INVOKED); ieee80211_rx_bss_put(local, bss); @@ -936,38 +875,42 @@ static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata, return 1; } -static void ieee80211_associate(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_associate(struct ieee80211_sub_if_data *sdata) { - ifsta->assoc_tries++; - if (ifsta->assoc_tries > IEEE80211_ASSOC_MAX_TRIES) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + ifmgd->assoc_tries++; + if (ifmgd->assoc_tries > IEEE80211_ASSOC_MAX_TRIES) { printk(KERN_DEBUG "%s: association with AP %pM" " timed out\n", - sdata->dev->name, ifsta->bssid); - ifsta->state = IEEE80211_STA_MLME_DISABLED; - ieee80211_sta_send_apinfo(sdata, ifsta); + sdata->dev->name, ifmgd->bssid); + ifmgd->state = IEEE80211_STA_MLME_DISABLED; + ieee80211_sta_send_apinfo(sdata); + ieee80211_rx_bss_remove(sdata, ifmgd->bssid, + sdata->local->hw.conf.channel->center_freq, + ifmgd->ssid, ifmgd->ssid_len); return; } - ifsta->state = IEEE80211_STA_MLME_ASSOCIATE; + ifmgd->state = IEEE80211_STA_MLME_ASSOCIATE; printk(KERN_DEBUG "%s: associate with AP %pM\n", - sdata->dev->name, ifsta->bssid); - if (ieee80211_privacy_mismatch(sdata, ifsta)) { + sdata->dev->name, ifmgd->bssid); + if (ieee80211_privacy_mismatch(sdata)) { printk(KERN_DEBUG "%s: mismatch in privacy configuration and " "mixed-cell disabled - abort association\n", sdata->dev->name); - ifsta->state = IEEE80211_STA_MLME_DISABLED; + ifmgd->state = IEEE80211_STA_MLME_DISABLED; return; } - ieee80211_send_assoc(sdata, ifsta); + ieee80211_send_assoc(sdata); - mod_timer(&ifsta->timer, jiffies + IEEE80211_ASSOC_TIMEOUT); + mod_timer(&ifmgd->timer, jiffies + IEEE80211_ASSOC_TIMEOUT); } -static void ieee80211_associated(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_associated(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct sta_info *sta; int disassoc; @@ -977,38 +920,40 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata, * for better APs. */ /* TODO: remove expired BSSes */ - ifsta->state = IEEE80211_STA_MLME_ASSOCIATED; + ifmgd->state = IEEE80211_STA_MLME_ASSOCIATED; rcu_read_lock(); - sta = sta_info_get(local, ifsta->bssid); + sta = sta_info_get(local, ifmgd->bssid); if (!sta) { printk(KERN_DEBUG "%s: No STA entry for own AP %pM\n", - sdata->dev->name, ifsta->bssid); + sdata->dev->name, ifmgd->bssid); disassoc = 1; } else { disassoc = 0; if (time_after(jiffies, sta->last_rx + IEEE80211_MONITORING_INTERVAL)) { - if (ifsta->flags & IEEE80211_STA_PROBEREQ_POLL) { + if (ifmgd->flags & IEEE80211_STA_PROBEREQ_POLL) { printk(KERN_DEBUG "%s: No ProbeResp from " "current AP %pM - assume out of " "range\n", - sdata->dev->name, ifsta->bssid); + sdata->dev->name, ifmgd->bssid); disassoc = 1; } else - ieee80211_send_probe_req(sdata, ifsta->bssid, - ifsta->ssid, - ifsta->ssid_len); - ifsta->flags ^= IEEE80211_STA_PROBEREQ_POLL; + ieee80211_send_probe_req(sdata, ifmgd->bssid, + ifmgd->ssid, + ifmgd->ssid_len, + NULL, 0); + ifmgd->flags ^= IEEE80211_STA_PROBEREQ_POLL; } else { - ifsta->flags &= ~IEEE80211_STA_PROBEREQ_POLL; - if (time_after(jiffies, ifsta->last_probe + + ifmgd->flags &= ~IEEE80211_STA_PROBEREQ_POLL; + if (time_after(jiffies, ifmgd->last_probe + IEEE80211_PROBE_INTERVAL)) { - ifsta->last_probe = jiffies; - ieee80211_send_probe_req(sdata, ifsta->bssid, - ifsta->ssid, - ifsta->ssid_len); + ifmgd->last_probe = jiffies; + ieee80211_send_probe_req(sdata, ifmgd->bssid, + ifmgd->ssid, + ifmgd->ssid_len, + NULL, 0); } } } @@ -1016,25 +961,25 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); if (disassoc) - ieee80211_set_disassoc(sdata, ifsta, true, true, + ieee80211_set_disassoc(sdata, true, true, WLAN_REASON_PREV_AUTH_NOT_VALID); else - mod_timer(&ifsta->timer, jiffies + + mod_timer(&ifmgd->timer, jiffies + IEEE80211_MONITORING_INTERVAL); } -static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + printk(KERN_DEBUG "%s: authenticated\n", sdata->dev->name); - ifsta->flags |= IEEE80211_STA_AUTHENTICATED; - ieee80211_associate(sdata, ifsta); + ifmgd->flags |= IEEE80211_STA_AUTHENTICATED; + ieee80211_associate(sdata); } static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, struct ieee80211_mgmt *mgmt, size_t len) { @@ -1045,50 +990,37 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); if (!elems.challenge) return; - ieee80211_send_auth(sdata, ifsta, 3, elems.challenge - 2, - elems.challenge_len + 2, 1); + ieee80211_send_auth(sdata, 3, sdata->u.mgd.auth_alg, + elems.challenge - 2, elems.challenge_len + 2, + sdata->u.mgd.bssid, 1); + sdata->u.mgd.auth_transaction = 4; } static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, struct ieee80211_mgmt *mgmt, size_t len) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 auth_alg, auth_transaction, status_code; - if (ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE && - sdata->vif.type != NL80211_IFTYPE_ADHOC) + if (ifmgd->state != IEEE80211_STA_MLME_AUTHENTICATE) return; if (len < 24 + 6) return; - if (sdata->vif.type != NL80211_IFTYPE_ADHOC && - memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0) + if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN) != 0) return; - if (sdata->vif.type != NL80211_IFTYPE_ADHOC && - memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0) + if (memcmp(ifmgd->bssid, mgmt->bssid, ETH_ALEN) != 0) return; auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); status_code = le16_to_cpu(mgmt->u.auth.status_code); - if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { - /* - * IEEE 802.11 standard does not require authentication in IBSS - * networks and most implementations do not seem to use it. - * However, try to reply to authentication attempts if someone - * has actually implemented this. - */ - if (auth_alg != WLAN_AUTH_OPEN || auth_transaction != 1) - return; - ieee80211_send_auth(sdata, ifsta, 2, NULL, 0, 0); - } - - if (auth_alg != ifsta->auth_alg || - auth_transaction != ifsta->auth_transaction) + if (auth_alg != ifmgd->auth_alg || + auth_transaction != ifmgd->auth_transaction) return; if (status_code != WLAN_STATUS_SUCCESS) { @@ -1097,15 +1029,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, const int num_algs = ARRAY_SIZE(algs); int i, pos; algs[0] = algs[1] = algs[2] = 0xff; - if (ifsta->auth_algs & IEEE80211_AUTH_ALG_OPEN) + if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_OPEN) algs[0] = WLAN_AUTH_OPEN; - if (ifsta->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY) + if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY) algs[1] = WLAN_AUTH_SHARED_KEY; - if (ifsta->auth_algs & IEEE80211_AUTH_ALG_LEAP) + if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_LEAP) algs[2] = WLAN_AUTH_LEAP; - if (ifsta->auth_alg == WLAN_AUTH_OPEN) + if (ifmgd->auth_alg == WLAN_AUTH_OPEN) pos = 0; - else if (ifsta->auth_alg == WLAN_AUTH_SHARED_KEY) + else if (ifmgd->auth_alg == WLAN_AUTH_SHARED_KEY) pos = 1; else pos = 2; @@ -1113,105 +1045,105 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, pos++; if (pos >= num_algs) pos = 0; - if (algs[pos] == ifsta->auth_alg || + if (algs[pos] == ifmgd->auth_alg || algs[pos] == 0xff) continue; if (algs[pos] == WLAN_AUTH_SHARED_KEY && !ieee80211_sta_wep_configured(sdata)) continue; - ifsta->auth_alg = algs[pos]; + ifmgd->auth_alg = algs[pos]; break; } } return; } - switch (ifsta->auth_alg) { + switch (ifmgd->auth_alg) { case WLAN_AUTH_OPEN: case WLAN_AUTH_LEAP: - ieee80211_auth_completed(sdata, ifsta); + ieee80211_auth_completed(sdata); break; case WLAN_AUTH_SHARED_KEY: - if (ifsta->auth_transaction == 4) - ieee80211_auth_completed(sdata, ifsta); + if (ifmgd->auth_transaction == 4) + ieee80211_auth_completed(sdata); else - ieee80211_auth_challenge(sdata, ifsta, mgmt, len); + ieee80211_auth_challenge(sdata, mgmt, len); break; } } static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, struct ieee80211_mgmt *mgmt, size_t len) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 reason_code; if (len < 24 + 2) return; - if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN)) + if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN)) return; reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); - if (ifsta->flags & IEEE80211_STA_AUTHENTICATED) + if (ifmgd->flags & IEEE80211_STA_AUTHENTICATED) printk(KERN_DEBUG "%s: deauthenticated (Reason: %u)\n", sdata->dev->name, reason_code); - if (ifsta->state == IEEE80211_STA_MLME_AUTHENTICATE || - ifsta->state == IEEE80211_STA_MLME_ASSOCIATE || - ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) { - ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE; - mod_timer(&ifsta->timer, jiffies + + if (ifmgd->state == IEEE80211_STA_MLME_AUTHENTICATE || + ifmgd->state == IEEE80211_STA_MLME_ASSOCIATE || + ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) { + ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE; + mod_timer(&ifmgd->timer, jiffies + IEEE80211_RETRY_AUTH_INTERVAL); } - ieee80211_set_disassoc(sdata, ifsta, true, false, 0); - ifsta->flags &= ~IEEE80211_STA_AUTHENTICATED; + ieee80211_set_disassoc(sdata, true, false, 0); + ifmgd->flags &= ~IEEE80211_STA_AUTHENTICATED; } static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, struct ieee80211_mgmt *mgmt, size_t len) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 reason_code; if (len < 24 + 2) return; - if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN)) + if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN)) return; reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); - if (ifsta->flags & IEEE80211_STA_ASSOCIATED) + if (ifmgd->flags & IEEE80211_STA_ASSOCIATED) printk(KERN_DEBUG "%s: disassociated (Reason: %u)\n", sdata->dev->name, reason_code); - if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) { - ifsta->state = IEEE80211_STA_MLME_ASSOCIATE; - mod_timer(&ifsta->timer, jiffies + + if (ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) { + ifmgd->state = IEEE80211_STA_MLME_ASSOCIATE; + mod_timer(&ifmgd->timer, jiffies + IEEE80211_RETRY_AUTH_INTERVAL); } - ieee80211_set_disassoc(sdata, ifsta, false, false, reason_code); + ieee80211_set_disassoc(sdata, false, false, reason_code); } static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, struct ieee80211_mgmt *mgmt, size_t len, int reassoc) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; struct sta_info *sta; - u64 rates, basic_rates; + u32 rates, basic_rates; u16 capab_info, status_code, aid; struct ieee802_11_elems elems; struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; @@ -1224,13 +1156,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, /* AssocResp and ReassocResp have identical structure, so process both * of them in this function. */ - if (ifsta->state != IEEE80211_STA_MLME_ASSOCIATE) + if (ifmgd->state != IEEE80211_STA_MLME_ASSOCIATE) return; if (len < 24 + 6) return; - if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0) + if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN) != 0) return; capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); @@ -1242,13 +1174,31 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, sdata->dev->name, reassoc ? "Rea" : "A", mgmt->sa, capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14)))); + pos = mgmt->u.assoc_resp.variable; + ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); + + if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY && + elems.timeout_int && elems.timeout_int_len == 5 && + elems.timeout_int[0] == WLAN_TIMEOUT_ASSOC_COMEBACK) { + u32 tu, ms; + tu = get_unaligned_le32(elems.timeout_int + 1); + ms = tu * 1024 / 1000; + printk(KERN_DEBUG "%s: AP rejected association temporarily; " + "comeback duration %u TU (%u ms)\n", + sdata->dev->name, tu, ms); + if (ms > IEEE80211_ASSOC_TIMEOUT) + mod_timer(&ifmgd->timer, + jiffies + msecs_to_jiffies(ms)); + return; + } + if (status_code != WLAN_STATUS_SUCCESS) { printk(KERN_DEBUG "%s: AP denied association (code=%d)\n", sdata->dev->name, status_code); /* if this was a reassociation, ensure we try a "full" * association next time. This works around some broken APs * which do not correctly reject reassociation requests. */ - ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET; + ifmgd->flags &= ~IEEE80211_STA_PREV_BSSID_SET; return; } @@ -1257,9 +1207,6 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, "set\n", sdata->dev->name, aid); aid &= ~(BIT(15) | BIT(14)); - pos = mgmt->u.assoc_resp.variable; - ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); - if (!elems.supp_rates) { printk(KERN_DEBUG "%s: no SuppRates element in AssocResp\n", sdata->dev->name); @@ -1267,40 +1214,29 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, } printk(KERN_DEBUG "%s: associated\n", sdata->dev->name); - ifsta->aid = aid; - ifsta->ap_capab = capab_info; + ifmgd->aid = aid; + ifmgd->ap_capab = capab_info; - kfree(ifsta->assocresp_ies); - ifsta->assocresp_ies_len = len - (pos - (u8 *) mgmt); - ifsta->assocresp_ies = kmalloc(ifsta->assocresp_ies_len, GFP_KERNEL); - if (ifsta->assocresp_ies) - memcpy(ifsta->assocresp_ies, pos, ifsta->assocresp_ies_len); + kfree(ifmgd->assocresp_ies); + ifmgd->assocresp_ies_len = len - (pos - (u8 *) mgmt); + ifmgd->assocresp_ies = kmalloc(ifmgd->assocresp_ies_len, GFP_KERNEL); + if (ifmgd->assocresp_ies) + memcpy(ifmgd->assocresp_ies, pos, ifmgd->assocresp_ies_len); rcu_read_lock(); /* Add STA entry for the AP */ - sta = sta_info_get(local, ifsta->bssid); + sta = sta_info_get(local, ifmgd->bssid); if (!sta) { - struct ieee80211_bss *bss; - newsta = true; - sta = sta_info_alloc(sdata, ifsta->bssid, GFP_ATOMIC); + sta = sta_info_alloc(sdata, ifmgd->bssid, GFP_ATOMIC); if (!sta) { printk(KERN_DEBUG "%s: failed to alloc STA entry for" " the AP\n", sdata->dev->name); rcu_read_unlock(); return; } - bss = ieee80211_rx_bss_get(local, ifsta->bssid, - local->hw.conf.channel->center_freq, - ifsta->ssid, ifsta->ssid_len); - if (bss) { - sta->last_signal = bss->signal; - sta->last_qual = bss->qual; - sta->last_noise = bss->noise; - ieee80211_rx_bss_put(local, bss); - } /* update new sta with its last rx activity */ sta->last_rx = jiffies; @@ -1367,7 +1303,8 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, else sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; - if (elems.ht_cap_elem) + /* If TKIP/WEP is used, no need to parse AP's HT capabilities */ + if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED)) ieee80211_ht_cap_ie_to_sta_ht_cap(sband, elems.ht_cap_elem, &sta->sta.ht_cap); @@ -1375,6 +1312,9 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, rate_control_rate_init(sta); + if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) + set_sta_flags(sta, WLAN_STA_MFP); + if (elems.wmm_param) set_sta_flags(sta, WLAN_STA_WME); @@ -1391,11 +1331,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); if (elems.wmm_param) - ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param, + ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param, elems.wmm_param_len); if (elems.ht_info_elem && elems.wmm_param && - (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) + (ifmgd->flags & IEEE80211_STA_WMM_ENABLED) && + !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED)) changed |= ieee80211_enable_ht(sdata, elems.ht_info_elem, ap_ht_cap_flags); @@ -1403,136 +1344,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, * ieee80211_set_associated() will tell the driver */ bss_conf->aid = aid; bss_conf->assoc_capability = capab_info; - ieee80211_set_associated(sdata, ifsta, changed); + ieee80211_set_associated(sdata, changed); - ieee80211_associated(sdata, ifsta); + ieee80211_associated(sdata); } -static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, - struct ieee80211_bss *bss) -{ - struct ieee80211_local *local = sdata->local; - int res, rates, i, j; - struct sk_buff *skb; - struct ieee80211_mgmt *mgmt; - u8 *pos; - struct ieee80211_supported_band *sband; - union iwreq_data wrqu; - - skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400); - if (!skb) { - printk(KERN_DEBUG "%s: failed to allocate buffer for probe " - "response\n", sdata->dev->name); - return -ENOMEM; - } - - sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; - - /* Remove possible STA entries from other IBSS networks. */ - sta_info_flush_delayed(sdata); - - if (local->ops->reset_tsf) { - /* Reset own TSF to allow time synchronization work. */ - local->ops->reset_tsf(local_to_hw(local)); - } - memcpy(ifsta->bssid, bss->bssid, ETH_ALEN); - res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID); - if (res) - return res; - - local->hw.conf.beacon_int = bss->beacon_int >= 10 ? bss->beacon_int : 10; - - sdata->drop_unencrypted = bss->capability & - WLAN_CAPABILITY_PRIVACY ? 1 : 0; - - res = ieee80211_set_freq(sdata, bss->freq); - - if (res) - return res; - - /* Build IBSS probe response */ - - skb_reserve(skb, local->hw.extra_tx_headroom); - - mgmt = (struct ieee80211_mgmt *) - skb_put(skb, 24 + sizeof(mgmt->u.beacon)); - memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon)); - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_PROBE_RESP); - memset(mgmt->da, 0xff, ETH_ALEN); - memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); - memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); - mgmt->u.beacon.beacon_int = - cpu_to_le16(local->hw.conf.beacon_int); - mgmt->u.beacon.timestamp = cpu_to_le64(bss->timestamp); - mgmt->u.beacon.capab_info = cpu_to_le16(bss->capability); - - pos = skb_put(skb, 2 + ifsta->ssid_len); - *pos++ = WLAN_EID_SSID; - *pos++ = ifsta->ssid_len; - memcpy(pos, ifsta->ssid, ifsta->ssid_len); - - rates = bss->supp_rates_len; - if (rates > 8) - rates = 8; - pos = skb_put(skb, 2 + rates); - *pos++ = WLAN_EID_SUPP_RATES; - *pos++ = rates; - memcpy(pos, bss->supp_rates, rates); - - if (bss->band == IEEE80211_BAND_2GHZ) { - pos = skb_put(skb, 2 + 1); - *pos++ = WLAN_EID_DS_PARAMS; - *pos++ = 1; - *pos++ = ieee80211_frequency_to_channel(bss->freq); - } - - pos = skb_put(skb, 2 + 2); - *pos++ = WLAN_EID_IBSS_PARAMS; - *pos++ = 2; - /* FIX: set ATIM window based on scan results */ - *pos++ = 0; - *pos++ = 0; - - if (bss->supp_rates_len > 8) { - rates = bss->supp_rates_len - 8; - pos = skb_put(skb, 2 + rates); - *pos++ = WLAN_EID_EXT_SUPP_RATES; - *pos++ = rates; - memcpy(pos, &bss->supp_rates[8], rates); - } - - ifsta->probe_resp = skb; - - ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON); - - - rates = 0; - sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; - for (i = 0; i < bss->supp_rates_len; i++) { - int bitrate = (bss->supp_rates[i] & 0x7f) * 5; - for (j = 0; j < sband->n_bitrates; j++) - if (sband->bitrates[j].bitrate == bitrate) - rates |= BIT(j); - } - ifsta->supp_rates_bits[local->hw.conf.channel->band] = rates; - - ieee80211_sta_def_wmm_params(sdata, bss); - - ifsta->state = IEEE80211_STA_MLME_IBSS_JOINED; - mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL); - - ieee80211_led_assoc(local, true); - - memset(&wrqu, 0, sizeof(wrqu)); - memcpy(wrqu.ap_addr.sa_data, bss->bssid, ETH_ALEN); - wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL); - - return res; -} - static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, @@ -1543,11 +1360,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; int freq; struct ieee80211_bss *bss; - struct sta_info *sta; struct ieee80211_channel *channel; - u64 beacon_timestamp, rx_timestamp; - u64 supp_rates = 0; - enum ieee80211_band band = rx_status->band; if (elems->ds_params && elems->ds_params_len == 1) freq = ieee80211_channel_to_frequency(elems->ds_params[0]); @@ -1559,112 +1372,16 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; - if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates && - memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) { - supp_rates = ieee80211_sta_get_rates(local, elems, band); - - rcu_read_lock(); - - sta = sta_info_get(local, mgmt->sa); - if (sta) { - u64 prev_rates; - - prev_rates = sta->sta.supp_rates[band]; - /* make sure mandatory rates are always added */ - sta->sta.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(local, band); - -#ifdef CONFIG_MAC80211_IBSS_DEBUG - if (sta->sta.supp_rates[band] != prev_rates) - printk(KERN_DEBUG "%s: updated supp_rates set " - "for %pM based on beacon info (0x%llx | " - "0x%llx -> 0x%llx)\n", - sdata->dev->name, - sta->sta.addr, - (unsigned long long) prev_rates, - (unsigned long long) supp_rates, - (unsigned long long) sta->sta.supp_rates[band]); -#endif - } else { - ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); - } - - rcu_read_unlock(); - } - bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, - freq, beacon); + channel, beacon); if (!bss) return; - /* was just updated in ieee80211_bss_info_update */ - beacon_timestamp = bss->timestamp; - - /* - * In STA mode, the remaining parameters should not be overridden - * by beacons because they're not necessarily accurate there. - */ - if (sdata->vif.type != NL80211_IFTYPE_ADHOC && - bss->last_probe_resp && beacon) { - ieee80211_rx_bss_put(local, bss); - return; - } - - /* check if we need to merge IBSS */ - if (sdata->vif.type == NL80211_IFTYPE_ADHOC && beacon && - bss->capability & WLAN_CAPABILITY_IBSS && - bss->freq == local->oper_channel->center_freq && - elems->ssid_len == sdata->u.sta.ssid_len && - memcmp(elems->ssid, sdata->u.sta.ssid, - sdata->u.sta.ssid_len) == 0) { - if (rx_status->flag & RX_FLAG_TSFT) { - /* in order for correct IBSS merging we need mactime - * - * since mactime is defined as the time the first data - * symbol of the frame hits the PHY, and the timestamp - * of the beacon is defined as "the time that the data - * symbol containing the first bit of the timestamp is - * transmitted to the PHY plus the transmitting STA’s - * delays through its local PHY from the MAC-PHY - * interface to its interface with the WM" - * (802.11 11.1.2) - equals the time this bit arrives at - * the receiver - we have to take into account the - * offset between the two. - * e.g: at 1 MBit that means mactime is 192 usec earlier - * (=24 bytes * 8 usecs/byte) than the beacon timestamp. - */ - int rate; - if (rx_status->flag & RX_FLAG_HT) { - rate = 65; /* TODO: HT rates */ - } else { - rate = local->hw.wiphy->bands[band]-> - bitrates[rx_status->rate_idx].bitrate; - } - rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate); - } else if (local && local->ops && local->ops->get_tsf) - /* second best option: get current TSF */ - rx_timestamp = local->ops->get_tsf(local_to_hw(local)); - else - /* can't merge without knowing the TSF */ - rx_timestamp = -1LLU; -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "RX beacon SA=%pM BSSID=" - "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n", - mgmt->sa, mgmt->bssid, - (unsigned long long)rx_timestamp, - (unsigned long long)beacon_timestamp, - (unsigned long long)(rx_timestamp - beacon_timestamp), - jiffies); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ - if (beacon_timestamp > rx_timestamp) { -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "%s: beacon TSF higher than " - "local TSF - IBSS merge with BSSID %pM\n", - sdata->dev->name, mgmt->bssid); -#endif - ieee80211_sta_join_ibss(sdata, &sdata->u.sta, bss); - ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates); - } + if (elems->ch_switch_elem && (elems->ch_switch_elem_len == 3) && + (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN) == 0)) { + struct ieee80211_channel_sw_ie *sw_elem = + (struct ieee80211_channel_sw_ie *)elems->ch_switch_elem; + ieee80211_process_chanswitch(sdata, sw_elem, bss); } ieee80211_rx_bss_put(local, bss); @@ -1678,7 +1395,6 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, { size_t baselen; struct ieee802_11_elems elems; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN)) return; /* ignore ProbeResp to foreign address */ @@ -1694,25 +1410,24 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, /* direct probe may be part of the association flow */ if (test_and_clear_bit(IEEE80211_STA_REQ_DIRECT_PROBE, - &ifsta->request)) { + &sdata->u.mgd.request)) { printk(KERN_DEBUG "%s direct probe responded\n", sdata->dev->name); - ieee80211_authenticate(sdata, ifsta); + ieee80211_authenticate(sdata); } } - static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) { - struct ieee80211_if_sta *ifsta; + struct ieee80211_if_managed *ifmgd; size_t baselen; struct ieee802_11_elems elems; struct ieee80211_local *local = sdata->local; u32 changed = 0; - bool erp_valid; + bool erp_valid, directed_tim; u8 erp_value = 0; /* Process beacon from the current BSS */ @@ -1726,15 +1441,43 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (sdata->vif.type != NL80211_IFTYPE_STATION) return; - ifsta = &sdata->u.sta; - if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED) || - memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0) + ifmgd = &sdata->u.mgd; + + if (!(ifmgd->flags & IEEE80211_STA_ASSOCIATED) || + memcmp(ifmgd->bssid, mgmt->bssid, ETH_ALEN) != 0) + return; + + if (rx_status->freq != local->hw.conf.channel->center_freq) return; - ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param, + ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param, elems.wmm_param_len); + if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) { + directed_tim = ieee80211_check_tim(&elems, ifmgd->aid); + + if (directed_tim) { + if (local->hw.conf.dynamic_ps_timeout > 0) { + local->hw.conf.flags &= ~IEEE80211_CONF_PS; + ieee80211_hw_config(local, + IEEE80211_CONF_CHANGE_PS); + ieee80211_send_nullfunc(local, sdata, 0); + } else { + local->pspolling = true; + + /* + * Here is assumed that the driver will be + * able to send ps-poll frame and receive a + * response even though power save mode is + * enabled, but some drivers might require + * to disable power save here. This needs + * to be investigated. + */ + ieee80211_send_pspoll(local, sdata); + } + } + } if (elems.erp_info && elems.erp_info_len >= 1) { erp_valid = true; @@ -1747,14 +1490,15 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, erp_valid, erp_value); - if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param) { + if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param && + !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED)) { struct sta_info *sta; struct ieee80211_supported_band *sband; u16 ap_ht_cap_flags; rcu_read_lock(); - sta = sta_info_get(local, ifsta->bssid); + sta = sta_info_get(local, ifmgd->bssid); if (!sta) { rcu_read_unlock(); return; @@ -1778,92 +1522,28 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, * for the BSSID we are associated to */ regulatory_hint_11d(local->hw.wiphy, elems.country_elem, elems.country_elem_len); - } - - ieee80211_bss_info_change_notify(sdata, changed); -} - -static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta, - struct ieee80211_mgmt *mgmt, - size_t len, - struct ieee80211_rx_status *rx_status) -{ - struct ieee80211_local *local = sdata->local; - int tx_last_beacon; - struct sk_buff *skb; - struct ieee80211_mgmt *resp; - u8 *pos, *end; - - if (sdata->vif.type != NL80211_IFTYPE_ADHOC || - ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED || - len < 24 + 2 || !ifsta->probe_resp) - return; - - if (local->ops->tx_last_beacon) - tx_last_beacon = local->ops->tx_last_beacon(local_to_hw(local)); - else - tx_last_beacon = 1; - -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM" - " (tx_last_beacon=%d)\n", - sdata->dev->name, mgmt->sa, mgmt->da, - mgmt->bssid, tx_last_beacon); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ - - if (!tx_last_beacon) - return; - - if (memcmp(mgmt->bssid, ifsta->bssid, ETH_ALEN) != 0 && - memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0) - return; - - end = ((u8 *) mgmt) + len; - pos = mgmt->u.probe_req.variable; - if (pos[0] != WLAN_EID_SSID || - pos + 2 + pos[1] > end) { -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq " - "from %pM\n", - sdata->dev->name, mgmt->sa); -#endif - return; + /* TODO: IBSS also needs this */ + if (elems.pwr_constr_elem) + ieee80211_handle_pwr_constr(sdata, + le16_to_cpu(mgmt->u.probe_resp.capab_info), + elems.pwr_constr_elem, + elems.pwr_constr_elem_len); } - if (pos[1] != 0 && - (pos[1] != ifsta->ssid_len || - memcmp(pos + 2, ifsta->ssid, ifsta->ssid_len) != 0)) { - /* Ignore ProbeReq for foreign SSID */ - return; - } - - /* Reply with ProbeResp */ - skb = skb_copy(ifsta->probe_resp, GFP_KERNEL); - if (!skb) - return; - resp = (struct ieee80211_mgmt *) skb->data; - memcpy(resp->da, mgmt->sa, ETH_ALEN); -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n", - sdata->dev->name, resp->da); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ - ieee80211_tx_skb(sdata, skb, 0); + ieee80211_bss_info_change_notify(sdata, changed); } -void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, - struct ieee80211_rx_status *rx_status) +ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + struct ieee80211_rx_status *rx_status) { struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta; struct ieee80211_mgmt *mgmt; u16 fc; if (skb->len < 24) - goto fail; - - ifsta = &sdata->u.sta; + return RX_DROP_MONITOR; mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); @@ -1878,113 +1558,68 @@ void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff * case IEEE80211_STYPE_REASSOC_RESP: case IEEE80211_STYPE_DEAUTH: case IEEE80211_STYPE_DISASSOC: - skb_queue_tail(&ifsta->skb_queue, skb); - queue_work(local->hw.workqueue, &ifsta->work); - return; + skb_queue_tail(&sdata->u.mgd.skb_queue, skb); + queue_work(local->hw.workqueue, &sdata->u.mgd.work); + return RX_QUEUED; } - fail: - kfree_skb(skb); + return RX_DROP_MONITOR; } static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status; - struct ieee80211_if_sta *ifsta; struct ieee80211_mgmt *mgmt; u16 fc; - ifsta = &sdata->u.sta; - rx_status = (struct ieee80211_rx_status *) skb->cb; mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); switch (fc & IEEE80211_FCTL_STYPE) { - case IEEE80211_STYPE_PROBE_REQ: - ieee80211_rx_mgmt_probe_req(sdata, ifsta, mgmt, skb->len, - rx_status); - break; case IEEE80211_STYPE_PROBE_RESP: - ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, rx_status); + ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, + rx_status); break; case IEEE80211_STYPE_BEACON: - ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, rx_status); + ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, + rx_status); break; case IEEE80211_STYPE_AUTH: - ieee80211_rx_mgmt_auth(sdata, ifsta, mgmt, skb->len); + ieee80211_rx_mgmt_auth(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_ASSOC_RESP: - ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt, skb->len, 0); + ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, 0); break; case IEEE80211_STYPE_REASSOC_RESP: - ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt, skb->len, 1); + ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, 1); break; case IEEE80211_STYPE_DEAUTH: - ieee80211_rx_mgmt_deauth(sdata, ifsta, mgmt, skb->len); + ieee80211_rx_mgmt_deauth(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_DISASSOC: - ieee80211_rx_mgmt_disassoc(sdata, ifsta, mgmt, skb->len); + ieee80211_rx_mgmt_disassoc(sdata, mgmt, skb->len); break; } kfree_skb(skb); } - -static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_local *local = sdata->local; - int active = 0; - struct sta_info *sta; - - rcu_read_lock(); - - list_for_each_entry_rcu(sta, &local->sta_list, list) { - if (sta->sdata == sdata && - time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL, - jiffies)) { - active++; - break; - } - } - - rcu_read_unlock(); - - return active; -} - - -static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) -{ - mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL); - - ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT); - if (ieee80211_sta_active_ibss(sdata)) - return; - - printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other " - "IBSS networks with same SSID (merge)\n", sdata->dev->name); - ieee80211_request_scan(sdata, ifsta->ssid, ifsta->ssid_len); -} - - static void ieee80211_sta_timer(unsigned long data) { struct ieee80211_sub_if_data *sdata = (struct ieee80211_sub_if_data *) data; - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; - set_bit(IEEE80211_STA_REQ_RUN, &ifsta->request); - queue_work(local->hw.workqueue, &ifsta->work); + set_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request); + queue_work(local->hw.workqueue, &ifmgd->work); } -static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; if (local->ops->reset_tsf) { @@ -1992,298 +1627,109 @@ static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata, local->ops->reset_tsf(local_to_hw(local)); } - ifsta->wmm_last_param_set = -1; /* allow any WMM update */ + ifmgd->wmm_last_param_set = -1; /* allow any WMM update */ - if (ifsta->auth_algs & IEEE80211_AUTH_ALG_OPEN) - ifsta->auth_alg = WLAN_AUTH_OPEN; - else if (ifsta->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY) - ifsta->auth_alg = WLAN_AUTH_SHARED_KEY; - else if (ifsta->auth_algs & IEEE80211_AUTH_ALG_LEAP) - ifsta->auth_alg = WLAN_AUTH_LEAP; + if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_OPEN) + ifmgd->auth_alg = WLAN_AUTH_OPEN; + else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY) + ifmgd->auth_alg = WLAN_AUTH_SHARED_KEY; + else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_LEAP) + ifmgd->auth_alg = WLAN_AUTH_LEAP; else - ifsta->auth_alg = WLAN_AUTH_OPEN; - ifsta->auth_transaction = -1; - ifsta->flags &= ~IEEE80211_STA_ASSOCIATED; - ifsta->assoc_scan_tries = 0; - ifsta->direct_probe_tries = 0; - ifsta->auth_tries = 0; - ifsta->assoc_tries = 0; + ifmgd->auth_alg = WLAN_AUTH_OPEN; + ifmgd->auth_transaction = -1; + ifmgd->flags &= ~IEEE80211_STA_ASSOCIATED; + ifmgd->assoc_scan_tries = 0; + ifmgd->direct_probe_tries = 0; + ifmgd->auth_tries = 0; + ifmgd->assoc_tries = 0; netif_tx_stop_all_queues(sdata->dev); netif_carrier_off(sdata->dev); } - -static int ieee80211_sta_match_ssid(struct ieee80211_if_sta *ifsta, - const char *ssid, int ssid_len) -{ - int tmp, hidden_ssid; - - if (ssid_len == ifsta->ssid_len && - !memcmp(ifsta->ssid, ssid, ssid_len)) - return 1; - - if (ifsta->flags & IEEE80211_STA_AUTO_BSSID_SEL) - return 0; - - hidden_ssid = 1; - tmp = ssid_len; - while (tmp--) { - if (ssid[tmp] != '\0') { - hidden_ssid = 0; - break; - } - } - - if (hidden_ssid && (ifsta->ssid_len == ssid_len || ssid_len == 0)) - return 1; - - if (ssid_len == 1 && ssid[0] == ' ') - return 1; - - return 0; -} - -static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct ieee80211_bss *bss; - struct ieee80211_supported_band *sband; - u8 bssid[ETH_ALEN], *pos; - int i; - int ret; - -#if 0 - /* Easier testing, use fixed BSSID. */ - memset(bssid, 0xfe, ETH_ALEN); -#else - /* Generate random, not broadcast, locally administered BSSID. Mix in - * own MAC address to make sure that devices that do not have proper - * random number generator get different BSSID. */ - get_random_bytes(bssid, ETH_ALEN); - for (i = 0; i < ETH_ALEN; i++) - bssid[i] ^= sdata->dev->dev_addr[i]; - bssid[0] &= ~0x01; - bssid[0] |= 0x02; -#endif - - printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n", - sdata->dev->name, bssid); - - bss = ieee80211_rx_bss_add(local, bssid, - local->hw.conf.channel->center_freq, - sdata->u.sta.ssid, sdata->u.sta.ssid_len); - if (!bss) - return -ENOMEM; - - bss->band = local->hw.conf.channel->band; - sband = local->hw.wiphy->bands[bss->band]; - - if (local->hw.conf.beacon_int == 0) - local->hw.conf.beacon_int = 100; - bss->beacon_int = local->hw.conf.beacon_int; - bss->last_update = jiffies; - bss->capability = WLAN_CAPABILITY_IBSS; + u8 *bssid = ifmgd->bssid, *ssid = ifmgd->ssid; + u8 ssid_len = ifmgd->ssid_len; + u16 capa_mask = WLAN_CAPABILITY_ESS; + u16 capa_val = WLAN_CAPABILITY_ESS; + struct ieee80211_channel *chan = local->oper_channel; - if (sdata->default_key) - bss->capability |= WLAN_CAPABILITY_PRIVACY; - else - sdata->drop_unencrypted = 0; - - bss->supp_rates_len = sband->n_bitrates; - pos = bss->supp_rates; - for (i = 0; i < sband->n_bitrates; i++) { - int rate = sband->bitrates[i].bitrate; - *pos++ = (u8) (rate / 5); + if (ifmgd->flags & (IEEE80211_STA_AUTO_SSID_SEL | + IEEE80211_STA_AUTO_BSSID_SEL | + IEEE80211_STA_AUTO_CHANNEL_SEL)) { + capa_mask |= WLAN_CAPABILITY_PRIVACY; + if (sdata->default_key) + capa_val |= WLAN_CAPABILITY_PRIVACY; } - ret = ieee80211_sta_join_ibss(sdata, ifsta, bss); - ieee80211_rx_bss_put(local, bss); - return ret; -} - + if (ifmgd->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) + chan = NULL; -static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_bss *bss; - int found = 0; - u8 bssid[ETH_ALEN]; - int active_ibss; + if (ifmgd->flags & IEEE80211_STA_AUTO_BSSID_SEL) + bssid = NULL; - if (ifsta->ssid_len == 0) - return -EINVAL; - - active_ibss = ieee80211_sta_active_ibss(sdata); -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n", - sdata->dev->name, active_ibss); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ - spin_lock_bh(&local->bss_lock); - list_for_each_entry(bss, &local->bss_list, list) { - if (ifsta->ssid_len != bss->ssid_len || - memcmp(ifsta->ssid, bss->ssid, bss->ssid_len) != 0 - || !(bss->capability & WLAN_CAPABILITY_IBSS)) - continue; -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG " bssid=%pM found\n", bss->bssid); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ - memcpy(bssid, bss->bssid, ETH_ALEN); - found = 1; - if (active_ibss || memcmp(bssid, ifsta->bssid, ETH_ALEN) != 0) - break; + if (ifmgd->flags & IEEE80211_STA_AUTO_SSID_SEL) { + ssid = NULL; + ssid_len = 0; } - spin_unlock_bh(&local->bss_lock); -#ifdef CONFIG_MAC80211_IBSS_DEBUG - if (found) - printk(KERN_DEBUG " sta_find_ibss: selected %pM current " - "%pM\n", bssid, ifsta->bssid); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + bss = (void *)cfg80211_get_bss(local->hw.wiphy, chan, + bssid, ssid, ssid_len, + capa_mask, capa_val); - if (found && memcmp(ifsta->bssid, bssid, ETH_ALEN) != 0) { - int ret; - int search_freq; - - if (ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) - search_freq = bss->freq; + if (bss) { + ieee80211_set_freq(sdata, bss->cbss.channel->center_freq); + if (!(ifmgd->flags & IEEE80211_STA_SSID_SET)) + ieee80211_sta_set_ssid(sdata, bss->ssid, + bss->ssid_len); + ieee80211_sta_set_bssid(sdata, bss->cbss.bssid); + ieee80211_sta_def_wmm_params(sdata, bss->supp_rates_len, + bss->supp_rates); + if (sdata->u.mgd.mfp == IEEE80211_MFP_REQUIRED) + sdata->u.mgd.flags |= IEEE80211_STA_MFP_ENABLED; else - search_freq = local->hw.conf.channel->center_freq; - - bss = ieee80211_rx_bss_get(local, bssid, search_freq, - ifsta->ssid, ifsta->ssid_len); - if (!bss) - goto dont_join; - - printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM" - " based on configured SSID\n", - sdata->dev->name, bssid); - ret = ieee80211_sta_join_ibss(sdata, ifsta, bss); - ieee80211_rx_bss_put(local, bss); - return ret; - } - -dont_join: -#ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG " did not try to join ibss\n"); -#endif /* CONFIG_MAC80211_IBSS_DEBUG */ - - /* Selected IBSS not found in current scan results - try to scan */ - if (ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED && - !ieee80211_sta_active_ibss(sdata)) { - mod_timer(&ifsta->timer, jiffies + - IEEE80211_IBSS_MERGE_INTERVAL); - } else if (time_after(jiffies, local->last_scan_completed + - IEEE80211_SCAN_INTERVAL)) { - printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to " - "join\n", sdata->dev->name); - return ieee80211_request_scan(sdata, ifsta->ssid, - ifsta->ssid_len); - } else if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED) { - int interval = IEEE80211_SCAN_INTERVAL; - - if (time_after(jiffies, ifsta->ibss_join_req + - IEEE80211_IBSS_JOIN_TIMEOUT)) { - if ((ifsta->flags & IEEE80211_STA_CREATE_IBSS) && - (!(local->oper_channel->flags & - IEEE80211_CHAN_NO_IBSS))) - return ieee80211_sta_create_ibss(sdata, ifsta); - if (ifsta->flags & IEEE80211_STA_CREATE_IBSS) { - printk(KERN_DEBUG "%s: IBSS not allowed on" - " %d MHz\n", sdata->dev->name, - local->hw.conf.channel->center_freq); - } - - /* No IBSS found - decrease scan interval and continue - * scanning. */ - interval = IEEE80211_SCAN_INTERVAL_SLOW; - } - - ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH; - mod_timer(&ifsta->timer, jiffies + interval); - return 0; - } - - return 0; -} - - -static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_bss *bss, *selected = NULL; - int top_rssi = 0, freq; - - spin_lock_bh(&local->bss_lock); - freq = local->oper_channel->center_freq; - list_for_each_entry(bss, &local->bss_list, list) { - if (!(bss->capability & WLAN_CAPABILITY_ESS)) - continue; - - if ((ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL | - IEEE80211_STA_AUTO_BSSID_SEL | - IEEE80211_STA_AUTO_CHANNEL_SEL)) && - (!!(bss->capability & WLAN_CAPABILITY_PRIVACY) ^ - !!sdata->default_key)) - continue; - - if (!(ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) && - bss->freq != freq) - continue; - - if (!(ifsta->flags & IEEE80211_STA_AUTO_BSSID_SEL) && - memcmp(bss->bssid, ifsta->bssid, ETH_ALEN)) - continue; - - if (!(ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) && - !ieee80211_sta_match_ssid(ifsta, bss->ssid, bss->ssid_len)) - continue; - - if (!selected || top_rssi < bss->signal) { - selected = bss; - top_rssi = bss->signal; - } - } - if (selected) - atomic_inc(&selected->users); - spin_unlock_bh(&local->bss_lock); - - if (selected) { - ieee80211_set_freq(sdata, selected->freq); - if (!(ifsta->flags & IEEE80211_STA_SSID_SET)) - ieee80211_sta_set_ssid(sdata, selected->ssid, - selected->ssid_len); - ieee80211_sta_set_bssid(sdata, selected->bssid); - ieee80211_sta_def_wmm_params(sdata, selected); + sdata->u.mgd.flags &= ~IEEE80211_STA_MFP_ENABLED; /* Send out direct probe if no probe resp was received or * the one we have is outdated */ - if (!selected->last_probe_resp || - time_after(jiffies, selected->last_probe_resp + if (!bss->last_probe_resp || + time_after(jiffies, bss->last_probe_resp + IEEE80211_SCAN_RESULT_EXPIRE)) - ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE; + ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE; else - ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE; + ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE; - ieee80211_rx_bss_put(local, selected); - ieee80211_sta_reset_auth(sdata, ifsta); + ieee80211_rx_bss_put(local, bss); + ieee80211_sta_reset_auth(sdata); return 0; } else { - if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) { - ifsta->assoc_scan_tries++; - if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) - ieee80211_start_scan(sdata, NULL, 0); + if (ifmgd->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) { + ifmgd->assoc_scan_tries++; + /* XXX maybe racy? */ + if (local->scan_req) + return -1; + memcpy(local->int_scan_req.ssids[0].ssid, + ifmgd->ssid, IEEE80211_MAX_SSID_LEN); + if (ifmgd->flags & IEEE80211_STA_AUTO_SSID_SEL) + local->int_scan_req.ssids[0].ssid_len = 0; else - ieee80211_start_scan(sdata, ifsta->ssid, - ifsta->ssid_len); - ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE; - set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request); - } else - ifsta->state = IEEE80211_STA_MLME_DISABLED; + local->int_scan_req.ssids[0].ssid_len = ifmgd->ssid_len; + + if (ieee80211_start_scan(sdata, &local->int_scan_req)) + ieee80211_scan_failed(local); + + ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE; + set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request); + } else { + ifmgd->assoc_scan_tries = 0; + ifmgd->state = IEEE80211_STA_MLME_DISABLED; + } } return -1; } @@ -2292,9 +1738,9 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata, static void ieee80211_sta_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = - container_of(work, struct ieee80211_sub_if_data, u.sta.work); + container_of(work, struct ieee80211_sub_if_data, u.mgd.work); struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta; + struct ieee80211_if_managed *ifmgd; struct sk_buff *skb; if (!netif_running(sdata->dev)) @@ -2303,61 +1749,60 @@ static void ieee80211_sta_work(struct work_struct *work) if (local->sw_scanning || local->hw_scanning) return; - if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION && - sdata->vif.type != NL80211_IFTYPE_ADHOC)) + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; - ifsta = &sdata->u.sta; + ifmgd = &sdata->u.mgd; - while ((skb = skb_dequeue(&ifsta->skb_queue))) + while ((skb = skb_dequeue(&ifmgd->skb_queue))) ieee80211_sta_rx_queued_mgmt(sdata, skb); - if (ifsta->state != IEEE80211_STA_MLME_DIRECT_PROBE && - ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE && - ifsta->state != IEEE80211_STA_MLME_ASSOCIATE && - test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) { - ieee80211_start_scan(sdata, ifsta->scan_ssid, - ifsta->scan_ssid_len); + if (ifmgd->state != IEEE80211_STA_MLME_DIRECT_PROBE && + ifmgd->state != IEEE80211_STA_MLME_AUTHENTICATE && + ifmgd->state != IEEE80211_STA_MLME_ASSOCIATE && + test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request)) { + /* + * The call to ieee80211_start_scan can fail but ieee80211_request_scan + * (which queued ieee80211_sta_work) did not return an error. Thus, call + * ieee80211_scan_failed here if ieee80211_start_scan fails in order to + * notify the scan requester. + */ + if (ieee80211_start_scan(sdata, local->scan_req)) + ieee80211_scan_failed(local); return; } - if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request)) { - if (ieee80211_sta_config_auth(sdata, ifsta)) + if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request)) { + if (ieee80211_sta_config_auth(sdata)) return; - clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request); - } else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request)) + clear_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request); + } else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request)) return; - switch (ifsta->state) { + switch (ifmgd->state) { case IEEE80211_STA_MLME_DISABLED: break; case IEEE80211_STA_MLME_DIRECT_PROBE: - ieee80211_direct_probe(sdata, ifsta); + ieee80211_direct_probe(sdata); break; case IEEE80211_STA_MLME_AUTHENTICATE: - ieee80211_authenticate(sdata, ifsta); + ieee80211_authenticate(sdata); break; case IEEE80211_STA_MLME_ASSOCIATE: - ieee80211_associate(sdata, ifsta); + ieee80211_associate(sdata); break; case IEEE80211_STA_MLME_ASSOCIATED: - ieee80211_associated(sdata, ifsta); - break; - case IEEE80211_STA_MLME_IBSS_SEARCH: - ieee80211_sta_find_ibss(sdata, ifsta); - break; - case IEEE80211_STA_MLME_IBSS_JOINED: - ieee80211_sta_merge_ibss(sdata, ifsta); + ieee80211_associated(sdata); break; default: WARN_ON(1); break; } - if (ieee80211_privacy_mismatch(sdata, ifsta)) { + if (ieee80211_privacy_mismatch(sdata)) { printk(KERN_DEBUG "%s: privacy configuration mismatch and " "mixed-cell disabled - disassociate\n", sdata->dev->name); - ieee80211_set_disassoc(sdata, ifsta, false, true, + ieee80211_set_disassoc(sdata, false, true, WLAN_REASON_UNSPECIFIED); } } @@ -2366,208 +1811,153 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type == NL80211_IFTYPE_STATION) queue_work(sdata->local->hw.workqueue, - &sdata->u.sta.work); + &sdata->u.mgd.work); } /* interface setup */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_if_sta *ifsta; + struct ieee80211_if_managed *ifmgd; - ifsta = &sdata->u.sta; - INIT_WORK(&ifsta->work, ieee80211_sta_work); - setup_timer(&ifsta->timer, ieee80211_sta_timer, + ifmgd = &sdata->u.mgd; + INIT_WORK(&ifmgd->work, ieee80211_sta_work); + INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work); + setup_timer(&ifmgd->timer, ieee80211_sta_timer, + (unsigned long) sdata); + setup_timer(&ifmgd->chswitch_timer, ieee80211_chswitch_timer, (unsigned long) sdata); - skb_queue_head_init(&ifsta->skb_queue); + skb_queue_head_init(&ifmgd->skb_queue); - ifsta->capab = WLAN_CAPABILITY_ESS; - ifsta->auth_algs = IEEE80211_AUTH_ALG_OPEN | + ifmgd->capab = WLAN_CAPABILITY_ESS; + ifmgd->auth_algs = IEEE80211_AUTH_ALG_OPEN | IEEE80211_AUTH_ALG_SHARED_KEY; - ifsta->flags |= IEEE80211_STA_CREATE_IBSS | + ifmgd->flags |= IEEE80211_STA_CREATE_IBSS | IEEE80211_STA_AUTO_BSSID_SEL | IEEE80211_STA_AUTO_CHANNEL_SEL; if (ieee80211_num_regular_queues(&sdata->local->hw) >= 4) - ifsta->flags |= IEEE80211_STA_WMM_ENABLED; -} - -/* - * Add a new IBSS station, will also be called by the RX code when, - * in IBSS mode, receiving a frame from a yet-unknown station, hence - * must be callable in atomic context. - */ -struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, - u8 *bssid,u8 *addr, u64 supp_rates) -{ - struct ieee80211_local *local = sdata->local; - struct sta_info *sta; - int band = local->hw.conf.channel->band; - - /* TODO: Could consider removing the least recently used entry and - * allow new one to be added. */ - if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) { - if (net_ratelimit()) { - printk(KERN_DEBUG "%s: No room for a new IBSS STA " - "entry %pM\n", sdata->dev->name, addr); - } - return NULL; - } - - if (compare_ether_addr(bssid, sdata->u.sta.bssid)) - return NULL; - -#ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n", - wiphy_name(local->hw.wiphy), addr, sdata->dev->name); -#endif - - sta = sta_info_alloc(sdata, addr, GFP_ATOMIC); - if (!sta) - return NULL; - - set_sta_flags(sta, WLAN_STA_AUTHORIZED); - - /* make sure mandatory rates are always added */ - sta->sta.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(local, band); - - rate_control_rate_init(sta); - - if (sta_info_insert(sta)) - return NULL; - - return sta; + ifmgd->flags |= IEEE80211_STA_WMM_ENABLED; } /* configuration hooks */ -void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata, - struct ieee80211_if_sta *ifsta) +void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; - if (sdata->vif.type != NL80211_IFTYPE_STATION) + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; - if ((ifsta->flags & (IEEE80211_STA_BSSID_SET | + if ((ifmgd->flags & (IEEE80211_STA_BSSID_SET | IEEE80211_STA_AUTO_BSSID_SEL)) && - (ifsta->flags & (IEEE80211_STA_SSID_SET | + (ifmgd->flags & (IEEE80211_STA_SSID_SET | IEEE80211_STA_AUTO_SSID_SEL))) { - if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) - ieee80211_set_disassoc(sdata, ifsta, true, true, + if (ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) + ieee80211_set_disassoc(sdata, true, true, WLAN_REASON_DEAUTH_LEAVING); - set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request); - queue_work(local->hw.workqueue, &ifsta->work); + set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request); + queue_work(local->hw.workqueue, &ifmgd->work); } } +int ieee80211_sta_commit(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + ifmgd->flags &= ~IEEE80211_STA_PREV_BSSID_SET; + + if (ifmgd->ssid_len) + ifmgd->flags |= IEEE80211_STA_SSID_SET; + else + ifmgd->flags &= ~IEEE80211_STA_SSID_SET; + + return 0; +} + int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len) { - struct ieee80211_if_sta *ifsta; + struct ieee80211_if_managed *ifmgd; if (len > IEEE80211_MAX_SSID_LEN) return -EINVAL; - ifsta = &sdata->u.sta; - - if (ifsta->ssid_len != len || memcmp(ifsta->ssid, ssid, len) != 0) { - memset(ifsta->ssid, 0, sizeof(ifsta->ssid)); - memcpy(ifsta->ssid, ssid, len); - ifsta->ssid_len = len; - ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET; - } - - if (len) - ifsta->flags |= IEEE80211_STA_SSID_SET; - else - ifsta->flags &= ~IEEE80211_STA_SSID_SET; + ifmgd = &sdata->u.mgd; - if (sdata->vif.type == NL80211_IFTYPE_ADHOC && - !(ifsta->flags & IEEE80211_STA_BSSID_SET)) { - ifsta->ibss_join_req = jiffies; - ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH; - return ieee80211_sta_find_ibss(sdata, ifsta); + if (ifmgd->ssid_len != len || memcmp(ifmgd->ssid, ssid, len) != 0) { + memset(ifmgd->ssid, 0, sizeof(ifmgd->ssid)); + memcpy(ifmgd->ssid, ssid, len); + ifmgd->ssid_len = len; } - return 0; + return ieee80211_sta_commit(sdata); } int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; - memcpy(ssid, ifsta->ssid, ifsta->ssid_len); - *len = ifsta->ssid_len; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + memcpy(ssid, ifmgd->ssid, ifmgd->ssid_len); + *len = ifmgd->ssid_len; return 0; } int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid) { - struct ieee80211_if_sta *ifsta; - int res; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - ifsta = &sdata->u.sta; + if (is_valid_ether_addr(bssid)) { + memcpy(ifmgd->bssid, bssid, ETH_ALEN); + ifmgd->flags |= IEEE80211_STA_BSSID_SET; + } else { + memset(ifmgd->bssid, 0, ETH_ALEN); + ifmgd->flags &= ~IEEE80211_STA_BSSID_SET; + } - if (memcmp(ifsta->bssid, bssid, ETH_ALEN) != 0) { - memcpy(ifsta->bssid, bssid, ETH_ALEN); - res = 0; - /* - * Hack! See also ieee80211_sta_set_ssid. - */ - if (netif_running(sdata->dev)) - res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID); - if (res) { + if (netif_running(sdata->dev)) { + if (ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID)) { printk(KERN_DEBUG "%s: Failed to config new BSSID to " "the low-level driver\n", sdata->dev->name); - return res; } } - if (is_valid_ether_addr(bssid)) - ifsta->flags |= IEEE80211_STA_BSSID_SET; - else - ifsta->flags &= ~IEEE80211_STA_BSSID_SET; - - return 0; + return ieee80211_sta_commit(sdata); } int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - kfree(ifsta->extra_ie); + kfree(ifmgd->extra_ie); if (len == 0) { - ifsta->extra_ie = NULL; - ifsta->extra_ie_len = 0; + ifmgd->extra_ie = NULL; + ifmgd->extra_ie_len = 0; return 0; } - ifsta->extra_ie = kmalloc(len, GFP_KERNEL); - if (!ifsta->extra_ie) { - ifsta->extra_ie_len = 0; + ifmgd->extra_ie = kmalloc(len, GFP_KERNEL); + if (!ifmgd->extra_ie) { + ifmgd->extra_ie_len = 0; return -ENOMEM; } - memcpy(ifsta->extra_ie, ie, len); - ifsta->extra_ie_len = len; + memcpy(ifmgd->extra_ie, ie, len); + ifmgd->extra_ie_len = len; return 0; } int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; - printk(KERN_DEBUG "%s: deauthenticating by local choice (reason=%d)\n", sdata->dev->name, reason); - if (sdata->vif.type != NL80211_IFTYPE_STATION && - sdata->vif.type != NL80211_IFTYPE_ADHOC) + if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EINVAL; - ieee80211_set_disassoc(sdata, ifsta, true, true, reason); + ieee80211_set_disassoc(sdata, true, true, reason); return 0; } int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; printk(KERN_DEBUG "%s: disassociating by local choice (reason=%d)\n", sdata->dev->name, reason); @@ -2575,10 +1965,10 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason) if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EINVAL; - if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED)) - return -1; + if (!(ifmgd->flags & IEEE80211_STA_ASSOCIATED)) + return -ENOLINK; - ieee80211_set_disassoc(sdata, ifsta, false, true, reason); + ieee80211_set_disassoc(sdata, false, true, reason); return 0; } @@ -2586,15 +1976,6 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason) void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata = local->scan_sdata; - struct ieee80211_if_sta *ifsta; - - if (sdata && sdata->vif.type == NL80211_IFTYPE_ADHOC) { - ifsta = &sdata->u.sta; - if (!(ifsta->flags & IEEE80211_STA_BSSID_SET) || - (!(ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED) && - !ieee80211_sta_active_ibss(sdata))) - ieee80211_sta_find_ibss(sdata, ifsta); - } /* Restart STA timers */ rcu_read_lock(); @@ -2623,12 +2004,15 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) struct ieee80211_local *local = container_of(work, struct ieee80211_local, dynamic_ps_enable_work); + struct ieee80211_sub_if_data *sdata = local->scan_sdata; if (local->hw.conf.flags & IEEE80211_CONF_PS) return; - local->hw.conf.flags |= IEEE80211_CONF_PS; + if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) + ieee80211_send_nullfunc(local, sdata, 1); + local->hw.conf.flags |= IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } @@ -2638,3 +2022,36 @@ void ieee80211_dynamic_ps_timer(unsigned long data) queue_work(local->hw.workqueue, &local->dynamic_ps_enable_work); } + +void ieee80211_send_nullfunc(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + int powersave) +{ + struct sk_buff *skb; + struct ieee80211_hdr *nullfunc; + __le16 fc; + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) + return; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc " + "frame\n", sdata->dev->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24); + memset(nullfunc, 0, 24); + fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | + IEEE80211_FCTL_TODS); + if (powersave) + fc |= cpu_to_le16(IEEE80211_FCTL_PM); + nullfunc->frame_control = fc; + memcpy(nullfunc->addr1, sdata->u.mgd.bssid, ETH_ALEN); + memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN); + memcpy(nullfunc->addr3, sdata->u.mgd.bssid, ETH_ALEN); + + ieee80211_tx_skb(sdata, skb, 0); +} diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c new file mode 100644 index 000000000000..44525f517077 --- /dev/null +++ b/net/mac80211/pm.c @@ -0,0 +1,117 @@ +#include <net/mac80211.h> +#include <net/rtnetlink.h> + +#include "ieee80211_i.h" +#include "led.h" + +int __ieee80211_suspend(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_init_conf conf; + struct sta_info *sta; + + flush_workqueue(local->hw.workqueue); + + /* disable keys */ + list_for_each_entry(sdata, &local->interfaces, list) + ieee80211_disable_keys(sdata); + + /* remove STAs */ + list_for_each_entry(sta, &local->sta_list, list) { + + if (local->ops->sta_notify) { + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); + + local->ops->sta_notify(hw, &sdata->vif, + STA_NOTIFY_REMOVE, &sta->sta); + } + } + + /* remove all interfaces */ + list_for_each_entry(sdata, &local->interfaces, list) { + + if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + sdata->vif.type != NL80211_IFTYPE_MONITOR && + netif_running(sdata->dev)) { + conf.vif = &sdata->vif; + conf.type = sdata->vif.type; + conf.mac_addr = sdata->dev->dev_addr; + local->ops->remove_interface(hw, &conf); + } + } + + /* flush again, in case driver queued work */ + flush_workqueue(local->hw.workqueue); + + /* stop hardware */ + if (local->open_count) { + ieee80211_led_radio(local, false); + local->ops->stop(hw); + } + return 0; +} + +int __ieee80211_resume(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_init_conf conf; + struct sta_info *sta; + int res; + + /* restart hardware */ + if (local->open_count) { + res = local->ops->start(hw); + + ieee80211_led_radio(local, hw->conf.radio_enabled); + } + + /* add interfaces */ + list_for_each_entry(sdata, &local->interfaces, list) { + + if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + sdata->vif.type != NL80211_IFTYPE_MONITOR && + netif_running(sdata->dev)) { + conf.vif = &sdata->vif; + conf.type = sdata->vif.type; + conf.mac_addr = sdata->dev->dev_addr; + res = local->ops->add_interface(hw, &conf); + } + } + + /* add STAs back */ + list_for_each_entry(sta, &local->sta_list, list) { + + if (local->ops->sta_notify) { + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); + + local->ops->sta_notify(hw, &sdata->vif, + STA_NOTIFY_ADD, &sta->sta); + } + } + + /* add back keys */ + list_for_each_entry(sdata, &local->interfaces, list) + if (netif_running(sdata->dev)) + ieee80211_enable_keys(sdata); + + /* setup RTS threshold */ + if (local->ops->set_rts_threshold) + local->ops->set_rts_threshold(hw, local->rts_threshold); + + /* reconfigure hardware */ + ieee80211_hw_config(local, ~0); + + netif_addr_lock_bh(local->mdev); + ieee80211_configure_filter(local); + netif_addr_unlock_bh(local->mdev); + + return 0; +} diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 928da625e281..b9164c9a9563 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -62,6 +62,18 @@ static inline void rate_control_rate_init(struct sta_info *sta) ref->ops->rate_init(ref->priv, sband, ista, priv_sta); } +static inline void rate_control_rate_update(struct ieee80211_local *local, + struct ieee80211_supported_band *sband, + struct sta_info *sta, u32 changed) +{ + struct rate_control_ref *ref = local->rate_ctrl; + struct ieee80211_sta *ista = &sta->sta; + void *priv_sta = sta->rate_ctrl_priv; + + if (ref->ops->rate_update) + ref->ops->rate_update(ref->priv, sband, ista, + priv_sta, changed); +} static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, struct ieee80211_sta *sta, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 7175ae80c36a..66f7ecf51b92 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -86,8 +86,7 @@ ieee80211_rx_radiotap_len(struct ieee80211_local *local, if (status->flag & RX_FLAG_TSFT) len += 8; - if (local->hw.flags & IEEE80211_HW_SIGNAL_DB || - local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) len += 1; if (local->hw.flags & IEEE80211_HW_NOISE_DBM) len += 1; @@ -102,7 +101,7 @@ ieee80211_rx_radiotap_len(struct ieee80211_local *local, return len; } -/** +/* * ieee80211_add_rx_radiotap_header - add radiotap header * * add a radiotap header containing all the fields which the hardware provided. @@ -158,7 +157,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, */ *pos = 0; } else { - rthdr->it_present |= (1 << IEEE80211_RADIOTAP_RATE); + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE); *pos = rate->bitrate / 5; } pos++; @@ -199,14 +198,6 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos = status->antenna; pos++; - /* IEEE80211_RADIOTAP_DB_ANTSIGNAL */ - if (local->hw.flags & IEEE80211_HW_SIGNAL_DB) { - *pos = status->signal; - rthdr->it_present |= - cpu_to_le32(1 << IEEE80211_RADIOTAP_DB_ANTSIGNAL); - pos++; - } - /* IEEE80211_RADIOTAP_DB_ANTNOISE is not used */ /* IEEE80211_RADIOTAP_RX_FLAGS */ @@ -371,39 +362,50 @@ static void ieee80211_parse_qos(struct ieee80211_rx_data *rx) rx->skb->priority = (tid > 7) ? 0 : tid; } -static void ieee80211_verify_ip_alignment(struct ieee80211_rx_data *rx) +/** + * DOC: Packet alignment + * + * Drivers always need to pass packets that are aligned to two-byte boundaries + * to the stack. + * + * Additionally, should, if possible, align the payload data in a way that + * guarantees that the contained IP header is aligned to a four-byte + * boundary. In the case of regular frames, this simply means aligning the + * payload to a four-byte boundary (because either the IP header is directly + * contained, or IV/RFC1042 headers that have a length divisible by four are + * in front of it). + * + * With A-MSDU frames, however, the payload data address must yield two modulo + * four because there are 14-byte 802.3 headers within the A-MSDU frames that + * push the IP header further back to a multiple of four again. Thankfully, the + * specs were sane enough this time around to require padding each A-MSDU + * subframe to a length that is a multiple of four. + * + * Padding like Atheros hardware adds which is inbetween the 802.11 header and + * the payload is not supported, the driver is required to move the 802.11 + * header to be directly in front of the payload in that case. + */ +static void ieee80211_verify_alignment(struct ieee80211_rx_data *rx) { -#ifdef CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; int hdrlen; +#ifndef CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT + return; +#endif + + if (WARN_ONCE((unsigned long)rx->skb->data & 1, + "unaligned packet at 0x%p\n", rx->skb->data)) + return; + if (!ieee80211_is_data_present(hdr->frame_control)) return; - /* - * Drivers are required to align the payload data in a way that - * guarantees that the contained IP header is aligned to a four- - * byte boundary. In the case of regular frames, this simply means - * aligning the payload to a four-byte boundary (because either - * the IP header is directly contained, or IV/RFC1042 headers that - * have a length divisible by four are in front of it. - * - * With A-MSDU frames, however, the payload data address must - * yield two modulo four because there are 14-byte 802.3 headers - * within the A-MSDU frames that push the IP header further back - * to a multiple of four again. Thankfully, the specs were sane - * enough this time around to require padding each A-MSDU subframe - * to a length that is a multiple of four. - * - * Padding like atheros hardware adds which is inbetween the 802.11 - * header and the payload is not supported, the driver is required - * to move the 802.11 header further back in that case. - */ hdrlen = ieee80211_hdrlen(hdr->frame_control); if (rx->flags & IEEE80211_RX_AMSDU) hdrlen += ETH_HLEN; - WARN_ON_ONCE(((unsigned long)(rx->skb->data + hdrlen)) & 3); -#endif + WARN_ONCE(((unsigned long)(rx->skb->data + hdrlen)) & 3, + "unaligned IP payload at 0x%p\n", rx->skb->data + hdrlen); } @@ -435,6 +437,52 @@ ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx) return RX_CONTINUE; } + +static int ieee80211_is_unicast_robust_mgmt_frame(struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + + if (skb->len < 24 || is_multicast_ether_addr(hdr->addr1)) + return 0; + + return ieee80211_is_robust_mgmt_frame(hdr); +} + + +static int ieee80211_is_multicast_robust_mgmt_frame(struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + + if (skb->len < 24 || !is_multicast_ether_addr(hdr->addr1)) + return 0; + + return ieee80211_is_robust_mgmt_frame(hdr); +} + + +/* Get the BIP key index from MMIE; return -1 if this is not a BIP frame */ +static int ieee80211_get_mmie_keyidx(struct sk_buff *skb) +{ + struct ieee80211_mgmt *hdr = (struct ieee80211_mgmt *) skb->data; + struct ieee80211_mmie *mmie; + + if (skb->len < 24 + sizeof(*mmie) || + !is_multicast_ether_addr(hdr->da)) + return -1; + + if (!ieee80211_is_robust_mgmt_frame((struct ieee80211_hdr *) hdr)) + return -1; /* not a robust management frame */ + + mmie = (struct ieee80211_mmie *) + (skb->data + skb->len - sizeof(*mmie)); + if (mmie->element_id != WLAN_EID_MMIE || + mmie->length != sizeof(*mmie) - 2) + return -1; + + return le16_to_cpu(mmie->key_id); +} + + static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) { @@ -550,21 +598,23 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) int hdrlen; ieee80211_rx_result result = RX_DROP_UNUSABLE; struct ieee80211_key *stakey = NULL; + int mmie_keyidx = -1; /* * Key selection 101 * - * There are three types of keys: + * There are four types of keys: * - GTK (group keys) + * - IGTK (group keys for management frames) * - PTK (pairwise keys) * - STK (station-to-station pairwise keys) * * When selecting a key, we have to distinguish between multicast * (including broadcast) and unicast frames, the latter can only - * use PTKs and STKs while the former always use GTKs. Unless, of - * course, actual WEP keys ("pre-RSNA") are used, then unicast - * frames can also use key indizes like GTKs. Hence, if we don't - * have a PTK/STK we check the key index for a WEP key. + * use PTKs and STKs while the former always use GTKs and IGTKs. + * Unless, of course, actual WEP keys ("pre-RSNA") are used, then + * unicast frames can also use key indices like GTKs. Hence, if we + * don't have a PTK/STK we check the key index for a WEP key. * * Note that in a regular BSS, multicast frames are sent by the * AP only, associated stations unicast the frame to the AP first @@ -577,8 +627,14 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) * possible. */ - if (!ieee80211_has_protected(hdr->frame_control)) - return RX_CONTINUE; + if (!ieee80211_has_protected(hdr->frame_control)) { + if (!ieee80211_is_mgmt(hdr->frame_control) || + rx->sta == NULL || !test_sta_flags(rx->sta, WLAN_STA_MFP)) + return RX_CONTINUE; + mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb); + if (mmie_keyidx < 0) + return RX_CONTINUE; + } /* * No point in finding a key and decrypting if the frame is neither @@ -592,6 +648,16 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (!is_multicast_ether_addr(hdr->addr1) && stakey) { rx->key = stakey; + } else if (mmie_keyidx >= 0) { + /* Broadcast/multicast robust management frame / BIP */ + if ((rx->status->flag & RX_FLAG_DECRYPTED) && + (rx->status->flag & RX_FLAG_IV_STRIPPED)) + return RX_CONTINUE; + + if (mmie_keyidx < NUM_DEFAULT_KEYS || + mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) + return RX_DROP_MONITOR; /* unexpected BIP keyidx */ + rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]); } else { /* * The device doesn't give us the IV so we won't be @@ -654,6 +720,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) case ALG_CCMP: result = ieee80211_crypto_ccmp_decrypt(rx); break; + case ALG_AES_CMAC: + result = ieee80211_crypto_aes_cmac_decrypt(rx); + break; } /* either the frame has been decrypted or will be dropped */ @@ -662,6 +731,39 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) return result; } +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_check_more_data(struct ieee80211_rx_data *rx) +{ + struct ieee80211_local *local; + struct ieee80211_hdr *hdr; + struct sk_buff *skb; + + local = rx->local; + skb = rx->skb; + hdr = (struct ieee80211_hdr *) skb->data; + + if (!local->pspolling) + return RX_CONTINUE; + + if (!ieee80211_has_fromds(hdr->frame_control)) + /* this is not from AP */ + return RX_CONTINUE; + + if (!ieee80211_is_data(hdr->frame_control)) + return RX_CONTINUE; + + if (!ieee80211_has_moredata(hdr->frame_control)) { + /* AP has no more frames buffered for us */ + local->pspolling = false; + return RX_CONTINUE; + } + + /* more data bit is set, let's request a new frame from the AP */ + ieee80211_send_pspoll(local, rx->sdata); + + return RX_CONTINUE; +} + static void ap_sta_ps_start(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; @@ -736,7 +838,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) { u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len, NL80211_IFTYPE_ADHOC); - if (compare_ether_addr(bssid, rx->sdata->u.sta.bssid) == 0) + if (compare_ether_addr(bssid, rx->sdata->u.ibss.bssid) == 0) sta->last_rx = jiffies; } else if (!is_multicast_ether_addr(hdr->addr1) || @@ -1101,6 +1203,15 @@ ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc) /* Drop unencrypted frames if key is set. */ if (unlikely(!ieee80211_has_protected(fc) && !ieee80211_is_nullfunc(fc) && + (!ieee80211_is_mgmt(fc) || + (ieee80211_is_unicast_robust_mgmt_frame(rx->skb) && + rx->sta && test_sta_flags(rx->sta, WLAN_STA_MFP))) && + (rx->key || rx->sdata->drop_unencrypted))) + return -EACCES; + /* BIP does not use Protected field, so need to check MMIE */ + if (unlikely(rx->sta && test_sta_flags(rx->sta, WLAN_STA_MFP) && + ieee80211_is_multicast_robust_mgmt_frame(rx->skb) && + ieee80211_get_mmie_keyidx(rx->skb) < 0 && (rx->key || rx->sdata->drop_unencrypted))) return -EACCES; @@ -1138,12 +1249,12 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx) switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { - case __constant_cpu_to_le16(IEEE80211_FCTL_TODS): + case cpu_to_le16(IEEE80211_FCTL_TODS): if (unlikely(sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_AP_VLAN)) return -1; break; - case __constant_cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): + case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): if (unlikely(sdata->vif.type != NL80211_IFTYPE_WDS && sdata->vif.type != NL80211_IFTYPE_MESH_POINT)) return -1; @@ -1157,13 +1268,13 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx) } } break; - case __constant_cpu_to_le16(IEEE80211_FCTL_FROMDS): + case cpu_to_le16(IEEE80211_FCTL_FROMDS): if (sdata->vif.type != NL80211_IFTYPE_STATION || (is_multicast_ether_addr(dst) && !compare_ether_addr(src, dev->dev_addr))) return -1; break; - case __constant_cpu_to_le16(0): + case cpu_to_le16(0): if (sdata->vif.type != NL80211_IFTYPE_ADHOC) return -1; break; @@ -1267,10 +1378,37 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) } if (skb) { - /* deliver to local stack */ - skb->protocol = eth_type_trans(skb, dev); - memset(skb->cb, 0, sizeof(skb->cb)); - netif_rx(skb); + int align __maybe_unused; + +#if defined(CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT) || !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) + /* + * 'align' will only take the values 0 or 2 here + * since all frames are required to be aligned + * to 2-byte boundaries when being passed to + * mac80211. That also explains the __skb_push() + * below. + */ + align = (unsigned long)skb->data & 4; + if (align) { + if (WARN_ON(skb_headroom(skb) < 3)) { + dev_kfree_skb(skb); + skb = NULL; + } else { + u8 *data = skb->data; + size_t len = skb->len; + u8 *new = __skb_push(skb, align); + memmove(new, data, len); + __skb_trim(skb, len); + } + } +#endif + + if (skb) { + /* deliver to local stack */ + skb->protocol = eth_type_trans(skb, dev); + memset(skb->cb, 0, sizeof(skb->cb)); + netif_rx(skb); + } } if (xmit_skb) { @@ -1339,14 +1477,20 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) if (remaining <= subframe_len + padding) frame = skb; else { - frame = dev_alloc_skb(local->hw.extra_tx_headroom + - subframe_len); + /* + * Allocate and reserve two bytes more for payload + * alignment since sizeof(struct ethhdr) is 14. + */ + frame = dev_alloc_skb( + ALIGN(local->hw.extra_tx_headroom, 4) + + subframe_len + 2); if (frame == NULL) return RX_DROP_UNUSABLE; - skb_reserve(frame, local->hw.extra_tx_headroom + - sizeof(struct ethhdr)); + skb_reserve(frame, + ALIGN(local->hw.extra_tx_headroom, 4) + + sizeof(struct ethhdr) + 2); memcpy(skb_put(frame, ntohs(len)), skb->data, ntohs(len)); @@ -1529,11 +1673,9 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx) start_seq_num = le16_to_cpu(bar->start_seq_num) >> 4; /* reset session timer */ - if (tid_agg_rx->timeout) { - unsigned long expires = - jiffies + (tid_agg_rx->timeout / 1000) * HZ; - mod_timer(&tid_agg_rx->session_timer, expires); - } + if (tid_agg_rx->timeout) + mod_timer(&tid_agg_rx->session_timer, + TU_TO_EXP_TIME(tid_agg_rx->timeout)); /* manage reordering buffer according to requested */ /* sequence number */ @@ -1547,12 +1689,64 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx) return RX_CONTINUE; } +static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *resp; + + if (compare_ether_addr(mgmt->da, sdata->dev->dev_addr) != 0) { + /* Not to own unicast address */ + return; + } + + if (compare_ether_addr(mgmt->sa, sdata->u.mgd.bssid) != 0 || + compare_ether_addr(mgmt->bssid, sdata->u.mgd.bssid) != 0) { + /* Not from the current AP. */ + return; + } + + if (sdata->u.mgd.state == IEEE80211_STA_MLME_ASSOCIATE) { + /* Association in progress; ignore SA Query */ + return; + } + + if (len < 24 + 1 + sizeof(resp->u.action.u.sa_query)) { + /* Too short SA Query request frame */ + return; + } + + skb = dev_alloc_skb(sizeof(*resp) + local->hw.extra_tx_headroom); + if (skb == NULL) + return; + + skb_reserve(skb, local->hw.extra_tx_headroom); + resp = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(resp, 0, 24); + memcpy(resp->da, mgmt->sa, ETH_ALEN); + memcpy(resp->sa, sdata->dev->dev_addr, ETH_ALEN); + memcpy(resp->bssid, sdata->u.mgd.bssid, ETH_ALEN); + resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query)); + resp->u.action.category = WLAN_CATEGORY_SA_QUERY; + resp->u.action.u.sa_query.action = WLAN_ACTION_SA_QUERY_RESPONSE; + memcpy(resp->u.action.u.sa_query.trans_id, + mgmt->u.action.u.sa_query.trans_id, + WLAN_SA_QUERY_TR_ID_LEN); + + ieee80211_tx_skb(sdata, skb, 1); +} + static ieee80211_rx_result debug_noinline ieee80211_rx_h_action(struct ieee80211_rx_data *rx) { struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + struct ieee80211_bss *bss; int len = rx->skb->len; if (!ieee80211_is_action(mgmt->frame_control)) @@ -1564,12 +1758,26 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) if (!(rx->flags & IEEE80211_RX_RA_MATCH)) return RX_DROP_MONITOR; + if (ieee80211_drop_unencrypted(rx, mgmt->frame_control)) + return RX_DROP_MONITOR; + /* all categories we currently handle have action_code */ if (len < IEEE80211_MIN_ACTION_SIZE + 1) return RX_DROP_MONITOR; switch (mgmt->u.action.category) { case WLAN_CATEGORY_BACK: + /* + * The aggregation code is not prepared to handle + * anything but STA/AP due to the BSSID handling; + * IBSS could work in the code but isn't supported + * by drivers or the standard. + */ + if (sdata->vif.type != NL80211_IFTYPE_STATION && + sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + sdata->vif.type != NL80211_IFTYPE_AP) + return RX_DROP_MONITOR; + switch (mgmt->u.action.u.addba_req.action_code) { case WLAN_ACTION_ADDBA_REQ: if (len < (IEEE80211_MIN_ACTION_SIZE + @@ -1594,6 +1802,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) case WLAN_CATEGORY_SPECTRUM_MGMT: if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ) return RX_DROP_MONITOR; + + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return RX_DROP_MONITOR; + switch (mgmt->u.action.u.measurement.action_code) { case WLAN_ACTION_SPCT_MSR_REQ: if (len < (IEEE80211_MIN_ACTION_SIZE + @@ -1601,6 +1813,43 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) return RX_DROP_MONITOR; ieee80211_process_measurement_req(sdata, mgmt, len); break; + case WLAN_ACTION_SPCT_CHL_SWITCH: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.chan_switch))) + return RX_DROP_MONITOR; + + if (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN)) + return RX_DROP_MONITOR; + + bss = ieee80211_rx_bss_get(local, sdata->u.mgd.bssid, + local->hw.conf.channel->center_freq, + sdata->u.mgd.ssid, + sdata->u.mgd.ssid_len); + if (!bss) + return RX_DROP_MONITOR; + + ieee80211_process_chanswitch(sdata, + &mgmt->u.action.u.chan_switch.sw_elem, bss); + ieee80211_rx_bss_put(local, bss); + break; + } + break; + case WLAN_CATEGORY_SA_QUERY: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.sa_query))) + return RX_DROP_MONITOR; + switch (mgmt->u.action.u.sa_query.action) { + case WLAN_ACTION_SA_QUERY_REQUEST: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return RX_DROP_MONITOR; + ieee80211_process_sa_query_req(sdata, mgmt, len); + break; + case WLAN_ACTION_SA_QUERY_RESPONSE: + /* + * SA Query response is currently only used in AP mode + * and it is processed in user space. + */ + return RX_CONTINUE; } break; default: @@ -1616,10 +1865,14 @@ static ieee80211_rx_result debug_noinline ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev); + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; if (!(rx->flags & IEEE80211_RX_RA_MATCH)) return RX_DROP_MONITOR; + if (ieee80211_drop_unencrypted(rx, mgmt->frame_control)) + return RX_DROP_MONITOR; + if (ieee80211_vif_is_mesh(&sdata->vif)) return ieee80211_mesh_rx_mgmt(sdata, rx->skb, rx->status); @@ -1627,11 +1880,14 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) sdata->vif.type != NL80211_IFTYPE_ADHOC) return RX_DROP_MONITOR; - if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) - return RX_DROP_MONITOR; - ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status); - return RX_QUEUED; + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) + return RX_DROP_MONITOR; + return ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status); + } + + return ieee80211_ibss_rx_mgmt(sdata, rx->skb, rx->status); } static void ieee80211_rx_michael_mic_report(struct net_device *dev, @@ -1780,6 +2036,7 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, CALL_RXH(ieee80211_rx_h_passive_scan) CALL_RXH(ieee80211_rx_h_check) CALL_RXH(ieee80211_rx_h_decrypt) + CALL_RXH(ieee80211_rx_h_check_more_data) CALL_RXH(ieee80211_rx_h_sta_process) CALL_RXH(ieee80211_rx_h_defragment) CALL_RXH(ieee80211_rx_h_ps_poll) @@ -1823,16 +2080,17 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, /* main receive path */ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, - u8 *bssid, struct ieee80211_rx_data *rx, + struct ieee80211_rx_data *rx, struct ieee80211_hdr *hdr) { + u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len, sdata->vif.type); int multicast = is_multicast_ether_addr(hdr->addr1); switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (!bssid) return 0; - if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) { + if (!ieee80211_bssid_match(bssid, sdata->u.mgd.bssid)) { if (!(rx->flags & IEEE80211_RX_IN_SCAN)) return 0; rx->flags &= ~IEEE80211_RX_RA_MATCH; @@ -1850,7 +2108,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, if (ieee80211_is_beacon(hdr->frame_control)) { return 1; } - else if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) { + else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) { if (!(rx->flags & IEEE80211_RX_IN_SCAN)) return 0; rx->flags &= ~IEEE80211_RX_RA_MATCH; @@ -1928,7 +2186,6 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, int prepares; struct ieee80211_sub_if_data *prev = NULL; struct sk_buff *skb_new; - u8 *bssid; hdr = (struct ieee80211_hdr *)skb->data; memset(&rx, 0, sizeof(rx)); @@ -1956,7 +2213,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, rx.flags |= IEEE80211_RX_IN_SCAN; ieee80211_parse_qos(&rx); - ieee80211_verify_ip_alignment(&rx); + ieee80211_verify_alignment(&rx); skb = rx.skb; @@ -1967,9 +2224,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (sdata->vif.type == NL80211_IFTYPE_MONITOR) continue; - bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type); rx.flags |= IEEE80211_RX_RA_MATCH; - prepares = prepare_for_handlers(sdata, bssid, &rx, hdr); + prepares = prepare_for_handlers(sdata, &rx, hdr); if (!prepares) continue; @@ -2174,11 +2430,9 @@ static u8 ieee80211_rx_reorder_ampdu(struct ieee80211_local *local, /* new un-ordered ampdu frame - process it */ /* reset session timer */ - if (tid_agg_rx->timeout) { - unsigned long expires = - jiffies + (tid_agg_rx->timeout / 1000) * HZ; - mod_timer(&tid_agg_rx->session_timer, expires); - } + if (tid_agg_rx->timeout) + mod_timer(&tid_agg_rx->session_timer, + TU_TO_EXP_TIME(tid_agg_rx->timeout)); /* if this mpdu is fragmented - terminate rx aggregation session */ sc = le16_to_cpu(hdr->seq_ctrl); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index f5c7c3371929..5030a3c87509 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -12,14 +12,11 @@ * published by the Free Software Foundation. */ -/* TODO: - * order BSS list by RSSI(?) ("quality of AP") - * scan result table filtering (by capability (privacy, IBSS/BSS, WPA/RSN IE, - * SSID) - */ +/* TODO: figure out how to avoid that the "current BSS" expires */ #include <linux/wireless.h> #include <linux/if_arp.h> +#include <linux/rtnetlink.h> #include <net/mac80211.h> #include <net/iw_handler.h> @@ -30,192 +27,29 @@ #define IEEE80211_CHANNEL_TIME (HZ / 33) #define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 5) -void ieee80211_rx_bss_list_init(struct ieee80211_local *local) -{ - spin_lock_init(&local->bss_lock); - INIT_LIST_HEAD(&local->bss_list); -} - -void ieee80211_rx_bss_list_deinit(struct ieee80211_local *local) -{ - struct ieee80211_bss *bss, *tmp; - - list_for_each_entry_safe(bss, tmp, &local->bss_list, list) - ieee80211_rx_bss_put(local, bss); -} - struct ieee80211_bss * ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq, u8 *ssid, u8 ssid_len) { - struct ieee80211_bss *bss; - - spin_lock_bh(&local->bss_lock); - bss = local->bss_hash[STA_HASH(bssid)]; - while (bss) { - if (!bss_mesh_cfg(bss) && - !memcmp(bss->bssid, bssid, ETH_ALEN) && - bss->freq == freq && - bss->ssid_len == ssid_len && - (ssid_len == 0 || !memcmp(bss->ssid, ssid, ssid_len))) { - atomic_inc(&bss->users); - break; - } - bss = bss->hnext; - } - spin_unlock_bh(&local->bss_lock); - return bss; -} - -/* Caller must hold local->bss_lock */ -static void __ieee80211_rx_bss_hash_add(struct ieee80211_local *local, - struct ieee80211_bss *bss) -{ - u8 hash_idx; - - if (bss_mesh_cfg(bss)) - hash_idx = mesh_id_hash(bss_mesh_id(bss), - bss_mesh_id_len(bss)); - else - hash_idx = STA_HASH(bss->bssid); - - bss->hnext = local->bss_hash[hash_idx]; - local->bss_hash[hash_idx] = bss; -} - -/* Caller must hold local->bss_lock */ -static void __ieee80211_rx_bss_hash_del(struct ieee80211_local *local, - struct ieee80211_bss *bss) -{ - struct ieee80211_bss *b, *prev = NULL; - b = local->bss_hash[STA_HASH(bss->bssid)]; - while (b) { - if (b == bss) { - if (!prev) - local->bss_hash[STA_HASH(bss->bssid)] = - bss->hnext; - else - prev->hnext = bss->hnext; - break; - } - prev = b; - b = b->hnext; - } -} - -struct ieee80211_bss * -ieee80211_rx_bss_add(struct ieee80211_local *local, u8 *bssid, int freq, - u8 *ssid, u8 ssid_len) -{ - struct ieee80211_bss *bss; - - bss = kzalloc(sizeof(*bss), GFP_ATOMIC); - if (!bss) - return NULL; - atomic_set(&bss->users, 2); - memcpy(bss->bssid, bssid, ETH_ALEN); - bss->freq = freq; - if (ssid && ssid_len <= IEEE80211_MAX_SSID_LEN) { - memcpy(bss->ssid, ssid, ssid_len); - bss->ssid_len = ssid_len; - } - - spin_lock_bh(&local->bss_lock); - /* TODO: order by RSSI? */ - list_add_tail(&bss->list, &local->bss_list); - __ieee80211_rx_bss_hash_add(local, bss); - spin_unlock_bh(&local->bss_lock); - return bss; + return (void *)cfg80211_get_bss(local->hw.wiphy, + ieee80211_get_channel(local->hw.wiphy, + freq), + bssid, ssid, ssid_len, + 0, 0); } -#ifdef CONFIG_MAC80211_MESH -static struct ieee80211_bss * -ieee80211_rx_mesh_bss_get(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len, - u8 *mesh_cfg, int freq) +static void ieee80211_rx_bss_free(struct cfg80211_bss *cbss) { - struct ieee80211_bss *bss; + struct ieee80211_bss *bss = (void *)cbss; - spin_lock_bh(&local->bss_lock); - bss = local->bss_hash[mesh_id_hash(mesh_id, mesh_id_len)]; - while (bss) { - if (bss_mesh_cfg(bss) && - !memcmp(bss_mesh_cfg(bss), mesh_cfg, MESH_CFG_CMP_LEN) && - bss->freq == freq && - mesh_id_len == bss->mesh_id_len && - (mesh_id_len == 0 || !memcmp(bss->mesh_id, mesh_id, - mesh_id_len))) { - atomic_inc(&bss->users); - break; - } - bss = bss->hnext; - } - spin_unlock_bh(&local->bss_lock); - return bss; -} - -static struct ieee80211_bss * -ieee80211_rx_mesh_bss_add(struct ieee80211_local *local, u8 *mesh_id, int mesh_id_len, - u8 *mesh_cfg, int mesh_config_len, int freq) -{ - struct ieee80211_bss *bss; - - if (mesh_config_len != IEEE80211_MESH_CONFIG_LEN) - return NULL; - - bss = kzalloc(sizeof(*bss), GFP_ATOMIC); - if (!bss) - return NULL; - - bss->mesh_cfg = kmalloc(MESH_CFG_CMP_LEN, GFP_ATOMIC); - if (!bss->mesh_cfg) { - kfree(bss); - return NULL; - } - - if (mesh_id_len && mesh_id_len <= IEEE80211_MAX_MESH_ID_LEN) { - bss->mesh_id = kmalloc(mesh_id_len, GFP_ATOMIC); - if (!bss->mesh_id) { - kfree(bss->mesh_cfg); - kfree(bss); - return NULL; - } - memcpy(bss->mesh_id, mesh_id, mesh_id_len); - } - - atomic_set(&bss->users, 2); - memcpy(bss->mesh_cfg, mesh_cfg, MESH_CFG_CMP_LEN); - bss->mesh_id_len = mesh_id_len; - bss->freq = freq; - spin_lock_bh(&local->bss_lock); - /* TODO: order by RSSI? */ - list_add_tail(&bss->list, &local->bss_list); - __ieee80211_rx_bss_hash_add(local, bss); - spin_unlock_bh(&local->bss_lock); - return bss; -} -#endif - -static void ieee80211_rx_bss_free(struct ieee80211_bss *bss) -{ - kfree(bss->ies); kfree(bss_mesh_id(bss)); kfree(bss_mesh_cfg(bss)); - kfree(bss); } void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss) { - local_bh_disable(); - if (!atomic_dec_and_lock(&bss->users, &local->bss_lock)) { - local_bh_enable(); - return; - } - - __ieee80211_rx_bss_hash_del(local, bss); - list_del(&bss->list); - spin_unlock_bh(&local->bss_lock); - ieee80211_rx_bss_free(bss); + cfg80211_put_bss((struct cfg80211_bss *)bss); } struct ieee80211_bss * @@ -224,39 +58,25 @@ ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_mgmt *mgmt, size_t len, struct ieee802_11_elems *elems, - int freq, bool beacon) + struct ieee80211_channel *channel, + bool beacon) { struct ieee80211_bss *bss; int clen; + s32 signal = 0; -#ifdef CONFIG_MAC80211_MESH - if (elems->mesh_config) - bss = ieee80211_rx_mesh_bss_get(local, elems->mesh_id, - elems->mesh_id_len, elems->mesh_config, freq); - else -#endif - bss = ieee80211_rx_bss_get(local, mgmt->bssid, freq, - elems->ssid, elems->ssid_len); - if (!bss) { -#ifdef CONFIG_MAC80211_MESH - if (elems->mesh_config) - bss = ieee80211_rx_mesh_bss_add(local, elems->mesh_id, - elems->mesh_id_len, elems->mesh_config, - elems->mesh_config_len, freq); - else -#endif - bss = ieee80211_rx_bss_add(local, mgmt->bssid, freq, - elems->ssid, elems->ssid_len); - if (!bss) - return NULL; - } else { -#if 0 - /* TODO: order by RSSI? */ - spin_lock_bh(&local->bss_lock); - list_move_tail(&bss->list, &local->bss_list); - spin_unlock_bh(&local->bss_lock); -#endif - } + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + signal = rx_status->signal * 100; + else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) + signal = (rx_status->signal * 100) / local->hw.max_signal; + + bss = (void *)cfg80211_inform_bss_frame(local->hw.wiphy, channel, + mgmt, len, signal, GFP_ATOMIC); + + if (!bss) + return NULL; + + bss->cbss.free_priv = ieee80211_rx_bss_free; /* save the ERP value so that it is available at association time */ if (elems->erp_info && elems->erp_info_len >= 1) { @@ -264,9 +84,6 @@ ieee80211_bss_info_update(struct ieee80211_local *local, bss->has_erp_value = 1; } - bss->beacon_int = le16_to_cpu(mgmt->u.beacon.beacon_int); - bss->capability = le16_to_cpu(mgmt->u.beacon.capab_info); - if (elems->tim) { struct ieee80211_tim_ie *tim_ie = (struct ieee80211_tim_ie *)elems->tim; @@ -295,37 +112,27 @@ ieee80211_bss_info_update(struct ieee80211_local *local, bss->supp_rates_len += clen; } - bss->band = rx_status->band; - - bss->timestamp = le64_to_cpu(mgmt->u.beacon.timestamp); - bss->last_update = jiffies; - bss->signal = rx_status->signal; - bss->noise = rx_status->noise; - bss->qual = rx_status->qual; bss->wmm_used = elems->wmm_param || elems->wmm_info; if (!beacon) bss->last_probe_resp = jiffies; - /* - * For probe responses, or if we don't have any information yet, - * use the IEs from the beacon. - */ - if (!bss->ies || !beacon) { - if (bss->ies == NULL || bss->ies_len < elems->total_len) { - kfree(bss->ies); - bss->ies = kmalloc(elems->total_len, GFP_ATOMIC); - } - if (bss->ies) { - memcpy(bss->ies, elems->ie_start, elems->total_len); - bss->ies_len = elems->total_len; - } else - bss->ies_len = 0; - } - return bss; } +void ieee80211_rx_bss_remove(struct ieee80211_sub_if_data *sdata, u8 *bssid, + int freq, u8 *ssid, u8 ssid_len) +{ + struct ieee80211_bss *bss; + struct ieee80211_local *local = sdata->local; + + bss = ieee80211_rx_bss_get(local, bssid, freq, ssid, ssid_len); + if (bss) { + cfg80211_unlink_bss(local->hw.wiphy, (void *)bss); + ieee80211_rx_bss_put(local, bss); + } +} + ieee80211_rx_result ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_rx_status *rx_status) @@ -387,7 +194,7 @@ ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bss = ieee80211_bss_info_update(sdata->local, rx_status, mgmt, skb->len, &elems, - freq, beacon); + channel, beacon); if (bss) ieee80211_rx_bss_put(sdata->local, bss); @@ -395,56 +202,34 @@ ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, return RX_QUEUED; } -static void ieee80211_send_nullfunc(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - int powersave) +void ieee80211_scan_failed(struct ieee80211_local *local) { - struct sk_buff *skb; - struct ieee80211_hdr *nullfunc; - __le16 fc; - - skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24); - if (!skb) { - printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc " - "frame\n", sdata->dev->name); + if (WARN_ON(!local->scan_req)) return; - } - skb_reserve(skb, local->hw.extra_tx_headroom); - - nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24); - memset(nullfunc, 0, 24); - fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | - IEEE80211_FCTL_TODS); - if (powersave) - fc |= cpu_to_le16(IEEE80211_FCTL_PM); - nullfunc->frame_control = fc; - memcpy(nullfunc->addr1, sdata->u.sta.bssid, ETH_ALEN); - memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN); - memcpy(nullfunc->addr3, sdata->u.sta.bssid, ETH_ALEN); - - ieee80211_tx_skb(sdata, skb, 0); + + /* notify cfg80211 about the failed scan */ + if (local->scan_req != &local->int_scan_req) + cfg80211_scan_done(local->scan_req, true); + + local->scan_req = NULL; } -void ieee80211_scan_completed(struct ieee80211_hw *hw) +void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; - union iwreq_data wrqu; if (WARN_ON(!local->hw_scanning && !local->sw_scanning)) return; - local->last_scan_completed = jiffies; - memset(&wrqu, 0, sizeof(wrqu)); + if (WARN_ON(!local->scan_req)) + return; - /* - * local->scan_sdata could have been NULLed by the interface - * down code in case we were scanning on an interface that is - * being taken down. - */ - sdata = local->scan_sdata; - if (sdata) - wireless_send_event(sdata->dev, SIOCGIWSCAN, &wrqu, NULL); + if (local->scan_req != &local->int_scan_req) + cfg80211_scan_done(local->scan_req, aborted); + local->scan_req = NULL; + + local->last_scan_completed = jiffies; if (local->hw_scanning) { local->hw_scanning = false; @@ -472,34 +257,46 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw) netif_addr_unlock(local->mdev); netif_tx_unlock_bh(local->mdev); - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (local->ops->sw_scan_complete) + local->ops->sw_scan_complete(local_to_hw(local)); + + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata, &local->interfaces, list) { + if (!netif_running(sdata->dev)) + continue; + /* Tell AP we're back */ if (sdata->vif.type == NL80211_IFTYPE_STATION) { - if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) { + if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) { ieee80211_send_nullfunc(local, sdata, 0); netif_tx_wake_all_queues(sdata->dev); } } else netif_tx_wake_all_queues(sdata->dev); + + /* re-enable beaconing */ + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_ADHOC || + sdata->vif.type == NL80211_IFTYPE_MESH_POINT) + ieee80211_if_config(sdata, + IEEE80211_IFCC_BEACON_ENABLED); } - rcu_read_unlock(); + mutex_unlock(&local->iflist_mtx); done: ieee80211_mlme_notify_scan_completed(local); + ieee80211_ibss_notify_scan_completed(local); ieee80211_mesh_notify_scan_completed(local); } EXPORT_SYMBOL(ieee80211_scan_completed); - void ieee80211_scan_work(struct work_struct *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, scan_work.work); struct ieee80211_sub_if_data *sdata = local->scan_sdata; - struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; - int skip; + int skip, i; unsigned long next_delay = 0; /* @@ -510,33 +307,13 @@ void ieee80211_scan_work(struct work_struct *work) switch (local->scan_state) { case SCAN_SET_CHANNEL: - /* - * Get current scan band. scan_band may be IEEE80211_NUM_BANDS - * after we successfully scanned the last channel of the last - * band (and the last band is supported by the hw) - */ - if (local->scan_band < IEEE80211_NUM_BANDS) - sband = local->hw.wiphy->bands[local->scan_band]; - else - sband = NULL; - - /* - * If we are at an unsupported band and have more bands - * left to scan, advance to the next supported one. - */ - while (!sband && local->scan_band < IEEE80211_NUM_BANDS - 1) { - local->scan_band++; - sband = local->hw.wiphy->bands[local->scan_band]; - local->scan_channel_idx = 0; - } - /* if no more bands/channels left, complete scan */ - if (!sband || local->scan_channel_idx >= sband->n_channels) { - ieee80211_scan_completed(local_to_hw(local)); + if (local->scan_channel_idx >= local->scan_req->n_channels) { + ieee80211_scan_completed(local_to_hw(local), false); return; } skip = 0; - chan = &sband->channels[local->scan_channel_idx]; + chan = local->scan_req->channels[local->scan_channel_idx]; if (chan->flags & IEEE80211_CHAN_DISABLED || (sdata->vif.type == NL80211_IFTYPE_ADHOC && @@ -552,15 +329,6 @@ void ieee80211_scan_work(struct work_struct *work) /* advance state machine to next channel/band */ local->scan_channel_idx++; - if (local->scan_channel_idx >= sband->n_channels) { - /* - * scan_band may end up == IEEE80211_NUM_BANDS, but - * we'll catch that case above and complete the scan - * if that is the case. - */ - local->scan_band++; - local->scan_channel_idx = 0; - } if (skip) break; @@ -573,10 +341,15 @@ void ieee80211_scan_work(struct work_struct *work) next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; local->scan_state = SCAN_SET_CHANNEL; - if (local->scan_channel->flags & IEEE80211_CHAN_PASSIVE_SCAN) + if (local->scan_channel->flags & IEEE80211_CHAN_PASSIVE_SCAN || + !local->scan_req->n_ssids) break; - ieee80211_send_probe_req(sdata, NULL, local->scan_ssid, - local->scan_ssid_len); + for (i = 0; i < local->scan_req->n_ssids; i++) + ieee80211_send_probe_req( + sdata, NULL, + local->scan_req->ssids[i].ssid, + local->scan_req->ssids[i].ssid_len, + local->scan_req->ie, local->scan_req->ie_len); next_delay = IEEE80211_CHANNEL_TIME; break; } @@ -587,14 +360,19 @@ void ieee80211_scan_work(struct work_struct *work) int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata, - u8 *ssid, size_t ssid_len) + struct cfg80211_scan_request *req) { struct ieee80211_local *local = scan_sdata->local; struct ieee80211_sub_if_data *sdata; - if (ssid_len > IEEE80211_MAX_SSID_LEN) + if (!req) return -EINVAL; + if (local->scan_req && local->scan_req != req) + return -EBUSY; + + local->scan_req = req; + /* MLME-SCAN.request (page 118) page 144 (11.1.3.1) * BSSType: INFRASTRUCTURE, INDEPENDENT, ANY_BSS * BSSID: MACAddress @@ -622,7 +400,7 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata, int rc; local->hw_scanning = true; - rc = local->ops->hw_scan(local_to_hw(local), ssid, ssid_len); + rc = local->ops->hw_scan(local_to_hw(local), req); if (rc) { local->hw_scanning = false; return rc; @@ -632,28 +410,35 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata, } local->sw_scanning = true; + if (local->ops->sw_scan_start) + local->ops->sw_scan_start(local_to_hw(local)); + + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata, &local->interfaces, list) { + if (!netif_running(sdata->dev)) + continue; + + /* disable beaconing */ + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_ADHOC || + sdata->vif.type == NL80211_IFTYPE_MESH_POINT) + ieee80211_if_config(sdata, + IEEE80211_IFCC_BEACON_ENABLED); - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_STATION) { - if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) { + if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) { netif_tx_stop_all_queues(sdata->dev); ieee80211_send_nullfunc(local, sdata, 1); } } else netif_tx_stop_all_queues(sdata->dev); } - rcu_read_unlock(); + mutex_unlock(&local->iflist_mtx); - if (ssid) { - local->scan_ssid_len = ssid_len; - memcpy(local->scan_ssid, ssid, ssid_len); - } else - local->scan_ssid_len = 0; local->scan_state = SCAN_SET_CHANNEL; local->scan_channel_idx = 0; - local->scan_band = IEEE80211_BAND_2GHZ; local->scan_sdata = scan_sdata; + local->scan_req = req; netif_addr_lock_bh(local->mdev); local->filter_flags |= FIF_BCN_PRBRESP_PROMISC; @@ -673,13 +458,21 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata, int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, - u8 *ssid, size_t ssid_len) + struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; - struct ieee80211_if_sta *ifsta; + struct ieee80211_if_managed *ifmgd; + + if (!req) + return -EINVAL; + + if (local->scan_req && local->scan_req != req) + return -EBUSY; + + local->scan_req = req; if (sdata->vif.type != NL80211_IFTYPE_STATION) - return ieee80211_start_scan(sdata, ssid, ssid_len); + return ieee80211_start_scan(sdata, req); /* * STA has a state machine that might need to defer scanning @@ -693,242 +486,9 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, return -EBUSY; } - ifsta = &sdata->u.sta; - - ifsta->scan_ssid_len = ssid_len; - if (ssid_len) - memcpy(ifsta->scan_ssid, ssid, ssid_len); - set_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request); - queue_work(local->hw.workqueue, &ifsta->work); + ifmgd = &sdata->u.mgd; + set_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request); + queue_work(local->hw.workqueue, &ifmgd->work); return 0; } - - -static void ieee80211_scan_add_ies(struct iw_request_info *info, - struct ieee80211_bss *bss, - char **current_ev, char *end_buf) -{ - u8 *pos, *end, *next; - struct iw_event iwe; - - if (bss == NULL || bss->ies == NULL) - return; - - /* - * If needed, fragment the IEs buffer (at IE boundaries) into short - * enough fragments to fit into IW_GENERIC_IE_MAX octet messages. - */ - pos = bss->ies; - end = pos + bss->ies_len; - - while (end - pos > IW_GENERIC_IE_MAX) { - next = pos + 2 + pos[1]; - while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX) - next = next + 2 + next[1]; - - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = IWEVGENIE; - iwe.u.data.length = next - pos; - *current_ev = iwe_stream_add_point(info, *current_ev, - end_buf, &iwe, pos); - - pos = next; - } - - if (end > pos) { - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = IWEVGENIE; - iwe.u.data.length = end - pos; - *current_ev = iwe_stream_add_point(info, *current_ev, - end_buf, &iwe, pos); - } -} - - -static char * -ieee80211_scan_result(struct ieee80211_local *local, - struct iw_request_info *info, - struct ieee80211_bss *bss, - char *current_ev, char *end_buf) -{ - struct iw_event iwe; - char *buf; - - if (time_after(jiffies, - bss->last_update + IEEE80211_SCAN_RESULT_EXPIRE)) - return current_ev; - - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = SIOCGIWAP; - iwe.u.ap_addr.sa_family = ARPHRD_ETHER; - memcpy(iwe.u.ap_addr.sa_data, bss->bssid, ETH_ALEN); - current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, - IW_EV_ADDR_LEN); - - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = SIOCGIWESSID; - if (bss_mesh_cfg(bss)) { - iwe.u.data.length = bss_mesh_id_len(bss); - iwe.u.data.flags = 1; - current_ev = iwe_stream_add_point(info, current_ev, end_buf, - &iwe, bss_mesh_id(bss)); - } else { - iwe.u.data.length = bss->ssid_len; - iwe.u.data.flags = 1; - current_ev = iwe_stream_add_point(info, current_ev, end_buf, - &iwe, bss->ssid); - } - - if (bss->capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS) - || bss_mesh_cfg(bss)) { - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = SIOCGIWMODE; - if (bss_mesh_cfg(bss)) - iwe.u.mode = IW_MODE_MESH; - else if (bss->capability & WLAN_CAPABILITY_ESS) - iwe.u.mode = IW_MODE_MASTER; - else - iwe.u.mode = IW_MODE_ADHOC; - current_ev = iwe_stream_add_event(info, current_ev, end_buf, - &iwe, IW_EV_UINT_LEN); - } - - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = SIOCGIWFREQ; - iwe.u.freq.m = ieee80211_frequency_to_channel(bss->freq); - iwe.u.freq.e = 0; - current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, - IW_EV_FREQ_LEN); - - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = SIOCGIWFREQ; - iwe.u.freq.m = bss->freq; - iwe.u.freq.e = 6; - current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, - IW_EV_FREQ_LEN); - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = IWEVQUAL; - iwe.u.qual.qual = bss->qual; - iwe.u.qual.level = bss->signal; - iwe.u.qual.noise = bss->noise; - iwe.u.qual.updated = local->wstats_flags; - current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, - IW_EV_QUAL_LEN); - - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = SIOCGIWENCODE; - if (bss->capability & WLAN_CAPABILITY_PRIVACY) - iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY; - else - iwe.u.data.flags = IW_ENCODE_DISABLED; - iwe.u.data.length = 0; - current_ev = iwe_stream_add_point(info, current_ev, end_buf, - &iwe, ""); - - ieee80211_scan_add_ies(info, bss, ¤t_ev, end_buf); - - if (bss->supp_rates_len > 0) { - /* display all supported rates in readable format */ - char *p = current_ev + iwe_stream_lcp_len(info); - int i; - - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = SIOCGIWRATE; - /* Those two flags are ignored... */ - iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0; - - for (i = 0; i < bss->supp_rates_len; i++) { - iwe.u.bitrate.value = ((bss->supp_rates[i] & - 0x7f) * 500000); - p = iwe_stream_add_value(info, current_ev, p, - end_buf, &iwe, IW_EV_PARAM_LEN); - } - current_ev = p; - } - - buf = kmalloc(30, GFP_ATOMIC); - if (buf) { - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = IWEVCUSTOM; - sprintf(buf, "tsf=%016llx", (unsigned long long)(bss->timestamp)); - iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, end_buf, - &iwe, buf); - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = IWEVCUSTOM; - sprintf(buf, " Last beacon: %dms ago", - jiffies_to_msecs(jiffies - bss->last_update)); - iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, &iwe, buf); - kfree(buf); - } - - if (bss_mesh_cfg(bss)) { - u8 *cfg = bss_mesh_cfg(bss); - buf = kmalloc(50, GFP_ATOMIC); - if (buf) { - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = IWEVCUSTOM; - sprintf(buf, "Mesh network (version %d)", cfg[0]); - iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); - sprintf(buf, "Path Selection Protocol ID: " - "0x%02X%02X%02X%02X", cfg[1], cfg[2], cfg[3], - cfg[4]); - iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); - sprintf(buf, "Path Selection Metric ID: " - "0x%02X%02X%02X%02X", cfg[5], cfg[6], cfg[7], - cfg[8]); - iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); - sprintf(buf, "Congestion Control Mode ID: " - "0x%02X%02X%02X%02X", cfg[9], cfg[10], - cfg[11], cfg[12]); - iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); - sprintf(buf, "Channel Precedence: " - "0x%02X%02X%02X%02X", cfg[13], cfg[14], - cfg[15], cfg[16]); - iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); - kfree(buf); - } - } - - return current_ev; -} - - -int ieee80211_scan_results(struct ieee80211_local *local, - struct iw_request_info *info, - char *buf, size_t len) -{ - char *current_ev = buf; - char *end_buf = buf + len; - struct ieee80211_bss *bss; - - spin_lock_bh(&local->bss_lock); - list_for_each_entry(bss, &local->bss_list, list) { - if (buf + len - current_ev <= IW_EV_ADDR_LEN) { - spin_unlock_bh(&local->bss_lock); - return -E2BIG; - } - current_ev = ieee80211_scan_result(local, info, bss, - current_ev, end_buf); - } - spin_unlock_bh(&local->bss_lock); - return current_ev - buf; -} diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index f72bad636d8e..5f7a2624ed74 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -65,7 +65,7 @@ static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_da IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED; msr_report->u.action.u.measurement.msr_elem.type = request_ie->type; - ieee80211_tx_skb(sdata, skb, 0); + ieee80211_tx_skb(sdata, skb, 1); } void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, @@ -84,3 +84,104 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, mgmt->sa, mgmt->bssid, mgmt->u.action.u.measurement.dialog_token); } + +void ieee80211_chswitch_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, u.mgd.chswitch_work); + struct ieee80211_bss *bss; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + if (!netif_running(sdata->dev)) + return; + + bss = ieee80211_rx_bss_get(sdata->local, ifmgd->bssid, + sdata->local->hw.conf.channel->center_freq, + ifmgd->ssid, ifmgd->ssid_len); + if (!bss) + goto exit; + + sdata->local->oper_channel = sdata->local->csa_channel; + /* XXX: shouldn't really modify cfg80211-owned data! */ + if (!ieee80211_hw_config(sdata->local, IEEE80211_CONF_CHANGE_CHANNEL)) + bss->cbss.channel = sdata->local->oper_channel; + + ieee80211_rx_bss_put(sdata->local, bss); +exit: + ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED; + ieee80211_wake_queues_by_reason(&sdata->local->hw, + IEEE80211_QUEUE_STOP_REASON_CSA); +} + +void ieee80211_chswitch_timer(unsigned long data) +{ + struct ieee80211_sub_if_data *sdata = + (struct ieee80211_sub_if_data *) data; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + queue_work(sdata->local->hw.workqueue, &ifmgd->chswitch_work); +} + +void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel_sw_ie *sw_elem, + struct ieee80211_bss *bss) +{ + struct ieee80211_channel *new_ch; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + int new_freq = ieee80211_channel_to_frequency(sw_elem->new_ch_num); + + /* FIXME: Handle ADHOC later */ + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return; + + if (ifmgd->state != IEEE80211_STA_MLME_ASSOCIATED) + return; + + if (sdata->local->sw_scanning || sdata->local->hw_scanning) + return; + + /* Disregard subsequent beacons if we are already running a timer + processing a CSA */ + + if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED) + return; + + new_ch = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq); + if (!new_ch || new_ch->flags & IEEE80211_CHAN_DISABLED) + return; + + sdata->local->csa_channel = new_ch; + + if (sw_elem->count <= 1) { + queue_work(sdata->local->hw.workqueue, &ifmgd->chswitch_work); + } else { + ieee80211_stop_queues_by_reason(&sdata->local->hw, + IEEE80211_QUEUE_STOP_REASON_CSA); + ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED; + mod_timer(&ifmgd->chswitch_timer, + jiffies + + msecs_to_jiffies(sw_elem->count * + bss->cbss.beacon_interval)); + } +} + +void ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, + u16 capab_info, u8 *pwr_constr_elem, + u8 pwr_constr_elem_len) +{ + struct ieee80211_conf *conf = &sdata->local->hw.conf; + + if (!(capab_info & WLAN_CAPABILITY_SPECTRUM_MGMT)) + return; + + /* Power constraint IE length should be 1 octet */ + if (pwr_constr_elem_len != 1) + return; + + if ((*pwr_constr_elem <= conf->channel->max_power) && + (*pwr_constr_elem != sdata->local->power_constr_level)) { + sdata->local->power_constr_level = *pwr_constr_elem; + ieee80211_hw_config(sdata->local, 0); + } +} + diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 10c5539c20ab..4ba3c540fcf3 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -194,12 +194,53 @@ void sta_info_destroy(struct sta_info *sta) dev_kfree_skb_any(skb); for (i = 0; i < STA_TID_NUM; i++) { + struct tid_ampdu_rx *tid_rx; + struct tid_ampdu_tx *tid_tx; + spin_lock_bh(&sta->lock); - if (sta->ampdu_mlme.tid_rx[i]) - del_timer_sync(&sta->ampdu_mlme.tid_rx[i]->session_timer); - if (sta->ampdu_mlme.tid_tx[i]) - del_timer_sync(&sta->ampdu_mlme.tid_tx[i]->addba_resp_timer); + tid_rx = sta->ampdu_mlme.tid_rx[i]; + /* Make sure timer won't free the tid_rx struct, see below */ + if (tid_rx) + tid_rx->shutdown = true; + + /* + * The stop callback cannot find this station any more, but + * it didn't complete its work -- start the queue if necessary + */ + if (sta->ampdu_mlme.tid_state_tx[i] & HT_AGG_STATE_INITIATOR_MSK && + sta->ampdu_mlme.tid_state_tx[i] & HT_AGG_STATE_REQ_STOP_BA_MSK && + local->hw.ampdu_queues) + ieee80211_wake_queue_by_reason(&local->hw, + local->hw.queues + sta->tid_to_tx_q[i], + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + spin_unlock_bh(&sta->lock); + + /* + * Outside spinlock - shutdown is true now so that the timer + * won't free tid_rx, we have to do that now. Can't let the + * timer do it because we have to sync the timer outside the + * lock that it takes itself. + */ + if (tid_rx) { + del_timer_sync(&tid_rx->session_timer); + kfree(tid_rx); + } + + /* + * No need to do such complications for TX agg sessions, the + * path leading to freeing the tid_tx struct goes via a call + * from the driver, and thus needs to look up the sta struct + * again, which cannot be found when we get here. Hence, we + * just need to delete the timer and free the aggregation + * info; we won't be telling the peer about it then but that + * doesn't matter if we're not talking to it again anyway. + */ + tid_tx = sta->ampdu_mlme.tid_tx[i]; + if (tid_tx) { + del_timer_sync(&tid_tx->addba_resp_timer); + kfree(tid_tx); + } } __sta_info_free(local, sta); @@ -246,8 +287,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, * enable session_timer's data differentiation. refer to * sta_rx_agg_session_timer_expired for useage */ sta->timer_to_tid[i] = i; - /* tid to tx queue: initialize according to HW (0 is valid) */ - sta->tid_to_tx_q[i] = ieee80211_num_queues(&local->hw); + sta->tid_to_tx_q[i] = -1; /* rx */ sta->ampdu_mlme.tid_state_rx[i] = HT_AGG_STATE_IDLE; sta->ampdu_mlme.tid_rx[i] = NULL; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index e49a5b99cf10..1f45573c580c 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -34,6 +34,7 @@ * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the * IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next * frame to this station is transmitted. + * @WLAN_STA_MFP: Management frame protection is used with this STA. */ enum ieee80211_sta_info_flags { WLAN_STA_AUTH = 1<<0, @@ -46,6 +47,7 @@ enum ieee80211_sta_info_flags { WLAN_STA_WDS = 1<<7, WLAN_STA_PSPOLL = 1<<8, WLAN_STA_CLEAR_PS_FILT = 1<<9, + WLAN_STA_MFP = 1<<10, }; #define STA_TID_NUM 16 @@ -63,7 +65,6 @@ enum ieee80211_sta_info_flags { #define HT_AGG_STATE_OPERATIONAL (HT_ADDBA_REQUESTED_MSK | \ HT_ADDBA_DRV_READY_MSK | \ HT_ADDBA_RECEIVED_MSK) -#define HT_AGG_STATE_DEBUGFS_CTL BIT(7) /** * struct tid_ampdu_tx - TID aggregation information (Tx). @@ -87,8 +88,9 @@ struct tid_ampdu_tx { * @stored_mpdu_num: number of MPDUs in reordering buffer * @ssn: Starting Sequence Number expected to be aggregated. * @buf_size: buffer size for incoming A-MPDUs - * @timeout: reset timer value. + * @timeout: reset timer value (in TUs). * @dialog_token: dialog token for aggregation session + * @shutdown: this session is being shut down due to STA removal */ struct tid_ampdu_rx { struct sk_buff **reorder_buf; @@ -99,6 +101,7 @@ struct tid_ampdu_rx { u16 buf_size; u16 timeout; u8 dialog_token; + bool shutdown; }; /** @@ -198,7 +201,7 @@ struct sta_ampdu_mlme { * @tid_seq: per-TID sequence numbers for sending to this STA * @ampdu_mlme: A-MPDU state machine state * @timer_to_tid: identity mapping to ID timers - * @tid_to_tx_q: map tid to tx queue + * @tid_to_tx_q: map tid to tx queue (invalid == negative values) * @llid: Local link ID * @plid: Peer link ID * @reason: Cancel reason on PLINK_HOLDING state @@ -273,7 +276,7 @@ struct sta_info { */ struct sta_ampdu_mlme ampdu_mlme; u8 timer_to_tid[STA_TID_NUM]; - u8 tid_to_tx_q[STA_TID_NUM]; + s8 tid_to_tx_q[STA_TID_NUM]; #ifdef CONFIG_MAC80211_MESH /* @@ -382,8 +385,6 @@ static inline u32 get_sta_flags(struct sta_info *sta) } -/* Maximum number of concurrently registered stations */ -#define MAX_STA_COUNT 2007 #define STA_HASH_SIZE 256 #define STA_HASH(sta) (sta[5]) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 37e3d5ef7e3f..457238a2f3fc 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -35,6 +35,7 @@ #define IEEE80211_TX_OK 0 #define IEEE80211_TX_AGAIN 1 #define IEEE80211_TX_FRAG_AGAIN 2 +#define IEEE80211_TX_PENDING 3 /* misc utils */ @@ -330,6 +331,22 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) return TX_CONTINUE; } +static int ieee80211_use_mfp(__le16 fc, struct sta_info *sta, + struct sk_buff *skb) +{ + if (!ieee80211_is_mgmt(fc)) + return 0; + + if (sta == NULL || !test_sta_flags(sta, WLAN_STA_MFP)) + return 0; + + if (!ieee80211_is_robust_mgmt_frame((struct ieee80211_hdr *) + skb->data)) + return 0; + + return 1; +} + static ieee80211_tx_result ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) { @@ -409,11 +426,17 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) tx->key = NULL; else if (tx->sta && (key = rcu_dereference(tx->sta->key))) tx->key = key; + else if (ieee80211_is_mgmt(hdr->frame_control) && + (key = rcu_dereference(tx->sdata->default_mgmt_key))) + tx->key = key; else if ((key = rcu_dereference(tx->sdata->default_key))) tx->key = key; else if (tx->sdata->drop_unencrypted && (tx->skb->protocol != cpu_to_be16(ETH_P_PAE)) && - !(info->flags & IEEE80211_TX_CTL_INJECTED)) { + !(info->flags & IEEE80211_TX_CTL_INJECTED) && + (!ieee80211_is_robust_mgmt_frame(hdr) || + (ieee80211_is_action(hdr->frame_control) && + tx->sta && test_sta_flags(tx->sta, WLAN_STA_MFP)))) { I802_DEBUG_INC(tx->local->tx_handlers_drop_unencrypted); return TX_DROP; } else @@ -428,10 +451,19 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (ieee80211_is_auth(hdr->frame_control)) break; case ALG_TKIP: - case ALG_CCMP: if (!ieee80211_is_data_present(hdr->frame_control)) tx->key = NULL; break; + case ALG_CCMP: + if (!ieee80211_is_data_present(hdr->frame_control) && + !ieee80211_use_mfp(hdr->frame_control, tx->sta, + tx->skb)) + tx->key = NULL; + break; + case ALG_AES_CMAC: + if (!ieee80211_is_mgmt(hdr->frame_control)) + tx->key = NULL; + break; } } @@ -789,6 +821,8 @@ ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) return ieee80211_crypto_tkip_encrypt(tx); case ALG_CCMP: return ieee80211_crypto_ccmp_encrypt(tx); + case ALG_AES_CMAC: + return ieee80211_crypto_aes_cmac_encrypt(tx); } /* not reached */ @@ -844,7 +878,6 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) return TX_CONTINUE; } - /* actual transmit path */ /* @@ -984,12 +1017,20 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx, tx->sta = sta_info_get(local, hdr->addr1); if (tx->sta && ieee80211_is_data_qos(hdr->frame_control)) { + unsigned long flags; qc = ieee80211_get_qos_ctl(hdr); tid = *qc & IEEE80211_QOS_CTL_TID_MASK; + spin_lock_irqsave(&tx->sta->lock, flags); state = &tx->sta->ampdu_mlme.tid_state_tx[tid]; - if (*state == HT_AGG_STATE_OPERATIONAL) + if (*state == HT_AGG_STATE_OPERATIONAL) { info->flags |= IEEE80211_TX_CTL_AMPDU; + if (local->hw.ampdu_queues) + skb_set_queue_mapping( + skb, tx->local->hw.queues + + tx->sta->tid_to_tx_q[tid]); + } + spin_unlock_irqrestore(&tx->sta->lock, flags); } if (is_multicast_ether_addr(hdr->addr1)) { @@ -1053,9 +1094,9 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb, int ret, i; if (skb) { - if (netif_subqueue_stopped(local->mdev, skb)) - return IEEE80211_TX_AGAIN; - info = IEEE80211_SKB_CB(skb); + if (ieee80211_queue_stopped(&local->hw, + skb_get_queue_mapping(skb))) + return IEEE80211_TX_PENDING; ret = local->ops->tx(local_to_hw(local), skb); if (ret) @@ -1070,8 +1111,8 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb, info = IEEE80211_SKB_CB(tx->extra_frag[i]); info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_FIRST_FRAGMENT); - if (netif_subqueue_stopped(local->mdev, - tx->extra_frag[i])) + if (ieee80211_queue_stopped(&local->hw, + skb_get_queue_mapping(tx->extra_frag[i]))) return IEEE80211_TX_FRAG_AGAIN; ret = local->ops->tx(local_to_hw(local), @@ -1181,8 +1222,9 @@ retry: * queues, there's no reason for a driver to reject * a frame there, warn and drop it. */ - if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU)) - goto drop; + if (ret != IEEE80211_TX_PENDING) + if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU)) + goto drop; store = &local->pending_packet[queue]; @@ -1298,6 +1340,19 @@ int ieee80211_master_start_xmit(struct sk_buff *skb, struct net_device *dev) return 0; } + if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) && + local->hw.conf.dynamic_ps_timeout > 0) { + if (local->hw.conf.flags & IEEE80211_CONF_PS) { + ieee80211_stop_queues_by_reason(&local->hw, + IEEE80211_QUEUE_STOP_REASON_PS); + queue_work(local->hw.workqueue, + &local->dynamic_ps_disable_work); + } + + mod_timer(&local->dynamic_ps_timer, jiffies + + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); + } + memset(info, 0, sizeof(*info)); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; @@ -1392,10 +1447,31 @@ int ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_channel *chan = local->hw.conf.channel; struct ieee80211_radiotap_header *prthdr = (struct ieee80211_radiotap_header *)skb->data; u16 len_rthdr; + /* + * Frame injection is not allowed if beaconing is not allowed + * or if we need radar detection. Beaconing is usually not allowed when + * the mode or operation (Adhoc, AP, Mesh) does not support DFS. + * Passive scan is also used in world regulatory domains where + * your country is not known and as such it should be treated as + * NO TX unless the channel is explicitly allowed in which case + * your current regulatory domain would not have the passive scan + * flag. + * + * Since AP mode uses monitor interfaces to inject/TX management + * frames we can make AP mode the exception to this rule once it + * supports radar detection as its implementation can deal with + * radar detection by itself. We can do that later by adding a + * monitor flag interfaces used for AP support. + */ + if ((chan->flags & (IEEE80211_CHAN_NO_IBSS | IEEE80211_CHAN_RADAR | + IEEE80211_CHAN_PASSIVE_SCAN))) + goto fail; + /* check for not even having the fixed radiotap header part */ if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) goto fail; /* too short to be possibly valid */ @@ -1479,19 +1555,6 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb, goto fail; } - if (!(local->hw.flags & IEEE80211_HW_NO_STACK_DYNAMIC_PS) && - local->dynamic_ps_timeout > 0) { - if (local->hw.conf.flags & IEEE80211_CONF_PS) { - ieee80211_stop_queues_by_reason(&local->hw, - IEEE80211_QUEUE_STOP_REASON_PS); - queue_work(local->hw.workqueue, - &local->dynamic_ps_disable_work); - } - - mod_timer(&local->dynamic_ps_timer, jiffies + - msecs_to_jiffies(local->dynamic_ps_timeout)); - } - nh_pos = skb_network_header(skb) - skb->data; h_pos = skb_transport_header(skb) - skb->data; @@ -1572,7 +1635,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb, case NL80211_IFTYPE_STATION: fc |= cpu_to_le16(IEEE80211_FCTL_TODS); /* BSSID SA DA */ - memcpy(hdr.addr1, sdata->u.sta.bssid, ETH_ALEN); + memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); memcpy(hdr.addr3, skb->data, ETH_ALEN); hdrlen = 24; @@ -1581,7 +1644,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb, /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); - memcpy(hdr.addr3, sdata->u.sta.bssid, ETH_ALEN); + memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN); hdrlen = 24; break; default: @@ -1867,7 +1930,6 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata = NULL; struct ieee80211_if_ap *ap = NULL; - struct ieee80211_if_sta *ifsta = NULL; struct beacon_data *beacon; struct ieee80211_supported_band *sband; enum ieee80211_band band = local->hw.conf.channel->band; @@ -1919,13 +1981,13 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, } else goto out; } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_hdr *hdr; - ifsta = &sdata->u.sta; - if (!ifsta->probe_resp) + if (!ifibss->probe_resp) goto out; - skb = skb_copy(ifsta->probe_resp, GFP_ATOMIC); + skb = skb_copy(ifibss->probe_resp, GFP_ATOMIC); if (!skb) goto out; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index fb89e1d0aa03..e0431a1d218b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -41,6 +41,15 @@ const unsigned char rfc1042_header[] __aligned(2) = const unsigned char bridge_tunnel_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; +struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy) +{ + struct ieee80211_local *local; + BUG_ON(!wiphy); + + local = wiphy_priv(wiphy); + return &local->hw; +} +EXPORT_SYMBOL(wiphy_to_ieee80211_hw); u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, enum nl80211_iftype type) @@ -335,15 +344,36 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, { struct ieee80211_local *local = hw_to_local(hw); - /* we don't need to track ampdu queues */ - if (queue < ieee80211_num_regular_queues(hw)) { - __clear_bit(reason, &local->queue_stop_reasons[queue]); + if (queue >= hw->queues) { + if (local->ampdu_ac_queue[queue - hw->queues] < 0) + return; + + /* + * for virtual aggregation queues, we need to refcount the + * internal mac80211 disable (multiple times!), keep track of + * driver disable _and_ make sure the regular queue is + * actually enabled. + */ + if (reason == IEEE80211_QUEUE_STOP_REASON_AGGREGATION) + local->amdpu_ac_stop_refcnt[queue - hw->queues]--; + else + __clear_bit(reason, &local->queue_stop_reasons[queue]); - if (local->queue_stop_reasons[queue] != 0) - /* someone still has this queue stopped */ + if (local->queue_stop_reasons[queue] || + local->amdpu_ac_stop_refcnt[queue - hw->queues]) return; + + /* now go on to treat the corresponding regular queue */ + queue = local->ampdu_ac_queue[queue - hw->queues]; + reason = IEEE80211_QUEUE_STOP_REASON_AGGREGATION; } + __clear_bit(reason, &local->queue_stop_reasons[queue]); + + if (local->queue_stop_reasons[queue] != 0) + /* someone still has this queue stopped */ + return; + if (test_bit(queue, local->queues_pending)) { set_bit(queue, local->queues_pending_run); tasklet_schedule(&local->tx_pending_tasklet); @@ -375,9 +405,27 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, { struct ieee80211_local *local = hw_to_local(hw); - /* we don't need to track ampdu queues */ - if (queue < ieee80211_num_regular_queues(hw)) - __set_bit(reason, &local->queue_stop_reasons[queue]); + if (queue >= hw->queues) { + if (local->ampdu_ac_queue[queue - hw->queues] < 0) + return; + + /* + * for virtual aggregation queues, we need to refcount the + * internal mac80211 disable (multiple times!), keep track of + * driver disable _and_ make sure the regular queue is + * actually enabled. + */ + if (reason == IEEE80211_QUEUE_STOP_REASON_AGGREGATION) + local->amdpu_ac_stop_refcnt[queue - hw->queues]++; + else + __set_bit(reason, &local->queue_stop_reasons[queue]); + + /* now go on to treat the corresponding regular queue */ + queue = local->ampdu_ac_queue[queue - hw->queues]; + reason = IEEE80211_QUEUE_STOP_REASON_AGGREGATION; + } + + __set_bit(reason, &local->queue_stop_reasons[queue]); netif_stop_subqueue(local->mdev, queue); } @@ -409,7 +457,7 @@ void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - for (i = 0; i < ieee80211_num_queues(hw); i++) + for (i = 0; i < hw->queues; i++) __ieee80211_stop_queue(hw, i, reason); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); @@ -425,6 +473,16 @@ EXPORT_SYMBOL(ieee80211_stop_queues); int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue) { struct ieee80211_local *local = hw_to_local(hw); + unsigned long flags; + + if (queue >= hw->queues) { + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + queue = local->ampdu_ac_queue[queue - hw->queues]; + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + if (queue < 0) + return true; + } + return __netif_subqueue_stopped(local->mdev, queue); } EXPORT_SYMBOL(ieee80211_queue_stopped); @@ -459,7 +517,7 @@ void ieee80211_iterate_active_interfaces( struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; - rtnl_lock(); + mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { switch (sdata->vif.type) { @@ -480,7 +538,7 @@ void ieee80211_iterate_active_interfaces( &sdata->vif); } - rtnl_unlock(); + mutex_unlock(&local->iflist_mtx); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces); @@ -653,6 +711,10 @@ void ieee802_11_parse_elems(u8 *start, size_t len, elems->pwr_constr_elem = pos; elems->pwr_constr_elem_len = elen; break; + case WLAN_EID_TIMEOUT_INTERVAL: + elems->timeout_int = pos; + elems->timeout_int_len = elen; + break; default: break; } @@ -688,6 +750,27 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata) local->ops->conf_tx(local_to_hw(local), i, &qparam); } +void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata, + const size_t supp_rates_len, + const u8 *supp_rates) +{ + struct ieee80211_local *local = sdata->local; + int i, have_higher_than_11mbit = 0; + + /* cf. IEEE 802.11 9.2.12 */ + for (i = 0; i < supp_rates_len; i++) + if ((supp_rates[i] & 0x7f) * 5 > 110) + have_higher_than_11mbit = 1; + + if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ && + have_higher_than_11mbit) + sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; + else + sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; + + ieee80211_set_wmm_default(sdata); +} + void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int encrypt) { @@ -727,12 +810,12 @@ int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freqMHz) return ret; } -u64 ieee80211_mandatory_rates(struct ieee80211_local *local, +u32 ieee80211_mandatory_rates(struct ieee80211_local *local, enum ieee80211_band band) { struct ieee80211_supported_band *sband; struct ieee80211_rate *bitrates; - u64 mandatory_rates; + u32 mandatory_rates; enum ieee80211_rate_flags mandatory_flag; int i; @@ -754,3 +837,161 @@ u64 ieee80211_mandatory_rates(struct ieee80211_local *local, mandatory_rates |= BIT(i); return mandatory_rates; } + +void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, + u16 transaction, u16 auth_alg, + u8 *extra, size_t extra_len, + const u8 *bssid, int encrypt) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + const u8 *ie_auth = NULL; + int ie_auth_len = 0; + + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + ie_auth_len = sdata->u.mgd.ie_auth_len; + ie_auth = sdata->u.mgd.ie_auth; + } + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + + sizeof(*mgmt) + 6 + extra_len + ie_auth_len); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for auth " + "frame\n", sdata->dev->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6); + memset(mgmt, 0, 24 + 6); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_AUTH); + if (encrypt) + mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); + memcpy(mgmt->da, bssid, ETH_ALEN); + memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); + memcpy(mgmt->bssid, bssid, ETH_ALEN); + mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg); + mgmt->u.auth.auth_transaction = cpu_to_le16(transaction); + mgmt->u.auth.status_code = cpu_to_le16(0); + if (extra) + memcpy(skb_put(skb, extra_len), extra, extra_len); + if (ie_auth) + memcpy(skb_put(skb, ie_auth_len), ie_auth, ie_auth_len); + + ieee80211_tx_skb(sdata, skb, encrypt); +} + +void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, + u8 *ssid, size_t ssid_len, + u8 *ie, size_t ie_len) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u8 *pos, *supp_rates, *esupp_rates = NULL, *extra_preq_ie = NULL; + int i, extra_preq_ie_len = 0; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + extra_preq_ie_len = sdata->u.mgd.ie_probereq_len; + extra_preq_ie = sdata->u.mgd.ie_probereq; + break; + default: + break; + } + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200 + + ie_len + extra_preq_ie_len); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for probe " + "request\n", sdata->dev->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(mgmt, 0, 24); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_PROBE_REQ); + memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); + if (dst) { + memcpy(mgmt->da, dst, ETH_ALEN); + memcpy(mgmt->bssid, dst, ETH_ALEN); + } else { + memset(mgmt->da, 0xff, ETH_ALEN); + memset(mgmt->bssid, 0xff, ETH_ALEN); + } + pos = skb_put(skb, 2 + ssid_len); + *pos++ = WLAN_EID_SSID; + *pos++ = ssid_len; + memcpy(pos, ssid, ssid_len); + + supp_rates = skb_put(skb, 2); + supp_rates[0] = WLAN_EID_SUPP_RATES; + supp_rates[1] = 0; + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + for (i = 0; i < sband->n_bitrates; i++) { + struct ieee80211_rate *rate = &sband->bitrates[i]; + if (esupp_rates) { + pos = skb_put(skb, 1); + esupp_rates[1]++; + } else if (supp_rates[1] == 8) { + esupp_rates = skb_put(skb, 3); + esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES; + esupp_rates[1] = 1; + pos = &esupp_rates[2]; + } else { + pos = skb_put(skb, 1); + supp_rates[1]++; + } + *pos = rate->bitrate / 5; + } + + if (ie) + memcpy(skb_put(skb, ie_len), ie, ie_len); + if (extra_preq_ie) + memcpy(skb_put(skb, extra_preq_ie_len), extra_preq_ie, + extra_preq_ie_len); + + ieee80211_tx_skb(sdata, skb, 0); +} + +u32 ieee80211_sta_get_rates(struct ieee80211_local *local, + struct ieee802_11_elems *elems, + enum ieee80211_band band) +{ + struct ieee80211_supported_band *sband; + struct ieee80211_rate *bitrates; + size_t num_rates; + u32 supp_rates; + int i, j; + sband = local->hw.wiphy->bands[band]; + + if (!sband) { + WARN_ON(1); + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + } + + bitrates = sband->bitrates; + num_rates = sband->n_bitrates; + supp_rates = 0; + for (i = 0; i < elems->supp_rates_len + + elems->ext_supp_rates_len; i++) { + u8 rate = 0; + int own_rate; + if (i < elems->supp_rates_len) + rate = elems->supp_rates[i]; + else if (elems->ext_supp_rates) + rate = elems->ext_supp_rates + [i - elems->supp_rates_len]; + own_rate = 5 * (rate & 0x7f); + for (j = 0; j < num_rates; j++) + if (bitrates[j].bitrate == own_rate) + supp_rates |= BIT(j); + } + return supp_rates; +} diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c index 7162d5816f39..935c63ed3dfa 100644 --- a/net/mac80211/wext.c +++ b/net/mac80211/wext.c @@ -37,7 +37,14 @@ static int ieee80211_set_encryption(struct ieee80211_sub_if_data *sdata, u8 *sta struct ieee80211_key *key; int err; - if (idx < 0 || idx >= NUM_DEFAULT_KEYS) { + if (alg == ALG_AES_CMAC) { + if (idx < NUM_DEFAULT_KEYS || + idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) { + printk(KERN_DEBUG "%s: set_encrypt - invalid idx=%d " + "(BIP)\n", sdata->dev->name, idx); + return -EINVAL; + } + } else if (idx < 0 || idx >= NUM_DEFAULT_KEYS) { printk(KERN_DEBUG "%s: set_encrypt - invalid idx=%d\n", sdata->dev->name, idx); return -EINVAL; @@ -103,6 +110,9 @@ static int ieee80211_set_encryption(struct ieee80211_sub_if_data *sdata, u8 *sta if (set_tx_key || (!sta && !sdata->default_key && key)) ieee80211_set_default_key(sdata, idx); + if (alg == ALG_AES_CMAC && + (set_tx_key || (!sta && !sdata->default_mgmt_key && key))) + ieee80211_set_default_mgmt_key(sdata, idx); } out_unlock: @@ -122,122 +132,37 @@ static int ieee80211_ioctl_siwgenie(struct net_device *dev, if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) return -EOPNOTSUPP; - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { int ret = ieee80211_sta_set_extra_ie(sdata, extra, data->length); if (ret) return ret; - sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL; - ieee80211_sta_req_auth(sdata, &sdata->u.sta); + sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL; + ieee80211_sta_req_auth(sdata); return 0; } return -EOPNOTSUPP; } -static int ieee80211_ioctl_giwrange(struct net_device *dev, - struct iw_request_info *info, - struct iw_point *data, char *extra) -{ - struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); - struct iw_range *range = (struct iw_range *) extra; - enum ieee80211_band band; - int c = 0; - - data->length = sizeof(struct iw_range); - memset(range, 0, sizeof(struct iw_range)); - - range->we_version_compiled = WIRELESS_EXT; - range->we_version_source = 21; - range->retry_capa = IW_RETRY_LIMIT; - range->retry_flags = IW_RETRY_LIMIT; - range->min_retry = 0; - range->max_retry = 255; - range->min_rts = 0; - range->max_rts = 2347; - range->min_frag = 256; - range->max_frag = 2346; - - range->encoding_size[0] = 5; - range->encoding_size[1] = 13; - range->num_encoding_sizes = 2; - range->max_encoding_tokens = NUM_DEFAULT_KEYS; - - if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC || - local->hw.flags & IEEE80211_HW_SIGNAL_DB) - range->max_qual.level = local->hw.max_signal; - else if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) - range->max_qual.level = -110; - else - range->max_qual.level = 0; - - if (local->hw.flags & IEEE80211_HW_NOISE_DBM) - range->max_qual.noise = -110; - else - range->max_qual.noise = 0; - - range->max_qual.qual = 100; - range->max_qual.updated = local->wstats_flags; - - range->avg_qual.qual = 50; - /* not always true but better than nothing */ - range->avg_qual.level = range->max_qual.level / 2; - range->avg_qual.noise = range->max_qual.noise / 2; - range->avg_qual.updated = local->wstats_flags; - - range->enc_capa = IW_ENC_CAPA_WPA | IW_ENC_CAPA_WPA2 | - IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP; - - - for (band = 0; band < IEEE80211_NUM_BANDS; band ++) { - int i; - struct ieee80211_supported_band *sband; - - sband = local->hw.wiphy->bands[band]; - - if (!sband) - continue; - - for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) { - struct ieee80211_channel *chan = &sband->channels[i]; - - if (!(chan->flags & IEEE80211_CHAN_DISABLED)) { - range->freq[c].i = - ieee80211_frequency_to_channel( - chan->center_freq); - range->freq[c].m = chan->center_freq; - range->freq[c].e = 6; - c++; - } - } - } - range->num_channels = c; - range->num_frequency = c; - - IW_EVENT_CAPA_SET_KERNEL(range->event_capa); - IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP); - IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN); - - range->scan_capa |= IW_SCAN_CAPA_ESSID; - - return 0; -} - - static int ieee80211_ioctl_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type == NL80211_IFTYPE_STATION) - sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL; + if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + sdata->u.ibss.flags &= ~IEEE80211_IBSS_AUTO_CHANNEL_SEL; + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL; /* freq->e == 0: freq->m = channel; otherwise freq = m * 10^e */ if (freq->e == 0) { if (freq->m < 0) { - if (sdata->vif.type == NL80211_IFTYPE_STATION) - sdata->u.sta.flags |= + if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + sdata->u.ibss.flags |= + IEEE80211_IBSS_AUTO_CHANNEL_SEL; + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + sdata->u.mgd.flags |= IEEE80211_STA_AUTO_CHANNEL_SEL; return 0; } else @@ -274,32 +199,35 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev, { struct ieee80211_sub_if_data *sdata; size_t len = data->length; + int ret; /* iwconfig uses nul termination in SSID.. */ if (len > 0 && ssid[len - 1] == '\0') len--; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { - int ret; + if (sdata->vif.type == NL80211_IFTYPE_STATION) { if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) { if (len > IEEE80211_MAX_SSID_LEN) return -EINVAL; - memcpy(sdata->u.sta.ssid, ssid, len); - sdata->u.sta.ssid_len = len; + memcpy(sdata->u.mgd.ssid, ssid, len); + sdata->u.mgd.ssid_len = len; return 0; } + if (data->flags) - sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_SSID_SEL; + sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_SSID_SEL; else - sdata->u.sta.flags |= IEEE80211_STA_AUTO_SSID_SEL; + sdata->u.mgd.flags |= IEEE80211_STA_AUTO_SSID_SEL; + ret = ieee80211_sta_set_ssid(sdata, ssid, len); if (ret) return ret; - ieee80211_sta_req_auth(sdata, &sdata->u.sta); + + ieee80211_sta_req_auth(sdata); return 0; - } + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + return ieee80211_ibss_set_ssid(sdata, ssid, len); return -EOPNOTSUPP; } @@ -313,8 +241,7 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev, struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { int res = ieee80211_sta_get_ssid(sdata, ssid, &len); if (res == 0) { data->length = len; @@ -322,6 +249,14 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev, } else data->flags = 0; return res; + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + int res = ieee80211_ibss_get_ssid(sdata, ssid, &len); + if (res == 0) { + data->length = len; + data->flags = 1; + } else + data->flags = 0; + return res; } return -EOPNOTSUPP; @@ -335,26 +270,35 @@ static int ieee80211_ioctl_siwap(struct net_device *dev, struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { int ret; if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) { - memcpy(sdata->u.sta.bssid, (u8 *) &ap_addr->sa_data, + memcpy(sdata->u.mgd.bssid, (u8 *) &ap_addr->sa_data, ETH_ALEN); return 0; } if (is_zero_ether_addr((u8 *) &ap_addr->sa_data)) - sdata->u.sta.flags |= IEEE80211_STA_AUTO_BSSID_SEL | + sdata->u.mgd.flags |= IEEE80211_STA_AUTO_BSSID_SEL | IEEE80211_STA_AUTO_CHANNEL_SEL; else if (is_broadcast_ether_addr((u8 *) &ap_addr->sa_data)) - sdata->u.sta.flags |= IEEE80211_STA_AUTO_BSSID_SEL; + sdata->u.mgd.flags |= IEEE80211_STA_AUTO_BSSID_SEL; else - sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL; + sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL; ret = ieee80211_sta_set_bssid(sdata, (u8 *) &ap_addr->sa_data); if (ret) return ret; - ieee80211_sta_req_auth(sdata, &sdata->u.sta); + ieee80211_sta_req_auth(sdata); return 0; + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (is_zero_ether_addr((u8 *) &ap_addr->sa_data)) + sdata->u.ibss.flags |= IEEE80211_IBSS_AUTO_BSSID_SEL | + IEEE80211_IBSS_AUTO_CHANNEL_SEL; + else if (is_broadcast_ether_addr((u8 *) &ap_addr->sa_data)) + sdata->u.ibss.flags |= IEEE80211_IBSS_AUTO_BSSID_SEL; + else + sdata->u.ibss.flags &= ~IEEE80211_IBSS_AUTO_BSSID_SEL; + + return ieee80211_ibss_set_bssid(sdata, (u8 *) &ap_addr->sa_data); } else if (sdata->vif.type == NL80211_IFTYPE_WDS) { /* * If it is necessary to update the WDS peer address @@ -383,17 +327,20 @@ static int ieee80211_ioctl_giwap(struct net_device *dev, struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) { - if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATED || - sdata->u.sta.state == IEEE80211_STA_MLME_IBSS_JOINED) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + if (sdata->u.mgd.state == IEEE80211_STA_MLME_ASSOCIATED) { ap_addr->sa_family = ARPHRD_ETHER; - memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN); - return 0; - } else { + memcpy(&ap_addr->sa_data, sdata->u.mgd.bssid, ETH_ALEN); + } else memset(&ap_addr->sa_data, 0, ETH_ALEN); - return 0; - } + return 0; + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (sdata->u.ibss.state == IEEE80211_IBSS_MLME_JOINED) { + ap_addr->sa_family = ARPHRD_ETHER; + memcpy(&ap_addr->sa_data, sdata->u.ibss.bssid, ETH_ALEN); + } else + memset(&ap_addr->sa_data, 0, ETH_ALEN); + return 0; } else if (sdata->vif.type == NL80211_IFTYPE_WDS) { ap_addr->sa_family = ARPHRD_ETHER; memcpy(&ap_addr->sa_data, sdata->u.wds.remote_addr, ETH_ALEN); @@ -404,58 +351,6 @@ static int ieee80211_ioctl_giwap(struct net_device *dev, } -static int ieee80211_ioctl_siwscan(struct net_device *dev, - struct iw_request_info *info, - union iwreq_data *wrqu, char *extra) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - struct iw_scan_req *req = NULL; - u8 *ssid = NULL; - size_t ssid_len = 0; - - if (!netif_running(dev)) - return -ENETDOWN; - - if (sdata->vif.type != NL80211_IFTYPE_STATION && - sdata->vif.type != NL80211_IFTYPE_ADHOC && - sdata->vif.type != NL80211_IFTYPE_MESH_POINT) - return -EOPNOTSUPP; - - /* if SSID was specified explicitly then use that */ - if (wrqu->data.length == sizeof(struct iw_scan_req) && - wrqu->data.flags & IW_SCAN_THIS_ESSID) { - req = (struct iw_scan_req *)extra; - ssid = req->essid; - ssid_len = req->essid_len; - } - - return ieee80211_request_scan(sdata, ssid, ssid_len); -} - - -static int ieee80211_ioctl_giwscan(struct net_device *dev, - struct iw_request_info *info, - struct iw_point *data, char *extra) -{ - int res; - struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); - struct ieee80211_sub_if_data *sdata; - - sdata = IEEE80211_DEV_TO_SUB_IF(dev); - - if (local->sw_scanning || local->hw_scanning) - return -EAGAIN; - - res = ieee80211_scan_results(local, info, extra, data->length); - if (res >= 0) { - data->length = res; - return 0; - } - data->length = 0; - return res; -} - - static int ieee80211_ioctl_siwrate(struct net_device *dev, struct iw_request_info *info, struct iw_param *rate, char *extra) @@ -511,7 +406,7 @@ static int ieee80211_ioctl_giwrate(struct net_device *dev, rcu_read_lock(); - sta = sta_info_get(local, sdata->u.sta.bssid); + sta = sta_info_get(local, sdata->u.mgd.bssid); if (sta && !(sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS)) rate->value = sband->bitrates[sta->last_tx_rate.idx].bitrate; @@ -549,10 +444,9 @@ static int ieee80211_ioctl_siwtxpower(struct net_device *dev, else /* Automatic power level setting */ new_power_level = chan->max_power; - if (local->hw.conf.power_level != new_power_level) { - local->hw.conf.power_level = new_power_level; + local->user_power_level = new_power_level; + if (local->hw.conf.power_level != new_power_level) reconf_flags |= IEEE80211_CONF_CHANGE_POWER; - } if (local->hw.conf.radio_enabled != !(data->txpower.disabled)) { local->hw.conf.radio_enabled = !(data->txpower.disabled); @@ -713,8 +607,7 @@ static int ieee80211_ioctl_siwmlme(struct net_device *dev, struct iw_mlme *mlme = (struct iw_mlme *) extra; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type != NL80211_IFTYPE_STATION && - sdata->vif.type != NL80211_IFTYPE_ADHOC) + if (!(sdata->vif.type == NL80211_IFTYPE_STATION)) return -EINVAL; switch (mlme->cmd) { @@ -810,8 +703,7 @@ static int ieee80211_ioctl_giwencode(struct net_device *dev, erq->flags |= IW_ENCODE_ENABLED; if (sdata->vif.type == NL80211_IFTYPE_STATION) { - struct ieee80211_if_sta *ifsta = &sdata->u.sta; - switch (ifsta->auth_alg) { + switch (sdata->u.mgd.auth_alg) { case WLAN_AUTH_OPEN: case WLAN_AUTH_LEAP: erq->flags |= IW_ENCODE_OPEN; @@ -836,6 +728,9 @@ static int ieee80211_ioctl_siwpower(struct net_device *dev, int ret = 0, timeout = 0; bool ps; + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) + return -EOPNOTSUPP; + if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EINVAL; @@ -852,31 +747,49 @@ static int ieee80211_ioctl_siwpower(struct net_device *dev, ps = true; break; default: /* Otherwise we ignore */ - break; + return -EINVAL; } + if (wrq->flags & ~(IW_POWER_MODE | IW_POWER_TIMEOUT)) + return -EINVAL; + if (wrq->flags & IW_POWER_TIMEOUT) timeout = wrq->value / 1000; -set: - if (ps == local->powersave && timeout == local->dynamic_ps_timeout) + set: + if (ps == local->powersave && timeout == conf->dynamic_ps_timeout) return ret; local->powersave = ps; - local->dynamic_ps_timeout = timeout; + conf->dynamic_ps_timeout = timeout; - if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) { - if (!(local->hw.flags & IEEE80211_HW_NO_STACK_DYNAMIC_PS) && - local->dynamic_ps_timeout > 0) - mod_timer(&local->dynamic_ps_timer, jiffies + - msecs_to_jiffies(local->dynamic_ps_timeout)); - else { - if (local->powersave) - conf->flags |= IEEE80211_CONF_PS; - else - conf->flags &= ~IEEE80211_CONF_PS; + if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) + ret = ieee80211_hw_config(local, + IEEE80211_CONF_CHANGE_DYNPS_TIMEOUT); + + if (!(sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED)) + return ret; + + if (conf->dynamic_ps_timeout > 0 && + !(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)) { + mod_timer(&local->dynamic_ps_timer, jiffies + + msecs_to_jiffies(conf->dynamic_ps_timeout)); + } else { + if (local->powersave) { + if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) + ieee80211_send_nullfunc(local, sdata, 1); + conf->flags |= IEEE80211_CONF_PS; + ret = ieee80211_hw_config(local, + IEEE80211_CONF_CHANGE_PS); + } else { + conf->flags &= ~IEEE80211_CONF_PS; + ret = ieee80211_hw_config(local, + IEEE80211_CONF_CHANGE_PS); + if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) + ieee80211_send_nullfunc(local, sdata, 0); + del_timer_sync(&local->dynamic_ps_timer); + cancel_work_sync(&local->dynamic_ps_enable_work); } - ret = ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } return ret; @@ -903,11 +816,22 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev, switch (data->flags & IW_AUTH_INDEX) { case IW_AUTH_WPA_VERSION: - case IW_AUTH_CIPHER_PAIRWISE: case IW_AUTH_CIPHER_GROUP: case IW_AUTH_WPA_ENABLED: case IW_AUTH_RX_UNENCRYPTED_EAPOL: case IW_AUTH_KEY_MGMT: + case IW_AUTH_CIPHER_GROUP_MGMT: + break; + case IW_AUTH_CIPHER_PAIRWISE: + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + if (data->value & (IW_AUTH_CIPHER_WEP40 | + IW_AUTH_CIPHER_WEP104 | IW_AUTH_CIPHER_TKIP)) + sdata->u.mgd.flags |= + IEEE80211_STA_TKIP_WEP_USED; + else + sdata->u.mgd.flags &= + ~IEEE80211_STA_TKIP_WEP_USED; + } break; case IW_AUTH_DROP_UNENCRYPTED: sdata->drop_unencrypted = !!data->value; @@ -916,24 +840,45 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev, if (sdata->vif.type != NL80211_IFTYPE_STATION) ret = -EINVAL; else { - sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED; + sdata->u.mgd.flags &= ~IEEE80211_STA_PRIVACY_INVOKED; /* * Privacy invoked by wpa_supplicant, store the * value and allow associating to a protected * network without having a key up front. */ if (data->value) - sdata->u.sta.flags |= + sdata->u.mgd.flags |= IEEE80211_STA_PRIVACY_INVOKED; } break; case IW_AUTH_80211_AUTH_ALG: - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) - sdata->u.sta.auth_algs = data->value; + if (sdata->vif.type == NL80211_IFTYPE_STATION) + sdata->u.mgd.auth_algs = data->value; else ret = -EOPNOTSUPP; break; + case IW_AUTH_MFP: + if (!(sdata->local->hw.flags & IEEE80211_HW_MFP_CAPABLE)) { + ret = -EOPNOTSUPP; + break; + } + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + switch (data->value) { + case IW_AUTH_MFP_DISABLED: + sdata->u.mgd.mfp = IEEE80211_MFP_DISABLED; + break; + case IW_AUTH_MFP_OPTIONAL: + sdata->u.mgd.mfp = IEEE80211_MFP_OPTIONAL; + break; + case IW_AUTH_MFP_REQUIRED: + sdata->u.mgd.mfp = IEEE80211_MFP_REQUIRED; + break; + default: + ret = -EINVAL; + } + } else + ret = -EOPNOTSUPP; + break; default: ret = -EOPNOTSUPP; break; @@ -951,9 +896,9 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev rcu_read_lock(); - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) - sta = sta_info_get(local, sdata->u.sta.bssid); + if (sdata->vif.type == NL80211_IFTYPE_STATION) + sta = sta_info_get(local, sdata->u.mgd.bssid); + if (!sta) { wstats->discard.fragment = 0; wstats->discard.misc = 0; @@ -962,10 +907,45 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev wstats->qual.noise = 0; wstats->qual.updated = IW_QUAL_ALL_INVALID; } else { - wstats->qual.level = sta->last_signal; - wstats->qual.qual = sta->last_qual; - wstats->qual.noise = sta->last_noise; - wstats->qual.updated = local->wstats_flags; + wstats->qual.updated = 0; + /* + * mirror what cfg80211 does for iwrange/scan results, + * otherwise userspace gets confused. + */ + if (local->hw.flags & (IEEE80211_HW_SIGNAL_UNSPEC | + IEEE80211_HW_SIGNAL_DBM)) { + wstats->qual.updated |= IW_QUAL_LEVEL_UPDATED; + wstats->qual.updated |= IW_QUAL_QUAL_UPDATED; + } else { + wstats->qual.updated |= IW_QUAL_LEVEL_INVALID; + wstats->qual.updated |= IW_QUAL_QUAL_INVALID; + } + + if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) { + wstats->qual.level = sta->last_signal; + wstats->qual.qual = sta->last_signal; + } else if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { + int sig = sta->last_signal; + + wstats->qual.updated |= IW_QUAL_DBM; + wstats->qual.level = sig; + if (sig < -110) + sig = -110; + else if (sig > -40) + sig = -40; + wstats->qual.qual = sig + 110; + } + + if (local->hw.flags & IEEE80211_HW_NOISE_DBM) { + /* + * This assumes that if driver reports noise, it also + * reports signal in dBm. + */ + wstats->qual.noise = sta->last_noise; + wstats->qual.updated |= IW_QUAL_NOISE_UPDATED; + } else { + wstats->qual.updated |= IW_QUAL_NOISE_INVALID; + } } rcu_read_unlock(); @@ -982,9 +962,8 @@ static int ieee80211_ioctl_giwauth(struct net_device *dev, switch (data->flags & IW_AUTH_INDEX) { case IW_AUTH_80211_AUTH_ALG: - if (sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) - data->value = sdata->u.sta.auth_algs; + if (sdata->vif.type == NL80211_IFTYPE_STATION) + data->value = sdata->u.mgd.auth_algs; else ret = -EOPNOTSUPP; break; @@ -1017,6 +996,9 @@ static int ieee80211_ioctl_siwencodeext(struct net_device *dev, case IW_ENCODE_ALG_CCMP: alg = ALG_CCMP; break; + case IW_ENCODE_ALG_AES_CMAC: + alg = ALG_AES_CMAC; + break; default: return -EOPNOTSUPP; } @@ -1025,20 +1007,41 @@ static int ieee80211_ioctl_siwencodeext(struct net_device *dev, remove = 1; idx = erq->flags & IW_ENCODE_INDEX; - if (idx < 1 || idx > 4) { - idx = -1; - if (!sdata->default_key) - idx = 0; - else for (i = 0; i < NUM_DEFAULT_KEYS; i++) { - if (sdata->default_key == sdata->keys[i]) { - idx = i; - break; + if (alg == ALG_AES_CMAC) { + if (idx < NUM_DEFAULT_KEYS + 1 || + idx > NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) { + idx = -1; + if (!sdata->default_mgmt_key) + idx = 0; + else for (i = NUM_DEFAULT_KEYS; + i < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS; + i++) { + if (sdata->default_mgmt_key == sdata->keys[i]) + { + idx = i; + break; + } } - } - if (idx < 0) - return -EINVAL; - } else - idx--; + if (idx < 0) + return -EINVAL; + } else + idx--; + } else { + if (idx < 1 || idx > 4) { + idx = -1; + if (!sdata->default_key) + idx = 0; + else for (i = 0; i < NUM_DEFAULT_KEYS; i++) { + if (sdata->default_key == sdata->keys[i]) { + idx = i; + break; + } + } + if (idx < 0) + return -EINVAL; + } else + idx--; + } return ieee80211_set_encryption(sdata, ext->addr.sa_data, idx, alg, remove, @@ -1063,7 +1066,7 @@ static const iw_handler ieee80211_handler[] = (iw_handler) NULL, /* SIOCSIWSENS */ (iw_handler) NULL, /* SIOCGIWSENS */ (iw_handler) NULL /* not used */, /* SIOCSIWRANGE */ - (iw_handler) ieee80211_ioctl_giwrange, /* SIOCGIWRANGE */ + (iw_handler) cfg80211_wext_giwrange, /* SIOCGIWRANGE */ (iw_handler) NULL /* not used */, /* SIOCSIWPRIV */ (iw_handler) NULL /* kernel code */, /* SIOCGIWPRIV */ (iw_handler) NULL /* not used */, /* SIOCSIWSTATS */ @@ -1076,8 +1079,8 @@ static const iw_handler ieee80211_handler[] = (iw_handler) ieee80211_ioctl_giwap, /* SIOCGIWAP */ (iw_handler) ieee80211_ioctl_siwmlme, /* SIOCSIWMLME */ (iw_handler) NULL, /* SIOCGIWAPLIST */ - (iw_handler) ieee80211_ioctl_siwscan, /* SIOCSIWSCAN */ - (iw_handler) ieee80211_ioctl_giwscan, /* SIOCGIWSCAN */ + (iw_handler) cfg80211_wext_siwscan, /* SIOCSIWSCAN */ + (iw_handler) cfg80211_wext_giwscan, /* SIOCGIWSCAN */ (iw_handler) ieee80211_ioctl_siwessid, /* SIOCSIWESSID */ (iw_handler) ieee80211_ioctl_giwessid, /* SIOCGIWESSID */ (iw_handler) NULL, /* SIOCSIWNICKN */ diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index ac71b38f7cb5..0b8ad1f4ecdd 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -99,10 +99,13 @@ static u16 classify80211(struct ieee80211_local *local, struct sk_buff *skb) /* in case we are a client verify acm is not set for this ac */ while (unlikely(local->wmm_acm & BIT(skb->priority))) { if (wme_downgrade_ac(skb)) { - /* The old code would drop the packet in this - * case. + /* + * This should not really happen. The AP has marked all + * lower ACs to require admission control which is not + * a reasonable configuration. Allow the frame to be + * transmitted using AC_BK as a workaround. */ - return 0; + break; } } @@ -114,9 +117,7 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb) { struct ieee80211_master_priv *mpriv = netdev_priv(dev); struct ieee80211_local *local = mpriv->local; - struct ieee80211_hw *hw = &local->hw; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct sta_info *sta; u16 queue; u8 tid; @@ -124,29 +125,11 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb) if (unlikely(queue >= local->hw.queues)) queue = local->hw.queues - 1; - if (skb->requeue) { - if (!hw->ampdu_queues) - return queue; - - rcu_read_lock(); - sta = sta_info_get(local, hdr->addr1); - tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; - if (sta) { - int ampdu_queue = sta->tid_to_tx_q[tid]; - - if ((ampdu_queue < ieee80211_num_queues(hw)) && - test_bit(ampdu_queue, local->queue_pool)) - queue = ampdu_queue; - } - rcu_read_unlock(); - - return queue; - } - - /* Now we know the 1d priority, fill in the QoS header if - * there is one. + /* + * Now we know the 1d priority, fill in the QoS header if + * there is one (and we haven't done this before). */ - if (ieee80211_is_data_qos(hdr->frame_control)) { + if (!skb->requeue && ieee80211_is_data_qos(hdr->frame_control)) { u8 *p = ieee80211_get_qos_ctl(hdr); u8 ack_policy = 0; tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; @@ -156,140 +139,7 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb) /* qos header is 2 bytes, second reserved */ *p++ = ack_policy | tid; *p = 0; - - if (!hw->ampdu_queues) - return queue; - - rcu_read_lock(); - - sta = sta_info_get(local, hdr->addr1); - if (sta) { - int ampdu_queue = sta->tid_to_tx_q[tid]; - - if ((ampdu_queue < ieee80211_num_queues(hw)) && - test_bit(ampdu_queue, local->queue_pool)) - queue = ampdu_queue; - } - - rcu_read_unlock(); } return queue; } - -int ieee80211_ht_agg_queue_add(struct ieee80211_local *local, - struct sta_info *sta, u16 tid) -{ - int i; - - /* XXX: currently broken due to cb/requeue use */ - return -EPERM; - - /* prepare the filter and save it for the SW queue - * matching the received HW queue */ - - if (!local->hw.ampdu_queues) - return -EPERM; - - /* try to get a Qdisc from the pool */ - for (i = local->hw.queues; i < ieee80211_num_queues(&local->hw); i++) - if (!test_and_set_bit(i, local->queue_pool)) { - ieee80211_stop_queue(local_to_hw(local), i); - sta->tid_to_tx_q[tid] = i; - - /* IF there are already pending packets - * on this tid first we need to drain them - * on the previous queue - * since HT is strict in order */ -#ifdef CONFIG_MAC80211_HT_DEBUG - if (net_ratelimit()) - printk(KERN_DEBUG "allocated aggregation queue" - " %d tid %d addr %pM pool=0x%lX\n", - i, tid, sta->sta.addr, - local->queue_pool[0]); -#endif /* CONFIG_MAC80211_HT_DEBUG */ - return 0; - } - - return -EAGAIN; -} - -/** - * the caller needs to hold netdev_get_tx_queue(local->mdev, X)->lock - */ -void ieee80211_ht_agg_queue_remove(struct ieee80211_local *local, - struct sta_info *sta, u16 tid, - u8 requeue) -{ - int agg_queue = sta->tid_to_tx_q[tid]; - struct ieee80211_hw *hw = &local->hw; - - /* return the qdisc to the pool */ - clear_bit(agg_queue, local->queue_pool); - sta->tid_to_tx_q[tid] = ieee80211_num_queues(hw); - - if (requeue) { - ieee80211_requeue(local, agg_queue); - } else { - struct netdev_queue *txq; - spinlock_t *root_lock; - struct Qdisc *q; - - txq = netdev_get_tx_queue(local->mdev, agg_queue); - q = rcu_dereference(txq->qdisc); - root_lock = qdisc_lock(q); - - spin_lock_bh(root_lock); - qdisc_reset(q); - spin_unlock_bh(root_lock); - } -} - -void ieee80211_requeue(struct ieee80211_local *local, int queue) -{ - struct netdev_queue *txq = netdev_get_tx_queue(local->mdev, queue); - struct sk_buff_head list; - spinlock_t *root_lock; - struct Qdisc *qdisc; - u32 len; - - rcu_read_lock_bh(); - - qdisc = rcu_dereference(txq->qdisc); - if (!qdisc || !qdisc->dequeue) - goto out_unlock; - - skb_queue_head_init(&list); - - root_lock = qdisc_root_lock(qdisc); - spin_lock(root_lock); - for (len = qdisc->q.qlen; len > 0; len--) { - struct sk_buff *skb = qdisc->dequeue(qdisc); - - if (skb) - __skb_queue_tail(&list, skb); - } - spin_unlock(root_lock); - - for (len = list.qlen; len > 0; len--) { - struct sk_buff *skb = __skb_dequeue(&list); - u16 new_queue; - - BUG_ON(!skb); - new_queue = ieee80211_select_queue(local->mdev, skb); - skb_set_queue_mapping(skb, new_queue); - - txq = netdev_get_tx_queue(local->mdev, new_queue); - - - qdisc = rcu_dereference(txq->qdisc); - root_lock = qdisc_root_lock(qdisc); - - spin_lock(root_lock); - qdisc_enqueue_root(skb, qdisc); - spin_unlock(root_lock); - } - -out_unlock: - rcu_read_unlock_bh(); -} diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h index bc62f28a4d3d..7520d2e014dc 100644 --- a/net/mac80211/wme.h +++ b/net/mac80211/wme.h @@ -21,11 +21,5 @@ extern const int ieee802_1d_to_ac[8]; u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb); -int ieee80211_ht_agg_queue_add(struct ieee80211_local *local, - struct sta_info *sta, u16 tid); -void ieee80211_ht_agg_queue_remove(struct ieee80211_local *local, - struct sta_info *sta, u16 tid, - u8 requeue); -void ieee80211_requeue(struct ieee80211_local *local, int queue); #endif /* _WME_H */ diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 7aa63caf8d50..9101b48ec2ae 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -1,5 +1,6 @@ /* * Copyright 2002-2004, Instant802 Networks, Inc. + * Copyright 2008, Jouni Malinen <j@w1.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,6 +20,7 @@ #include "michael.h" #include "tkip.h" #include "aes_ccm.h" +#include "aes_cmac.h" #include "wpa.h" ieee80211_tx_result @@ -266,7 +268,7 @@ static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *scratch, int encrypted) { __le16 mask_fc; - int a4_included; + int a4_included, mgmt; u8 qos_tid; u8 *b_0, *aad; u16 data_len, len_a; @@ -277,12 +279,15 @@ static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *scratch, aad = scratch + 4 * AES_BLOCK_LEN; /* - * Mask FC: zero subtype b4 b5 b6 + * Mask FC: zero subtype b4 b5 b6 (if not mgmt) * Retry, PwrMgt, MoreData; set Protected */ + mgmt = ieee80211_is_mgmt(hdr->frame_control); mask_fc = hdr->frame_control; - mask_fc &= ~cpu_to_le16(0x0070 | IEEE80211_FCTL_RETRY | + mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY | IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA); + if (!mgmt) + mask_fc &= ~cpu_to_le16(0x0070); mask_fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); hdrlen = ieee80211_hdrlen(hdr->frame_control); @@ -300,8 +305,10 @@ static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *scratch, /* First block, b_0 */ b_0[0] = 0x59; /* flags: Adata: 1, M: 011, L: 001 */ - /* Nonce: QoS Priority | A2 | PN */ - b_0[1] = qos_tid; + /* Nonce: Nonce Flags | A2 | PN + * Nonce Flags: Priority (b0..b3) | Management (b4) | Reserved (b5..b7) + */ + b_0[1] = qos_tid | (mgmt << 4); memcpy(&b_0[2], hdr->addr2, ETH_ALEN); memcpy(&b_0[8], pn, CCMP_PN_LEN); /* l(m) */ @@ -360,9 +367,14 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) int hdrlen, len, tail; u8 *pos, *pn; int i; + bool skip_hw; + + skip_hw = (tx->key->conf.flags & IEEE80211_KEY_FLAG_SW_MGMT) && + ieee80211_is_mgmt(hdr->frame_control); if ((tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) && - !(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + !(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) && + !skip_hw) { /* hwaccel - with no need for preallocated room for CCMP * header or MIC fields */ info->control.hw_key = &tx->key->conf; @@ -397,7 +409,7 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) ccmp_pn2hdr(pos, pn, key->conf.keyidx); - if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { + if ((key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) && !skip_hw) { /* hwaccel - with preallocated room for CCMP header */ info->control.hw_key = &tx->key->conf; return 0; @@ -446,7 +458,8 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx) hdrlen = ieee80211_hdrlen(hdr->frame_control); - if (!ieee80211_is_data(hdr->frame_control)) + if (!ieee80211_is_data(hdr->frame_control) && + !ieee80211_is_robust_mgmt_frame(hdr)) return RX_CONTINUE; data_len = skb->len - hdrlen - CCMP_HDR_LEN - CCMP_MIC_LEN; @@ -485,3 +498,126 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx) return RX_CONTINUE; } + + +static void bip_aad(struct sk_buff *skb, u8 *aad) +{ + /* BIP AAD: FC(masked) || A1 || A2 || A3 */ + + /* FC type/subtype */ + aad[0] = skb->data[0]; + /* Mask FC Retry, PwrMgt, MoreData flags to zero */ + aad[1] = skb->data[1] & ~(BIT(4) | BIT(5) | BIT(6)); + /* A1 || A2 || A3 */ + memcpy(aad + 2, skb->data + 4, 3 * ETH_ALEN); +} + + +static inline void bip_ipn_swap(u8 *d, const u8 *s) +{ + *d++ = s[5]; + *d++ = s[4]; + *d++ = s[3]; + *d++ = s[2]; + *d++ = s[1]; + *d = s[0]; +} + + +ieee80211_tx_result +ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) +{ + struct sk_buff *skb = tx->skb; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_key *key = tx->key; + struct ieee80211_mmie *mmie; + u8 *pn, aad[20]; + int i; + + if (tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { + /* hwaccel */ + info->control.hw_key = &tx->key->conf; + return 0; + } + + if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) + return TX_DROP; + + mmie = (struct ieee80211_mmie *) skb_put(skb, sizeof(*mmie)); + mmie->element_id = WLAN_EID_MMIE; + mmie->length = sizeof(*mmie) - 2; + mmie->key_id = cpu_to_le16(key->conf.keyidx); + + /* PN = PN + 1 */ + pn = key->u.aes_cmac.tx_pn; + + for (i = sizeof(key->u.aes_cmac.tx_pn) - 1; i >= 0; i--) { + pn[i]++; + if (pn[i]) + break; + } + bip_ipn_swap(mmie->sequence_number, pn); + + bip_aad(skb, aad); + + /* + * MIC = AES-128-CMAC(IGTK, AAD || Management Frame Body || MMIE, 64) + */ + ieee80211_aes_cmac(key->u.aes_cmac.tfm, key->u.aes_cmac.tx_crypto_buf, + aad, skb->data + 24, skb->len - 24, mmie->mic); + + return TX_CONTINUE; +} + + +ieee80211_rx_result +ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx) +{ + struct sk_buff *skb = rx->skb; + struct ieee80211_key *key = rx->key; + struct ieee80211_mmie *mmie; + u8 aad[20], mic[8], ipn[6]; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + + if (!ieee80211_is_mgmt(hdr->frame_control)) + return RX_CONTINUE; + + if ((rx->status->flag & RX_FLAG_DECRYPTED) && + (rx->status->flag & RX_FLAG_IV_STRIPPED)) + return RX_CONTINUE; + + if (skb->len < 24 + sizeof(*mmie)) + return RX_DROP_UNUSABLE; + + mmie = (struct ieee80211_mmie *) + (skb->data + skb->len - sizeof(*mmie)); + if (mmie->element_id != WLAN_EID_MMIE || + mmie->length != sizeof(*mmie) - 2) + return RX_DROP_UNUSABLE; /* Invalid MMIE */ + + bip_ipn_swap(ipn, mmie->sequence_number); + + if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) { + key->u.aes_cmac.replays++; + return RX_DROP_UNUSABLE; + } + + if (!(rx->status->flag & RX_FLAG_DECRYPTED)) { + /* hardware didn't decrypt/verify MIC */ + bip_aad(skb, aad); + ieee80211_aes_cmac(key->u.aes_cmac.tfm, + key->u.aes_cmac.rx_crypto_buf, aad, + skb->data + 24, skb->len - 24, mic); + if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) { + key->u.aes_cmac.icverrors++; + return RX_DROP_UNUSABLE; + } + } + + memcpy(key->u.aes_cmac.rx_pn, ipn, 6); + + /* Remove MMIE */ + skb_trim(skb, skb->len - sizeof(*mmie)); + + return RX_CONTINUE; +} diff --git a/net/mac80211/wpa.h b/net/mac80211/wpa.h index d42d221d8a1d..baba0608313e 100644 --- a/net/mac80211/wpa.h +++ b/net/mac80211/wpa.h @@ -28,4 +28,9 @@ ieee80211_crypto_ccmp_encrypt(struct ieee80211_tx_data *tx); ieee80211_rx_result ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx); +ieee80211_tx_result +ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx); +ieee80211_rx_result +ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx); + #endif /* WPA_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index c2bac9cd0caf..2562d05dbaf5 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -357,6 +357,45 @@ config NETFILTER_XT_TARGET_DSCP To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_HL + tristate '"HL" hoplimit target support' + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + ---help--- + This option adds the "HL" (for IPv6) and "TTL" (for IPv4) + targets, which enable the user to change the + hoplimit/time-to-live value of the IP header. + + While it is safe to decrement the hoplimit/TTL value, the + modules also allow to increment and set the hoplimit value of + the header to arbitrary values. This is EXTREMELY DANGEROUS + since you can easily create immortal packets that loop + forever on the network. + +config NETFILTER_XT_TARGET_LED + tristate '"LED" target support' + depends on LEDS_CLASS + depends on NETFILTER_ADVANCED + help + This option adds a `LED' target, which allows you to blink LEDs in + response to particular packets passing through your machine. + + This can be used to turn a spare LED into a network activity LED, + which only flashes in response to FTP transfers, for example. Or + you could have an LED which lights up for a minute or two every time + somebody connects to your machine via SSH. + + You will need support for the "led" class to make this work. + + To create an LED trigger for incoming SSH traffic: + iptables -A INPUT -p tcp --dport 22 -j LED --led-trigger-id ssh --led-delay 1000 + + Then attach the new trigger to an LED on your system: + echo netfilter-ssh > /sys/class/leds/<ledname>/trigger + + For more information on the LEDs available on your system, see + Documentation/leds-class.txt + config NETFILTER_XT_TARGET_MARK tristate '"MARK" target support' default m if NETFILTER_ADVANCED=n @@ -488,6 +527,22 @@ config NETFILTER_XT_TARGET_TCPOPTSTRIP This option adds a "TCPOPTSTRIP" target, which allows you to strip TCP options from TCP packets. +config NETFILTER_XT_MATCH_CLUSTER + tristate '"cluster" match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + ---help--- + This option allows you to build work-load-sharing clusters of + network servers/stateful firewalls without having a dedicated + load-balancing router/server/switch. Basically, this match returns + true when the packet must be handled by this cluster node. Thus, + all nodes see all packets and this match decides which node handles + what packets. The work-load sharing algorithm is based on source + address hashing. + + If you say Y or M here, try `iptables -m cluster --help` for + more information. + config NETFILTER_XT_MATCH_COMMENT tristate '"comment" match support' depends on NETFILTER_ADVANCED @@ -605,6 +660,14 @@ config NETFILTER_XT_MATCH_HELPER To compile it as a module, choose M here. If unsure, say Y. +config NETFILTER_XT_MATCH_HL + tristate '"hl" hoplimit/TTL match support' + depends on NETFILTER_ADVANCED + ---help--- + HL matching allows you to match packets based on the hoplimit + in the IPv6 header, or the time-to-live field in the IPv4 + header of the packet. + config NETFILTER_XT_MATCH_IPRANGE tristate '"iprange" address range match support' depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index da3d909e053f..6282060fbda9 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -45,6 +45,8 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o +obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o +obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o @@ -57,6 +59,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o # matches +obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o @@ -67,6 +70,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o +obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index a90ac83c5918..5bb34737501f 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -174,7 +174,6 @@ next_hook: outdev, &elem, okfn, hook_thresh); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; - goto unlock; } else if (verdict == NF_DROP) { kfree_skb(skb); ret = -EPERM; @@ -183,7 +182,6 @@ next_hook: verdict >> NF_VERDICT_BITS)) goto next_hook; } -unlock: rcu_read_unlock(); return ret; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 6be5d4efa51b..5c48378a852f 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -149,8 +149,8 @@ static struct task_struct *sync_backup_thread; /* multicast addr */ static struct sockaddr_in mcast_addr = { .sin_family = AF_INET, - .sin_port = __constant_htons(IP_VS_SYNC_PORT), - .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), }; diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index 4f8fcf498545..07d9d8857e5d 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -177,7 +177,7 @@ static struct nf_conntrack_helper amanda_helper[2] __read_mostly = { .me = THIS_MODULE, .help = amanda_help, .tuple.src.l3num = AF_INET, - .tuple.src.u.udp.port = __constant_htons(10080), + .tuple.src.u.udp.port = cpu_to_be16(10080), .tuple.dst.protonum = IPPROTO_UDP, .expect_policy = &amanda_exp_policy, }, @@ -186,7 +186,7 @@ static struct nf_conntrack_helper amanda_helper[2] __read_mostly = { .me = THIS_MODULE, .help = amanda_help, .tuple.src.l3num = AF_INET6, - .tuple.src.u.udp.port = __constant_htons(10080), + .tuple.src.u.udp.port = cpu_to_be16(10080), .tuple.dst.protonum = IPPROTO_UDP, .expect_policy = &amanda_exp_policy, }, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f4935e344b61..dfb447b584da 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_lock); unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); -int nf_conntrack_max __read_mostly; +unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); struct nf_conn nf_conntrack_untracked __read_mostly; @@ -472,7 +472,8 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, struct nf_conn *ct; if (unlikely(!nf_conntrack_hash_rnd_initted)) { - get_random_bytes(&nf_conntrack_hash_rnd, 4); + get_random_bytes(&nf_conntrack_hash_rnd, + sizeof(nf_conntrack_hash_rnd)); nf_conntrack_hash_rnd_initted = 1; } @@ -516,16 +517,17 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc); static void nf_conntrack_free_rcu(struct rcu_head *head) { struct nf_conn *ct = container_of(head, struct nf_conn, rcu); - struct net *net = nf_ct_net(ct); nf_ct_ext_free(ct); kmem_cache_free(nf_conntrack_cachep, ct); - atomic_dec(&net->ct.count); } void nf_conntrack_free(struct nf_conn *ct) { + struct net *net = nf_ct_net(ct); + nf_ct_ext_destroy(ct); + atomic_dec(&net->ct.count); call_rcu(&ct->rcu, nf_conntrack_free_rcu); } EXPORT_SYMBOL_GPL(nf_conntrack_free); @@ -733,6 +735,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, nf_conntrack_put(skb->nfct); skb->nfct = NULL; NF_CT_STAT_INC_ATOMIC(net, invalid); + if (ret == -NF_DROP) + NF_CT_STAT_INC_ATOMIC(net, drop); return -ret; } @@ -1103,7 +1107,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) /* We have to rehahs for the new table anyway, so we also can * use a newrandom seed */ - get_random_bytes(&rnd, 4); + get_random_bytes(&rnd, sizeof(rnd)); /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 3a8a34a6d37c..357ba39d4c8d 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -72,7 +72,8 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple unsigned int hash; if (unlikely(!nf_ct_expect_hash_rnd_initted)) { - get_random_bytes(&nf_ct_expect_hash_rnd, 4); + get_random_bytes(&nf_ct_expect_hash_rnd, + sizeof(nf_ct_expect_hash_rnd)); nf_ct_expect_hash_rnd_initted = 1; } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 687bd633c3d7..66369490230e 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1167,7 +1167,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = { .name = "Q.931", .me = THIS_MODULE, .tuple.src.l3num = AF_INET, - .tuple.src.u.tcp.port = __constant_htons(Q931_PORT), + .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT), .tuple.dst.protonum = IPPROTO_TCP, .help = q931_help, .expect_policy = &q931_exp_policy, @@ -1176,7 +1176,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = { .name = "Q.931", .me = THIS_MODULE, .tuple.src.l3num = AF_INET6, - .tuple.src.u.tcp.port = __constant_htons(Q931_PORT), + .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT), .tuple.dst.protonum = IPPROTO_TCP, .help = q931_help, .expect_policy = &q931_exp_policy, @@ -1741,7 +1741,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = { .name = "RAS", .me = THIS_MODULE, .tuple.src.l3num = AF_INET, - .tuple.src.u.udp.port = __constant_htons(RAS_PORT), + .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), .tuple.dst.protonum = IPPROTO_UDP, .help = ras_help, .expect_policy = &ras_exp_policy, @@ -1750,7 +1750,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = { .name = "RAS", .me = THIS_MODULE, .tuple.src.l3num = AF_INET6, - .tuple.src.u.udp.port = __constant_htons(RAS_PORT), + .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), .tuple.dst.protonum = IPPROTO_UDP, .help = ras_help, .expect_policy = &ras_exp_policy, diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index 5af4273b4668..8a3875e36ec2 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -105,7 +105,7 @@ static struct nf_conntrack_expect_policy exp_policy = { static struct nf_conntrack_helper helper __read_mostly = { .name = "netbios-ns", .tuple.src.l3num = AF_INET, - .tuple.src.u.udp.port = __constant_htons(NMBD_PORT), + .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT), .tuple.dst.protonum = IPPROTO_UDP, .me = THIS_MODULE, .help = help, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index ed6d873ad384..7a16bd462f82 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -518,6 +518,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, nla_put_failure: rcu_read_unlock(); nlmsg_failure: + nfnetlink_set_err(0, group, -ENOBUFS); kfree_skb(skb); return NOTIFY_DONE; } @@ -599,7 +600,8 @@ ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple) nla_parse_nested(tb, CTA_IP_MAX, attr, NULL); - l3proto = nf_ct_l3proto_find_get(tuple->src.l3num); + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(tuple->src.l3num); if (likely(l3proto->nlattr_to_tuple)) { ret = nla_validate_nested(attr, CTA_IP_MAX, @@ -608,7 +610,7 @@ ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple) ret = l3proto->nlattr_to_tuple(tb, tuple); } - nf_ct_l3proto_put(l3proto); + rcu_read_unlock(); return ret; } @@ -633,7 +635,8 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr, return -EINVAL; tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); - l4proto = nf_ct_l4proto_find_get(tuple->src.l3num, tuple->dst.protonum); + rcu_read_lock(); + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); if (likely(l4proto->nlattr_to_tuple)) { ret = nla_validate_nested(attr, CTA_PROTO_MAX, @@ -642,7 +645,7 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr, ret = l4proto->nlattr_to_tuple(tb, tuple); } - nf_ct_l4proto_put(l4proto); + rcu_read_unlock(); return ret; } @@ -989,10 +992,11 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, struct nlattr *cda[]) nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, NULL); - l4proto = nf_ct_l4proto_find_get(nf_ct_l3num(ct), nf_ct_protonum(ct)); + rcu_read_lock(); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto->from_nlattr) err = l4proto->from_nlattr(tb, ct); - nf_ct_l4proto_put(l4proto); + rcu_read_unlock(); return err; } @@ -1062,6 +1066,10 @@ ctnetlink_change_conntrack(struct nf_conn *ct, struct nlattr *cda[]) { int err; + /* only allow NAT changes and master assignation for new conntracks */ + if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER]) + return -EOPNOTSUPP; + if (cda[CTA_HELP]) { err = ctnetlink_change_helper(ct, cda); if (err < 0) @@ -1124,13 +1132,11 @@ ctnetlink_event_report(struct nf_conn *ct, u32 pid, int report) report); } -static int +static struct nf_conn * ctnetlink_create_conntrack(struct nlattr *cda[], struct nf_conntrack_tuple *otuple, struct nf_conntrack_tuple *rtuple, - struct nf_conn *master_ct, - u32 pid, - int report) + u8 u3) { struct nf_conn *ct; int err = -EINVAL; @@ -1138,10 +1144,10 @@ ctnetlink_create_conntrack(struct nlattr *cda[], ct = nf_conntrack_alloc(&init_net, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); if (!cda[CTA_TIMEOUT]) - goto err; + goto err1; ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); ct->timeout.expires = jiffies + ct->timeout.expires * HZ; @@ -1152,10 +1158,8 @@ ctnetlink_create_conntrack(struct nlattr *cda[], char *helpname; err = ctnetlink_parse_help(cda[CTA_HELP], &helpname); - if (err < 0) { - rcu_read_unlock(); - goto err; - } + if (err < 0) + goto err2; helper = __nf_conntrack_helper_find_byname(helpname); if (helper == NULL) { @@ -1163,28 +1167,26 @@ ctnetlink_create_conntrack(struct nlattr *cda[], #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { err = -EOPNOTSUPP; - goto err; + goto err1; } rcu_read_lock(); helper = __nf_conntrack_helper_find_byname(helpname); if (helper) { - rcu_read_unlock(); err = -EAGAIN; - goto err; + goto err2; } rcu_read_unlock(); #endif err = -EOPNOTSUPP; - goto err; + goto err1; } else { struct nf_conn_help *help; help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help == NULL) { - rcu_read_unlock(); err = -ENOMEM; - goto err; + goto err2; } /* not in hash table yet so not strictly necessary */ @@ -1193,44 +1195,34 @@ ctnetlink_create_conntrack(struct nlattr *cda[], } else { /* try an implicit helper assignation */ err = __nf_ct_try_assign_helper(ct, GFP_ATOMIC); - if (err < 0) { - rcu_read_unlock(); - goto err; - } + if (err < 0) + goto err2; } if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); - if (err < 0) { - rcu_read_unlock(); - goto err; - } + if (err < 0) + goto err2; } if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) { err = ctnetlink_change_nat(ct, cda); - if (err < 0) { - rcu_read_unlock(); - goto err; - } + if (err < 0) + goto err2; } #ifdef CONFIG_NF_NAT_NEEDED if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { err = ctnetlink_change_nat_seq_adj(ct, cda); - if (err < 0) { - rcu_read_unlock(); - goto err; - } + if (err < 0) + goto err2; } #endif if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); - if (err < 0) { - rcu_read_unlock(); - goto err; - } + if (err < 0) + goto err2; } nf_ct_acct_ext_add(ct, GFP_ATOMIC); @@ -1241,23 +1233,37 @@ ctnetlink_create_conntrack(struct nlattr *cda[], #endif /* setup master conntrack: this is a confirmed expectation */ - if (master_ct) { + if (cda[CTA_TUPLE_MASTER]) { + struct nf_conntrack_tuple master; + struct nf_conntrack_tuple_hash *master_h; + struct nf_conn *master_ct; + + err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3); + if (err < 0) + goto err2; + + master_h = __nf_conntrack_find(&init_net, &master); + if (master_h == NULL) { + err = -ENOENT; + goto err2; + } + master_ct = nf_ct_tuplehash_to_ctrack(master_h); + nf_conntrack_get(&master_ct->ct_general); __set_bit(IPS_EXPECTED_BIT, &ct->status); ct->master = master_ct; } - nf_conntrack_get(&ct->ct_general); add_timer(&ct->timeout); nf_conntrack_hash_insert(ct); rcu_read_unlock(); - ctnetlink_event_report(ct, pid, report); - nf_ct_put(ct); - return 0; + return ct; -err: +err2: + rcu_read_unlock(); +err1: nf_conntrack_free(ct); - return err; + return ERR_PTR(err); } static int @@ -1289,38 +1295,25 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, h = __nf_conntrack_find(&init_net, &rtuple); if (h == NULL) { - struct nf_conntrack_tuple master; - struct nf_conntrack_tuple_hash *master_h = NULL; - struct nf_conn *master_ct = NULL; - - if (cda[CTA_TUPLE_MASTER]) { - err = ctnetlink_parse_tuple(cda, - &master, - CTA_TUPLE_MASTER, - u3); - if (err < 0) - goto out_unlock; + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) { + struct nf_conn *ct; - master_h = __nf_conntrack_find(&init_net, &master); - if (master_h == NULL) { - err = -ENOENT; + ct = ctnetlink_create_conntrack(cda, &otuple, + &rtuple, u3); + if (IS_ERR(ct)) { + err = PTR_ERR(ct); goto out_unlock; } - master_ct = nf_ct_tuplehash_to_ctrack(master_h); - nf_conntrack_get(&master_ct->ct_general); - } - - err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) - err = ctnetlink_create_conntrack(cda, - &otuple, - &rtuple, - master_ct, - NETLINK_CB(skb).pid, - nlmsg_report(nlh)); - spin_unlock_bh(&nf_conntrack_lock); - if (err < 0 && master_ct) - nf_ct_put(master_ct); + err = 0; + nf_conntrack_get(&ct->ct_general); + spin_unlock_bh(&nf_conntrack_lock); + ctnetlink_event_report(ct, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_put(ct); + } else + spin_unlock_bh(&nf_conntrack_lock); return err; } @@ -1332,17 +1325,6 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) { - err = -EOPNOTSUPP; - goto out_unlock; - } - /* can't link an existing conntrack to a master */ - if (cda[CTA_TUPLE_MASTER]) { - err = -EOPNOTSUPP; - goto out_unlock; - } - err = ctnetlink_change_conntrack(ct, cda); if (err == 0) { nf_conntrack_get(&ct->ct_general); @@ -1533,6 +1515,7 @@ static int ctnetlink_expect_event(struct notifier_block *this, nla_put_failure: rcu_read_unlock(); nlmsg_failure: + nfnetlink_set_err(0, 0, -ENOBUFS); kfree_skb(skb); return NOTIFY_DONE; } diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 9e169ef2e854..72cca638a82d 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -591,7 +591,7 @@ static struct nf_conntrack_helper pptp __read_mostly = { .name = "pptp", .me = THIS_MODULE, .tuple.src.l3num = AF_INET, - .tuple.src.u.tcp.port = __constant_htons(PPTP_CONTROL_PORT), + .tuple.src.u.tcp.port = cpu_to_be16(PPTP_CONTROL_PORT), .tuple.dst.protonum = IPPROTO_TCP, .help = conntrack_pptp_help, .destroy = pptp_destroy_siblings, diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 592d73344d46..9a62b4efa0e1 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -74,27 +74,6 @@ EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); /* this is guaranteed to always return a valid protocol helper, since * it falls back to generic_protocol */ -struct nf_conntrack_l4proto * -nf_ct_l4proto_find_get(u_int16_t l3proto, u_int8_t l4proto) -{ - struct nf_conntrack_l4proto *p; - - rcu_read_lock(); - p = __nf_ct_l4proto_find(l3proto, l4proto); - if (!try_module_get(p->me)) - p = &nf_conntrack_l4proto_generic; - rcu_read_unlock(); - - return p; -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get); - -void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p) -{ - module_put(p->me); -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); - struct nf_conntrack_l3proto * nf_ct_l3proto_find_get(u_int16_t l3proto) { diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 8fcf1762fabf..d3d5a7fd73ce 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -16,6 +16,9 @@ #include <linux/skbuff.h> #include <linux/dccp.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + #include <linux/netfilter/nfnetlink_conntrack.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> @@ -23,8 +26,6 @@ static DEFINE_RWLOCK(dccp_lock); -static int nf_ct_dccp_loose __read_mostly = 1; - /* Timeouts are based on values from RFC4340: * * - REQUEST: @@ -72,16 +73,6 @@ static int nf_ct_dccp_loose __read_mostly = 1; #define DCCP_MSL (2 * 60 * HZ) -static unsigned int dccp_timeout[CT_DCCP_MAX + 1] __read_mostly = { - [CT_DCCP_REQUEST] = 2 * DCCP_MSL, - [CT_DCCP_RESPOND] = 4 * DCCP_MSL, - [CT_DCCP_PARTOPEN] = 4 * DCCP_MSL, - [CT_DCCP_OPEN] = 12 * 3600 * HZ, - [CT_DCCP_CLOSEREQ] = 64 * HZ, - [CT_DCCP_CLOSING] = 64 * HZ, - [CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL, -}; - static const char * const dccp_state_names[] = { [CT_DCCP_NONE] = "NONE", [CT_DCCP_REQUEST] = "REQUEST", @@ -393,6 +384,22 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = }, }; +/* this module per-net specifics */ +static int dccp_net_id; +struct dccp_net { + int dccp_loose; + unsigned int dccp_timeout[CT_DCCP_MAX + 1]; +#ifdef CONFIG_SYSCTL + struct ctl_table_header *sysctl_header; + struct ctl_table *sysctl_table; +#endif +}; + +static inline struct dccp_net *dccp_pernet(struct net *net) +{ + return net_generic(net, dccp_net_id); +} + static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) { @@ -419,6 +426,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff) { struct net *net = nf_ct_net(ct); + struct dccp_net *dn; struct dccp_hdr _dh, *dh; const char *msg; u_int8_t state; @@ -429,7 +437,8 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; switch (state) { default: - if (nf_ct_dccp_loose == 0) { + dn = dccp_pernet(net); + if (dn->dccp_loose == 0) { msg = "nf_ct_dccp: not picking up existing connection "; goto out_invalid; } @@ -465,6 +474,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, u_int8_t pf, unsigned int hooknum) { struct net *net = nf_ct_net(ct); + struct dccp_net *dn; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; u_int8_t type, old_state, new_state; @@ -542,7 +552,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.dccp.last_pkt = type; ct->proto.dccp.state = new_state; write_unlock_bh(&dccp_lock); - nf_ct_refresh_acct(ct, ctinfo, skb, dccp_timeout[new_state]); + + dn = dccp_pernet(net); + nf_ct_refresh_acct(ct, ctinfo, skb, dn->dccp_timeout[new_state]); return NF_ACCEPT; } @@ -660,13 +672,11 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) #endif #ifdef CONFIG_SYSCTL -static unsigned int dccp_sysctl_table_users; -static struct ctl_table_header *dccp_sysctl_header; -static ctl_table dccp_sysctl_table[] = { +/* template, data assigned later */ +static struct ctl_table dccp_sysctl_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nf_conntrack_dccp_timeout_request", - .data = &dccp_timeout[CT_DCCP_REQUEST], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -674,7 +684,6 @@ static ctl_table dccp_sysctl_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nf_conntrack_dccp_timeout_respond", - .data = &dccp_timeout[CT_DCCP_RESPOND], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -682,7 +691,6 @@ static ctl_table dccp_sysctl_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nf_conntrack_dccp_timeout_partopen", - .data = &dccp_timeout[CT_DCCP_PARTOPEN], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -690,7 +698,6 @@ static ctl_table dccp_sysctl_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nf_conntrack_dccp_timeout_open", - .data = &dccp_timeout[CT_DCCP_OPEN], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -698,7 +705,6 @@ static ctl_table dccp_sysctl_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nf_conntrack_dccp_timeout_closereq", - .data = &dccp_timeout[CT_DCCP_CLOSEREQ], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -706,7 +712,6 @@ static ctl_table dccp_sysctl_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nf_conntrack_dccp_timeout_closing", - .data = &dccp_timeout[CT_DCCP_CLOSING], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -714,7 +719,6 @@ static ctl_table dccp_sysctl_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nf_conntrack_dccp_timeout_timewait", - .data = &dccp_timeout[CT_DCCP_TIMEWAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -722,8 +726,7 @@ static ctl_table dccp_sysctl_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nf_conntrack_dccp_loose", - .data = &nf_ct_dccp_loose, - .maxlen = sizeof(nf_ct_dccp_loose), + .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, @@ -751,11 +754,6 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = { .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_users = &dccp_sysctl_table_users, - .ctl_table_header = &dccp_sysctl_header, - .ctl_table = dccp_sysctl_table, -#endif }; static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { @@ -776,34 +774,107 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif +}; + +static __net_init int dccp_net_init(struct net *net) +{ + struct dccp_net *dn; + int err; + + dn = kmalloc(sizeof(*dn), GFP_KERNEL); + if (!dn) + return -ENOMEM; + + /* default values */ + dn->dccp_loose = 1; + dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; + + err = net_assign_generic(net, dccp_net_id, dn); + if (err) + goto out; + #ifdef CONFIG_SYSCTL - .ctl_table_users = &dccp_sysctl_table_users, - .ctl_table_header = &dccp_sysctl_header, - .ctl_table = dccp_sysctl_table, + err = -ENOMEM; + dn->sysctl_table = kmemdup(dccp_sysctl_table, + sizeof(dccp_sysctl_table), GFP_KERNEL); + if (!dn->sysctl_table) + goto out; + + dn->sysctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST]; + dn->sysctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND]; + dn->sysctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN]; + dn->sysctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN]; + dn->sysctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ]; + dn->sysctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING]; + dn->sysctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT]; + dn->sysctl_table[7].data = &dn->dccp_loose; + + dn->sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, dn->sysctl_table); + if (!dn->sysctl_header) { + kfree(dn->sysctl_table); + goto out; + } #endif + + return 0; + +out: + kfree(dn); + return err; +} + +static __net_exit void dccp_net_exit(struct net *net) +{ + struct dccp_net *dn = dccp_pernet(net); +#ifdef CONFIG_SYSCTL + unregister_net_sysctl_table(dn->sysctl_header); + kfree(dn->sysctl_table); +#endif + kfree(dn); + + net_assign_generic(net, dccp_net_id, NULL); +} + +static struct pernet_operations dccp_net_ops = { + .init = dccp_net_init, + .exit = dccp_net_exit, }; static int __init nf_conntrack_proto_dccp_init(void) { int err; - err = nf_conntrack_l4proto_register(&dccp_proto4); + err = register_pernet_gen_subsys(&dccp_net_id, &dccp_net_ops); if (err < 0) goto err1; - err = nf_conntrack_l4proto_register(&dccp_proto6); + err = nf_conntrack_l4proto_register(&dccp_proto4); if (err < 0) goto err2; + + err = nf_conntrack_l4proto_register(&dccp_proto6); + if (err < 0) + goto err3; return 0; -err2: +err3: nf_conntrack_l4proto_unregister(&dccp_proto4); +err2: + unregister_pernet_gen_subsys(dccp_net_id, &dccp_net_ops); err1: return err; } static void __exit nf_conntrack_proto_dccp_fini(void) { + unregister_pernet_gen_subsys(dccp_net_id, &dccp_net_ops); nf_conntrack_l4proto_unregister(&dccp_proto6); nf_conntrack_l4proto_unregister(&dccp_proto4); } diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 4be80d7b8795..829374f426c4 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -92,7 +92,7 @@ static struct ctl_table generic_compat_sysctl_table[] = { struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly = { .l3proto = PF_UNSPEC, - .l4proto = 0, + .l4proto = 255, .name = "unknown", .pkt_to_tuple = generic_pkt_to_tuple, .invert_tuple = generic_invert_tuple, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 56ac4ee77a1d..0aeb8b09a1f7 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -26,6 +26,8 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_log.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> /* Protects ct->proto.tcp */ static DEFINE_RWLOCK(tcp_lock); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 2b8b1f579f93..d4021179e24e 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -22,6 +22,8 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_log.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> static unsigned int nf_ct_udp_timeout __read_mostly = 30*HZ; static unsigned int nf_ct_udp_timeout_stream __read_mostly = 180*HZ; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index fa8ae5d2659c..8bb998fe098b 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -14,58 +14,63 @@ LOG target modules */ #define NF_LOG_PREFIXLEN 128 +#define NFLOGGER_NAME_LEN 64 static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; +static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); -/* return EBUSY if somebody else is registered, EEXIST if the same logger - * is registred, 0 on success. */ -int nf_log_register(u_int8_t pf, const struct nf_logger *logger) +static struct nf_logger *__find_logger(int pf, const char *str_logger) { - int ret; + struct nf_logger *t; - if (pf >= ARRAY_SIZE(nf_loggers)) - return -EINVAL; - - /* Any setup of logging members must be done before - * substituting pointer. */ - ret = mutex_lock_interruptible(&nf_log_mutex); - if (ret < 0) - return ret; - - if (!nf_loggers[pf]) - rcu_assign_pointer(nf_loggers[pf], logger); - else if (nf_loggers[pf] == logger) - ret = -EEXIST; - else - ret = -EBUSY; + list_for_each_entry(t, &nf_loggers_l[pf], list[pf]) { + if (!strnicmp(str_logger, t->name, strlen(t->name))) + return t; + } - mutex_unlock(&nf_log_mutex); - return ret; + return NULL; } -EXPORT_SYMBOL(nf_log_register); -void nf_log_unregister_pf(u_int8_t pf) +/* return EEXIST if the same logger is registred, 0 on success. */ +int nf_log_register(u_int8_t pf, struct nf_logger *logger) { + const struct nf_logger *llog; + if (pf >= ARRAY_SIZE(nf_loggers)) - return; + return -EINVAL; + mutex_lock(&nf_log_mutex); - rcu_assign_pointer(nf_loggers[pf], NULL); + + if (pf == NFPROTO_UNSPEC) { + int i; + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) + list_add_tail(&(logger->list[i]), &(nf_loggers_l[i])); + } else { + /* register at end of list to honor first register win */ + list_add_tail(&logger->list[pf], &nf_loggers_l[pf]); + llog = rcu_dereference(nf_loggers[pf]); + if (llog == NULL) + rcu_assign_pointer(nf_loggers[pf], logger); + } + mutex_unlock(&nf_log_mutex); - /* Give time to concurrent readers. */ - synchronize_rcu(); + return 0; } -EXPORT_SYMBOL(nf_log_unregister_pf); +EXPORT_SYMBOL(nf_log_register); -void nf_log_unregister(const struct nf_logger *logger) +void nf_log_unregister(struct nf_logger *logger) { + const struct nf_logger *c_logger; int i; mutex_lock(&nf_log_mutex); for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) { - if (nf_loggers[i] == logger) + c_logger = rcu_dereference(nf_loggers[i]); + if (c_logger == logger) rcu_assign_pointer(nf_loggers[i], NULL); + list_del(&logger->list[i]); } mutex_unlock(&nf_log_mutex); @@ -73,6 +78,27 @@ void nf_log_unregister(const struct nf_logger *logger) } EXPORT_SYMBOL(nf_log_unregister); +int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger) +{ + mutex_lock(&nf_log_mutex); + if (__find_logger(pf, logger->name) == NULL) { + mutex_unlock(&nf_log_mutex); + return -ENOENT; + } + rcu_assign_pointer(nf_loggers[pf], logger); + mutex_unlock(&nf_log_mutex); + return 0; +} +EXPORT_SYMBOL(nf_log_bind_pf); + +void nf_log_unbind_pf(u_int8_t pf) +{ + mutex_lock(&nf_log_mutex); + rcu_assign_pointer(nf_loggers[pf], NULL); + mutex_unlock(&nf_log_mutex); +} +EXPORT_SYMBOL(nf_log_unbind_pf); + void nf_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, @@ -129,13 +155,37 @@ static int seq_show(struct seq_file *s, void *v) { loff_t *pos = v; const struct nf_logger *logger; + struct nf_logger *t; + int ret; logger = rcu_dereference(nf_loggers[*pos]); if (!logger) - return seq_printf(s, "%2lld NONE\n", *pos); + ret = seq_printf(s, "%2lld NONE (", *pos); + else + ret = seq_printf(s, "%2lld %s (", *pos, logger->name); + + if (ret < 0) + return ret; + + mutex_lock(&nf_log_mutex); + list_for_each_entry(t, &nf_loggers_l[*pos], list[*pos]) { + ret = seq_printf(s, "%s", t->name); + if (ret < 0) { + mutex_unlock(&nf_log_mutex); + return ret; + } + if (&t->list[*pos] != nf_loggers_l[*pos].prev) { + ret = seq_printf(s, ","); + if (ret < 0) { + mutex_unlock(&nf_log_mutex); + return ret; + } + } + } + mutex_unlock(&nf_log_mutex); - return seq_printf(s, "%2lld %s\n", *pos, logger->name); + return seq_printf(s, ")\n"); } static const struct seq_operations nflog_seq_ops = { @@ -158,15 +208,102 @@ static const struct file_operations nflog_file_ops = { .release = seq_release, }; + #endif /* PROC_FS */ +#ifdef CONFIG_SYSCTL +struct ctl_path nf_log_sysctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "netfilter", .ctl_name = NET_NETFILTER, }, + { .procname = "nf_log", .ctl_name = CTL_UNNUMBERED, }, + { } +}; + +static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; +static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; +static struct ctl_table_header *nf_log_dir_header; + +static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp, loff_t *ppos) +{ + const struct nf_logger *logger; + int r = 0; + int tindex = (unsigned long)table->extra1; + + if (write) { + if (!strcmp(buffer, "NONE")) { + nf_log_unbind_pf(tindex); + return 0; + } + mutex_lock(&nf_log_mutex); + logger = __find_logger(tindex, buffer); + if (logger == NULL) { + mutex_unlock(&nf_log_mutex); + return -ENOENT; + } + rcu_assign_pointer(nf_loggers[tindex], logger); + mutex_unlock(&nf_log_mutex); + } else { + rcu_read_lock(); + logger = rcu_dereference(nf_loggers[tindex]); + if (!logger) + table->data = "NONE"; + else + table->data = logger->name; + r = proc_dostring(table, write, filp, buffer, lenp, ppos); + rcu_read_unlock(); + } + + return r; +} + +static __init int netfilter_log_sysctl_init(void) +{ + int i; + + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { + snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i); + nf_log_sysctl_table[i].ctl_name = CTL_UNNUMBERED; + nf_log_sysctl_table[i].procname = + nf_log_sysctl_fnames[i-NFPROTO_UNSPEC]; + nf_log_sysctl_table[i].data = NULL; + nf_log_sysctl_table[i].maxlen = + NFLOGGER_NAME_LEN * sizeof(char); + nf_log_sysctl_table[i].mode = 0644; + nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring; + nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i; + } + + nf_log_dir_header = register_sysctl_paths(nf_log_sysctl_path, + nf_log_sysctl_table); + if (!nf_log_dir_header) + return -ENOMEM; + + return 0; +} +#else +static __init int netfilter_log_sysctl_init(void) +{ + return 0; +} +#endif /* CONFIG_SYSCTL */ int __init netfilter_log_init(void) { + int i, r; #ifdef CONFIG_PROC_FS if (!proc_create("nf_log", S_IRUGO, proc_net_netfilter, &nflog_file_ops)) return -1; #endif + + /* Errors will trigger panic, unroll on error is unnecessary. */ + r = netfilter_log_sysctl_init(); + if (r < 0) + return r; + + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) + INIT_LIST_HEAD(&(nf_loggers_l[i])); + return 0; } diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c index cdc97f3105a3..5490fc37c92d 100644 --- a/net/netfilter/nf_tproxy_core.c +++ b/net/netfilter/nf_tproxy_core.c @@ -71,6 +71,7 @@ int nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) { if (inet_sk(sk)->transparent) { + skb_orphan(skb); skb->sk = sk; skb->destructor = nf_tproxy_destructor; return 1; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 9c0ba17a1ddb..2785d66a7e38 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -113,6 +113,12 @@ int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) } EXPORT_SYMBOL_GPL(nfnetlink_send); +void nfnetlink_set_err(u32 pid, u32 group, int error) +{ + netlink_set_err(nfnl, pid, group, error); +} +EXPORT_SYMBOL_GPL(nfnetlink_set_err); + int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags) { return netlink_unicast(nfnl, skb, pid, flags); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index c712e9fc6bba..fd326ac27ec8 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -693,7 +693,7 @@ nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, return -ENOTSUPP; } -static const struct nf_logger nfulnl_logger = { +static struct nf_logger nfulnl_logger __read_mostly = { .name = "nfnetlink_log", .logfn = &nfulnl_log_packet, .me = THIS_MODULE, @@ -725,9 +725,9 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, /* Commands without queue context */ switch (cmd->command) { case NFULNL_CFG_CMD_PF_BIND: - return nf_log_register(pf, &nfulnl_logger); + return nf_log_bind_pf(pf, &nfulnl_logger); case NFULNL_CFG_CMD_PF_UNBIND: - nf_log_unregister_pf(pf); + nf_log_unbind_pf(pf); return 0; } } @@ -952,17 +952,25 @@ static int __init nfnetlink_log_init(void) goto cleanup_netlink_notifier; } + status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger); + if (status < 0) { + printk(KERN_ERR "log: failed to register logger\n"); + goto cleanup_subsys; + } + #ifdef CONFIG_PROC_FS if (!proc_create("nfnetlink_log", 0440, proc_net_netfilter, &nful_file_ops)) - goto cleanup_subsys; + goto cleanup_logger; #endif return status; #ifdef CONFIG_PROC_FS +cleanup_logger: + nf_log_unregister(&nfulnl_logger); +#endif cleanup_subsys: nfnetlink_subsys_unregister(&nfulnl_subsys); -#endif cleanup_netlink_notifier: netlink_unregister_notifier(&nfulnl_rtnl_notifier); return status; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 5baccfa5a0de..509a95621f9f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -625,6 +625,20 @@ void xt_free_table_info(struct xt_table_info *info) } EXPORT_SYMBOL(xt_free_table_info); +void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo, + struct xt_table_info *newinfo) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + void *p = oldinfo->entries[cpu]; + rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]); + newinfo->entries[cpu] = p; + } + +} +EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu); + /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) @@ -671,21 +685,22 @@ xt_replace_table(struct xt_table *table, struct xt_table_info *oldinfo, *private; /* Do the substitution. */ - write_lock_bh(&table->lock); + mutex_lock(&table->lock); private = table->private; /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { duprintf("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); - write_unlock_bh(&table->lock); + mutex_unlock(&table->lock); *error = -EAGAIN; return NULL; } oldinfo = private; - table->private = newinfo; + rcu_assign_pointer(table->private, newinfo); newinfo->initial_entries = oldinfo->initial_entries; - write_unlock_bh(&table->lock); + mutex_unlock(&table->lock); + synchronize_net(); return oldinfo; } EXPORT_SYMBOL_GPL(xt_replace_table); @@ -719,7 +734,8 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table, /* Simplifies replace_table code. */ table->private = bootstrap; - rwlock_init(&table->lock); + mutex_init(&table->lock); + if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c new file mode 100644 index 000000000000..10e789e2d12a --- /dev/null +++ b/net/netfilter/xt_HL.c @@ -0,0 +1,171 @@ +/* + * TTL modification target for IP tables + * (C) 2000,2005 by Harald Welte <laforge@netfilter.org> + * + * Hop Limit modification target for ip6tables + * Maciej Soltysiak <solt@dns.toxicfilms.tv> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/checksum.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ipt_TTL.h> +#include <linux/netfilter_ipv6/ip6t_HL.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); +MODULE_DESCRIPTION("Xtables: Hoplimit/TTL Limit field modification target"); +MODULE_LICENSE("GPL"); + +static unsigned int +ttl_tg(struct sk_buff *skb, const struct xt_target_param *par) +{ + struct iphdr *iph; + const struct ipt_TTL_info *info = par->targinfo; + int new_ttl; + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + iph = ip_hdr(skb); + + switch (info->mode) { + case IPT_TTL_SET: + new_ttl = info->ttl; + break; + case IPT_TTL_INC: + new_ttl = iph->ttl + info->ttl; + if (new_ttl > 255) + new_ttl = 255; + break; + case IPT_TTL_DEC: + new_ttl = iph->ttl - info->ttl; + if (new_ttl < 0) + new_ttl = 0; + break; + default: + new_ttl = iph->ttl; + break; + } + + if (new_ttl != iph->ttl) { + csum_replace2(&iph->check, htons(iph->ttl << 8), + htons(new_ttl << 8)); + iph->ttl = new_ttl; + } + + return XT_CONTINUE; +} + +static unsigned int +hl_tg6(struct sk_buff *skb, const struct xt_target_param *par) +{ + struct ipv6hdr *ip6h; + const struct ip6t_HL_info *info = par->targinfo; + int new_hl; + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + ip6h = ipv6_hdr(skb); + + switch (info->mode) { + case IP6T_HL_SET: + new_hl = info->hop_limit; + break; + case IP6T_HL_INC: + new_hl = ip6h->hop_limit + info->hop_limit; + if (new_hl > 255) + new_hl = 255; + break; + case IP6T_HL_DEC: + new_hl = ip6h->hop_limit - info->hop_limit; + if (new_hl < 0) + new_hl = 0; + break; + default: + new_hl = ip6h->hop_limit; + break; + } + + ip6h->hop_limit = new_hl; + + return XT_CONTINUE; +} + +static bool ttl_tg_check(const struct xt_tgchk_param *par) +{ + const struct ipt_TTL_info *info = par->targinfo; + + if (info->mode > IPT_TTL_MAXMODE) { + printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", + info->mode); + return false; + } + if (info->mode != IPT_TTL_SET && info->ttl == 0) + return false; + return true; +} + +static bool hl_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_HL_info *info = par->targinfo; + + if (info->mode > IP6T_HL_MAXMODE) { + printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n", + info->mode); + return false; + } + if (info->mode != IP6T_HL_SET && info->hop_limit == 0) { + printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't " + "make sense with value 0\n"); + return false; + } + return true; +} + +static struct xt_target hl_tg_reg[] __read_mostly = { + { + .name = "TTL", + .revision = 0, + .family = NFPROTO_IPV4, + .target = ttl_tg, + .targetsize = sizeof(struct ipt_TTL_info), + .table = "mangle", + .checkentry = ttl_tg_check, + .me = THIS_MODULE, + }, + { + .name = "HL", + .revision = 0, + .family = NFPROTO_IPV6, + .target = hl_tg6, + .targetsize = sizeof(struct ip6t_HL_info), + .table = "mangle", + .checkentry = hl_tg6_check, + .me = THIS_MODULE, + }, +}; + +static int __init hl_tg_init(void) +{ + return xt_register_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); +} + +static void __exit hl_tg_exit(void) +{ + xt_unregister_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); +} + +module_init(hl_tg_init); +module_exit(hl_tg_exit); +MODULE_ALIAS("ipt_TTL"); +MODULE_ALIAS("ip6t_HL"); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c new file mode 100644 index 000000000000..8ff7843bb921 --- /dev/null +++ b/net/netfilter/xt_LED.c @@ -0,0 +1,161 @@ +/* + * xt_LED.c - netfilter target to make LEDs blink upon packet matches + * + * Copyright (C) 2008 Adam Nielsen <a.nielsen@shikadi.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301 USA. + * + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter/x_tables.h> +#include <linux/leds.h> +#include <linux/mutex.h> + +#include <linux/netfilter/xt_LED.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>"); +MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match"); + +/* + * This is declared in here (the kernel module) only, to avoid having these + * dependencies in userspace code. This is what xt_led_info.internal_data + * points to. + */ +struct xt_led_info_internal { + struct led_trigger netfilter_led_trigger; + struct timer_list timer; +}; + +static unsigned int +led_tg(struct sk_buff *skb, const struct xt_target_param *par) +{ + const struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal = ledinfo->internal_data; + + /* + * If "always blink" is enabled, and there's still some time until the + * LED will switch off, briefly switch it off now. + */ + if ((ledinfo->delay > 0) && ledinfo->always_blink && + timer_pending(&ledinternal->timer)) + led_trigger_event(&ledinternal->netfilter_led_trigger,LED_OFF); + + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL); + + /* If there's a positive delay, start/update the timer */ + if (ledinfo->delay > 0) { + mod_timer(&ledinternal->timer, + jiffies + msecs_to_jiffies(ledinfo->delay)); + + /* Otherwise if there was no delay given, blink as fast as possible */ + } else if (ledinfo->delay == 0) { + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); + } + + /* else the delay is negative, which means switch on and stay on */ + + return XT_CONTINUE; +} + +static void led_timeout_callback(unsigned long data) +{ + struct xt_led_info *ledinfo = (struct xt_led_info *)data; + struct xt_led_info_internal *ledinternal = ledinfo->internal_data; + + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); +} + +static bool led_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal; + int err; + + if (ledinfo->id[0] == '\0') { + printk(KERN_ERR KBUILD_MODNAME ": No 'id' parameter given.\n"); + return false; + } + + ledinternal = kzalloc(sizeof(struct xt_led_info_internal), GFP_KERNEL); + if (!ledinternal) { + printk(KERN_CRIT KBUILD_MODNAME ": out of memory\n"); + return false; + } + + ledinternal->netfilter_led_trigger.name = ledinfo->id; + + err = led_trigger_register(&ledinternal->netfilter_led_trigger); + if (err) { + printk(KERN_CRIT KBUILD_MODNAME + ": led_trigger_register() failed\n"); + if (err == -EEXIST) + printk(KERN_ERR KBUILD_MODNAME + ": Trigger name is already in use.\n"); + goto exit_alloc; + } + + /* See if we need to set up a timer */ + if (ledinfo->delay > 0) + setup_timer(&ledinternal->timer, led_timeout_callback, + (unsigned long)ledinfo); + + ledinfo->internal_data = ledinternal; + + return true; + +exit_alloc: + kfree(ledinternal); + + return false; +} + +static void led_tg_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal = ledinfo->internal_data; + + if (ledinfo->delay > 0) + del_timer_sync(&ledinternal->timer); + + led_trigger_unregister(&ledinternal->netfilter_led_trigger); + kfree(ledinternal); +} + +static struct xt_target led_tg_reg __read_mostly = { + .name = "LED", + .revision = 0, + .family = NFPROTO_UNSPEC, + .target = led_tg, + .targetsize = XT_ALIGN(sizeof(struct xt_led_info)), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, +}; + +static int __init led_tg_init(void) +{ + return xt_register_target(&led_tg_reg); +} + +static void __exit led_tg_exit(void) +{ + xt_unregister_target(&led_tg_reg); +} + +module_init(led_tg_init); +module_exit(led_tg_exit); diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c new file mode 100644 index 000000000000..ad5bd890e4e8 --- /dev/null +++ b/net/netfilter/xt_cluster.c @@ -0,0 +1,164 @@ +/* + * (C) 2008-2009 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/jhash.h> +#include <linux/ip.h> +#include <net/ipv6.h> + +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <linux/netfilter/xt_cluster.h> + +static inline u_int32_t nf_ct_orig_ipv4_src(const struct nf_conn *ct) +{ + return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; +} + +static inline const void *nf_ct_orig_ipv6_src(const struct nf_conn *ct) +{ + return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6; +} + +static inline u_int32_t +xt_cluster_hash_ipv4(u_int32_t ip, const struct xt_cluster_match_info *info) +{ + return jhash_1word(ip, info->hash_seed); +} + +static inline u_int32_t +xt_cluster_hash_ipv6(const void *ip, const struct xt_cluster_match_info *info) +{ + return jhash2(ip, NF_CT_TUPLE_L3SIZE / sizeof(__u32), info->hash_seed); +} + +static inline u_int32_t +xt_cluster_hash(const struct nf_conn *ct, + const struct xt_cluster_match_info *info) +{ + u_int32_t hash = 0; + + switch(nf_ct_l3num(ct)) { + case AF_INET: + hash = xt_cluster_hash_ipv4(nf_ct_orig_ipv4_src(ct), info); + break; + case AF_INET6: + hash = xt_cluster_hash_ipv6(nf_ct_orig_ipv6_src(ct), info); + break; + default: + WARN_ON(1); + break; + } + return (((u64)hash * info->total_nodes) >> 32); +} + +static inline bool +xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family) +{ + bool is_multicast = false; + + switch(family) { + case NFPROTO_IPV4: + is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr); + break; + case NFPROTO_IPV6: + is_multicast = ipv6_addr_type(&ipv6_hdr(skb)->daddr) & + IPV6_ADDR_MULTICAST; + break; + default: + WARN_ON(1); + break; + } + return is_multicast; +} + +static bool +xt_cluster_mt(const struct sk_buff *skb, const struct xt_match_param *par) +{ + struct sk_buff *pskb = (struct sk_buff *)skb; + const struct xt_cluster_match_info *info = par->matchinfo; + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned long hash; + + /* This match assumes that all nodes see the same packets. This can be + * achieved if the switch that connects the cluster nodes support some + * sort of 'port mirroring'. However, if your switch does not support + * this, your cluster nodes can reply ARP request using a multicast MAC + * address. Thus, your switch will flood the same packets to the + * cluster nodes with the same multicast MAC address. Using a multicast + * link address is a RFC 1812 (section 3.3.2) violation, but this works + * fine in practise. + * + * Unfortunately, if you use the multicast MAC address, the link layer + * sets skbuff's pkt_type to PACKET_MULTICAST, which is not accepted + * by TCP and others for packets coming to this node. For that reason, + * this match mangles skbuff's pkt_type if it detects a packet + * addressed to a unicast address but using PACKET_MULTICAST. Yes, I + * know, matches should not alter packets, but we are doing this here + * because we would need to add a PKTTYPE target for this sole purpose. + */ + if (!xt_cluster_is_multicast_addr(skb, par->family) && + skb->pkt_type == PACKET_MULTICAST) { + pskb->pkt_type = PACKET_HOST; + } + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return false; + + if (ct == &nf_conntrack_untracked) + return false; + + if (ct->master) + hash = xt_cluster_hash(ct->master, info); + else + hash = xt_cluster_hash(ct, info); + + return !!((1 << hash) & info->node_mask) ^ + !!(info->flags & XT_CLUSTER_F_INV); +} + +static bool xt_cluster_mt_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_cluster_match_info *info = par->matchinfo; + + if (info->node_mask >= (1 << info->total_nodes)) { + printk(KERN_ERR "xt_cluster: this node mask cannot be " + "higher than the total number of nodes\n"); + return false; + } + return true; +} + +static struct xt_match xt_cluster_match __read_mostly = { + .name = "cluster", + .family = NFPROTO_UNSPEC, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .me = THIS_MODULE, +}; + +static int __init xt_cluster_mt_init(void) +{ + return xt_register_match(&xt_cluster_match); +} + +static void __exit xt_cluster_mt_fini(void) +{ + xt_unregister_match(&xt_cluster_match); +} + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: hash-based cluster match"); +MODULE_ALIAS("ipt_cluster"); +MODULE_ALIAS("ip6t_cluster"); +module_init(xt_cluster_mt_init); +module_exit(xt_cluster_mt_fini); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index f97fded024c4..a5b5369c30f9 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -149,7 +149,7 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht, /* initialize hash with random val at the time we allocate * the first hashtable entry */ if (!ht->rnd_initialized) { - get_random_bytes(&ht->rnd, 4); + get_random_bytes(&ht->rnd, sizeof(ht->rnd)); ht->rnd_initialized = 1; } @@ -565,8 +565,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, static bool hashlimit_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_hashlimit_info *r = - ((const struct xt_hashlimit_info *)par->matchinfo)->u.master; + const struct xt_hashlimit_info *r = par->matchinfo; struct xt_hashlimit_htable *hinfo = r->hinfo; unsigned long now = jiffies; struct dsthash_ent *dh; @@ -702,8 +701,6 @@ static bool hashlimit_mt_check_v0(const struct xt_mtchk_param *par) } mutex_unlock(&hlimit_mutex); - /* Ugly hack: For SMP, we only want to use one set */ - r->u.master = r; return true; } diff --git a/net/netfilter/xt_hl.c b/net/netfilter/xt_hl.c new file mode 100644 index 000000000000..7726154c87b2 --- /dev/null +++ b/net/netfilter/xt_hl.c @@ -0,0 +1,108 @@ +/* + * IP tables module for matching the value of the TTL + * (C) 2000,2001 by Harald Welte <laforge@netfilter.org> + * + * Hop Limit matching module + * (C) 2001-2002 Maciej Soltysiak <solt@dns.toxicfilms.tv> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ipt_ttl.h> +#include <linux/netfilter_ipv6/ip6t_hl.h> + +MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); +MODULE_DESCRIPTION("Xtables: Hoplimit/TTL field match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ttl"); +MODULE_ALIAS("ip6t_hl"); + +static bool ttl_mt(const struct sk_buff *skb, const struct xt_match_param *par) +{ + const struct ipt_ttl_info *info = par->matchinfo; + const u8 ttl = ip_hdr(skb)->ttl; + + switch (info->mode) { + case IPT_TTL_EQ: + return ttl == info->ttl; + case IPT_TTL_NE: + return ttl != info->ttl; + case IPT_TTL_LT: + return ttl < info->ttl; + case IPT_TTL_GT: + return ttl > info->ttl; + default: + printk(KERN_WARNING "ipt_ttl: unknown mode %d\n", + info->mode); + return false; + } + + return false; +} + +static bool hl_mt6(const struct sk_buff *skb, const struct xt_match_param *par) +{ + const struct ip6t_hl_info *info = par->matchinfo; + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + + switch (info->mode) { + case IP6T_HL_EQ: + return ip6h->hop_limit == info->hop_limit; + break; + case IP6T_HL_NE: + return ip6h->hop_limit != info->hop_limit; + break; + case IP6T_HL_LT: + return ip6h->hop_limit < info->hop_limit; + break; + case IP6T_HL_GT: + return ip6h->hop_limit > info->hop_limit; + break; + default: + printk(KERN_WARNING "ip6t_hl: unknown mode %d\n", + info->mode); + return false; + } + + return false; +} + +static struct xt_match hl_mt_reg[] __read_mostly = { + { + .name = "ttl", + .revision = 0, + .family = NFPROTO_IPV4, + .match = ttl_mt, + .matchsize = sizeof(struct ipt_ttl_info), + .me = THIS_MODULE, + }, + { + .name = "hl", + .revision = 0, + .family = NFPROTO_IPV6, + .match = hl_mt6, + .matchsize = sizeof(struct ip6t_hl_info), + .me = THIS_MODULE, + }, +}; + +static int __init hl_mt_init(void) +{ + return xt_register_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg)); +} + +static void __exit hl_mt_exit(void) +{ + xt_unregister_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg)); +} + +module_init(hl_mt_init); +module_exit(hl_mt_exit); diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index c908d69a5595..2e8089ecd0af 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -14,6 +14,11 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_limit.h> +struct xt_limit_priv { + unsigned long prev; + uint32_t credit; +}; + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); MODULE_DESCRIPTION("Xtables: rate-limit match"); @@ -60,18 +65,18 @@ static DEFINE_SPINLOCK(limit_lock); static bool limit_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - struct xt_rateinfo *r = - ((const struct xt_rateinfo *)par->matchinfo)->master; + const struct xt_rateinfo *r = par->matchinfo; + struct xt_limit_priv *priv = r->master; unsigned long now = jiffies; spin_lock_bh(&limit_lock); - r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY; - if (r->credit > r->credit_cap) - r->credit = r->credit_cap; + priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; + if (priv->credit > r->credit_cap) + priv->credit = r->credit_cap; - if (r->credit >= r->cost) { + if (priv->credit >= r->cost) { /* We're not limited. */ - r->credit -= r->cost; + priv->credit -= r->cost; spin_unlock_bh(&limit_lock); return true; } @@ -95,6 +100,7 @@ user2credits(u_int32_t user) static bool limit_mt_check(const struct xt_mtchk_param *par) { struct xt_rateinfo *r = par->matchinfo; + struct xt_limit_priv *priv; /* Check for overflow. */ if (r->burst == 0 @@ -104,19 +110,30 @@ static bool limit_mt_check(const struct xt_mtchk_param *par) return false; } - /* For SMP, we only want to use one set of counters. */ - r->master = r; + priv = kmalloc(sizeof(*priv), GFP_KERNEL); + if (priv == NULL) + return -ENOMEM; + + /* For SMP, we only want to use one set of state. */ + r->master = priv; if (r->cost == 0) { /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * 128. */ - r->prev = jiffies; - r->credit = user2credits(r->avg * r->burst); /* Credits full. */ + priv->prev = jiffies; + priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ r->cost = user2credits(r->avg); } return true; } +static void limit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_rateinfo *info = par->matchinfo; + + kfree(info->master); +} + #ifdef CONFIG_COMPAT struct compat_xt_rateinfo { u_int32_t avg; @@ -167,6 +184,7 @@ static struct xt_match limit_mt_reg __read_mostly = { .family = NFPROTO_UNSPEC, .match = limit_mt, .checkentry = limit_mt_check, + .destroy = limit_mt_destroy, .matchsize = sizeof(struct xt_rateinfo), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_xt_rateinfo), diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 1bcdfc12cf59..44a234ef4439 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -20,13 +20,30 @@ MODULE_DESCRIPTION("Xtables: Bridge physical device match"); MODULE_ALIAS("ipt_physdev"); MODULE_ALIAS("ip6t_physdev"); +static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask) +{ + const unsigned long *a = (const unsigned long *)_a; + const unsigned long *b = (const unsigned long *)_b; + const unsigned long *mask = (const unsigned long *)_mask; + unsigned long ret; + + ret = (a[0] ^ b[0]) & mask[0]; + if (IFNAMSIZ > sizeof(unsigned long)) + ret |= (a[1] ^ b[1]) & mask[1]; + if (IFNAMSIZ > 2 * sizeof(unsigned long)) + ret |= (a[2] ^ b[2]) & mask[2]; + if (IFNAMSIZ > 3 * sizeof(unsigned long)) + ret |= (a[3] ^ b[3]) & mask[3]; + BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long)); + return ret; +} + static bool physdev_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - int i; - static const char nulldevname[IFNAMSIZ]; + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct xt_physdev_info *info = par->matchinfo; - bool ret; + unsigned long ret; const char *indev, *outdev; const struct nf_bridge_info *nf_bridge; @@ -68,11 +85,7 @@ physdev_mt(const struct sk_buff *skb, const struct xt_match_param *par) if (!(info->bitmask & XT_PHYSDEV_OP_IN)) goto match_outdev; indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; - for (i = 0, ret = false; i < IFNAMSIZ/sizeof(unsigned int); i++) { - ret |= (((const unsigned int *)indev)[i] - ^ ((const unsigned int *)info->physindev)[i]) - & ((const unsigned int *)info->in_mask)[i]; - } + ret = ifname_compare(indev, info->physindev, info->in_mask); if (!ret ^ !(info->invert & XT_PHYSDEV_OP_IN)) return false; @@ -82,13 +95,9 @@ match_outdev: return true; outdev = nf_bridge->physoutdev ? nf_bridge->physoutdev->name : nulldevname; - for (i = 0, ret = false; i < IFNAMSIZ/sizeof(unsigned int); i++) { - ret |= (((const unsigned int *)outdev)[i] - ^ ((const unsigned int *)info->physoutdev)[i]) - & ((const unsigned int *)info->out_mask)[i]; - } + ret = ifname_compare(outdev, info->physoutdev, info->out_mask); - return ret ^ !(info->invert & XT_PHYSDEV_OP_OUT); + return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT)); } static bool physdev_mt_check(const struct xt_mtchk_param *par) diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index c84fce5e0f3e..01dd07b764ec 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -9,6 +9,10 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_quota.h> +struct xt_quota_priv { + uint64_t quota; +}; + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sam Johnston <samj@samj.net>"); MODULE_DESCRIPTION("Xtables: countdown quota match"); @@ -20,18 +24,20 @@ static DEFINE_SPINLOCK(quota_lock); static bool quota_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - struct xt_quota_info *q = - ((const struct xt_quota_info *)par->matchinfo)->master; + struct xt_quota_info *q = (void *)par->matchinfo; + struct xt_quota_priv *priv = q->master; bool ret = q->flags & XT_QUOTA_INVERT; spin_lock_bh("a_lock); - if (q->quota >= skb->len) { - q->quota -= skb->len; + if (priv->quota >= skb->len) { + priv->quota -= skb->len; ret = !ret; } else { /* we do not allow even small packets from now on */ - q->quota = 0; + priv->quota = 0; } + /* Copy quota back to matchinfo so that iptables can display it */ + q->quota = priv->quota; spin_unlock_bh("a_lock); return ret; @@ -43,17 +49,28 @@ static bool quota_mt_check(const struct xt_mtchk_param *par) if (q->flags & ~XT_QUOTA_MASK) return false; - /* For SMP, we only want to use one set of counters. */ - q->master = q; + + q->master = kmalloc(sizeof(*q->master), GFP_KERNEL); + if (q->master == NULL) + return -ENOMEM; + return true; } +static void quota_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_quota_info *q = par->matchinfo; + + kfree(q->master); +} + static struct xt_match quota_mt_reg __read_mostly = { .name = "quota", .revision = 0, .family = NFPROTO_UNSPEC, .match = quota_mt, .checkentry = quota_mt_check, + .destroy = quota_mt_destroy, .matchsize = sizeof(struct xt_quota_info), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 0d75141139d5..d8c0f8f1a78e 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -16,6 +16,10 @@ #include <linux/netfilter/xt_statistic.h> #include <linux/netfilter/x_tables.h> +struct xt_statistic_priv { + uint32_t count; +}; + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)"); @@ -27,7 +31,7 @@ static DEFINE_SPINLOCK(nth_lock); static bool statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - struct xt_statistic_info *info = (void *)par->matchinfo; + const struct xt_statistic_info *info = par->matchinfo; bool ret = info->flags & XT_STATISTIC_INVERT; switch (info->mode) { @@ -36,10 +40,9 @@ statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par) ret = !ret; break; case XT_STATISTIC_MODE_NTH: - info = info->master; spin_lock_bh(&nth_lock); - if (info->u.nth.count++ == info->u.nth.every) { - info->u.nth.count = 0; + if (info->master->count++ == info->u.nth.every) { + info->master->count = 0; ret = !ret; } spin_unlock_bh(&nth_lock); @@ -56,16 +59,31 @@ static bool statistic_mt_check(const struct xt_mtchk_param *par) if (info->mode > XT_STATISTIC_MODE_MAX || info->flags & ~XT_STATISTIC_MASK) return false; - info->master = info; + + info->master = kzalloc(sizeof(*info->master), GFP_KERNEL); + if (info->master == NULL) { + printk(KERN_ERR KBUILD_MODNAME ": Out of memory\n"); + return false; + } + info->master->count = info->u.nth.count; + return true; } +static void statistic_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_statistic_info *info = par->matchinfo; + + kfree(info->master); +} + static struct xt_match xt_statistic_mt_reg __read_mostly = { .name = "statistic", .revision = 0, .family = NFPROTO_UNSPEC, .match = statistic_mt, .checkentry = statistic_mt_check, + .destroy = statistic_mt_destroy, .matchsize = sizeof(struct xt_statistic_info), .me = THIS_MODULE, }; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3ae3cb816563..8b6bbb3032b0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -85,6 +85,8 @@ struct netlink_sock { #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 +#define NETLINK_BROADCAST_SEND_ERROR 0x4 +#define NETLINK_RECV_NO_ENOBUFS 0x8 static inline struct netlink_sock *nlk_sk(struct sock *sk) { @@ -716,10 +718,15 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, static void netlink_overrun(struct sock *sk) { - if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); + struct netlink_sock *nlk = nlk_sk(sk); + + if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + } } + atomic_inc(&sk->sk_drops); } static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) @@ -950,6 +957,7 @@ struct netlink_broadcast_data { u32 pid; u32 group; int failure; + int delivery_failure; int congested; int delivered; gfp_t allocation; @@ -994,11 +1002,15 @@ static inline int do_one_broadcast(struct sock *sk, netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; + if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + p->delivery_failure = 1; } else if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { netlink_overrun(sk); + if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; @@ -1025,6 +1037,7 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, info.pid = pid; info.group = group; info.failure = 0; + info.delivery_failure = 0; info.congested = 0; info.delivered = 0; info.allocation = allocation; @@ -1042,16 +1055,16 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, netlink_unlock_table(); - if (info.skb2) - kfree_skb(info.skb2); + kfree_skb(info.skb2); + + if (info.delivery_failure) + return -ENOBUFS; if (info.delivered) { if (info.congested && (allocation & __GFP_WAIT)) yield(); return 0; } - if (info.failure) - return -ENOBUFS; return -ESRCH; } EXPORT_SYMBOL(netlink_broadcast); @@ -1110,6 +1123,7 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) read_unlock(&nl_table_lock); } +EXPORT_SYMBOL(netlink_set_err); /* must be called with netlink table grabbed */ static void netlink_update_socket_mc(struct netlink_sock *nlk, @@ -1167,6 +1181,22 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, err = 0; break; } + case NETLINK_BROADCAST_ERROR: + if (val) + nlk->flags |= NETLINK_BROADCAST_SEND_ERROR; + else + nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; + err = 0; + break; + case NETLINK_NO_ENOBUFS: + if (val) { + nlk->flags |= NETLINK_RECV_NO_ENOBUFS; + clear_bit(0, &nlk->state); + wake_up_interruptible(&nlk->wait); + } else + nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1199,6 +1229,26 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; err = 0; break; + case NETLINK_BROADCAST_ERROR: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; + case NETLINK_NO_ENOBUFS: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1525,8 +1575,7 @@ EXPORT_SYMBOL(netlink_set_nonroot); static void netlink_destroy_callback(struct netlink_callback *cb) { - if (cb->skb) - kfree_skb(cb->skb); + kfree_skb(cb->skb); kfree(cb); } @@ -1743,12 +1792,18 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid, exclude_pid = pid; } - /* errors reported via destination sk->sk_err */ - nlmsg_multicast(sk, skb, exclude_pid, group, flags); + /* errors reported via destination sk->sk_err, but propagate + * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ + err = nlmsg_multicast(sk, skb, exclude_pid, group, flags); } - if (report) - err = nlmsg_unicast(sk, skb, pid); + if (report) { + int err2; + + err2 = nlmsg_unicast(sk, skb, pid); + if (!err || err == -ESRCH) + err = err2; + } return err; } @@ -1849,12 +1904,12 @@ static int netlink_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, "sk Eth Pid Groups " - "Rmem Wmem Dump Locks\n"); + "Rmem Wmem Dump Locks Drops\n"); else { struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); - seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %d\n", + seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %-8d %-8d\n", s, s->sk_protocol, nlk->pid, @@ -1862,7 +1917,8 @@ static int netlink_seq_show(struct seq_file *seq, void *v) atomic_read(&s->sk_rmem_alloc), atomic_read(&s->sk_wmem_alloc), nlk->cb, - atomic_read(&s->sk_refcnt) + atomic_read(&s->sk_refcnt), + atomic_read(&s->sk_drops) ); } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index e9c05b8f4f45..6d9c58ec56ac 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1037,6 +1037,10 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock, unsigned char *asmptr; int size; + /* Netrom empty data frame has no meaning : don't send */ + if (len == 0) + return 0; + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) return -EINVAL; @@ -1167,6 +1171,11 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, skb_reset_transport_header(skb); copied = skb->len; + /* NetRom empty data frame has no meaning : ignore it */ + if (copied == 0) { + goto out; + } + if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; @@ -1182,7 +1191,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_namelen = sizeof(*sax); - skb_free_datagram(sk, skb); +out: skb_free_datagram(sk, skb); release_sock(sk); return copied; @@ -1432,7 +1441,7 @@ static int __init nr_proto_init(void) struct net_device *dev; sprintf(name, "nr%d", i); - dev = alloc_netdev(sizeof(struct nr_private), name, nr_setup); + dev = alloc_netdev(0, name, nr_setup); if (!dev) { printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n"); goto fail; diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 6caf459665f2..351372463fed 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -42,7 +42,7 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev) { - struct net_device_stats *stats = netdev_priv(dev); + struct net_device_stats *stats = &dev->stats; if (!netif_running(dev)) { stats->rx_dropped++; @@ -171,8 +171,7 @@ static int nr_close(struct net_device *dev) static int nr_xmit(struct sk_buff *skb, struct net_device *dev) { - struct nr_private *nr = netdev_priv(dev); - struct net_device_stats *stats = &nr->stats; + struct net_device_stats *stats = &dev->stats; unsigned int len = skb->len; if (!nr_route_frame(skb, NULL)) { @@ -187,34 +186,27 @@ static int nr_xmit(struct sk_buff *skb, struct net_device *dev) return 0; } -static struct net_device_stats *nr_get_stats(struct net_device *dev) -{ - struct nr_private *nr = netdev_priv(dev); - - return &nr->stats; -} - static const struct header_ops nr_header_ops = { .create = nr_header, .rebuild= nr_rebuild_header, }; +static const struct net_device_ops nr_netdev_ops = { + .ndo_open = nr_open, + .ndo_stop = nr_close, + .ndo_start_xmit = nr_xmit, + .ndo_set_mac_address = nr_set_mac_address, +}; void nr_setup(struct net_device *dev) { dev->mtu = NR_MAX_PACKET_SIZE; - dev->hard_start_xmit = nr_xmit; - dev->open = nr_open; - dev->stop = nr_close; - + dev->netdev_ops = &nr_netdev_ops; dev->header_ops = &nr_header_ops; dev->hard_header_len = NR_NETWORK_LEN + NR_TRANSPORT_LEN; dev->addr_len = AX25_ADDR_LEN; dev->type = ARPHRD_NETROM; - dev->set_mac_address = nr_set_mac_address; /* New-style flags. */ dev->flags = IFF_NOARP; - - dev->get_stats = nr_get_stats; } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 1fc4a7885c41..74776de523ec 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -584,7 +584,7 @@ drop_n_restore: skb->len = skb_len; } drop: - kfree_skb(skb); + consume_skb(skb); return 0; } @@ -756,8 +756,7 @@ ring_is_full: spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk, 0); - if (copy_skb) - kfree_skb(copy_skb); + kfree_skb(copy_skb); goto drop_n_restore; } diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index 13cb323f8c38..a662e62a99cf 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -275,8 +275,6 @@ static inline int can_respond(struct sk_buff *skb) return 0; ph = pn_hdr(skb); - if (phonet_address_get(skb->dev, ph->pn_rdev) != ph->pn_rdev) - return 0; /* we are not the destination */ if (ph->pn_res == PN_PREFIX && !pskb_may_pull(skb, 5)) return 0; if (ph->pn_res == PN_COMMGR) /* indications */ @@ -344,8 +342,8 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pkttype, struct net_device *orig_dev) { + struct net *net = dev_net(dev); struct phonethdr *ph; - struct sock *sk; struct sockaddr_pn sa; u16 len; @@ -364,29 +362,28 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev, skb_reset_transport_header(skb); pn_skb_get_dst_sockaddr(skb, &sa); - if (pn_sockaddr_get_addr(&sa) == 0) - goto out; /* currently, we cannot be device 0 */ - sk = pn_find_sock_by_sa(dev_net(dev), &sa); - if (sk == NULL) { + /* check if we are the destination */ + if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) { + /* Phonet packet input */ + struct sock *sk = pn_find_sock_by_sa(net, &sa); + + if (sk) + return sk_receive_skb(sk, skb, 0); + if (can_respond(skb)) { send_obj_unreachable(skb); send_reset_indications(skb); } - goto out; } - /* Push data to the socket (or other sockets connected to it). */ - return sk_receive_skb(sk, skb, 0); - out: kfree_skb(skb); return NET_RX_DROP; } -static struct packet_type phonet_packet_type = { - .type = __constant_htons(ETH_P_PHONET), - .dev = NULL, +static struct packet_type phonet_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_PHONET), .func = phonet_rcv, }; @@ -428,16 +425,18 @@ static int __init phonet_init(void) { int err; + err = phonet_device_init(); + if (err) + return err; + err = sock_register(&phonet_proto_family); if (err) { printk(KERN_ALERT "phonet protocol family initialization failed\n"); - return err; + goto err_sock; } - phonet_device_init(); dev_add_pack(&phonet_packet_type); - phonet_netlink_register(); phonet_sysctl_init(); err = isi_register(); @@ -449,6 +448,7 @@ err: phonet_sysctl_exit(); sock_unregister(PF_PHONET); dev_remove_pack(&phonet_packet_type); +err_sock: phonet_device_exit(); return err; } diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 5491bf5e354b..80a322d77909 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -28,32 +28,41 @@ #include <linux/netdevice.h> #include <linux/phonet.h> #include <net/sock.h> +#include <net/netns/generic.h> #include <net/phonet/pn_dev.h> -/* when accessing, remember to lock with spin_lock(&pndevs.lock); */ -struct phonet_device_list pndevs = { - .list = LIST_HEAD_INIT(pndevs.list), - .lock = __SPIN_LOCK_UNLOCKED(pndevs.lock), +struct phonet_net { + struct phonet_device_list pndevs; }; +int phonet_net_id; + +struct phonet_device_list *phonet_device_list(struct net *net) +{ + struct phonet_net *pnn = net_generic(net, phonet_net_id); + return &pnn->pndevs; +} + /* Allocate new Phonet device. */ static struct phonet_device *__phonet_device_alloc(struct net_device *dev) { + struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd = kmalloc(sizeof(*pnd), GFP_ATOMIC); if (pnd == NULL) return NULL; pnd->netdev = dev; bitmap_zero(pnd->addrs, 64); - list_add(&pnd->list, &pndevs.list); + list_add(&pnd->list, &pndevs->list); return pnd; } static struct phonet_device *__phonet_get(struct net_device *dev) { + struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; - list_for_each_entry(pnd, &pndevs.list, list) { + list_for_each_entry(pnd, &pndevs->list, list) { if (pnd->netdev == dev) return pnd; } @@ -68,32 +77,33 @@ static void __phonet_device_free(struct phonet_device *pnd) struct net_device *phonet_device_get(struct net *net) { + struct phonet_device_list *pndevs = phonet_device_list(net); struct phonet_device *pnd; struct net_device *dev; - spin_lock_bh(&pndevs.lock); - list_for_each_entry(pnd, &pndevs.list, list) { + spin_lock_bh(&pndevs->lock); + list_for_each_entry(pnd, &pndevs->list, list) { dev = pnd->netdev; BUG_ON(!dev); - if (net_eq(dev_net(dev), net) && - (dev->reg_state == NETREG_REGISTERED) && + if ((dev->reg_state == NETREG_REGISTERED) && ((pnd->netdev->flags & IFF_UP)) == IFF_UP) break; dev = NULL; } if (dev) dev_hold(dev); - spin_unlock_bh(&pndevs.lock); + spin_unlock_bh(&pndevs->lock); return dev; } int phonet_address_add(struct net_device *dev, u8 addr) { + struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; int err = 0; - spin_lock_bh(&pndevs.lock); + spin_lock_bh(&pndevs->lock); /* Find or create Phonet-specific device data */ pnd = __phonet_get(dev); if (pnd == NULL) @@ -102,31 +112,33 @@ int phonet_address_add(struct net_device *dev, u8 addr) err = -ENOMEM; else if (test_and_set_bit(addr >> 2, pnd->addrs)) err = -EEXIST; - spin_unlock_bh(&pndevs.lock); + spin_unlock_bh(&pndevs->lock); return err; } int phonet_address_del(struct net_device *dev, u8 addr) { + struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; int err = 0; - spin_lock_bh(&pndevs.lock); + spin_lock_bh(&pndevs->lock); pnd = __phonet_get(dev); if (!pnd || !test_and_clear_bit(addr >> 2, pnd->addrs)) err = -EADDRNOTAVAIL; else if (bitmap_empty(pnd->addrs, 64)) __phonet_device_free(pnd); - spin_unlock_bh(&pndevs.lock); + spin_unlock_bh(&pndevs->lock); return err; } /* Gets a source address toward a destination, through a interface. */ u8 phonet_address_get(struct net_device *dev, u8 addr) { + struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; - spin_lock_bh(&pndevs.lock); + spin_lock_bh(&pndevs->lock); pnd = __phonet_get(dev); if (pnd) { BUG_ON(bitmap_empty(pnd->addrs, 64)); @@ -136,30 +148,31 @@ u8 phonet_address_get(struct net_device *dev, u8 addr) addr = find_first_bit(pnd->addrs, 64) << 2; } else addr = PN_NO_ADDR; - spin_unlock_bh(&pndevs.lock); + spin_unlock_bh(&pndevs->lock); return addr; } int phonet_address_lookup(struct net *net, u8 addr) { + struct phonet_device_list *pndevs = phonet_device_list(net); struct phonet_device *pnd; + int err = -EADDRNOTAVAIL; - spin_lock_bh(&pndevs.lock); - list_for_each_entry(pnd, &pndevs.list, list) { - if (!net_eq(dev_net(pnd->netdev), net)) - continue; + spin_lock_bh(&pndevs->lock); + list_for_each_entry(pnd, &pndevs->list, list) { /* Don't allow unregistering devices! */ if ((pnd->netdev->reg_state != NETREG_REGISTERED) || ((pnd->netdev->flags & IFF_UP)) != IFF_UP) continue; if (test_bit(addr >> 2, pnd->addrs)) { - spin_unlock_bh(&pndevs.lock); - return 0; + err = 0; + goto found; } } - spin_unlock_bh(&pndevs.lock); - return -EADDRNOTAVAIL; +found: + spin_unlock_bh(&pndevs->lock); + return err; } /* notify Phonet of device events */ @@ -169,14 +182,16 @@ static int phonet_device_notify(struct notifier_block *me, unsigned long what, struct net_device *dev = arg; if (what == NETDEV_UNREGISTER) { + struct phonet_device_list *pndevs; struct phonet_device *pnd; /* Destroy phonet-specific device data */ - spin_lock_bh(&pndevs.lock); + pndevs = phonet_device_list(dev_net(dev)); + spin_lock_bh(&pndevs->lock); pnd = __phonet_get(dev); if (pnd) __phonet_device_free(pnd); - spin_unlock_bh(&pndevs.lock); + spin_unlock_bh(&pndevs->lock); } return 0; @@ -187,24 +202,52 @@ static struct notifier_block phonet_device_notifier = { .priority = 0, }; -/* Initialize Phonet devices list */ -void phonet_device_init(void) +/* Per-namespace Phonet devices handling */ +static int phonet_init_net(struct net *net) { - register_netdevice_notifier(&phonet_device_notifier); + struct phonet_net *pnn = kmalloc(sizeof(*pnn), GFP_KERNEL); + if (!pnn) + return -ENOMEM; + + INIT_LIST_HEAD(&pnn->pndevs.list); + spin_lock_init(&pnn->pndevs.lock); + net_assign_generic(net, phonet_net_id, pnn); + return 0; } -void phonet_device_exit(void) +static void phonet_exit_net(struct net *net) { + struct phonet_net *pnn = net_generic(net, phonet_net_id); struct phonet_device *pnd, *n; - rtnl_unregister_all(PF_PHONET); - rtnl_lock(); - spin_lock_bh(&pndevs.lock); - - list_for_each_entry_safe(pnd, n, &pndevs.list, list) + list_for_each_entry_safe(pnd, n, &pnn->pndevs.list, list) __phonet_device_free(pnd); - spin_unlock_bh(&pndevs.lock); - rtnl_unlock(); + kfree(pnn); +} + +static struct pernet_operations phonet_net_ops = { + .init = phonet_init_net, + .exit = phonet_exit_net, +}; + +/* Initialize Phonet devices list */ +int __init phonet_device_init(void) +{ + int err = register_pernet_gen_device(&phonet_net_id, &phonet_net_ops); + if (err) + return err; + + register_netdevice_notifier(&phonet_device_notifier); + err = phonet_netlink_register(); + if (err) + phonet_device_exit(); + return err; +} + +void phonet_device_exit(void) +{ + rtnl_unregister_all(PF_PHONET); unregister_netdevice_notifier(&phonet_device_notifier); + unregister_pernet_gen_device(phonet_net_id, &phonet_net_ops); } diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 242fe8f8c322..cec4e5951681 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -47,8 +47,9 @@ static void rtmsg_notify(int event, struct net_device *dev, u8 addr) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, dev_net(dev), 0, - RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); + rtnl_notify(skb, dev_net(dev), 0, + RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); + return; errout: if (err < 0) rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err); @@ -123,17 +124,16 @@ nla_put_failure: static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); + struct phonet_device_list *pndevs; struct phonet_device *pnd; int dev_idx = 0, dev_start_idx = cb->args[0]; int addr_idx = 0, addr_start_idx = cb->args[1]; - spin_lock_bh(&pndevs.lock); - list_for_each_entry(pnd, &pndevs.list, list) { + pndevs = phonet_device_list(sock_net(skb->sk)); + spin_lock_bh(&pndevs->lock); + list_for_each_entry(pnd, &pndevs->list, list) { u8 addr; - if (!net_eq(dev_net(pnd->netdev), net)) - continue; if (dev_idx > dev_start_idx) addr_start_idx = 0; if (dev_idx++ < dev_start_idx) @@ -153,16 +153,21 @@ static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) } out: - spin_unlock_bh(&pndevs.lock); + spin_unlock_bh(&pndevs->lock); cb->args[0] = dev_idx; cb->args[1] = addr_idx; return skb->len; } -void __init phonet_netlink_register(void) +int __init phonet_netlink_register(void) { - rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit, NULL); - rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL); - rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit); + int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit, NULL); + if (err) + return err; + + /* Further __rtnl_register() cannot fail */ + __rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL); + __rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit); + return 0; } diff --git a/net/rds/Kconfig b/net/rds/Kconfig new file mode 100644 index 000000000000..796773b5df9b --- /dev/null +++ b/net/rds/Kconfig @@ -0,0 +1,14 @@ + +config RDS + tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)" + depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL + depends on INFINIBAND && INFINIBAND_ADDR_TRANS + ---help--- + RDS provides reliable, sequenced delivery of datagrams + over Infiniband. + +config RDS_DEBUG + bool "Debugging messages" + depends on RDS + default n + diff --git a/net/rds/Makefile b/net/rds/Makefile new file mode 100644 index 000000000000..51f27585fa08 --- /dev/null +++ b/net/rds/Makefile @@ -0,0 +1,14 @@ +obj-$(CONFIG_RDS) += rds.o +rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ + recv.o send.o stats.o sysctl.o threads.o transport.o \ + loop.o page.o rdma.o \ + rdma_transport.o \ + ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ + ib_sysctl.o ib_rdma.o \ + iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ + iw_sysctl.o iw_rdma.o + +ifeq ($(CONFIG_RDS_DEBUG), y) +EXTRA_CFLAGS += -DDEBUG +endif + diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c new file mode 100644 index 000000000000..20cf16fc572f --- /dev/null +++ b/net/rds/af_rds.c @@ -0,0 +1,586 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/poll.h> +#include <linux/version.h> +#include <net/sock.h> + +#include "rds.h" +#include "rdma.h" +#include "rdma_transport.h" + +/* this is just used for stats gathering :/ */ +static DEFINE_SPINLOCK(rds_sock_lock); +static unsigned long rds_sock_count; +static LIST_HEAD(rds_sock_list); +DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); + +/* + * This is called as the final descriptor referencing this socket is closed. + * We have to unbind the socket so that another socket can be bound to the + * address it was using. + * + * We have to be careful about racing with the incoming path. sock_orphan() + * sets SOCK_DEAD and we use that as an indicator to the rx path that new + * messages shouldn't be queued. + */ +static int rds_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs; + unsigned long flags; + + if (sk == NULL) + goto out; + + rs = rds_sk_to_rs(sk); + + sock_orphan(sk); + /* Note - rds_clear_recv_queue grabs rs_recv_lock, so + * that ensures the recv path has completed messing + * with the socket. */ + rds_clear_recv_queue(rs); + rds_cong_remove_socket(rs); + rds_remove_bound(rs); + rds_send_drop_to(rs, NULL); + rds_rdma_drop_keys(rs); + rds_notify_queue_get(rs, NULL); + + spin_lock_irqsave(&rds_sock_lock, flags); + list_del_init(&rs->rs_item); + rds_sock_count--; + spin_unlock_irqrestore(&rds_sock_lock, flags); + + sock->sk = NULL; + sock_put(sk); +out: + return 0; +} + +/* + * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. + * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK + * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but + * this seems more conservative. + * NB - normally, one would use sk_callback_lock for this, but we can + * get here from interrupts, whereas the network code grabs sk_callback_lock + * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. + */ +void rds_wake_sk_sleep(struct rds_sock *rs) +{ + unsigned long flags; + + read_lock_irqsave(&rs->rs_recv_lock, flags); + __rds_wake_sk_sleep(rds_rs_to_sk(rs)); + read_unlock_irqrestore(&rs->rs_recv_lock, flags); +} + +static int rds_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + + /* racey, don't care */ + if (peer) { + if (!rs->rs_conn_addr) + return -ENOTCONN; + + sin->sin_port = rs->rs_conn_port; + sin->sin_addr.s_addr = rs->rs_conn_addr; + } else { + sin->sin_port = rs->rs_bound_port; + sin->sin_addr.s_addr = rs->rs_bound_addr; + } + + sin->sin_family = AF_INET; + + *uaddr_len = sizeof(*sin); + return 0; +} + +/* + * RDS' poll is without a doubt the least intuitive part of the interface, + * as POLLIN and POLLOUT do not behave entirely as you would expect from + * a network protocol. + * + * POLLIN is asserted if + * - there is data on the receive queue. + * - to signal that a previously congested destination may have become + * uncongested + * - A notification has been queued to the socket (this can be a congestion + * update, or a RDMA completion). + * + * POLLOUT is asserted if there is room on the send queue. This does not mean + * however, that the next sendmsg() call will succeed. If the application tries + * to send to a congested destination, the system call may still fail (and + * return ENOBUFS). + */ +static unsigned int rds_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + unsigned int mask = 0; + unsigned long flags; + + poll_wait(file, sk->sk_sleep, wait); + + poll_wait(file, &rds_poll_waitq, wait); + + read_lock_irqsave(&rs->rs_recv_lock, flags); + if (!rs->rs_cong_monitor) { + /* When a congestion map was updated, we signal POLLIN for + * "historical" reasons. Applications can also poll for + * WRBAND instead. */ + if (rds_cong_updated_since(&rs->rs_cong_track)) + mask |= (POLLIN | POLLRDNORM | POLLWRBAND); + } else { + spin_lock(&rs->rs_lock); + if (rs->rs_cong_notify) + mask |= (POLLIN | POLLRDNORM); + spin_unlock(&rs->rs_lock); + } + if (!list_empty(&rs->rs_recv_queue) + || !list_empty(&rs->rs_notify_queue)) + mask |= (POLLIN | POLLRDNORM); + if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) + mask |= (POLLOUT | POLLWRNORM); + read_unlock_irqrestore(&rs->rs_recv_lock, flags); + + return mask; +} + +static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -ENOIOCTLCMD; +} + +static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, + int len) +{ + struct sockaddr_in sin; + int ret = 0; + + /* racing with another thread binding seems ok here */ + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (len < sizeof(struct sockaddr_in)) { + ret = -EINVAL; + goto out; + } + + if (copy_from_user(&sin, optval, sizeof(sin))) { + ret = -EFAULT; + goto out; + } + + rds_send_drop_to(rs, &sin); +out: + return ret; +} + +static int rds_set_bool_option(unsigned char *optvar, char __user *optval, + int optlen) +{ + int value; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(value, (int __user *) optval)) + return -EFAULT; + *optvar = !!value; + return 0; +} + +static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, + int optlen) +{ + int ret; + + ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); + if (ret == 0) { + if (rs->rs_cong_monitor) { + rds_cong_add_socket(rs); + } else { + rds_cong_remove_socket(rs); + rs->rs_cong_mask = 0; + rs->rs_cong_notify = 0; + } + } + return ret; +} + +static int rds_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + int ret; + + if (level != SOL_RDS) { + ret = -ENOPROTOOPT; + goto out; + } + + switch (optname) { + case RDS_CANCEL_SENT_TO: + ret = rds_cancel_sent_to(rs, optval, optlen); + break; + case RDS_GET_MR: + ret = rds_get_mr(rs, optval, optlen); + break; + case RDS_FREE_MR: + ret = rds_free_mr(rs, optval, optlen); + break; + case RDS_RECVERR: + ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); + break; + case RDS_CONG_MONITOR: + ret = rds_cong_monitor(rs, optval, optlen); + break; + default: + ret = -ENOPROTOOPT; + } +out: + return ret; +} + +static int rds_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + int ret = -ENOPROTOOPT, len; + + if (level != SOL_RDS) + goto out; + + if (get_user(len, optlen)) { + ret = -EFAULT; + goto out; + } + + switch (optname) { + case RDS_INFO_FIRST ... RDS_INFO_LAST: + ret = rds_info_getsockopt(sock, optname, optval, + optlen); + break; + + case RDS_RECVERR: + if (len < sizeof(int)) + ret = -EINVAL; + else + if (put_user(rs->rs_recverr, (int __user *) optval) + || put_user(sizeof(int), optlen)) + ret = -EFAULT; + else + ret = 0; + break; + default: + break; + } + +out: + return ret; + +} + +static int rds_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sk); + int ret = 0; + + lock_sock(sk); + + if (addr_len != sizeof(struct sockaddr_in)) { + ret = -EINVAL; + goto out; + } + + if (sin->sin_family != AF_INET) { + ret = -EAFNOSUPPORT; + goto out; + } + + if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + ret = -EDESTADDRREQ; + goto out; + } + + rs->rs_conn_addr = sin->sin_addr.s_addr; + rs->rs_conn_port = sin->sin_port; + +out: + release_sock(sk); + return ret; +} + +static struct proto rds_proto = { + .name = "RDS", + .owner = THIS_MODULE, + .obj_size = sizeof(struct rds_sock), +}; + +static struct proto_ops rds_proto_ops = { + .family = AF_RDS, + .owner = THIS_MODULE, + .release = rds_release, + .bind = rds_bind, + .connect = rds_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = rds_getname, + .poll = rds_poll, + .ioctl = rds_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = rds_setsockopt, + .getsockopt = rds_getsockopt, + .sendmsg = rds_sendmsg, + .recvmsg = rds_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static int __rds_create(struct socket *sock, struct sock *sk, int protocol) +{ + unsigned long flags; + struct rds_sock *rs; + + sock_init_data(sock, sk); + sock->ops = &rds_proto_ops; + sk->sk_protocol = protocol; + + rs = rds_sk_to_rs(sk); + spin_lock_init(&rs->rs_lock); + rwlock_init(&rs->rs_recv_lock); + INIT_LIST_HEAD(&rs->rs_send_queue); + INIT_LIST_HEAD(&rs->rs_recv_queue); + INIT_LIST_HEAD(&rs->rs_notify_queue); + INIT_LIST_HEAD(&rs->rs_cong_list); + spin_lock_init(&rs->rs_rdma_lock); + rs->rs_rdma_keys = RB_ROOT; + + spin_lock_irqsave(&rds_sock_lock, flags); + list_add_tail(&rs->rs_item, &rds_sock_list); + rds_sock_count++; + spin_unlock_irqrestore(&rds_sock_lock, flags); + + return 0; +} + +static int rds_create(struct net *net, struct socket *sock, int protocol) +{ + struct sock *sk; + + if (sock->type != SOCK_SEQPACKET || protocol) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto); + if (!sk) + return -ENOMEM; + + return __rds_create(sock, sk, protocol); +} + +void rds_sock_addref(struct rds_sock *rs) +{ + sock_hold(rds_rs_to_sk(rs)); +} + +void rds_sock_put(struct rds_sock *rs) +{ + sock_put(rds_rs_to_sk(rs)); +} + +static struct net_proto_family rds_family_ops = { + .family = AF_RDS, + .create = rds_create, + .owner = THIS_MODULE, +}; + +static void rds_sock_inc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_sock *rs; + struct sock *sk; + struct rds_incoming *inc; + unsigned long flags; + unsigned int total = 0; + + len /= sizeof(struct rds_info_message); + + spin_lock_irqsave(&rds_sock_lock, flags); + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + sk = rds_rs_to_sk(rs); + read_lock(&rs->rs_recv_lock); + + /* XXX too lazy to maintain counts.. */ + list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { + total++; + if (total <= len) + rds_inc_info_copy(inc, iter, inc->i_saddr, + rs->rs_bound_addr, 1); + } + + read_unlock(&rs->rs_recv_lock); + } + + spin_unlock_irqrestore(&rds_sock_lock, flags); + + lens->nr = total; + lens->each = sizeof(struct rds_info_message); +} + +static void rds_sock_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_info_socket sinfo; + struct rds_sock *rs; + unsigned long flags; + + len /= sizeof(struct rds_info_socket); + + spin_lock_irqsave(&rds_sock_lock, flags); + + if (len < rds_sock_count) + goto out; + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + sinfo.sndbuf = rds_sk_sndbuf(rs); + sinfo.rcvbuf = rds_sk_rcvbuf(rs); + sinfo.bound_addr = rs->rs_bound_addr; + sinfo.connected_addr = rs->rs_conn_addr; + sinfo.bound_port = rs->rs_bound_port; + sinfo.connected_port = rs->rs_conn_port; + sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); + + rds_info_copy(iter, &sinfo, sizeof(sinfo)); + } + +out: + lens->nr = rds_sock_count; + lens->each = sizeof(struct rds_info_socket); + + spin_unlock_irqrestore(&rds_sock_lock, flags); +} + +static void __exit rds_exit(void) +{ + rds_rdma_exit(); + sock_unregister(rds_family_ops.family); + proto_unregister(&rds_proto); + rds_conn_exit(); + rds_cong_exit(); + rds_sysctl_exit(); + rds_threads_exit(); + rds_stats_exit(); + rds_page_exit(); + rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); + rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); +} +module_exit(rds_exit); + +static int __init rds_init(void) +{ + int ret; + + ret = rds_conn_init(); + if (ret) + goto out; + ret = rds_threads_init(); + if (ret) + goto out_conn; + ret = rds_sysctl_init(); + if (ret) + goto out_threads; + ret = rds_stats_init(); + if (ret) + goto out_sysctl; + ret = proto_register(&rds_proto, 1); + if (ret) + goto out_stats; + ret = sock_register(&rds_family_ops); + if (ret) + goto out_proto; + + rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); + rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); + + /* ib/iwarp transports currently compiled-in */ + ret = rds_rdma_init(); + if (ret) + goto out_sock; + goto out; + +out_sock: + sock_unregister(rds_family_ops.family); +out_proto: + proto_unregister(&rds_proto); +out_stats: + rds_stats_exit(); +out_sysctl: + rds_sysctl_exit(); +out_threads: + rds_threads_exit(); +out_conn: + rds_conn_exit(); + rds_cong_exit(); + rds_page_exit(); +out: + return ret; +} +module_init(rds_init); + +#define DRV_VERSION "4.0" +#define DRV_RELDATE "Feb 12, 2009" + +MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); +MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" + " v" DRV_VERSION " (" DRV_RELDATE ")"); +MODULE_VERSION(DRV_VERSION); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NETPROTO(PF_RDS); diff --git a/net/rds/bind.c b/net/rds/bind.c new file mode 100644 index 000000000000..c17cc39160ce --- /dev/null +++ b/net/rds/bind.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <net/sock.h> +#include <linux/in.h> +#include <linux/if_arp.h> +#include "rds.h" + +/* + * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't + * particularly zippy. + * + * This is now called for every incoming frame so we arguably care much more + * about it than we used to. + */ +static DEFINE_SPINLOCK(rds_bind_lock); +static struct rb_root rds_bind_tree = RB_ROOT; + +static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, + struct rds_sock *insert) +{ + struct rb_node **p = &rds_bind_tree.rb_node; + struct rb_node *parent = NULL; + struct rds_sock *rs; + u64 cmp; + u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); + + while (*p) { + parent = *p; + rs = rb_entry(parent, struct rds_sock, rs_bound_node); + + cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | + be16_to_cpu(rs->rs_bound_port); + + if (needle < cmp) + p = &(*p)->rb_left; + else if (needle > cmp) + p = &(*p)->rb_right; + else + return rs; + } + + if (insert) { + rb_link_node(&insert->rs_bound_node, parent, p); + rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); + } + return NULL; +} + +/* + * Return the rds_sock bound at the given local address. + * + * The rx path can race with rds_release. We notice if rds_release() has + * marked this socket and don't return a rs ref to the rx path. + */ +struct rds_sock *rds_find_bound(__be32 addr, __be16 port) +{ + struct rds_sock *rs; + unsigned long flags; + + spin_lock_irqsave(&rds_bind_lock, flags); + rs = rds_bind_tree_walk(addr, port, NULL); + if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) + rds_sock_addref(rs); + else + rs = NULL; + spin_unlock_irqrestore(&rds_bind_lock, flags); + + rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, + ntohs(port)); + return rs; +} + +/* returns -ve errno or +ve port */ +static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) +{ + unsigned long flags; + int ret = -EADDRINUSE; + u16 rover, last; + + if (*port != 0) { + rover = be16_to_cpu(*port); + last = rover; + } else { + rover = max_t(u16, net_random(), 2); + last = rover - 1; + } + + spin_lock_irqsave(&rds_bind_lock, flags); + + do { + if (rover == 0) + rover++; + if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { + *port = cpu_to_be16(rover); + ret = 0; + break; + } + } while (rover++ != last); + + if (ret == 0) { + rs->rs_bound_addr = addr; + rs->rs_bound_port = *port; + rds_sock_addref(rs); + + rdsdebug("rs %p binding to %pI4:%d\n", + rs, &addr, (int)ntohs(*port)); + } + + spin_unlock_irqrestore(&rds_bind_lock, flags); + + return ret; +} + +void rds_remove_bound(struct rds_sock *rs) +{ + unsigned long flags; + + spin_lock_irqsave(&rds_bind_lock, flags); + + if (rs->rs_bound_addr) { + rdsdebug("rs %p unbinding from %pI4:%d\n", + rs, &rs->rs_bound_addr, + ntohs(rs->rs_bound_port)); + + rb_erase(&rs->rs_bound_node, &rds_bind_tree); + rds_sock_put(rs); + rs->rs_bound_addr = 0; + } + + spin_unlock_irqrestore(&rds_bind_lock, flags); +} + +int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sk); + struct rds_transport *trans; + int ret = 0; + + lock_sock(sk); + + if (addr_len != sizeof(struct sockaddr_in) || + sin->sin_family != AF_INET || + rs->rs_bound_addr || + sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + ret = -EINVAL; + goto out; + } + + ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); + if (ret) + goto out; + + trans = rds_trans_get_preferred(sin->sin_addr.s_addr); + if (trans == NULL) { + ret = -EADDRNOTAVAIL; + rds_remove_bound(rs); + goto out; + } + + rs->rs_transport = trans; + ret = 0; + +out: + release_sock(sk); + return ret; +} diff --git a/net/rds/cong.c b/net/rds/cong.c new file mode 100644 index 000000000000..710e4599d76c --- /dev/null +++ b/net/rds/cong.c @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2007 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/types.h> +#include <linux/rbtree.h> + +#include <asm-generic/bitops/le.h> + +#include "rds.h" + +/* + * This file implements the receive side of the unconventional congestion + * management in RDS. + * + * Messages waiting in the receive queue on the receiving socket are accounted + * against the sockets SO_RCVBUF option value. Only the payload bytes in the + * message are accounted for. If the number of bytes queued equals or exceeds + * rcvbuf then the socket is congested. All sends attempted to this socket's + * address should return block or return -EWOULDBLOCK. + * + * Applications are expected to be reasonably tuned such that this situation + * very rarely occurs. An application encountering this "back-pressure" is + * considered a bug. + * + * This is implemented by having each node maintain bitmaps which indicate + * which ports on bound addresses are congested. As the bitmap changes it is + * sent through all the connections which terminate in the local address of the + * bitmap which changed. + * + * The bitmaps are allocated as connections are brought up. This avoids + * allocation in the interrupt handling path which queues messages on sockets. + * The dense bitmaps let transports send the entire bitmap on any bitmap change + * reasonably efficiently. This is much easier to implement than some + * finer-grained communication of per-port congestion. The sender does a very + * inexpensive bit test to test if the port it's about to send to is congested + * or not. + */ + +/* + * Interaction with poll is a tad tricky. We want all processes stuck in + * poll to wake up and check whether a congested destination became uncongested. + * The really sad thing is we have no idea which destinations the application + * wants to send to - we don't even know which rds_connections are involved. + * So until we implement a more flexible rds poll interface, we have to make + * do with this: + * We maintain a global counter that is incremented each time a congestion map + * update is received. Each rds socket tracks this value, and if rds_poll + * finds that the saved generation number is smaller than the global generation + * number, it wakes up the process. + */ +static atomic_t rds_cong_generation = ATOMIC_INIT(0); + +/* + * Congestion monitoring + */ +static LIST_HEAD(rds_cong_monitor); +static DEFINE_RWLOCK(rds_cong_monitor_lock); + +/* + * Yes, a global lock. It's used so infrequently that it's worth keeping it + * global to simplify the locking. It's only used in the following + * circumstances: + * + * - on connection buildup to associate a conn with its maps + * - on map changes to inform conns of a new map to send + * + * It's sadly ordered under the socket callback lock and the connection lock. + * Receive paths can mark ports congested from interrupt context so the + * lock masks interrupts. + */ +static DEFINE_SPINLOCK(rds_cong_lock); +static struct rb_root rds_cong_tree = RB_ROOT; + +static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, + struct rds_cong_map *insert) +{ + struct rb_node **p = &rds_cong_tree.rb_node; + struct rb_node *parent = NULL; + struct rds_cong_map *map; + + while (*p) { + parent = *p; + map = rb_entry(parent, struct rds_cong_map, m_rb_node); + + if (addr < map->m_addr) + p = &(*p)->rb_left; + else if (addr > map->m_addr) + p = &(*p)->rb_right; + else + return map; + } + + if (insert) { + rb_link_node(&insert->m_rb_node, parent, p); + rb_insert_color(&insert->m_rb_node, &rds_cong_tree); + } + return NULL; +} + +/* + * There is only ever one bitmap for any address. Connections try and allocate + * these bitmaps in the process getting pointers to them. The bitmaps are only + * ever freed as the module is removed after all connections have been freed. + */ +static struct rds_cong_map *rds_cong_from_addr(__be32 addr) +{ + struct rds_cong_map *map; + struct rds_cong_map *ret = NULL; + unsigned long zp; + unsigned long i; + unsigned long flags; + + map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); + if (map == NULL) + return NULL; + + map->m_addr = addr; + init_waitqueue_head(&map->m_waitq); + INIT_LIST_HEAD(&map->m_conn_list); + + for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { + zp = get_zeroed_page(GFP_KERNEL); + if (zp == 0) + goto out; + map->m_page_addrs[i] = zp; + } + + spin_lock_irqsave(&rds_cong_lock, flags); + ret = rds_cong_tree_walk(addr, map); + spin_unlock_irqrestore(&rds_cong_lock, flags); + + if (ret == NULL) { + ret = map; + map = NULL; + } + +out: + if (map) { + for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) + free_page(map->m_page_addrs[i]); + kfree(map); + } + + rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); + + return ret; +} + +/* + * Put the conn on its local map's list. This is called when the conn is + * really added to the hash. It's nested under the rds_conn_lock, sadly. + */ +void rds_cong_add_conn(struct rds_connection *conn) +{ + unsigned long flags; + + rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong); + spin_lock_irqsave(&rds_cong_lock, flags); + list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list); + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +void rds_cong_remove_conn(struct rds_connection *conn) +{ + unsigned long flags; + + rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong); + spin_lock_irqsave(&rds_cong_lock, flags); + list_del_init(&conn->c_map_item); + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +int rds_cong_get_maps(struct rds_connection *conn) +{ + conn->c_lcong = rds_cong_from_addr(conn->c_laddr); + conn->c_fcong = rds_cong_from_addr(conn->c_faddr); + + if (conn->c_lcong == NULL || conn->c_fcong == NULL) + return -ENOMEM; + + return 0; +} + +void rds_cong_queue_updates(struct rds_cong_map *map) +{ + struct rds_connection *conn; + unsigned long flags; + + spin_lock_irqsave(&rds_cong_lock, flags); + + list_for_each_entry(conn, &map->m_conn_list, c_map_item) { + if (!test_and_set_bit(0, &conn->c_map_queued)) { + rds_stats_inc(s_cong_update_queued); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + } + } + + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) +{ + rdsdebug("waking map %p for %pI4\n", + map, &map->m_addr); + rds_stats_inc(s_cong_update_received); + atomic_inc(&rds_cong_generation); + if (waitqueue_active(&map->m_waitq)) + wake_up(&map->m_waitq); + if (waitqueue_active(&rds_poll_waitq)) + wake_up_all(&rds_poll_waitq); + + if (portmask && !list_empty(&rds_cong_monitor)) { + unsigned long flags; + struct rds_sock *rs; + + read_lock_irqsave(&rds_cong_monitor_lock, flags); + list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) { + spin_lock(&rs->rs_lock); + rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); + rs->rs_cong_mask &= ~portmask; + spin_unlock(&rs->rs_lock); + if (rs->rs_cong_notify) + rds_wake_sk_sleep(rs); + } + read_unlock_irqrestore(&rds_cong_monitor_lock, flags); + } +} + +int rds_cong_updated_since(unsigned long *recent) +{ + unsigned long gen = atomic_read(&rds_cong_generation); + + if (likely(*recent == gen)) + return 0; + *recent = gen; + return 1; +} + +/* + * We're called under the locking that protects the sockets receive buffer + * consumption. This makes it a lot easier for the caller to only call us + * when it knows that an existing set bit needs to be cleared, and vice versa. + * We can't block and we need to deal with concurrent sockets working against + * the same per-address map. + */ +void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + rdsdebug("setting congestion for %pI4:%u in map %p\n", + &map->m_addr, ntohs(port), map); + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + generic___set_le_bit(off, (void *)map->m_page_addrs[i]); +} + +void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + rdsdebug("clearing congestion for %pI4:%u in map %p\n", + &map->m_addr, ntohs(port), map); + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + generic___clear_le_bit(off, (void *)map->m_page_addrs[i]); +} + +static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + return generic_test_le_bit(off, (void *)map->m_page_addrs[i]); +} + +void rds_cong_add_socket(struct rds_sock *rs) +{ + unsigned long flags; + + write_lock_irqsave(&rds_cong_monitor_lock, flags); + if (list_empty(&rs->rs_cong_list)) + list_add(&rs->rs_cong_list, &rds_cong_monitor); + write_unlock_irqrestore(&rds_cong_monitor_lock, flags); +} + +void rds_cong_remove_socket(struct rds_sock *rs) +{ + unsigned long flags; + struct rds_cong_map *map; + + write_lock_irqsave(&rds_cong_monitor_lock, flags); + list_del_init(&rs->rs_cong_list); + write_unlock_irqrestore(&rds_cong_monitor_lock, flags); + + /* update congestion map for now-closed port */ + spin_lock_irqsave(&rds_cong_lock, flags); + map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); + spin_unlock_irqrestore(&rds_cong_lock, flags); + + if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { + rds_cong_clear_bit(map, rs->rs_bound_port); + rds_cong_queue_updates(map); + } +} + +int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, + struct rds_sock *rs) +{ + if (!rds_cong_test_bit(map, port)) + return 0; + if (nonblock) { + if (rs && rs->rs_cong_monitor) { + unsigned long flags; + + /* It would have been nice to have an atomic set_bit on + * a uint64_t. */ + spin_lock_irqsave(&rs->rs_lock, flags); + rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port)); + spin_unlock_irqrestore(&rs->rs_lock, flags); + + /* Test again - a congestion update may have arrived in + * the meantime. */ + if (!rds_cong_test_bit(map, port)) + return 0; + } + rds_stats_inc(s_cong_send_error); + return -ENOBUFS; + } + + rds_stats_inc(s_cong_send_blocked); + rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port)); + + return wait_event_interruptible(map->m_waitq, + !rds_cong_test_bit(map, port)); +} + +void rds_cong_exit(void) +{ + struct rb_node *node; + struct rds_cong_map *map; + unsigned long i; + + while ((node = rb_first(&rds_cong_tree))) { + map = rb_entry(node, struct rds_cong_map, m_rb_node); + rdsdebug("freeing map %p\n", map); + rb_erase(&map->m_rb_node, &rds_cong_tree); + for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) + free_page(map->m_page_addrs[i]); + kfree(map); + } +} + +/* + * Allocate a RDS message containing a congestion update. + */ +struct rds_message *rds_cong_update_alloc(struct rds_connection *conn) +{ + struct rds_cong_map *map = conn->c_lcong; + struct rds_message *rm; + + rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES); + if (!IS_ERR(rm)) + rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP; + + return rm; +} diff --git a/net/rds/connection.c b/net/rds/connection.c new file mode 100644 index 000000000000..273f064930a8 --- /dev/null +++ b/net/rds/connection.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/list.h> +#include <net/inet_hashtables.h> + +#include "rds.h" +#include "loop.h" +#include "rdma.h" + +#define RDS_CONNECTION_HASH_BITS 12 +#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) +#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) + +/* converting this to RCU is a chore for another day.. */ +static DEFINE_SPINLOCK(rds_conn_lock); +static unsigned long rds_conn_count; +static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; +static struct kmem_cache *rds_conn_slab; + +static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) +{ + /* Pass NULL, don't need struct net for hash */ + unsigned long hash = inet_ehashfn(NULL, + be32_to_cpu(laddr), 0, + be32_to_cpu(faddr), 0); + return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; +} + +#define rds_conn_info_set(var, test, suffix) do { \ + if (test) \ + var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ +} while (0) + +static inline int rds_conn_is_sending(struct rds_connection *conn) +{ + int ret = 0; + + if (!mutex_trylock(&conn->c_send_lock)) + ret = 1; + else + mutex_unlock(&conn->c_send_lock); + + return ret; +} + +static struct rds_connection *rds_conn_lookup(struct hlist_head *head, + __be32 laddr, __be32 faddr, + struct rds_transport *trans) +{ + struct rds_connection *conn, *ret = NULL; + struct hlist_node *pos; + + hlist_for_each_entry(conn, pos, head, c_hash_node) { + if (conn->c_faddr == faddr && conn->c_laddr == laddr && + conn->c_trans == trans) { + ret = conn; + break; + } + } + rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret, + &laddr, &faddr); + return ret; +} + +/* + * This is called by transports as they're bringing down a connection. + * It clears partial message state so that the transport can start sending + * and receiving over this connection again in the future. It is up to + * the transport to have serialized this call with its send and recv. + */ +void rds_conn_reset(struct rds_connection *conn) +{ + rdsdebug("connection %pI4 to %pI4 reset\n", + &conn->c_laddr, &conn->c_faddr); + + rds_stats_inc(s_conn_reset); + rds_send_reset(conn); + conn->c_flags = 0; + + /* Do not clear next_rx_seq here, else we cannot distinguish + * retransmitted packets from new packets, and will hand all + * of them to the application. That is not consistent with the + * reliability guarantees of RDS. */ +} + +/* + * There is only every one 'conn' for a given pair of addresses in the + * system at a time. They contain messages to be retransmitted and so + * span the lifetime of the actual underlying transport connections. + * + * For now they are not garbage collected once they're created. They + * are torn down as the module is removed, if ever. + */ +static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp, + int is_outgoing) +{ + struct rds_connection *conn, *tmp, *parent = NULL; + struct hlist_head *head = rds_conn_bucket(laddr, faddr); + unsigned long flags; + int ret; + + spin_lock_irqsave(&rds_conn_lock, flags); + conn = rds_conn_lookup(head, laddr, faddr, trans); + if (conn + && conn->c_loopback + && conn->c_trans != &rds_loop_transport + && !is_outgoing) { + /* This is a looped back IB connection, and we're + * called by the code handling the incoming connect. + * We need a second connection object into which we + * can stick the other QP. */ + parent = conn; + conn = parent->c_passive; + } + spin_unlock_irqrestore(&rds_conn_lock, flags); + if (conn) + goto out; + + conn = kmem_cache_alloc(rds_conn_slab, gfp); + if (conn == NULL) { + conn = ERR_PTR(-ENOMEM); + goto out; + } + + memset(conn, 0, sizeof(*conn)); + + INIT_HLIST_NODE(&conn->c_hash_node); + conn->c_version = RDS_PROTOCOL_3_0; + conn->c_laddr = laddr; + conn->c_faddr = faddr; + spin_lock_init(&conn->c_lock); + conn->c_next_tx_seq = 1; + + mutex_init(&conn->c_send_lock); + INIT_LIST_HEAD(&conn->c_send_queue); + INIT_LIST_HEAD(&conn->c_retrans); + + ret = rds_cong_get_maps(conn); + if (ret) { + kmem_cache_free(rds_conn_slab, conn); + conn = ERR_PTR(ret); + goto out; + } + + /* + * This is where a connection becomes loopback. If *any* RDS sockets + * can bind to the destination address then we'd rather the messages + * flow through loopback rather than either transport. + */ + if (rds_trans_get_preferred(faddr)) { + conn->c_loopback = 1; + if (is_outgoing && trans->t_prefer_loopback) { + /* "outgoing" connection - and the transport + * says it wants the connection handled by the + * loopback transport. This is what TCP does. + */ + trans = &rds_loop_transport; + } + } + + conn->c_trans = trans; + + ret = trans->conn_alloc(conn, gfp); + if (ret) { + kmem_cache_free(rds_conn_slab, conn); + conn = ERR_PTR(ret); + goto out; + } + + atomic_set(&conn->c_state, RDS_CONN_DOWN); + conn->c_reconnect_jiffies = 0; + INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); + INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); + INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker); + INIT_WORK(&conn->c_down_w, rds_shutdown_worker); + mutex_init(&conn->c_cm_lock); + conn->c_flags = 0; + + rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", + conn, &laddr, &faddr, + trans->t_name ? trans->t_name : "[unknown]", + is_outgoing ? "(outgoing)" : ""); + + spin_lock_irqsave(&rds_conn_lock, flags); + if (parent == NULL) { + tmp = rds_conn_lookup(head, laddr, faddr, trans); + if (tmp == NULL) + hlist_add_head(&conn->c_hash_node, head); + } else { + tmp = parent->c_passive; + if (!tmp) + parent->c_passive = conn; + } + + if (tmp) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rds_conn_slab, conn); + conn = tmp; + } else { + rds_cong_add_conn(conn); + rds_conn_count++; + } + + spin_unlock_irqrestore(&rds_conn_lock, flags); + +out: + return conn; +} + +struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp) +{ + return __rds_conn_create(laddr, faddr, trans, gfp, 0); +} + +struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp) +{ + return __rds_conn_create(laddr, faddr, trans, gfp, 1); +} + +void rds_conn_destroy(struct rds_connection *conn) +{ + struct rds_message *rm, *rtmp; + + rdsdebug("freeing conn %p for %pI4 -> " + "%pI4\n", conn, &conn->c_laddr, + &conn->c_faddr); + + hlist_del_init(&conn->c_hash_node); + + /* wait for the rds thread to shut it down */ + atomic_set(&conn->c_state, RDS_CONN_ERROR); + cancel_delayed_work(&conn->c_conn_w); + queue_work(rds_wq, &conn->c_down_w); + flush_workqueue(rds_wq); + + /* tear down queued messages */ + list_for_each_entry_safe(rm, rtmp, + &conn->c_send_queue, + m_conn_item) { + list_del_init(&rm->m_conn_item); + BUG_ON(!list_empty(&rm->m_sock_item)); + rds_message_put(rm); + } + if (conn->c_xmit_rm) + rds_message_put(conn->c_xmit_rm); + + conn->c_trans->conn_free(conn->c_transport_data); + + /* + * The congestion maps aren't freed up here. They're + * freed by rds_cong_exit() after all the connections + * have been freed. + */ + rds_cong_remove_conn(conn); + + BUG_ON(!list_empty(&conn->c_retrans)); + kmem_cache_free(rds_conn_slab, conn); + + rds_conn_count--; +} + +static void rds_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct list_head *list; + struct rds_connection *conn; + struct rds_message *rm; + unsigned long flags; + unsigned int total = 0; + size_t i; + + len /= sizeof(struct rds_info_message); + + spin_lock_irqsave(&rds_conn_lock, flags); + + for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); + i++, head++) { + hlist_for_each_entry(conn, pos, head, c_hash_node) { + if (want_send) + list = &conn->c_send_queue; + else + list = &conn->c_retrans; + + spin_lock(&conn->c_lock); + + /* XXX too lazy to maintain counts.. */ + list_for_each_entry(rm, list, m_conn_item) { + total++; + if (total <= len) + rds_inc_info_copy(&rm->m_inc, iter, + conn->c_laddr, + conn->c_faddr, 0); + } + + spin_unlock(&conn->c_lock); + } + } + + spin_unlock_irqrestore(&rds_conn_lock, flags); + + lens->nr = total; + lens->each = sizeof(struct rds_info_message); +} + +static void rds_conn_message_info_send(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_conn_message_info(sock, len, iter, lens, 1); +} + +static void rds_conn_message_info_retrans(struct socket *sock, + unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_conn_message_info(sock, len, iter, lens, 0); +} + +void rds_for_each_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_connection *, void *), + size_t item_len) +{ + uint64_t buffer[(item_len + 7) / 8]; + struct hlist_head *head; + struct hlist_node *pos; + struct hlist_node *tmp; + struct rds_connection *conn; + unsigned long flags; + size_t i; + + spin_lock_irqsave(&rds_conn_lock, flags); + + lens->nr = 0; + lens->each = item_len; + + for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); + i++, head++) { + hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { + + /* XXX no c_lock usage.. */ + if (!visitor(conn, buffer)) + continue; + + /* We copy as much as we can fit in the buffer, + * but we count all items so that the caller + * can resize the buffer. */ + if (len >= item_len) { + rds_info_copy(iter, buffer, item_len); + len -= item_len; + } + lens->nr++; + } + } + + spin_unlock_irqrestore(&rds_conn_lock, flags); +} + +static int rds_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_connection *cinfo = buffer; + + cinfo->next_tx_seq = conn->c_next_tx_seq; + cinfo->next_rx_seq = conn->c_next_rx_seq; + cinfo->laddr = conn->c_laddr; + cinfo->faddr = conn->c_faddr; + strncpy(cinfo->transport, conn->c_trans->t_name, + sizeof(cinfo->transport)); + cinfo->flags = 0; + + rds_conn_info_set(cinfo->flags, + rds_conn_is_sending(conn), SENDING); + /* XXX Future: return the state rather than these funky bits */ + rds_conn_info_set(cinfo->flags, + atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, + CONNECTING); + rds_conn_info_set(cinfo->flags, + atomic_read(&conn->c_state) == RDS_CONN_UP, + CONNECTED); + return 1; +} + +static void rds_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_conn_info_visitor, + sizeof(struct rds_info_connection)); +} + +int __init rds_conn_init(void) +{ + rds_conn_slab = kmem_cache_create("rds_connection", + sizeof(struct rds_connection), + 0, 0, NULL); + if (rds_conn_slab == NULL) + return -ENOMEM; + + rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); + rds_info_register_func(RDS_INFO_SEND_MESSAGES, + rds_conn_message_info_send); + rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, + rds_conn_message_info_retrans); + + return 0; +} + +void rds_conn_exit(void) +{ + rds_loop_exit(); + + WARN_ON(!hlist_empty(rds_conn_hash)); + + kmem_cache_destroy(rds_conn_slab); + + rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info); + rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, + rds_conn_message_info_send); + rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, + rds_conn_message_info_retrans); +} + +/* + * Force a disconnect + */ +void rds_conn_drop(struct rds_connection *conn) +{ + atomic_set(&conn->c_state, RDS_CONN_ERROR); + queue_work(rds_wq, &conn->c_down_w); +} + +/* + * An error occurred on the connection + */ +void +__rds_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); + + rds_conn_drop(conn); +} diff --git a/net/rds/ib.c b/net/rds/ib.c new file mode 100644 index 000000000000..06a7b798d9a7 --- /dev/null +++ b/net/rds/ib.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/if_arp.h> +#include <linux/delay.h> + +#include "rds.h" +#include "ib.h" + +unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; +unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ + +module_param(fmr_pool_size, int, 0444); +MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); +module_param(fmr_message_size, int, 0444); +MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); + +struct list_head rds_ib_devices; + +DEFINE_SPINLOCK(ib_nodev_conns_lock); +LIST_HEAD(ib_nodev_conns); + +void rds_ib_add_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + struct ib_device_attr *dev_attr; + + /* Only handle IB (no iWARP) devices */ + if (device->node_type != RDMA_NODE_IB_CA) + return; + + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) + return; + + if (ib_query_device(device, dev_attr)) { + rdsdebug("Query device failed for %s\n", device->name); + goto free_attr; + } + + rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); + if (!rds_ibdev) + goto free_attr; + + spin_lock_init(&rds_ibdev->spinlock); + + rds_ibdev->max_wrs = dev_attr->max_qp_wr; + rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); + + rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); + rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift; + rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1); + rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; + rds_ibdev->max_fmrs = dev_attr->max_fmr ? + min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : + fmr_pool_size; + + rds_ibdev->dev = device; + rds_ibdev->pd = ib_alloc_pd(device); + if (IS_ERR(rds_ibdev->pd)) + goto free_dev; + + rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_ibdev->mr)) + goto err_pd; + + rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); + if (IS_ERR(rds_ibdev->mr_pool)) { + rds_ibdev->mr_pool = NULL; + goto err_mr; + } + + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); + INIT_LIST_HEAD(&rds_ibdev->conn_list); + list_add_tail(&rds_ibdev->list, &rds_ib_devices); + + ib_set_client_data(device, &rds_ib_client, rds_ibdev); + + goto free_attr; + +err_mr: + ib_dereg_mr(rds_ibdev->mr); +err_pd: + ib_dealloc_pd(rds_ibdev->pd); +free_dev: + kfree(rds_ibdev); +free_attr: + kfree(dev_attr); +} + +void rds_ib_remove_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + struct rds_ib_ipaddr *i_ipaddr, *i_next; + + rds_ibdev = ib_get_client_data(device, &rds_ib_client); + if (!rds_ibdev) + return; + + list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { + list_del(&i_ipaddr->list); + kfree(i_ipaddr); + } + + rds_ib_remove_conns(rds_ibdev); + + if (rds_ibdev->mr_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + + ib_dereg_mr(rds_ibdev->mr); + + while (ib_dealloc_pd(rds_ibdev->pd)) { + rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); + msleep(1); + } + + list_del(&rds_ibdev->list); + kfree(rds_ibdev); +} + +struct ib_client rds_ib_client = { + .name = "rds_ib", + .add = rds_ib_add_one, + .remove = rds_ib_remove_one +}; + +static int rds_ib_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_rdma_connection *iinfo = buffer; + struct rds_ib_connection *ic; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_ib_transport) + return 0; + + iinfo->src_addr = conn->c_laddr; + iinfo->dst_addr = conn->c_faddr; + + memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); + memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_ib_device *rds_ibdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + + ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + iinfo->max_send_wr = ic->i_send_ring.w_nr; + iinfo->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo->max_send_sge = rds_ibdev->max_sge; + rds_ib_get_mr_info(rds_ibdev, iinfo); + } + return 1; +} + +static void rds_ib_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_ib_conn_info_visitor, + sizeof(struct rds_info_rdma_connection)); +} + + +/* + * Early RDS/IB was built to only bind to an address if there is an IPoIB + * device with that address set. + * + * If it were me, I'd advocate for something more flexible. Sending and + * receiving should be device-agnostic. Transports would try and maintain + * connections between peers who have messages queued. Userspace would be + * allowed to influence which paths have priority. We could call userspace + * asserting this policy "routing". + */ +static int rds_ib_laddr_check(__be32 addr) +{ + int ret; + struct rdma_cm_id *cm_id; + struct sockaddr_in sin; + + /* Create a CMA ID and try to bind it. This catches both + * IB and iWARP capable NICs. + */ + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + if (!cm_id) + return -EADDRNOTAVAIL; + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + + /* rdma_bind_addr will only succeed for IB & iWARP devices */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + /* due to this, we will claim to support iWARP devices unless we + check node_type. */ + if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) + ret = -EADDRNOTAVAIL; + + rdsdebug("addr %pI4 ret %d node type %d\n", + &addr, ret, + cm_id->device ? cm_id->device->node_type : -1); + + rdma_destroy_id(cm_id); + + return ret; +} + +void rds_ib_exit(void) +{ + rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_ib_remove_nodev_conns(); + ib_unregister_client(&rds_ib_client); + rds_ib_sysctl_exit(); + rds_ib_recv_exit(); + rds_trans_unregister(&rds_ib_transport); +} + +struct rds_transport rds_ib_transport = { + .laddr_check = rds_ib_laddr_check, + .xmit_complete = rds_ib_xmit_complete, + .xmit = rds_ib_xmit, + .xmit_cong_map = NULL, + .xmit_rdma = rds_ib_xmit_rdma, + .recv = rds_ib_recv, + .conn_alloc = rds_ib_conn_alloc, + .conn_free = rds_ib_conn_free, + .conn_connect = rds_ib_conn_connect, + .conn_shutdown = rds_ib_conn_shutdown, + .inc_copy_to_user = rds_ib_inc_copy_to_user, + .inc_purge = rds_ib_inc_purge, + .inc_free = rds_ib_inc_free, + .cm_initiate_connect = rds_ib_cm_initiate_connect, + .cm_handle_connect = rds_ib_cm_handle_connect, + .cm_connect_complete = rds_ib_cm_connect_complete, + .stats_info_copy = rds_ib_stats_info_copy, + .exit = rds_ib_exit, + .get_mr = rds_ib_get_mr, + .sync_mr = rds_ib_sync_mr, + .free_mr = rds_ib_free_mr, + .flush_mrs = rds_ib_flush_mrs, + .t_owner = THIS_MODULE, + .t_name = "infiniband", +}; + +int __init rds_ib_init(void) +{ + int ret; + + INIT_LIST_HEAD(&rds_ib_devices); + + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out; + + ret = rds_ib_sysctl_init(); + if (ret) + goto out_ibreg; + + ret = rds_ib_recv_init(); + if (ret) + goto out_sysctl; + + ret = rds_trans_register(&rds_ib_transport); + if (ret) + goto out_recv; + + rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + + goto out; + +out_recv: + rds_ib_recv_exit(); +out_sysctl: + rds_ib_sysctl_exit(); +out_ibreg: + ib_unregister_client(&rds_ib_client); +out: + return ret; +} + +MODULE_LICENSE("GPL"); + diff --git a/net/rds/ib.h b/net/rds/ib.h new file mode 100644 index 000000000000..8be563a1363a --- /dev/null +++ b/net/rds/ib.h @@ -0,0 +1,367 @@ +#ifndef _RDS_IB_H +#define _RDS_IB_H + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include "rds.h" +#include "rdma_transport.h" + +#define RDS_FMR_SIZE 256 +#define RDS_FMR_POOL_SIZE 4096 + +#define RDS_IB_MAX_SGE 8 +#define RDS_IB_RECV_SGE 2 + +#define RDS_IB_DEFAULT_RECV_WR 1024 +#define RDS_IB_DEFAULT_SEND_WR 256 + +#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ + +extern struct list_head rds_ib_devices; + +/* + * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to + * try and minimize the amount of memory tied up both the device and + * socket receive queues. + */ +/* page offset of the final full frag that fits in the page */ +#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) +struct rds_page_frag { + struct list_head f_item; + struct page *f_page; + unsigned long f_offset; + dma_addr_t f_mapped; +}; + +struct rds_ib_incoming { + struct list_head ii_frags; + struct rds_incoming ii_inc; +}; + +struct rds_ib_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + __be32 dp_saddr; + __be32 dp_daddr; + u8 dp_protocol_major; + u8 dp_protocol_minor; + __be16 dp_protocol_minor_mask; /* bitmask */ + __be32 dp_reserved1; + __be64 dp_ack_seq; + __be32 dp_credit; /* non-zero enables flow ctl */ +}; + +struct rds_ib_send_work { + struct rds_message *s_rm; + struct rds_rdma_op *s_op; + struct ib_send_wr s_wr; + struct ib_sge s_sge[RDS_IB_MAX_SGE]; + unsigned long s_queued; +}; + +struct rds_ib_recv_work { + struct rds_ib_incoming *r_ibinc; + struct rds_page_frag *r_frag; + struct ib_recv_wr r_wr; + struct ib_sge r_sge[2]; +}; + +struct rds_ib_work_ring { + u32 w_nr; + u32 w_alloc_ptr; + u32 w_alloc_ctr; + u32 w_free_ptr; + atomic_t w_free_ctr; +}; + +struct rds_ib_device; + +struct rds_ib_connection { + + struct list_head ib_node; + struct rds_ib_device *rds_ibdev; + struct rds_connection *conn; + + /* alphabet soup, IBTA style */ + struct rdma_cm_id *i_cm_id; + struct ib_pd *i_pd; + struct ib_mr *i_mr; + struct ib_cq *i_send_cq; + struct ib_cq *i_recv_cq; + + /* tx */ + struct rds_ib_work_ring i_send_ring; + struct rds_message *i_rm; + struct rds_header *i_send_hdrs; + u64 i_send_hdrs_dma; + struct rds_ib_send_work *i_sends; + + /* rx */ + struct mutex i_recv_mutex; + struct rds_ib_work_ring i_recv_ring; + struct rds_ib_incoming *i_ibinc; + u32 i_recv_data_rem; + struct rds_header *i_recv_hdrs; + u64 i_recv_hdrs_dma; + struct rds_ib_recv_work *i_recvs; + struct rds_page_frag i_frag; + u64 i_ack_recv; /* last ACK received */ + + /* sending acks */ + unsigned long i_ack_flags; + u64 i_ack_next; /* next ACK to send */ + struct rds_header *i_ack; + struct ib_send_wr i_ack_wr; + struct ib_sge i_ack_sge; + u64 i_ack_dma; + unsigned long i_ack_queued; + + /* Flow control related information + * + * Our algorithm uses a pair variables that we need to access + * atomically - one for the send credits, and one posted + * recv credits we need to transfer to remote. + * Rather than protect them using a slow spinlock, we put both into + * a single atomic_t and update it using cmpxchg + */ + atomic_t i_credits; + + /* Protocol version specific information */ + unsigned int i_flowctl:1; /* enable/disable flow ctl */ + + /* Batched completions */ + unsigned int i_unsignaled_wrs; + long i_unsignaled_bytes; +}; + +/* This assumes that atomic_t is at least 32 bits */ +#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_GET_POST_CREDITS(v) ((v) >> 16) +#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_SET_POST_CREDITS(v) ((v) << 16) + +struct rds_ib_ipaddr { + struct list_head list; + __be32 ipaddr; +}; + +struct rds_ib_device { + struct list_head list; + struct list_head ipaddr_list; + struct list_head conn_list; + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct rds_ib_mr_pool *mr_pool; + int fmr_page_shift; + int fmr_page_size; + u64 fmr_page_mask; + unsigned int fmr_max_remaps; + unsigned int max_fmrs; + int max_sge; + unsigned int max_wrs; + spinlock_t spinlock; /* protect the above */ +}; + +/* bits for i_ack_flags */ +#define IB_ACK_IN_FLIGHT 0 +#define IB_ACK_REQUESTED 1 + +/* Magic WR_ID for ACKs */ +#define RDS_IB_ACK_WR_ID (~(u64) 0) + +struct rds_ib_statistics { + uint64_t s_ib_connect_raced; + uint64_t s_ib_listen_closed_stale; + uint64_t s_ib_tx_cq_call; + uint64_t s_ib_tx_cq_event; + uint64_t s_ib_tx_ring_full; + uint64_t s_ib_tx_throttle; + uint64_t s_ib_tx_sg_mapping_failure; + uint64_t s_ib_tx_stalled; + uint64_t s_ib_tx_credit_updates; + uint64_t s_ib_rx_cq_call; + uint64_t s_ib_rx_cq_event; + uint64_t s_ib_rx_ring_empty; + uint64_t s_ib_rx_refill_from_cq; + uint64_t s_ib_rx_refill_from_thread; + uint64_t s_ib_rx_alloc_limit; + uint64_t s_ib_rx_credit_updates; + uint64_t s_ib_ack_sent; + uint64_t s_ib_ack_send_failure; + uint64_t s_ib_ack_send_delayed; + uint64_t s_ib_ack_send_piggybacked; + uint64_t s_ib_ack_received; + uint64_t s_ib_rdma_mr_alloc; + uint64_t s_ib_rdma_mr_free; + uint64_t s_ib_rdma_mr_used; + uint64_t s_ib_rdma_mr_pool_flush; + uint64_t s_ib_rdma_mr_pool_wait; + uint64_t s_ib_rdma_mr_pool_depleted; +}; + +extern struct workqueue_struct *rds_ib_wq; + +/* + * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h + * doesn't define it. + */ +static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_cpu(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu + +static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_device(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device + + +/* ib.c */ +extern struct rds_transport rds_ib_transport; +extern void rds_ib_add_one(struct ib_device *device); +extern void rds_ib_remove_one(struct ib_device *device); +extern struct ib_client rds_ib_client; + +extern unsigned int fmr_pool_size; +extern unsigned int fmr_message_size; + +extern spinlock_t ib_nodev_conns_lock; +extern struct list_head ib_nodev_conns; + +/* ib_cm.c */ +int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); +void rds_ib_conn_free(void *arg); +int rds_ib_conn_connect(struct rds_connection *conn); +void rds_ib_conn_shutdown(struct rds_connection *conn); +void rds_ib_state_change(struct sock *sk); +int __init rds_ib_listen_init(void); +void rds_ib_listen_stop(void); +void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); +int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); +void rds_ib_cm_connect_complete(struct rds_connection *conn, + struct rdma_cm_event *event); + + +#define rds_ib_conn_error(conn, fmt...) \ + __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) + +/* ib_rdma.c */ +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); +int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); +void rds_ib_remove_nodev_conns(void); +void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev); +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); +void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); +void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret); +void rds_ib_sync_mr(void *trans_private, int dir); +void rds_ib_free_mr(void *trans_private, int invalidate); +void rds_ib_flush_mrs(void); + +/* ib_recv.c */ +int __init rds_ib_recv_init(void); +void rds_ib_recv_exit(void); +int rds_ib_recv(struct rds_connection *conn); +int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill); +void rds_ib_inc_purge(struct rds_incoming *inc); +void rds_ib_inc_free(struct rds_incoming *inc); +int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, + size_t size); +void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_recv_init_ring(struct rds_ib_connection *ic); +void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); +void rds_ib_recv_init_ack(struct rds_ib_connection *ic); +void rds_ib_attempt_ack(struct rds_ib_connection *ic); +void rds_ib_ack_send_complete(struct rds_ib_connection *ic); +u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); + +/* ib_ring.c */ +void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); +void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr); +u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos); +void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val); +void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val); +int rds_ib_ring_empty(struct rds_ib_work_ring *ring); +int rds_ib_ring_low(struct rds_ib_work_ring *ring); +u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring); +u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); +extern wait_queue_head_t rds_ib_ring_empty_wait; + +/* ib_send.c */ +void rds_ib_xmit_complete(struct rds_connection *conn); +int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_send_init_ring(struct rds_ib_connection *ic); +void rds_ib_send_clear_ring(struct rds_ib_connection *ic); +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); +void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); +int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, + u32 *adv_credits, int need_posted); + +/* ib_stats.c */ +DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); +#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) +unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); + +/* ib_sysctl.c */ +int __init rds_ib_sysctl_init(void); +void rds_ib_sysctl_exit(void); +extern unsigned long rds_ib_sysctl_max_send_wr; +extern unsigned long rds_ib_sysctl_max_recv_wr; +extern unsigned long rds_ib_sysctl_max_unsig_wrs; +extern unsigned long rds_ib_sysctl_max_unsig_bytes; +extern unsigned long rds_ib_sysctl_max_recv_allocation; +extern unsigned int rds_ib_sysctl_flow_control; +extern ctl_table rds_ib_sysctl_table[]; + +/* + * Helper functions for getting/setting the header and data SGEs in + * RDS packets (not RDMA) + */ +static inline struct ib_sge * +rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) +{ + return &sge[0]; +} + +static inline struct ib_sge * +rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) +{ + return &sge[1]; +} + +static inline void rds_ib_set_64bit(u64 *ptr, u64 val) +{ +#if BITS_PER_LONG == 64 + *ptr = val; +#else + set_64bit(ptr, val); +#endif +} + +#endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c new file mode 100644 index 000000000000..0532237bd128 --- /dev/null +++ b/net/rds/ib_cm.c @@ -0,0 +1,726 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/vmalloc.h> + +#include "rds.h" +#include "ib.h" + +/* + * Set the selected protocol version + */ +static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version) +{ + conn->c_version = version; +} + +/* + * Set up flow control + */ +static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (rds_ib_sysctl_flow_control && credits != 0) { + /* We're doing flow control */ + ic->i_flowctl = 1; + rds_ib_send_add_credits(conn, credits); + } else { + ic->i_flowctl = 0; + } +} + +/* + * Tune RNR behavior. Without flow control, we use a rather + * low timeout, but not the absolute minimum - this should + * be tunable. + * + * We already set the RNR retry count to 7 (which is the + * smallest infinite number :-) above. + * If flow control is off, we want to change this back to 0 + * so that we learn quickly when our credit accounting is + * buggy. + * + * Caller passes in a qp_attr pointer - don't waste stack spacv + * by allocation this twice. + */ +static void +rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr) +{ + int ret; + + attr->min_rnr_timer = IB_RNR_TIMER_000_32; + ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); + if (ret) + printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret); +} + +/* + * Connection established. + * We get here for both outgoing and incoming connection. + */ +void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) +{ + const struct rds_ib_connect_private *dp = NULL; + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_device *rds_ibdev; + struct ib_qp_attr qp_attr; + int err; + + if (event->param.conn.private_data_len) { + dp = event->param.conn.private_data; + + rds_ib_set_protocol(conn, + RDS_PROTOCOL(dp->dp_protocol_major, + dp->dp_protocol_minor)); + rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + } + + printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", + &conn->c_laddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + + /* Tune RNR behavior */ + rds_ib_tune_rnr(ic, &qp_attr); + + qp_attr.qp_state = IB_QPS_RTS; + err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); + if (err) + printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); + + /* update ib_device with this local ipaddr & conn */ + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); + if (err) + printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); + err = rds_ib_add_conn(rds_ibdev, conn); + if (err) + printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp && dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + rds_connect_complete(conn); +} + +static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, + struct rdma_conn_param *conn_param, + struct rds_ib_connect_private *dp, + u32 protocol_version) +{ + memset(conn_param, 0, sizeof(struct rdma_conn_param)); + /* XXX tune these? */ + conn_param->responder_resources = 1; + conn_param->initiator_depth = 1; + conn_param->retry_count = 7; + conn_param->rnr_retry_count = 7; + + if (dp) { + struct rds_ib_connection *ic = conn->c_transport_data; + + memset(dp, 0, sizeof(*dp)); + dp->dp_saddr = conn->c_laddr; + dp->dp_daddr = conn->c_faddr; + dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); + dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); + dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); + dp->dp_ack_seq = rds_ib_piggyb_ack(ic); + + /* Advertise flow control */ + if (ic->i_flowctl) { + unsigned int credits; + + credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); + dp->dp_credit = cpu_to_be32(credits); + atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); + } + + conn_param->private_data = dp; + conn_param->private_data_len = sizeof(*dp); + } +} + +static void rds_ib_cq_event_handler(struct ib_event *event, void *data) +{ + rdsdebug("event %u data %p\n", event->event, data); +} + +static void rds_ib_qp_event_handler(struct ib_event *event, void *data) +{ + struct rds_connection *conn = data; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); + break; + default: + printk(KERN_WARNING "RDS/ib: unhandled QP event %u " + "on connection to %pI4\n", event->event, + &conn->c_faddr); + break; + } +} + +/* + * This needs to be very careful to not leave IS_ERR pointers around for + * cleanup to trip over. + */ +static int rds_ib_setup_qp(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct ib_qp_init_attr attr; + struct rds_ib_device *rds_ibdev; + int ret; + + /* rds_ib_add_one creates a rds_ib_device object per IB device, + * and allocates a protection domain, memory range and FMR pool + * for each. If that fails for any reason, it will not register + * the rds_ibdev at all. + */ + rds_ibdev = ib_get_client_data(dev, &rds_ib_client); + if (rds_ibdev == NULL) { + if (printk_ratelimit()) + printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", + dev->name); + return -EOPNOTSUPP; + } + + if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) + rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); + if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) + rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); + + /* Protection domain and memory range */ + ic->i_pd = rds_ibdev->pd; + ic->i_mr = rds_ibdev->mr; + + ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, + rds_ib_cq_event_handler, conn, + ic->i_send_ring.w_nr + 1, 0); + if (IS_ERR(ic->i_send_cq)) { + ret = PTR_ERR(ic->i_send_cq); + ic->i_send_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, + rds_ib_cq_event_handler, conn, + ic->i_recv_ring.w_nr, 0); + if (IS_ERR(ic->i_recv_cq)) { + ret = PTR_ERR(ic->i_recv_cq); + ic->i_recv_cq = NULL; + rdsdebug("ib_create_cq recv failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); + if (ret) { + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + if (ret) { + rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); + goto out; + } + + /* XXX negotiate max send/recv with remote? */ + memset(&attr, 0, sizeof(attr)); + attr.event_handler = rds_ib_qp_event_handler; + attr.qp_context = conn; + /* + 1 to allow for the single ack message */ + attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; + attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; + attr.cap.max_send_sge = rds_ibdev->max_sge; + attr.cap.max_recv_sge = RDS_IB_RECV_SGE; + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + attr.qp_type = IB_QPT_RC; + attr.send_cq = ic->i_send_cq; + attr.recv_cq = ic->i_recv_cq; + + /* + * XXX this can fail if max_*_wr is too large? Are we supposed + * to back off until we get a value that the hardware can support? + */ + ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); + if (ret) { + rdsdebug("rdma_create_qp failed: %d\n", ret); + goto out; + } + + ic->i_send_hdrs = ib_dma_alloc_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + &ic->i_send_hdrs_dma, GFP_KERNEL); + if (ic->i_send_hdrs == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent send failed\n"); + goto out; + } + + ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + &ic->i_recv_hdrs_dma, GFP_KERNEL); + if (ic->i_recv_hdrs == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent recv failed\n"); + goto out; + } + + ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), + &ic->i_ack_dma, GFP_KERNEL); + if (ic->i_ack == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent ack failed\n"); + goto out; + } + + ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); + if (ic->i_sends == NULL) { + ret = -ENOMEM; + rdsdebug("send allocation failed\n"); + goto out; + } + rds_ib_send_init_ring(ic); + + ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); + if (ic->i_recvs == NULL) { + ret = -ENOMEM; + rdsdebug("recv allocation failed\n"); + goto out; + } + + rds_ib_recv_init_ring(ic); + rds_ib_recv_init_ack(ic); + + /* Post receive buffers - as a side effect, this will update + * the posted credit count. */ + rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + + rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, + ic->i_send_cq, ic->i_recv_cq); + +out: + return ret; +} + +static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp) +{ + u16 common; + u32 version = 0; + + /* rdma_cm private data is odd - when there is any private data in the + * request, we will be given a pretty large buffer without telling us the + * original size. The only way to tell the difference is by looking at + * the contents, which are initialized to zero. + * If the protocol version fields aren't set, this is a connection attempt + * from an older version. This could could be 3.0 or 2.0 - we can't tell. + * We really should have changed this for OFED 1.3 :-( */ + if (dp->dp_protocol_major == 0) + return RDS_PROTOCOL_3_0; + + common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; + if (dp->dp_protocol_major == 3 && common) { + version = RDS_PROTOCOL_3_0; + while ((common >>= 1) != 0) + version++; + } else if (printk_ratelimit()) { + printk(KERN_NOTICE "RDS: Connection from %pI4 using " + "incompatible protocol version %u.%u\n", + &dp->dp_saddr, + dp->dp_protocol_major, + dp->dp_protocol_minor); + } + return version; +} + +int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; + __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; + const struct rds_ib_connect_private *dp = event->param.conn.private_data; + struct rds_ib_connect_private dp_rep; + struct rds_connection *conn = NULL; + struct rds_ib_connection *ic = NULL; + struct rdma_conn_param conn_param; + u32 version; + int err, destroy = 1; + + /* Check whether the remote protocol version matches ours. */ + version = rds_ib_protocol_compatible(dp); + if (!version) + goto out; + + rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " + "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, + RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), + (unsigned long long)be64_to_cpu(lguid), + (unsigned long long)be64_to_cpu(fguid)); + + conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, + GFP_KERNEL); + if (IS_ERR(conn)) { + rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); + conn = NULL; + goto out; + } + + /* + * The connection request may occur while the + * previous connection exist, e.g. in case of failover. + * But as connections may be initiated simultaneously + * by both hosts, we have a random backoff mechanism - + * see the comment above rds_queue_reconnect() + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + if (rds_conn_state(conn) == RDS_CONN_UP) { + rdsdebug("incoming connect while connecting\n"); + rds_conn_drop(conn); + rds_ib_stats_inc(s_ib_listen_closed_stale); + } else + if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { + /* Wait and see - our connect may still be succeeding */ + rds_ib_stats_inc(s_ib_connect_raced); + } + mutex_unlock(&conn->c_cm_lock); + goto out; + } + + ic = conn->c_transport_data; + + rds_ib_set_protocol(conn, version); + rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + BUG_ON(cm_id->context); + BUG_ON(ic->i_cm_id); + + ic->i_cm_id = cm_id; + cm_id->context = conn; + + /* We got halfway through setting up the ib_connection, if we + * fail now, we have to take the long route out of this mess. */ + destroy = 0; + + err = rds_ib_setup_qp(conn); + if (err) { + rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); + goto out; + } + + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + + /* rdma_accept() calls rdma_reject() internally if it fails */ + err = rdma_accept(cm_id, &conn_param); + mutex_unlock(&conn->c_cm_lock); + if (err) { + rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); + goto out; + } + + return 0; + +out: + rdma_reject(cm_id, NULL, 0); + return destroy; +} + + +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) +{ + struct rds_connection *conn = cm_id->context; + struct rds_ib_connection *ic = conn->c_transport_data; + struct rdma_conn_param conn_param; + struct rds_ib_connect_private dp; + int ret; + + /* If the peer doesn't do protocol negotiation, we must + * default to RDSv3.0 */ + rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0); + ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */ + + ret = rds_ib_setup_qp(conn); + if (ret) { + rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret); + goto out; + } + + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); + + ret = rdma_connect(cm_id, &conn_param); + if (ret) + rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); + +out: + /* Beware - returning non-zero tells the rdma_cm to destroy + * the cm_id. We should certainly not do it as long as we still + * "own" the cm_id. */ + if (ret) { + if (ic->i_cm_id == cm_id) + ret = 0; + } + return ret; +} + +int rds_ib_conn_connect(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct sockaddr_in src, dest; + int ret; + + /* XXX I wonder what affect the port space has */ + /* delegate cm event handler to rdma_transport */ + ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, + RDMA_PS_TCP); + if (IS_ERR(ic->i_cm_id)) { + ret = PTR_ERR(ic->i_cm_id); + ic->i_cm_id = NULL; + rdsdebug("rdma_create_id() failed: %d\n", ret); + goto out; + } + + rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (__force u32)conn->c_laddr; + src.sin_port = (__force u16)htons(0); + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (__force u32)conn->c_faddr; + dest.sin_port = (__force u16)htons(RDS_PORT); + + ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, + (struct sockaddr *)&dest, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, + ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + } + +out: + return ret; +} + +/* + * This is so careful about only cleaning up resources that were built up + * so that it can be called at any point during startup. In fact it + * can be called multiple times for a given connection. + */ +void rds_ib_conn_shutdown(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + int err = 0; + + rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, + ic->i_pd, ic->i_send_cq, ic->i_recv_cq, + ic->i_cm_id ? ic->i_cm_id->qp : NULL); + + if (ic->i_cm_id) { + struct ib_device *dev = ic->i_cm_id->device; + + rdsdebug("disconnecting cm %p\n", ic->i_cm_id); + err = rdma_disconnect(ic->i_cm_id); + if (err) { + /* Actually this may happen quite frequently, when + * an outgoing connect raced with an incoming connect. + */ + rdsdebug("failed to disconnect, cm: %p err %d\n", + ic->i_cm_id, err); + } + + wait_event(rds_ib_ring_empty_wait, + rds_ib_ring_empty(&ic->i_send_ring) && + rds_ib_ring_empty(&ic->i_recv_ring)); + + if (ic->i_send_hdrs) + ib_dma_free_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + ic->i_send_hdrs, + ic->i_send_hdrs_dma); + + if (ic->i_recv_hdrs) + ib_dma_free_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + ic->i_recv_hdrs, + ic->i_recv_hdrs_dma); + + if (ic->i_ack) + ib_dma_free_coherent(dev, sizeof(struct rds_header), + ic->i_ack, ic->i_ack_dma); + + if (ic->i_sends) + rds_ib_send_clear_ring(ic); + if (ic->i_recvs) + rds_ib_recv_clear_ring(ic); + + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + rdma_destroy_id(ic->i_cm_id); + + /* + * Move connection back to the nodev list. + */ + if (ic->rds_ibdev) { + + spin_lock_irq(&ic->rds_ibdev->spinlock); + BUG_ON(list_empty(&ic->ib_node)); + list_del(&ic->ib_node); + spin_unlock_irq(&ic->rds_ibdev->spinlock); + + spin_lock_irq(&ib_nodev_conns_lock); + list_add_tail(&ic->ib_node, &ib_nodev_conns); + spin_unlock_irq(&ib_nodev_conns_lock); + ic->rds_ibdev = NULL; + } + + ic->i_cm_id = NULL; + ic->i_pd = NULL; + ic->i_mr = NULL; + ic->i_send_cq = NULL; + ic->i_recv_cq = NULL; + ic->i_send_hdrs = NULL; + ic->i_recv_hdrs = NULL; + ic->i_ack = NULL; + } + BUG_ON(ic->rds_ibdev); + + /* Clear pending transmit */ + if (ic->i_rm) { + rds_message_put(ic->i_rm); + ic->i_rm = NULL; + } + + /* Clear the ACK state */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_ib_set_64bit(&ic->i_ack_next, 0); + ic->i_ack_recv = 0; + + /* Clear flow control state */ + ic->i_flowctl = 0; + atomic_set(&ic->i_credits, 0); + + rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); + rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + + if (ic->i_ibinc) { + rds_inc_put(&ic->i_ibinc->ii_inc); + ic->i_ibinc = NULL; + } + + vfree(ic->i_sends); + ic->i_sends = NULL; + vfree(ic->i_recvs); + ic->i_recvs = NULL; +} + +int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_ib_connection *ic; + unsigned long flags; + + /* XXX too lazy? */ + ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); + if (ic == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&ic->ib_node); + mutex_init(&ic->i_recv_mutex); + + /* + * rds_ib_conn_shutdown() waits for these to be emptied so they + * must be initialized before it can be called. + */ + rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); + rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + + ic->conn = conn; + conn->c_transport_data = ic; + + spin_lock_irqsave(&ib_nodev_conns_lock, flags); + list_add_tail(&ic->ib_node, &ib_nodev_conns); + spin_unlock_irqrestore(&ib_nodev_conns_lock, flags); + + + rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); + return 0; +} + +void rds_ib_conn_free(void *arg) +{ + struct rds_ib_connection *ic = arg; + rdsdebug("ic %p\n", ic); + list_del(&ic->ib_node); + kfree(ic); +} + + +/* + * An error occurred on the connection + */ +void +__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + rds_conn_drop(conn); + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); +} diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c new file mode 100644 index 000000000000..69a6289ed672 --- /dev/null +++ b/net/rds/ib_rdma.c @@ -0,0 +1,641 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> + +#include "rds.h" +#include "rdma.h" +#include "ib.h" + + +/* + * This is stored as mr->r_trans_private. + */ +struct rds_ib_mr { + struct rds_ib_device *device; + struct rds_ib_mr_pool *pool; + struct ib_fmr *fmr; + struct list_head list; + unsigned int remap_count; + + struct scatterlist *sg; + unsigned int sg_len; + u64 *dma; + int sg_dma_len; +}; + +/* + * Our own little FMR pool + */ +struct rds_ib_mr_pool { + struct mutex flush_lock; /* serialize fmr invalidate */ + struct work_struct flush_worker; /* flush worker */ + + spinlock_t list_lock; /* protect variables below */ + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + struct list_head drop_list; /* MRs that have reached their max_maps limit */ + struct list_head free_list; /* unused MRs */ + struct list_head clean_list; /* unused & unamapped MRs */ + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; + struct ib_fmr_attr fmr_attr; +}; + +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); +static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); +static void rds_ib_mr_pool_flush_worker(struct work_struct *work); + +static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) +{ + struct rds_ib_device *rds_ibdev; + struct rds_ib_ipaddr *i_ipaddr; + + list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { + spin_lock_irq(&rds_ibdev->spinlock); + list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { + if (i_ipaddr->ipaddr == ipaddr) { + spin_unlock_irq(&rds_ibdev->spinlock); + return rds_ibdev; + } + } + spin_unlock_irq(&rds_ibdev->spinlock); + } + + return NULL; +} + +static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_ipaddr *i_ipaddr; + + i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL); + if (!i_ipaddr) + return -ENOMEM; + + i_ipaddr->ipaddr = ipaddr; + + spin_lock_irq(&rds_ibdev->spinlock); + list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); + spin_unlock_irq(&rds_ibdev->spinlock); + + return 0; +} + +static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_ipaddr *i_ipaddr, *next; + + spin_lock_irq(&rds_ibdev->spinlock); + list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { + if (i_ipaddr->ipaddr == ipaddr) { + list_del(&i_ipaddr->list); + kfree(i_ipaddr); + break; + } + } + spin_unlock_irq(&rds_ibdev->spinlock); +} + +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_device *rds_ibdev_old; + + rds_ibdev_old = rds_ib_get_device(ipaddr); + if (rds_ibdev_old) + rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); +} + +int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + /* conn was previously on the nodev_conns_list */ + spin_lock_irq(&ib_nodev_conns_lock); + BUG_ON(list_empty(&ib_nodev_conns)); + BUG_ON(list_empty(&ic->ib_node)); + list_del(&ic->ib_node); + spin_unlock_irq(&ib_nodev_conns_lock); + + spin_lock_irq(&rds_ibdev->spinlock); + list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); + spin_unlock_irq(&rds_ibdev->spinlock); + + ic->rds_ibdev = rds_ibdev; + + return 0; +} + +void rds_ib_remove_nodev_conns(void) +{ + struct rds_ib_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&ib_nodev_conns_lock); + list_splice(&ib_nodev_conns, &tmp_list); + INIT_LIST_HEAD(&ib_nodev_conns); + spin_unlock_irq(&ib_nodev_conns_lock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) { + if (ic->conn->c_passive) + rds_conn_destroy(ic->conn->c_passive); + rds_conn_destroy(ic->conn); + } +} + +void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&rds_ibdev->spinlock); + list_splice(&rds_ibdev->conn_list, &tmp_list); + INIT_LIST_HEAD(&rds_ibdev->conn_list); + spin_unlock_irq(&rds_ibdev->spinlock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) { + if (ic->conn->c_passive) + rds_conn_destroy(ic->conn->c_passive); + rds_conn_destroy(ic->conn); + } +} + +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_mr_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->drop_list); + INIT_LIST_HEAD(&pool->clean_list); + mutex_init(&pool->flush_lock); + spin_lock_init(&pool->list_lock); + INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); + + pool->fmr_attr.max_pages = fmr_message_size; + pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; + pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift; + pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4; + + /* We never allow more than max_items MRs to be allocated. + * When we exceed more than max_items_soft, we start freeing + * items more aggressively. + * Make sure that max_items > max_items_soft > max_items / 2 + */ + pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; + pool->max_items = rds_ibdev->max_fmrs; + + return pool; +} + +void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) +{ + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + iinfo->rdma_mr_max = pool->max_items; + iinfo->rdma_mr_size = pool->fmr_attr.max_pages; +} + +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) +{ + flush_workqueue(rds_wq); + rds_ib_flush_mr_pool(pool, 1); + BUG_ON(atomic_read(&pool->item_count)); + BUG_ON(atomic_read(&pool->free_pinned)); + kfree(pool); +} + +static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) +{ + struct rds_ib_mr *ibmr = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->list_lock, flags); + if (!list_empty(&pool->clean_list)) { + ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); + list_del_init(&ibmr->list); + } + spin_unlock_irqrestore(&pool->list_lock, flags); + + return ibmr; +} + +static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr *ibmr = NULL; + int err = 0, iter = 0; + + while (1) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) + return ibmr; + + /* No clean MRs - now we have the choice of either + * allocating a fresh MR up to the limit imposed by the + * driver, or flush any dirty unused MRs. + * We try to avoid stalling in the send path if possible, + * so we allocate as long as we're allowed to. + * + * We're fussy with enforcing the FMR limit, though. If the driver + * tells us we can't use more than N fmrs, we shouldn't start + * arguing with it */ + if (atomic_inc_return(&pool->item_count) <= pool->max_items) + break; + + atomic_dec(&pool->item_count); + + if (++iter > 2) { + rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted); + return ERR_PTR(-EAGAIN); + } + + /* We do have some empty MRs. Flush them out. */ + rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); + rds_ib_flush_mr_pool(pool, 0); + } + + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE), + &pool->fmr_attr); + if (IS_ERR(ibmr->fmr)) { + err = PTR_ERR(ibmr->fmr); + ibmr->fmr = NULL; + printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err); + goto out_no_cigar; + } + + rds_ib_stats_inc(s_ib_rdma_mr_alloc); + return ibmr; + +out_no_cigar: + if (ibmr) { + if (ibmr->fmr) + ib_dealloc_fmr(ibmr->fmr); + kfree(ibmr); + } + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, + struct scatterlist *sg, unsigned int nents) +{ + struct ib_device *dev = rds_ibdev->dev; + struct scatterlist *scat = sg; + u64 io_addr = 0; + u64 *dma_pages; + u32 len; + int page_cnt, sg_dma_len; + int i, j; + int ret; + + sg_dma_len = ib_dma_map_sg(dev, sg, nents, + DMA_BIDIRECTIONAL); + if (unlikely(!sg_dma_len)) { + printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n"); + return -EBUSY; + } + + len = 0; + page_cnt = 0; + + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + if (dma_addr & ~rds_ibdev->fmr_page_mask) { + if (i > 0) + return -EINVAL; + else + ++page_cnt; + } + if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) { + if (i < sg_dma_len - 1) + return -EINVAL; + else + ++page_cnt; + } + + len += dma_len; + } + + page_cnt += len >> rds_ibdev->fmr_page_shift; + if (page_cnt > fmr_message_size) + return -EINVAL; + + dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); + if (!dma_pages) + return -ENOMEM; + + page_cnt = 0; + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size) + dma_pages[page_cnt++] = + (dma_addr & rds_ibdev->fmr_page_mask) + j; + } + + ret = ib_map_phys_fmr(ibmr->fmr, + dma_pages, page_cnt, io_addr); + if (ret) + goto out; + + /* Success - we successfully remapped the MR, so we can + * safely tear down the old mapping. */ + rds_ib_teardown_mr(ibmr); + + ibmr->sg = scat; + ibmr->sg_len = nents; + ibmr->sg_dma_len = sg_dma_len; + ibmr->remap_count++; + + rds_ib_stats_inc(s_ib_rdma_mr_used); + ret = 0; + +out: + kfree(dma_pages); + + return ret; +} + +void rds_ib_sync_mr(void *trans_private, int direction) +{ + struct rds_ib_mr *ibmr = trans_private; + struct rds_ib_device *rds_ibdev = ibmr->device; + + switch (direction) { + case DMA_FROM_DEVICE: + ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, + ibmr->sg_dma_len, DMA_BIDIRECTIONAL); + break; + case DMA_TO_DEVICE: + ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg, + ibmr->sg_dma_len, DMA_BIDIRECTIONAL); + break; + } +} + +static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +{ + struct rds_ib_device *rds_ibdev = ibmr->device; + + if (ibmr->sg_dma_len) { + ib_dma_unmap_sg(rds_ibdev->dev, + ibmr->sg, ibmr->sg_len, + DMA_BIDIRECTIONAL); + ibmr->sg_dma_len = 0; + } + + /* Release the s/g list */ + if (ibmr->sg_len) { + unsigned int i; + + for (i = 0; i < ibmr->sg_len; ++i) { + struct page *page = sg_page(&ibmr->sg[i]); + + /* FIXME we need a way to tell a r/w MR + * from a r/o MR */ + set_page_dirty(page); + put_page(page); + } + kfree(ibmr->sg); + + ibmr->sg = NULL; + ibmr->sg_len = 0; + } +} + +static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +{ + unsigned int pinned = ibmr->sg_len; + + __rds_ib_teardown_mr(ibmr); + if (pinned) { + struct rds_ib_device *rds_ibdev = ibmr->device; + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + atomic_sub(pinned, &pool->free_pinned); + } +} + +static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all) +{ + unsigned int item_count; + + item_count = atomic_read(&pool->item_count); + if (free_all) + return item_count; + + return 0; +} + +/* + * Flush our pool of MRs. + * At a minimum, all currently unused MRs are unmapped. + * If the number of MRs allocated exceeds the limit, we also try + * to free as many MRs as needed to get back to this limit. + */ +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) +{ + struct rds_ib_mr *ibmr, *next; + LIST_HEAD(unmap_list); + LIST_HEAD(fmr_list); + unsigned long unpinned = 0; + unsigned long flags; + unsigned int nfreed = 0, ncleaned = 0, free_goal; + int ret = 0; + + rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); + + mutex_lock(&pool->flush_lock); + + spin_lock_irqsave(&pool->list_lock, flags); + /* Get the list of all MRs to be dropped. Ordering matters - + * we want to put drop_list ahead of free_list. */ + list_splice_init(&pool->free_list, &unmap_list); + list_splice_init(&pool->drop_list, &unmap_list); + if (free_all) + list_splice_init(&pool->clean_list, &unmap_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + + free_goal = rds_ib_flush_goal(pool, free_all); + + if (list_empty(&unmap_list)) + goto out; + + /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ + list_for_each_entry(ibmr, &unmap_list, list) + list_add(&ibmr->fmr->list, &fmr_list); + ret = ib_unmap_fmr(&fmr_list); + if (ret) + printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); + + /* Now we can destroy the DMA mapping and unpin any pages */ + list_for_each_entry_safe(ibmr, next, &unmap_list, list) { + unpinned += ibmr->sg_len; + __rds_ib_teardown_mr(ibmr); + if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { + rds_ib_stats_inc(s_ib_rdma_mr_free); + list_del(&ibmr->list); + ib_dealloc_fmr(ibmr->fmr); + kfree(ibmr); + nfreed++; + } + ncleaned++; + } + + spin_lock_irqsave(&pool->list_lock, flags); + list_splice(&unmap_list, &pool->clean_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + + atomic_sub(unpinned, &pool->free_pinned); + atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(nfreed, &pool->item_count); + +out: + mutex_unlock(&pool->flush_lock); + return ret; +} + +static void rds_ib_mr_pool_flush_worker(struct work_struct *work) +{ + struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); + + rds_ib_flush_mr_pool(pool, 0); +} + +void rds_ib_free_mr(void *trans_private, int invalidate) +{ + struct rds_ib_mr *ibmr = trans_private; + struct rds_ib_device *rds_ibdev = ibmr->device; + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + unsigned long flags; + + rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); + + /* Return it to the pool's free list */ + spin_lock_irqsave(&pool->list_lock, flags); + if (ibmr->remap_count >= pool->fmr_attr.max_maps) + list_add(&ibmr->list, &pool->drop_list); + else + list_add(&ibmr->list, &pool->free_list); + + atomic_add(ibmr->sg_len, &pool->free_pinned); + atomic_inc(&pool->dirty_count); + spin_unlock_irqrestore(&pool->list_lock, flags); + + /* If we've pinned too many pages, request a flush */ + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned + || atomic_read(&pool->dirty_count) >= pool->max_items / 10) + queue_work(rds_wq, &pool->flush_worker); + + if (invalidate) { + if (likely(!in_interrupt())) { + rds_ib_flush_mr_pool(pool, 0); + } else { + /* We get here if the user created a MR marked + * as use_once and invalidate at the same time. */ + queue_work(rds_wq, &pool->flush_worker); + } + } +} + +void rds_ib_flush_mrs(void) +{ + struct rds_ib_device *rds_ibdev; + + list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + if (pool) + rds_ib_flush_mr_pool(pool, 0); + } +} + +void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret) +{ + struct rds_ib_device *rds_ibdev; + struct rds_ib_mr *ibmr = NULL; + int ret; + + rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); + if (!rds_ibdev) { + ret = -ENODEV; + goto out; + } + + if (!rds_ibdev->mr_pool) { + ret = -ENODEV; + goto out; + } + + ibmr = rds_ib_alloc_fmr(rds_ibdev); + if (IS_ERR(ibmr)) + return ibmr; + + ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); + if (ret == 0) + *key_ret = ibmr->fmr->rkey; + else + printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); + + ibmr->device = rds_ibdev; + + out: + if (ret) { + if (ibmr) + rds_ib_free_mr(ibmr, 0); + ibmr = ERR_PTR(ret); + } + return ibmr; +} diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c new file mode 100644 index 000000000000..5061b5502162 --- /dev/null +++ b/net/rds/ib_recv.c @@ -0,0 +1,869 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <rdma/rdma_cm.h> + +#include "rds.h" +#include "ib.h" + +static struct kmem_cache *rds_ib_incoming_slab; +static struct kmem_cache *rds_ib_frag_slab; +static atomic_t rds_ib_allocation = ATOMIC_INIT(0); + +static void rds_ib_frag_drop_page(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + __free_page(frag->f_page); + frag->f_page = NULL; +} + +static void rds_ib_frag_free(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + BUG_ON(frag->f_page != NULL); + kmem_cache_free(rds_ib_frag_slab, frag); +} + +/* + * We map a page at a time. Its fragments are posted in order. This + * is called in fragment order as the fragments get send completion events. + * Only the last frag in the page performs the unmapping. + * + * It's OK for ring cleanup to call this in whatever order it likes because + * DMA is not in flight and so we can unmap while other ring entries still + * hold page references in their frags. + */ +static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, + struct rds_ib_recv_work *recv) +{ + struct rds_page_frag *frag = recv->r_frag; + + rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); + if (frag->f_mapped) + ib_dma_unmap_page(ic->i_cm_id->device, + frag->f_mapped, + RDS_FRAG_SIZE, DMA_FROM_DEVICE); + frag->f_mapped = 0; +} + +void rds_ib_recv_init_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_recv_work *recv; + u32 i; + + for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { + struct ib_sge *sge; + + recv->r_ibinc = NULL; + recv->r_frag = NULL; + + recv->r_wr.next = NULL; + recv->r_wr.wr_id = i; + recv->r_wr.sg_list = recv->r_sge; + recv->r_wr.num_sge = RDS_IB_RECV_SGE; + + sge = rds_ib_data_sge(ic, recv->r_sge); + sge->addr = 0; + sge->length = RDS_FRAG_SIZE; + sge->lkey = ic->i_mr->lkey; + + sge = rds_ib_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + } +} + +static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, + struct rds_ib_recv_work *recv) +{ + if (recv->r_ibinc) { + rds_inc_put(&recv->r_ibinc->ii_inc); + recv->r_ibinc = NULL; + } + if (recv->r_frag) { + rds_ib_recv_unmap_page(ic, recv); + if (recv->r_frag->f_page) + rds_ib_frag_drop_page(recv->r_frag); + rds_ib_frag_free(recv->r_frag); + recv->r_frag = NULL; + } +} + +void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) +{ + u32 i; + + for (i = 0; i < ic->i_recv_ring.w_nr; i++) + rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); + + if (ic->i_frag.f_page) + rds_ib_frag_drop_page(&ic->i_frag); +} + +static int rds_ib_recv_refill_one(struct rds_connection *conn, + struct rds_ib_recv_work *recv, + gfp_t kptr_gfp, gfp_t page_gfp) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + dma_addr_t dma_addr; + struct ib_sge *sge; + int ret = -ENOMEM; + + if (recv->r_ibinc == NULL) { + if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) { + rds_ib_stats_inc(s_ib_rx_alloc_limit); + goto out; + } + recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, + kptr_gfp); + if (recv->r_ibinc == NULL) + goto out; + atomic_inc(&rds_ib_allocation); + INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); + rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); + } + + if (recv->r_frag == NULL) { + recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); + if (recv->r_frag == NULL) + goto out; + INIT_LIST_HEAD(&recv->r_frag->f_item); + recv->r_frag->f_page = NULL; + } + + if (ic->i_frag.f_page == NULL) { + ic->i_frag.f_page = alloc_page(page_gfp); + if (ic->i_frag.f_page == NULL) + goto out; + ic->i_frag.f_offset = 0; + } + + dma_addr = ib_dma_map_page(ic->i_cm_id->device, + ic->i_frag.f_page, + ic->i_frag.f_offset, + RDS_FRAG_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) + goto out; + + /* + * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() + * must be called on this recv. This happens as completions hit + * in order or on connection shutdown. + */ + recv->r_frag->f_page = ic->i_frag.f_page; + recv->r_frag->f_offset = ic->i_frag.f_offset; + recv->r_frag->f_mapped = dma_addr; + + sge = rds_ib_data_sge(ic, recv->r_sge); + sge->addr = dma_addr; + sge->length = RDS_FRAG_SIZE; + + sge = rds_ib_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->length = sizeof(struct rds_header); + + get_page(recv->r_frag->f_page); + + if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { + ic->i_frag.f_offset += RDS_FRAG_SIZE; + } else { + put_page(ic->i_frag.f_page); + ic->i_frag.f_page = NULL; + ic->i_frag.f_offset = 0; + } + + ret = 0; +out: + return ret; +} + +/* + * This tries to allocate and post unused work requests after making sure that + * they have all the allocations they need to queue received fragments into + * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc + * pairs don't go unmatched. + * + * -1 is returned if posting fails due to temporary resource exhaustion. + */ +int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_recv_work *recv; + struct ib_recv_wr *failed_wr; + unsigned int posted = 0; + int ret = 0; + u32 pos; + + while ((prefill || rds_conn_up(conn)) + && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + if (pos >= ic->i_recv_ring.w_nr) { + printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", + pos); + ret = -EINVAL; + break; + } + + recv = &ic->i_recvs[pos]; + ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); + if (ret) { + ret = -1; + break; + } + + /* XXX when can this fail? */ + ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); + rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, + recv->r_ibinc, recv->r_frag->f_page, + (long) recv->r_frag->f_mapped, ret); + if (ret) { + rds_ib_conn_error(conn, "recv post on " + "%pI4 returned %d, disconnecting and " + "reconnecting\n", &conn->c_faddr, + ret); + ret = -1; + break; + } + + posted++; + } + + /* We're doing flow control - update the window. */ + if (ic->i_flowctl && posted) + rds_ib_advertise_credits(conn, posted); + + if (ret) + rds_ib_ring_unalloc(&ic->i_recv_ring, 1); + return ret; +} + +void rds_ib_inc_purge(struct rds_incoming *inc) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + rdsdebug("purging ibinc %p inc %p\n", ibinc, inc); + + list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_ib_frag_drop_page(frag); + rds_ib_frag_free(frag); + } +} + +void rds_ib_inc_free(struct rds_incoming *inc) +{ + struct rds_ib_incoming *ibinc; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + + rds_ib_inc_purge(inc); + rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); + BUG_ON(!list_empty(&ibinc->ii_frags)); + kmem_cache_free(rds_ib_incoming_slab, ibinc); + atomic_dec(&rds_ib_allocation); + BUG_ON(atomic_read(&rds_ib_allocation) < 0); +} + +int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, + size_t size) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct iovec *iov = first_iov; + unsigned long to_copy; + unsigned long frag_off = 0; + unsigned long iov_off = 0; + int copied = 0; + int ret; + u32 len; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); + len = be32_to_cpu(inc->i_hdr.h_len); + + while (copied < size && copied < len) { + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " + "[%p, %lu] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + frag->f_page, frag->f_offset, frag_off); + + /* XXX needs + offset for multiple recvs per page */ + ret = rds_page_copy_to_user(frag->f_page, + frag->f_offset + frag_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + frag_off += to_copy; + copied += to_copy; + } + + return copied; +} + +/* ic starts out kzalloc()ed */ +void rds_ib_recv_init_ack(struct rds_ib_connection *ic) +{ + struct ib_send_wr *wr = &ic->i_ack_wr; + struct ib_sge *sge = &ic->i_ack_sge; + + sge->addr = ic->i_ack_dma; + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + + wr->sg_list = sge; + wr->num_sge = 1; + wr->opcode = IB_WR_SEND; + wr->wr_id = RDS_IB_ACK_WR_ID; + wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; +} + +/* + * You'd think that with reliable IB connections you wouldn't need to ack + * messages that have been received. The problem is that IB hardware generates + * an ack message before it has DMAed the message into memory. This creates a + * potential message loss if the HCA is disabled for any reason between when it + * sends the ack and before the message is DMAed and processed. This is only a + * potential issue if another HCA is available for fail-over. + * + * When the remote host receives our ack they'll free the sent message from + * their send queue. To decrease the latency of this we always send an ack + * immediately after we've received messages. + * + * For simplicity, we only have one ack in flight at a time. This puts + * pressure on senders to have deep enough send queues to absorb the latency of + * a single ack frame being in flight. This might not be good enough. + * + * This is implemented by have a long-lived send_wr and sge which point to a + * statically allocated ack frame. This ack wr does not fall under the ring + * accounting that the tx and rx wrs do. The QP attribute specifically makes + * room for it beyond the ring size. Send completion notices its special + * wr_id and avoids working with the ring in that case. + */ +static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, + int ack_required) +{ + rds_ib_set_64bit(&ic->i_ack_next, seq); + if (ack_required) { + smp_mb__before_clear_bit(); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + } +} + +static u64 rds_ib_get_ack(struct rds_ib_connection *ic) +{ + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + smp_mb__after_clear_bit(); + + return ic->i_ack_next; +} + +static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits) +{ + struct rds_header *hdr = ic->i_ack; + struct ib_send_wr *failed_wr; + u64 seq; + int ret; + + seq = rds_ib_get_ack(ic); + + rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); + rds_message_populate_header(hdr, 0, 0, 0); + hdr->h_ack = cpu_to_be64(seq); + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + ic->i_ack_queued = jiffies; + + ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); + if (unlikely(ret)) { + /* Failed to send. Release the WR, and + * force another ACK. + */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + rds_ib_stats_inc(s_ib_ack_send_failure); + /* Need to finesse this later. */ + BUG(); + } else + rds_ib_stats_inc(s_ib_ack_sent); +} + +/* + * There are 3 ways of getting acknowledgements to the peer: + * 1. We call rds_ib_attempt_ack from the recv completion handler + * to send an ACK-only frame. + * However, there can be only one such frame in the send queue + * at any time, so we may have to postpone it. + * 2. When another (data) packet is transmitted while there's + * an ACK in the queue, we piggyback the ACK sequence number + * on the data packet. + * 3. If the ACK WR is done sending, we get called from the + * send queue completion handler, and check whether there's + * another ACK pending (postponed because the WR was on the + * queue). If so, we transmit it. + * + * We maintain 2 variables: + * - i_ack_flags, which keeps track of whether the ACK WR + * is currently in the send queue or not (IB_ACK_IN_FLIGHT) + * - i_ack_next, which is the last sequence number we received + * + * Potentially, send queue and receive queue handlers can run concurrently. + * + * Reconnecting complicates this picture just slightly. When we + * reconnect, we may be seeing duplicate packets. The peer + * is retransmitting them, because it hasn't seen an ACK for + * them. It is important that we ACK these. + * + * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with + * this flag set *MUST* be acknowledged immediately. + */ + +/* + * When we get here, we're called from the recv queue handler. + * Check whether we ought to transmit an ACK. + */ +void rds_ib_attempt_ack(struct rds_ib_connection *ic) +{ + unsigned int adv_credits; + + if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + return; + + if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { + rds_ib_stats_inc(s_ib_ack_send_delayed); + return; + } + + /* Can we get a send credit? */ + if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) { + rds_ib_stats_inc(s_ib_tx_throttle); + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + return; + } + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + rds_ib_send_ack(ic, adv_credits); +} + +/* + * We get here from the send completion handler, when the + * adapter tells us the ACK frame was sent. + */ +void rds_ib_ack_send_complete(struct rds_ib_connection *ic) +{ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_ib_attempt_ack(ic); +} + +/* + * This is called by the regular xmit code when it wants to piggyback + * an ACK on an outgoing frame. + */ +u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) +{ + if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + rds_ib_stats_inc(s_ib_ack_send_piggybacked); + return rds_ib_get_ack(ic); +} + +/* + * It's kind of lame that we're copying from the posted receive pages into + * long-lived bitmaps. We could have posted the bitmaps and rdma written into + * them. But receiving new congestion bitmaps should be a *rare* event, so + * hopefully we won't need to invest that complexity in making it more + * efficient. By copying we can share a simpler core with TCP which has to + * copy. + */ +static void rds_ib_cong_recv(struct rds_connection *conn, + struct rds_ib_incoming *ibinc) +{ + struct rds_cong_map *map; + unsigned int map_off; + unsigned int map_page; + struct rds_page_frag *frag; + unsigned long frag_off; + unsigned long to_copy; + unsigned long copied; + uint64_t uncongested = 0; + void *addr; + + /* catch completely corrupt packets */ + if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) + return; + + map = conn->c_fcong; + map_page = 0; + map_off = 0; + + frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); + frag_off = 0; + + copied = 0; + + while (copied < RDS_CONG_MAP_BYTES) { + uint64_t *src, *dst; + unsigned int k; + + to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); + BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ + + addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); + + src = addr + frag_off; + dst = (void *)map->m_page_addrs[map_page] + map_off; + for (k = 0; k < to_copy; k += 8) { + /* Record ports that became uncongested, ie + * bits that changed from 0 to 1. */ + uncongested |= ~(*src) & *dst; + *dst++ = *src++; + } + kunmap_atomic(addr, KM_SOFTIRQ0); + + copied += to_copy; + + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + + frag_off += to_copy; + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + } + + /* the congestion map is in little endian order */ + uncongested = le64_to_cpu(uncongested); + + rds_cong_map_updated(map, uncongested); +} + +/* + * Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_ib_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + +static void rds_ib_process_recv(struct rds_connection *conn, + struct rds_ib_recv_work *recv, u32 byte_len, + struct rds_ib_ack_state *state) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_incoming *ibinc = ic->i_ibinc; + struct rds_header *ihdr, *hdr; + + /* XXX shut down the connection if port 0,0 are seen? */ + + rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv, + byte_len); + + if (byte_len < sizeof(struct rds_header)) { + rds_ib_conn_error(conn, "incoming message " + "from %pI4 didn't inclue a " + "header, disconnecting and " + "reconnecting\n", + &conn->c_faddr); + return; + } + byte_len -= sizeof(struct rds_header); + + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + + /* Validate the checksum. */ + if (!rds_message_verify_checksum(ihdr)) { + rds_ib_conn_error(conn, "incoming message " + "from %pI4 has corrupted header - " + "forcing a reconnect\n", + &conn->c_faddr); + rds_stats_inc(s_recv_drop_bad_checksum); + return; + } + + /* Process the ACK sequence which comes with every packet */ + state->ack_recv = be64_to_cpu(ihdr->h_ack); + state->ack_recv_valid = 1; + + /* Process the credits update if there was one */ + if (ihdr->h_credit) + rds_ib_send_add_credits(conn, ihdr->h_credit); + + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { + /* This is an ACK-only packet. The fact that it gets + * special treatment here is that historically, ACKs + * were rather special beasts. + */ + rds_ib_stats_inc(s_ib_ack_received); + + /* + * Usually the frags make their way on to incs and are then freed as + * the inc is freed. We don't go that route, so we have to drop the + * page ref ourselves. We can't just leave the page on the recv + * because that confuses the dma mapping of pages and each recv's use + * of a partial page. We can leave the frag, though, it will be + * reused. + * + * FIXME: Fold this into the code path below. + */ + rds_ib_frag_drop_page(recv->r_frag); + return; + } + + /* + * If we don't already have an inc on the connection then this + * fragment has a header and starts a message.. copy its header + * into the inc and save the inc so we can hang upcoming fragments + * off its list. + */ + if (ibinc == NULL) { + ibinc = recv->r_ibinc; + recv->r_ibinc = NULL; + ic->i_ibinc = ibinc; + + hdr = &ibinc->ii_inc.i_hdr; + memcpy(hdr, ihdr, sizeof(*hdr)); + ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + + rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc, + ic->i_recv_data_rem, hdr->h_flags); + } else { + hdr = &ibinc->ii_inc.i_hdr; + /* We can't just use memcmp here; fragments of a + * single message may carry different ACKs */ + if (hdr->h_sequence != ihdr->h_sequence + || hdr->h_len != ihdr->h_len + || hdr->h_sport != ihdr->h_sport + || hdr->h_dport != ihdr->h_dport) { + rds_ib_conn_error(conn, + "fragment header mismatch; forcing reconnect\n"); + return; + } + } + + list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags); + recv->r_frag = NULL; + + if (ic->i_recv_data_rem > RDS_FRAG_SIZE) + ic->i_recv_data_rem -= RDS_FRAG_SIZE; + else { + ic->i_recv_data_rem = 0; + ic->i_ibinc = NULL; + + if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_ib_cong_recv(conn, ibinc); + else { + rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + &ibinc->ii_inc, GFP_ATOMIC, + KM_SOFTIRQ0); + state->ack_next = be64_to_cpu(hdr->h_sequence); + state->ack_next_valid = 1; + } + + /* Evaluate the ACK_REQUIRED flag *after* we received + * the complete frame, and after bumping the next_rx + * sequence. */ + if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { + rds_stats_inc(s_recv_ack_required); + state->ack_required = 1; + } + + rds_inc_put(&ibinc->ii_inc); + } +} + +/* + * Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rds_recv_incoming() per RDS connection. + */ +void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_wc wc; + struct rds_ib_ack_state state = { 0, }; + struct rds_ib_recv_work *recv; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_rx_cq_call); + + ib_req_notify_cq(cq, IB_CQ_SOLICITED); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_ib_stats_inc(s_ib_rx_cq_event); + + recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; + + rds_ib_recv_unmap_page(ic, recv); + + /* + * Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) { + /* We expect errors as the qp is drained during shutdown */ + if (wc.status == IB_WC_SUCCESS) { + rds_ib_process_recv(conn, recv, wc.byte_len, &state); + } else { + rds_ib_conn_error(conn, "recv completion on " + "%pI4 had status %u, disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status); + } + } + + rds_ib_ring_free(&ic->i_recv_ring, 1); + } + + if (state.ack_next_valid) + rds_ib_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); + + /* If we ever end up with a really empty receive ring, we're + * in deep trouble, as the sender will definitely see RNR + * timeouts. */ + if (rds_ib_ring_empty(&ic->i_recv_ring)) + rds_ib_stats_inc(s_ib_rx_ring_empty); + + /* + * If the ring is running low, then schedule the thread to refill. + */ + if (rds_ib_ring_low(&ic->i_recv_ring)) + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +} + +int rds_ib_recv(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + int ret = 0; + + rdsdebug("conn %p\n", conn); + + /* + * If we get a temporary posting failure in this context then + * we're really low and we want the caller to back off for a bit. + */ + mutex_lock(&ic->i_recv_mutex); + if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) + ret = -ENOMEM; + else + rds_ib_stats_inc(s_ib_rx_refill_from_thread); + mutex_unlock(&ic->i_recv_mutex); + + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); + + return ret; +} + +int __init rds_ib_recv_init(void) +{ + struct sysinfo si; + int ret = -ENOMEM; + + /* Default to 30% of all available RAM for recv memory */ + si_meminfo(&si); + rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; + + rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", + sizeof(struct rds_ib_incoming), + 0, 0, NULL); + if (rds_ib_incoming_slab == NULL) + goto out; + + rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", + sizeof(struct rds_page_frag), + 0, 0, NULL); + if (rds_ib_frag_slab == NULL) + kmem_cache_destroy(rds_ib_incoming_slab); + else + ret = 0; +out: + return ret; +} + +void rds_ib_recv_exit(void) +{ + kmem_cache_destroy(rds_ib_incoming_slab); + kmem_cache_destroy(rds_ib_frag_slab); +} diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c new file mode 100644 index 000000000000..99a6ccae964c --- /dev/null +++ b/net/rds/ib_ring.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> + +#include "rds.h" +#include "ib.h" + +/* + * Locking for IB rings. + * We assume that allocation is always protected by a mutex + * in the caller (this is a valid assumption for the current + * implementation). + * + * Freeing always happens in an interrupt, and hence only + * races with allocations, but not with other free()s. + * + * The interaction between allocation and freeing is that + * the alloc code has to determine the number of free entries. + * To this end, we maintain two counters; an allocation counter + * and a free counter. Both are allowed to run freely, and wrap + * around. + * The number of used entries is always (alloc_ctr - free_ctr) % NR. + * + * The current implementation makes free_ctr atomic. When the + * caller finds an allocation fails, it should set an "alloc fail" + * bit and retry the allocation. The "alloc fail" bit essentially tells + * the CQ completion handlers to wake it up after freeing some + * more entries. + */ + +/* + * This only happens on shutdown. + */ +DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait); + +void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr) +{ + memset(ring, 0, sizeof(*ring)); + ring->w_nr = nr; + rdsdebug("ring %p nr %u\n", ring, ring->w_nr); +} + +static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring) +{ + u32 diff; + + /* This assumes that atomic_t has at least as many bits as u32 */ + diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); + BUG_ON(diff > ring->w_nr); + + return diff; +} + +void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr) +{ + /* We only ever get called from the connection setup code, + * prior to creating the QP. */ + BUG_ON(__rds_ib_ring_used(ring)); + ring->w_nr = nr; +} + +static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_used(ring) == 0; +} + +u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos) +{ + u32 ret = 0, avail; + + avail = ring->w_nr - __rds_ib_ring_used(ring); + + rdsdebug("ring %p val %u next %u free %u\n", ring, val, + ring->w_alloc_ptr, avail); + + if (val && avail) { + ret = min(val, avail); + *pos = ring->w_alloc_ptr; + + ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; + ring->w_alloc_ctr += ret; + } + + return ret; +} + +void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val) +{ + ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; + atomic_add(val, &ring->w_free_ctr); + + if (__rds_ib_ring_empty(ring) && + waitqueue_active(&rds_ib_ring_empty_wait)) + wake_up(&rds_ib_ring_empty_wait); +} + +void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val) +{ + ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; + ring->w_alloc_ctr -= val; +} + +int rds_ib_ring_empty(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_empty(ring); +} + +int rds_ib_ring_low(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2); +} + +/* + * returns the oldest alloced ring entry. This will be the next one + * freed. This can't be called if there are none allocated. + */ +u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring) +{ + return ring->w_free_ptr; +} + +/* + * returns the number of completed work requests. + */ + +u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest) +{ + u32 ret; + + if (oldest <= (unsigned long long)wr_id) + ret = (unsigned long long)wr_id - oldest + 1; + else + ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; + + rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, + wr_id, oldest); + return ret; +} diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c new file mode 100644 index 000000000000..cb6c52cb1c4c --- /dev/null +++ b/net/rds/ib_send.c @@ -0,0 +1,874 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/device.h> +#include <linux/dmapool.h> + +#include "rds.h" +#include "rdma.h" +#include "ib.h" + +static void rds_ib_send_rdma_complete(struct rds_message *rm, + int wc_status) +{ + int notify_status; + + switch (wc_status) { + case IB_WC_WR_FLUSH_ERR: + return; + + case IB_WC_SUCCESS: + notify_status = RDS_RDMA_SUCCESS; + break; + + case IB_WC_REM_ACCESS_ERR: + notify_status = RDS_RDMA_REMOTE_ERROR; + break; + + default: + notify_status = RDS_RDMA_OTHER_ERROR; + break; + } + rds_rdma_send_complete(rm, notify_status); +} + +static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, + struct rds_rdma_op *op) +{ + if (op->r_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, + op->r_sg, op->r_nents, + op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->r_mapped = 0; + } +} + +static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + int wc_status) +{ + struct rds_message *rm = send->s_rm; + + rdsdebug("ic %p send %p rm %p\n", ic, send, rm); + + ib_dma_unmap_sg(ic->i_cm_id->device, + rm->m_sg, rm->m_nents, + DMA_TO_DEVICE); + + if (rm->m_rdma_op != NULL) { + rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); + + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * call rds_rdma_send_complete from the cq_handler. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_ib_send_rdma_complete(rm, wc_status); + + if (rm->m_rdma_op->r_write) + rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); + else + rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + } + + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + + rds_message_put(rm); + send->s_rm = NULL; +} + +void rds_ib_send_init_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + struct ib_sge *sge; + + send->s_rm = NULL; + send->s_op = NULL; + + send->s_wr.wr_id = i; + send->s_wr.sg_list = send->s_sge; + send->s_wr.num_sge = 1; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.send_flags = 0; + send->s_wr.ex.imm_data = 0; + + sge = rds_ib_data_sge(ic, send->s_sge); + sge->lkey = ic->i_mr->lkey; + + sge = rds_ib_header_sge(ic, send->s_sge); + sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + } +} + +void rds_ib_send_clear_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + if (send->s_wr.opcode == 0xdead) + continue; + if (send->s_rm) + rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); + if (send->s_op) + rds_ib_send_unmap_rdma(ic, send->s_op); + } +} + +/* + * The _oldest/_free ring operations here race cleanly with the alloc/unalloc + * operations performed in the send path. As the sender allocs and potentially + * unallocs the next free entry in the ring it doesn't alter which is + * the next to be freed, which is what this is concerned with. + */ +void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_wc wc; + struct rds_ib_send_work *send; + u32 completed; + u32 oldest; + u32 i = 0; + int ret; + + rdsdebug("cq %p conn %p\n", cq, conn); + rds_ib_stats_inc(s_ib_tx_cq_call); + ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (ret) + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_ib_stats_inc(s_ib_tx_cq_event); + + if (wc.wr_id == RDS_IB_ACK_WR_ID) { + if (ic->i_ack_queued + HZ/2 < jiffies) + rds_ib_stats_inc(s_ib_tx_stalled); + rds_ib_ack_send_complete(ic); + continue; + } + + oldest = rds_ib_ring_oldest(&ic->i_send_ring); + + completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_rm) + rds_ib_send_unmap_rm(ic, send, wc.status); + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + /* Nothing to be done - the SG list will be unmapped + * when the SEND completes. */ + break; + default: + if (printk_ratelimit()) + printk(KERN_NOTICE + "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; + } + + send->s_wr.opcode = 0xdead; + send->s_wr.num_sge = 1; + if (send->s_queued + HZ/2 < jiffies) + rds_ib_stats_inc(s_ib_tx_stalled); + + /* If a RDMA operation produced an error, signal this right + * away. If we don't, the subsequent SEND that goes with this + * RDMA will be canceled with ERR_WFLUSH, and the application + * never learn that the RDMA failed. */ + if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { + struct rds_message *rm; + + rm = rds_send_get_message(conn, send->s_op); + if (rm) + rds_ib_send_rdma_complete(rm, wc.status); + } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_ib_ring_free(&ic->i_send_ring, completed); + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) + || test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_ib_conn_error(conn, + "send completion on %pI4 " + "had status %u, disconnecting and reconnecting\n", + &conn->c_faddr, wc.status); + } + } +} + +/* + * This is the main function for allocating credits when sending + * messages. + * + * Conceptually, we have two counters: + * - send credits: this tells us how many WRs we're allowed + * to submit without overruning the reciever's queue. For + * each SEND WR we post, we decrement this by one. + * + * - posted credits: this tells us how many WRs we recently + * posted to the receive queue. This value is transferred + * to the peer as a "credit update" in a RDS header field. + * Every time we transmit credits to the peer, we subtract + * the amount of transferred credits from this counter. + * + * It is essential that we avoid situations where both sides have + * exhausted their send credits, and are unable to send new credits + * to the peer. We achieve this by requiring that we send at least + * one credit update to the peer before exhausting our credits. + * When new credits arrive, we subtract one credit that is withheld + * until we've posted new buffers and are ready to transmit these + * credits (see rds_ib_send_add_credits below). + * + * The RDS send code is essentially single-threaded; rds_send_xmit + * grabs c_send_lock to ensure exclusive access to the send ring. + * However, the ACK sending code is independent and can race with + * message SENDs. + * + * In the send path, we need to update the counters for send credits + * and the counter of posted buffers atomically - when we use the + * last available credit, we cannot allow another thread to race us + * and grab the posted credits counter. Hence, we have to use a + * spinlock to protect the credit counter, or use atomics. + * + * Spinlocks shared between the send and the receive path are bad, + * because they create unnecessary delays. An early implementation + * using a spinlock showed a 5% degradation in throughput at some + * loads. + * + * This implementation avoids spinlocks completely, putting both + * counters into a single atomic, and updating that atomic using + * atomic_add (in the receive path, when receiving fresh credits), + * and using atomic_cmpxchg when updating the two counters. + */ +int rds_ib_send_grab_credits(struct rds_ib_connection *ic, + u32 wanted, u32 *adv_credits, int need_posted) +{ + unsigned int avail, posted, got = 0, advertise; + long oldval, newval; + + *adv_credits = 0; + if (!ic->i_flowctl) + return wanted; + +try_again: + advertise = 0; + oldval = newval = atomic_read(&ic->i_credits); + posted = IB_GET_POST_CREDITS(oldval); + avail = IB_GET_SEND_CREDITS(oldval); + + rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n", + wanted, avail, posted); + + /* The last credit must be used to send a credit update. */ + if (avail && !posted) + avail--; + + if (avail < wanted) { + struct rds_connection *conn = ic->i_cm_id->context; + + /* Oops, there aren't that many credits left! */ + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + got = avail; + } else { + /* Sometimes you get what you want, lalala. */ + got = wanted; + } + newval -= IB_SET_SEND_CREDITS(got); + + /* + * If need_posted is non-zero, then the caller wants + * the posted regardless of whether any send credits are + * available. + */ + if (posted && (got || need_posted)) { + advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT); + newval -= IB_SET_POST_CREDITS(advertise); + } + + /* Finally bill everything */ + if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) + goto try_again; + + *adv_credits = advertise; + return got; +} + +void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (credits == 0) + return; + + rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n", + credits, + IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), + test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); + + atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); + + rds_ib_stats_inc(s_ib_rx_credit_updates); +} + +void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (posted == 0) + return; + + atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); + + /* Decide whether to send an update to the peer now. + * If we would send a credit update for every single buffer we + * post, we would end up with an ACK storm (ACK arrives, + * consumes buffer, we refill the ring, send ACK to remote + * advertising the newly posted buffer... ad inf) + * + * Performance pretty much depends on how often we send + * credit updates - too frequent updates mean lots of ACKs. + * Too infrequent updates, and the peer will run out of + * credits and has to throttle. + * For the time being, 16 seems to be a good compromise. + */ + if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); +} + +static inline void +rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, unsigned int pos, + unsigned long buffer, unsigned int length, + int send_flags) +{ + struct ib_sge *sge; + + WARN_ON(pos != send - ic->i_sends); + + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 2; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; + + if (length != 0) { + sge = rds_ib_data_sge(ic, send->s_sge); + sge->addr = buffer; + sge->length = length; + sge->lkey = ic->i_mr->lkey; + + sge = rds_ib_header_sge(ic, send->s_sge); + } else { + /* We're sending a packet with no payload. There is only + * one SGE */ + send->s_wr.num_sge = 1; + sge = &send->s_sge[0]; + } + + sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; +} + +/* + * This can be called multiple times for a given message. The first time + * we see a message we map its scatterlist into the IB device so that + * we can provide that mapped address to the IB scatter gather entries + * in the IB work requests. We translate the scatterlist into a series + * of work requests that fragment the message. These work requests complete + * in order so we pass ownership of the message to the completion handler + * once we send the final fragment. + * + * The RDS core uses the c_send_lock to only enter this function once + * per connection. This makes sure that the tx ring alloc/unalloc pairs + * don't get out of sync and confuse the ring. + */ +int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct rds_ib_send_work *send = NULL; + struct rds_ib_send_work *first; + struct rds_ib_send_work *prev; + struct ib_send_wr *failed_wr; + struct scatterlist *scat; + u32 pos; + u32 i; + u32 work_alloc; + u32 credit_alloc; + u32 posted; + u32 adv_credits = 0; + int send_flags = 0; + int sent; + int ret; + int flow_controlled = 0; + + BUG_ON(off % RDS_FRAG_SIZE); + BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); + + /* FIXME we may overallocate here */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) + i = 1; + else + i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + credit_alloc = work_alloc; + if (ic->i_flowctl) { + credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0); + adv_credits += posted; + if (credit_alloc < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); + work_alloc = credit_alloc; + flow_controlled++; + } + if (work_alloc == 0) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_throttle); + ret = -ENOMEM; + goto out; + } + } + + /* map the message the first time we see it */ + if (ic->i_rm == NULL) { + /* + printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", + be16_to_cpu(rm->m_inc.i_hdr.h_dport), + rm->m_inc.i_hdr.h_flags, + be32_to_cpu(rm->m_inc.i_hdr.h_len)); + */ + if (rm->m_nents) { + rm->m_count = ib_dma_map_sg(dev, + rm->m_sg, rm->m_nents, DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); + if (rm->m_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + } else { + rm->m_count = 0; + } + + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; + rds_message_addref(rm); + ic->i_rm = rm; + + /* Finalize the header */ + if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; + if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; + + /* If it has a RDMA op, tell the peer we did it. This is + * used by the peer to release use-once RDMA MRs. */ + if (rm->m_rdma_op) { + struct rds_ext_header_rdma ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); + } + if (rm->m_rdma_cookie) { + rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, + rds_rdma_cookie_key(rm->m_rdma_cookie), + rds_rdma_cookie_offset(rm->m_rdma_cookie)); + } + + /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so + * we should not do this unless we have a chance of at least + * sticking the header into the send ring. Which is why we + * should call rds_ib_ring_alloc first. */ + rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic)); + rds_message_make_checksum(&rm->m_inc.i_hdr); + + /* + * Update adv_credits since we reset the ACK_REQUIRED bit. + */ + rds_ib_send_grab_credits(ic, 0, &posted, 1); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } else if (ic->i_rm != rm) + BUG(); + + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &rm->m_sg[sg]; + sent = 0; + i = 0; + + /* Sometimes you want to put a fence between an RDMA + * READ and the following SEND. + * We could either do this all the time + * or when requested by the user. Right now, we let + * the application choose. + */ + if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + send_flags = IB_SEND_FENCE; + + /* + * We could be copying the header into the unused tail of the page. + * That would need to be changed in the future when those pages might + * be mapped userspace pages or page cache pages. So instead we always + * use a second sge and our long-lived ring of mapped headers. We send + * the header after the data so that the data payload can be aligned on + * the receiver. + */ + + /* handle a 0-len message */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { + rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); + goto add_header; + } + + /* if there's data reference it with a chain of work reqs */ + for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { + unsigned int len; + + send = &ic->i_sends[pos]; + + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + rds_ib_xmit_populate_wr(ic, send, pos, + ib_sg_dma_address(dev, scat) + off, len, + send_flags); + + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + ic->i_unsignaled_bytes -= len; + if (ic->i_unsignaled_bytes <= 0) { + ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + /* + * Always signal the last one if we're stopping due to flow control. + */ + if (flow_controlled && i == (work_alloc-1)) + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } + +add_header: + /* Tack on the header after the data. The header SGE should already + * have been set up to point to the right header buffer. */ + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + + if (0) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", + be16_to_cpu(hdr->h_dport), + hdr->h_flags, + be32_to_cpu(hdr->h_len)); + } + if (adv_credits) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + /* add credit and redo the header checksum */ + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + adv_credits = 0; + rds_ib_stats_inc(s_ib_tx_credit_updates); + } + + if (prev) + prev->s_wr.next = &send->s_wr; + prev = send; + + pos = (pos + 1) % ic->i_send_ring.w_nr; + } + + /* Account the RDS header in the number of bytes we sent, but just once. + * The caller has no concept of fragmentation. */ + if (hdr_off == 0) + sent += sizeof(struct rds_header); + + /* if we finished the message then send completion owns it */ + if (scat == &rm->m_sg[rm->m_count]) { + prev->s_rm = ic->i_rm; + prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + ic->i_rm = NULL; + } + + if (i < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + if (ic->i_flowctl && i < credit_alloc) + rds_ib_send_add_credits(conn, credit_alloc - i); + + /* XXX need to worry about failed_wr and partial sends. */ + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + if (prev->s_rm) { + ic->i_rm = prev->s_rm; + prev->s_rm = NULL; + } + /* Finesse this later */ + BUG(); + goto out; + } + + ret = sent; +out: + BUG_ON(adv_credits); + return ret; +} + +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_send_work *send = NULL; + struct rds_ib_send_work *first; + struct rds_ib_send_work *prev; + struct ib_send_wr *failed_wr; + struct rds_ib_device *rds_ibdev; + struct scatterlist *scat; + unsigned long len; + u64 remote_addr = op->r_remote_addr; + u32 pos; + u32 work_alloc; + u32 i; + u32 j; + int sent; + int ret; + int num_sge; + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + + /* map the message the first time we see it */ + if (!op->r_mapped) { + op->r_count = ib_dma_map_sg(ic->i_cm_id->device, + op->r_sg, op->r_nents, (op->r_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); + if (op->r_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + op->r_mapped = 1; + } + + /* + * Instead of knowing how to return a partial rdma read/write we insist that there + * be enough work requests to send the entire message. + */ + i = ceil(op->r_count, rds_ibdev->max_sge); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc != i) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &op->r_sg[0]; + sent = 0; + num_sge = op->r_count; + + for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + send->s_wr.send_flags = 0; + send->s_queued = jiffies; + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + send->s_wr.send_flags = IB_SEND_SIGNALED; + } + + send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; + send->s_wr.wr.rdma.remote_addr = remote_addr; + send->s_wr.wr.rdma.rkey = op->r_key; + send->s_op = op; + + if (num_sge > rds_ibdev->max_sge) { + send->s_wr.num_sge = rds_ibdev->max_sge; + num_sge -= rds_ibdev->max_sge; + } else { + send->s_wr.num_sge = num_sge; + } + + send->s_wr.next = NULL; + + if (prev) + prev->s_wr.next = &send->s_wr; + + for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + len = ib_sg_dma_len(ic->i_cm_id->device, scat); + send->s_sge[j].addr = + ib_sg_dma_address(ic->i_cm_id->device, scat); + send->s_sge[j].length = len; + send->s_sge[j].lkey = ic->i_mr->lkey; + + sent += len; + rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); + + remote_addr += len; + scat++; + } + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + prev = send; + if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) + send = ic->i_sends; + } + + /* if we finished the message then send completion owns it */ + if (scat == &op->r_sg[op->r_count]) + prev->s_wr.send_flags = IB_SEND_SIGNALED; + + if (i < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + goto out; + } + + if (unlikely(failed_wr != &first->s_wr)) { + printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret); + BUG_ON(failed_wr != &first->s_wr); + } + + +out: + return ret; +} + +void rds_ib_xmit_complete(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + /* We may have a pending ACK or window update we were unable + * to send previously (due to flow control). Try again. */ + rds_ib_attempt_ack(ic); +} diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c new file mode 100644 index 000000000000..02e3e3d50d4a --- /dev/null +++ b/net/rds/ib_stats.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "rds.h" +#include "ib.h" + +DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; + +static char *rds_ib_stat_names[] = { + "ib_connect_raced", + "ib_listen_closed_stale", + "ib_tx_cq_call", + "ib_tx_cq_event", + "ib_tx_ring_full", + "ib_tx_throttle", + "ib_tx_sg_mapping_failure", + "ib_tx_stalled", + "ib_tx_credit_updates", + "ib_rx_cq_call", + "ib_rx_cq_event", + "ib_rx_ring_empty", + "ib_rx_refill_from_cq", + "ib_rx_refill_from_thread", + "ib_rx_alloc_limit", + "ib_rx_credit_updates", + "ib_ack_sent", + "ib_ack_send_failure", + "ib_ack_send_delayed", + "ib_ack_send_piggybacked", + "ib_ack_received", + "ib_rdma_mr_alloc", + "ib_rdma_mr_free", + "ib_rdma_mr_used", + "ib_rdma_mr_pool_flush", + "ib_rdma_mr_pool_wait", + "ib_rdma_mr_pool_depleted", +}; + +unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) +{ + struct rds_ib_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + if (avail < ARRAY_SIZE(rds_ib_stat_names)) + goto out; + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names, + ARRAY_SIZE(rds_ib_stat_names)); +out: + return ARRAY_SIZE(rds_ib_stat_names); +} diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c new file mode 100644 index 000000000000..d87830db93a0 --- /dev/null +++ b/net/rds/ib_sysctl.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> + +#include "ib.h" + +static struct ctl_table_header *rds_ib_sysctl_hdr; + +unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR; +unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR; +unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; +static unsigned long rds_ib_sysctl_max_wr_min = 1; +/* hardware will fail CQ creation long before this */ +static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0; + +unsigned long rds_ib_sysctl_max_unsig_wrs = 16; +static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; +static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; + +unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20); +static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1; +static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; + +unsigned int rds_ib_sysctl_flow_control = 1; + +ctl_table rds_ib_sysctl_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_send_wr", + .data = &rds_ib_sysctl_max_send_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_wr_min, + .extra2 = &rds_ib_sysctl_max_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_recv_wr", + .data = &rds_ib_sysctl_max_recv_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_wr_min, + .extra2 = &rds_ib_sysctl_max_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unsignaled_wr", + .data = &rds_ib_sysctl_max_unsig_wrs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_unsig_wr_min, + .extra2 = &rds_ib_sysctl_max_unsig_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unsignaled_bytes", + .data = &rds_ib_sysctl_max_unsig_bytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_unsig_bytes_min, + .extra2 = &rds_ib_sysctl_max_unsig_bytes_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_recv_allocation", + .data = &rds_ib_sysctl_max_recv_allocation, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "flow_control", + .data = &rds_ib_sysctl_flow_control, + .maxlen = sizeof(rds_ib_sysctl_flow_control), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0} +}; + +static struct ctl_path rds_ib_sysctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, + { .procname = "ib", .ctl_name = CTL_UNNUMBERED, }, + { } +}; + +void rds_ib_sysctl_exit(void) +{ + if (rds_ib_sysctl_hdr) + unregister_sysctl_table(rds_ib_sysctl_hdr); +} + +int __init rds_ib_sysctl_init(void) +{ + rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); + if (rds_ib_sysctl_hdr == NULL) + return -ENOMEM; + return 0; +} diff --git a/net/rds/info.c b/net/rds/info.c new file mode 100644 index 000000000000..1d885535214d --- /dev/null +++ b/net/rds/info.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "rds.h" + +/* + * This file implements a getsockopt() call which copies a set of fixed + * sized structs into a user-specified buffer as a means of providing + * read-only information about RDS. + * + * For a given information source there are a given number of fixed sized + * structs at a given time. The structs are only copied if the user-specified + * buffer is big enough. The destination pages that make up the buffer + * are pinned for the duration of the copy. + * + * This gives us the following benefits: + * + * - simple implementation, no copy "position" across multiple calls + * - consistent snapshot of an info source + * - atomic copy works well with whatever locking info source has + * - one portable tool to get rds info across implementations + * - long-lived tool can get info without allocating + * + * at the following costs: + * + * - info source copy must be pinned, may be "large" + */ + +struct rds_info_iterator { + struct page **pages; + void *addr; + unsigned long offset; +}; + +static DEFINE_SPINLOCK(rds_info_lock); +static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1]; + +void rds_info_register_func(int optname, rds_info_func func) +{ + int offset = optname - RDS_INFO_FIRST; + + BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); + + spin_lock(&rds_info_lock); + BUG_ON(rds_info_funcs[offset] != NULL); + rds_info_funcs[offset] = func; + spin_unlock(&rds_info_lock); +} + +void rds_info_deregister_func(int optname, rds_info_func func) +{ + int offset = optname - RDS_INFO_FIRST; + + BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); + + spin_lock(&rds_info_lock); + BUG_ON(rds_info_funcs[offset] != func); + rds_info_funcs[offset] = NULL; + spin_unlock(&rds_info_lock); +} + +/* + * Typically we hold an atomic kmap across multiple rds_info_copy() calls + * because the kmap is so expensive. This must be called before using blocking + * operations while holding the mapping and as the iterator is torn down. + */ +void rds_info_iter_unmap(struct rds_info_iterator *iter) +{ + if (iter->addr != NULL) { + kunmap_atomic(iter->addr, KM_USER0); + iter->addr = NULL; + } +} + +/* + * get_user_pages() called flush_dcache_page() on the pages for us. + */ +void rds_info_copy(struct rds_info_iterator *iter, void *data, + unsigned long bytes) +{ + unsigned long this; + + while (bytes) { + if (iter->addr == NULL) + iter->addr = kmap_atomic(*iter->pages, KM_USER0); + + this = min(bytes, PAGE_SIZE - iter->offset); + + rdsdebug("page %p addr %p offset %lu this %lu data %p " + "bytes %lu\n", *iter->pages, iter->addr, + iter->offset, this, data, bytes); + + memcpy(iter->addr + iter->offset, data, this); + + data += this; + bytes -= this; + iter->offset += this; + + if (iter->offset == PAGE_SIZE) { + kunmap_atomic(iter->addr, KM_USER0); + iter->addr = NULL; + iter->offset = 0; + iter->pages++; + } + } +} + +/* + * @optval points to the userspace buffer that the information snapshot + * will be copied into. + * + * @optlen on input is the size of the buffer in userspace. @optlen + * on output is the size of the requested snapshot in bytes. + * + * This function returns -errno if there is a failure, particularly -ENOSPC + * if the given userspace buffer was not large enough to fit the snapshot. + * On success it returns the positive number of bytes of each array element + * in the snapshot. + */ +int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, + int __user *optlen) +{ + struct rds_info_iterator iter; + struct rds_info_lengths lens; + unsigned long nr_pages = 0; + unsigned long start; + unsigned long i; + rds_info_func func; + struct page **pages = NULL; + int ret; + int len; + int total; + + if (get_user(len, optlen)) { + ret = -EFAULT; + goto out; + } + + /* check for all kinds of wrapping and the like */ + start = (unsigned long)optval; + if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) { + ret = -EINVAL; + goto out; + } + + /* a 0 len call is just trying to probe its length */ + if (len == 0) + goto call_func; + + nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) + >> PAGE_SHIFT; + + pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + ret = -ENOMEM; + goto out; + } + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0, + pages, NULL); + up_read(¤t->mm->mmap_sem); + if (ret != nr_pages) { + if (ret > 0) + nr_pages = ret; + else + nr_pages = 0; + ret = -EAGAIN; /* XXX ? */ + goto out; + } + + rdsdebug("len %d nr_pages %lu\n", len, nr_pages); + +call_func: + func = rds_info_funcs[optname - RDS_INFO_FIRST]; + if (func == NULL) { + ret = -ENOPROTOOPT; + goto out; + } + + iter.pages = pages; + iter.addr = NULL; + iter.offset = start & (PAGE_SIZE - 1); + + func(sock, len, &iter, &lens); + BUG_ON(lens.each == 0); + + total = lens.nr * lens.each; + + rds_info_iter_unmap(&iter); + + if (total > len) { + len = total; + ret = -ENOSPC; + } else { + len = total; + ret = lens.each; + } + + if (put_user(len, optlen)) + ret = -EFAULT; + +out: + for (i = 0; pages != NULL && i < nr_pages; i++) + put_page(pages[i]); + kfree(pages); + + return ret; +} diff --git a/net/rds/info.h b/net/rds/info.h new file mode 100644 index 000000000000..b6c052ca7d22 --- /dev/null +++ b/net/rds/info.h @@ -0,0 +1,30 @@ +#ifndef _RDS_INFO_H +#define _RDS_INFO_H + +struct rds_info_lengths { + unsigned int nr; + unsigned int each; +}; + +struct rds_info_iterator; + +/* + * These functions must fill in the fields of @lens to reflect the size + * of the available info source. If the snapshot fits in @len then it + * should be copied using @iter. The caller will deduce if it was copied + * or not by comparing the lengths. + */ +typedef void (*rds_info_func)(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens); + +void rds_info_register_func(int optname, rds_info_func func); +void rds_info_deregister_func(int optname, rds_info_func func); +int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, + int __user *optlen); +void rds_info_copy(struct rds_info_iterator *iter, void *data, + unsigned long bytes); +void rds_info_iter_unmap(struct rds_info_iterator *iter); + + +#endif diff --git a/net/rds/iw.c b/net/rds/iw.c new file mode 100644 index 000000000000..1b56905c4c08 --- /dev/null +++ b/net/rds/iw.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/if_arp.h> +#include <linux/delay.h> + +#include "rds.h" +#include "iw.h" + +unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE; +unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */ + +module_param(fastreg_pool_size, int, 0444); +MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device"); +module_param(fastreg_message_size, int, 0444); +MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)"); + +struct list_head rds_iw_devices; + +DEFINE_SPINLOCK(iw_nodev_conns_lock); +LIST_HEAD(iw_nodev_conns); + +void rds_iw_add_one(struct ib_device *device) +{ + struct rds_iw_device *rds_iwdev; + struct ib_device_attr *dev_attr; + + /* Only handle iwarp devices */ + if (device->node_type != RDMA_NODE_RNIC) + return; + + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) + return; + + if (ib_query_device(device, dev_attr)) { + rdsdebug("Query device failed for %s\n", device->name); + goto free_attr; + } + + rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); + if (!rds_iwdev) + goto free_attr; + + spin_lock_init(&rds_iwdev->spinlock); + + rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); + rds_iwdev->max_wrs = dev_attr->max_qp_wr; + rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); + + rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1); + + rds_iwdev->dev = device; + rds_iwdev->pd = ib_alloc_pd(device); + if (IS_ERR(rds_iwdev->pd)) + goto free_dev; + + if (!rds_iwdev->dma_local_lkey) { + if (device->node_type != RDMA_NODE_RNIC) { + rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, + IB_ACCESS_LOCAL_WRITE); + } else { + rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE); + } + if (IS_ERR(rds_iwdev->mr)) + goto err_pd; + } else + rds_iwdev->mr = NULL; + + rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev); + if (IS_ERR(rds_iwdev->mr_pool)) { + rds_iwdev->mr_pool = NULL; + goto err_mr; + } + + INIT_LIST_HEAD(&rds_iwdev->cm_id_list); + INIT_LIST_HEAD(&rds_iwdev->conn_list); + list_add_tail(&rds_iwdev->list, &rds_iw_devices); + + ib_set_client_data(device, &rds_iw_client, rds_iwdev); + + goto free_attr; + +err_mr: + if (rds_iwdev->mr) + ib_dereg_mr(rds_iwdev->mr); +err_pd: + ib_dealloc_pd(rds_iwdev->pd); +free_dev: + kfree(rds_iwdev); +free_attr: + kfree(dev_attr); +} + +void rds_iw_remove_one(struct ib_device *device) +{ + struct rds_iw_device *rds_iwdev; + struct rds_iw_cm_id *i_cm_id, *next; + + rds_iwdev = ib_get_client_data(device, &rds_iw_client); + if (!rds_iwdev) + return; + + spin_lock_irq(&rds_iwdev->spinlock); + list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) { + list_del(&i_cm_id->list); + kfree(i_cm_id); + } + spin_unlock_irq(&rds_iwdev->spinlock); + + rds_iw_remove_conns(rds_iwdev); + + if (rds_iwdev->mr_pool) + rds_iw_destroy_mr_pool(rds_iwdev->mr_pool); + + if (rds_iwdev->mr) + ib_dereg_mr(rds_iwdev->mr); + + while (ib_dealloc_pd(rds_iwdev->pd)) { + rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd); + msleep(1); + } + + list_del(&rds_iwdev->list); + kfree(rds_iwdev); +} + +struct ib_client rds_iw_client = { + .name = "rds_iw", + .add = rds_iw_add_one, + .remove = rds_iw_remove_one +}; + +static int rds_iw_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_rdma_connection *iinfo = buffer; + struct rds_iw_connection *ic; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_iw_transport) + return 0; + + iinfo->src_addr = conn->c_laddr; + iinfo->dst_addr = conn->c_faddr; + + memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); + memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_iw_device *rds_iwdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + + ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + iinfo->max_send_wr = ic->i_send_ring.w_nr; + iinfo->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo->max_send_sge = rds_iwdev->max_sge; + rds_iw_get_mr_info(rds_iwdev, iinfo); + } + return 1; +} + +static void rds_iw_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_iw_conn_info_visitor, + sizeof(struct rds_info_rdma_connection)); +} + + +/* + * Early RDS/IB was built to only bind to an address if there is an IPoIB + * device with that address set. + * + * If it were me, I'd advocate for something more flexible. Sending and + * receiving should be device-agnostic. Transports would try and maintain + * connections between peers who have messages queued. Userspace would be + * allowed to influence which paths have priority. We could call userspace + * asserting this policy "routing". + */ +static int rds_iw_laddr_check(__be32 addr) +{ + int ret; + struct rdma_cm_id *cm_id; + struct sockaddr_in sin; + + /* Create a CMA ID and try to bind it. This catches both + * IB and iWARP capable NICs. + */ + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + if (!cm_id) + return -EADDRNOTAVAIL; + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + + /* rdma_bind_addr will only succeed for IB & iWARP devices */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + /* due to this, we will claim to support IB devices unless we + check node_type. */ + if (ret || cm_id->device->node_type != RDMA_NODE_RNIC) + ret = -EADDRNOTAVAIL; + + rdsdebug("addr %pI4 ret %d node type %d\n", + &addr, ret, + cm_id->device ? cm_id->device->node_type : -1); + + rdma_destroy_id(cm_id); + + return ret; +} + +void rds_iw_exit(void) +{ + rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); + rds_iw_remove_nodev_conns(); + ib_unregister_client(&rds_iw_client); + rds_iw_sysctl_exit(); + rds_iw_recv_exit(); + rds_trans_unregister(&rds_iw_transport); +} + +struct rds_transport rds_iw_transport = { + .laddr_check = rds_iw_laddr_check, + .xmit_complete = rds_iw_xmit_complete, + .xmit = rds_iw_xmit, + .xmit_cong_map = NULL, + .xmit_rdma = rds_iw_xmit_rdma, + .recv = rds_iw_recv, + .conn_alloc = rds_iw_conn_alloc, + .conn_free = rds_iw_conn_free, + .conn_connect = rds_iw_conn_connect, + .conn_shutdown = rds_iw_conn_shutdown, + .inc_copy_to_user = rds_iw_inc_copy_to_user, + .inc_purge = rds_iw_inc_purge, + .inc_free = rds_iw_inc_free, + .cm_initiate_connect = rds_iw_cm_initiate_connect, + .cm_handle_connect = rds_iw_cm_handle_connect, + .cm_connect_complete = rds_iw_cm_connect_complete, + .stats_info_copy = rds_iw_stats_info_copy, + .exit = rds_iw_exit, + .get_mr = rds_iw_get_mr, + .sync_mr = rds_iw_sync_mr, + .free_mr = rds_iw_free_mr, + .flush_mrs = rds_iw_flush_mrs, + .t_owner = THIS_MODULE, + .t_name = "iwarp", + .t_prefer_loopback = 1, +}; + +int __init rds_iw_init(void) +{ + int ret; + + INIT_LIST_HEAD(&rds_iw_devices); + + ret = ib_register_client(&rds_iw_client); + if (ret) + goto out; + + ret = rds_iw_sysctl_init(); + if (ret) + goto out_ibreg; + + ret = rds_iw_recv_init(); + if (ret) + goto out_sysctl; + + ret = rds_trans_register(&rds_iw_transport); + if (ret) + goto out_recv; + + rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); + + goto out; + +out_recv: + rds_iw_recv_exit(); +out_sysctl: + rds_iw_sysctl_exit(); +out_ibreg: + ib_unregister_client(&rds_iw_client); +out: + return ret; +} + +MODULE_LICENSE("GPL"); + diff --git a/net/rds/iw.h b/net/rds/iw.h new file mode 100644 index 000000000000..0ddda34f2a1c --- /dev/null +++ b/net/rds/iw.h @@ -0,0 +1,395 @@ +#ifndef _RDS_IW_H +#define _RDS_IW_H + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include "rds.h" +#include "rdma_transport.h" + +#define RDS_FASTREG_SIZE 20 +#define RDS_FASTREG_POOL_SIZE 2048 + +#define RDS_IW_MAX_SGE 8 +#define RDS_IW_RECV_SGE 2 + +#define RDS_IW_DEFAULT_RECV_WR 1024 +#define RDS_IW_DEFAULT_SEND_WR 256 + +#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ + +extern struct list_head rds_iw_devices; + +/* + * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to + * try and minimize the amount of memory tied up both the device and + * socket receive queues. + */ +/* page offset of the final full frag that fits in the page */ +#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) +struct rds_page_frag { + struct list_head f_item; + struct page *f_page; + unsigned long f_offset; + dma_addr_t f_mapped; +}; + +struct rds_iw_incoming { + struct list_head ii_frags; + struct rds_incoming ii_inc; +}; + +struct rds_iw_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + __be32 dp_saddr; + __be32 dp_daddr; + u8 dp_protocol_major; + u8 dp_protocol_minor; + __be16 dp_protocol_minor_mask; /* bitmask */ + __be32 dp_reserved1; + __be64 dp_ack_seq; + __be32 dp_credit; /* non-zero enables flow ctl */ +}; + +struct rds_iw_scatterlist { + struct scatterlist *list; + unsigned int len; + int dma_len; + unsigned int dma_npages; + unsigned int bytes; +}; + +struct rds_iw_mapping { + spinlock_t m_lock; /* protect the mapping struct */ + struct list_head m_list; + struct rds_iw_mr *m_mr; + uint32_t m_rkey; + struct rds_iw_scatterlist m_sg; +}; + +struct rds_iw_send_work { + struct rds_message *s_rm; + + /* We should really put these into a union: */ + struct rds_rdma_op *s_op; + struct rds_iw_mapping *s_mapping; + struct ib_mr *s_mr; + struct ib_fast_reg_page_list *s_page_list; + unsigned char s_remap_count; + + struct ib_send_wr s_wr; + struct ib_sge s_sge[RDS_IW_MAX_SGE]; + unsigned long s_queued; +}; + +struct rds_iw_recv_work { + struct rds_iw_incoming *r_iwinc; + struct rds_page_frag *r_frag; + struct ib_recv_wr r_wr; + struct ib_sge r_sge[2]; +}; + +struct rds_iw_work_ring { + u32 w_nr; + u32 w_alloc_ptr; + u32 w_alloc_ctr; + u32 w_free_ptr; + atomic_t w_free_ctr; +}; + +struct rds_iw_device; + +struct rds_iw_connection { + + struct list_head iw_node; + struct rds_iw_device *rds_iwdev; + struct rds_connection *conn; + + /* alphabet soup, IBTA style */ + struct rdma_cm_id *i_cm_id; + struct ib_pd *i_pd; + struct ib_mr *i_mr; + struct ib_cq *i_send_cq; + struct ib_cq *i_recv_cq; + + /* tx */ + struct rds_iw_work_ring i_send_ring; + struct rds_message *i_rm; + struct rds_header *i_send_hdrs; + u64 i_send_hdrs_dma; + struct rds_iw_send_work *i_sends; + + /* rx */ + struct mutex i_recv_mutex; + struct rds_iw_work_ring i_recv_ring; + struct rds_iw_incoming *i_iwinc; + u32 i_recv_data_rem; + struct rds_header *i_recv_hdrs; + u64 i_recv_hdrs_dma; + struct rds_iw_recv_work *i_recvs; + struct rds_page_frag i_frag; + u64 i_ack_recv; /* last ACK received */ + + /* sending acks */ + unsigned long i_ack_flags; + u64 i_ack_next; /* next ACK to send */ + struct rds_header *i_ack; + struct ib_send_wr i_ack_wr; + struct ib_sge i_ack_sge; + u64 i_ack_dma; + unsigned long i_ack_queued; + + /* Flow control related information + * + * Our algorithm uses a pair variables that we need to access + * atomically - one for the send credits, and one posted + * recv credits we need to transfer to remote. + * Rather than protect them using a slow spinlock, we put both into + * a single atomic_t and update it using cmpxchg + */ + atomic_t i_credits; + + /* Protocol version specific information */ + unsigned int i_flowctl:1; /* enable/disable flow ctl */ + unsigned int i_dma_local_lkey:1; + unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */ + /* Batched completions */ + unsigned int i_unsignaled_wrs; + long i_unsignaled_bytes; +}; + +/* This assumes that atomic_t is at least 32 bits */ +#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_GET_POST_CREDITS(v) ((v) >> 16) +#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_SET_POST_CREDITS(v) ((v) << 16) + +struct rds_iw_cm_id { + struct list_head list; + struct rdma_cm_id *cm_id; +}; + +struct rds_iw_device { + struct list_head list; + struct list_head cm_id_list; + struct list_head conn_list; + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct rds_iw_mr_pool *mr_pool; + int page_shift; + int max_sge; + unsigned int max_wrs; + unsigned int dma_local_lkey:1; + spinlock_t spinlock; /* protect the above */ +}; + +/* bits for i_ack_flags */ +#define IB_ACK_IN_FLIGHT 0 +#define IB_ACK_REQUESTED 1 + +/* Magic WR_ID for ACKs */ +#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL) +#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL) +#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL) + +struct rds_iw_statistics { + uint64_t s_iw_connect_raced; + uint64_t s_iw_listen_closed_stale; + uint64_t s_iw_tx_cq_call; + uint64_t s_iw_tx_cq_event; + uint64_t s_iw_tx_ring_full; + uint64_t s_iw_tx_throttle; + uint64_t s_iw_tx_sg_mapping_failure; + uint64_t s_iw_tx_stalled; + uint64_t s_iw_tx_credit_updates; + uint64_t s_iw_rx_cq_call; + uint64_t s_iw_rx_cq_event; + uint64_t s_iw_rx_ring_empty; + uint64_t s_iw_rx_refill_from_cq; + uint64_t s_iw_rx_refill_from_thread; + uint64_t s_iw_rx_alloc_limit; + uint64_t s_iw_rx_credit_updates; + uint64_t s_iw_ack_sent; + uint64_t s_iw_ack_send_failure; + uint64_t s_iw_ack_send_delayed; + uint64_t s_iw_ack_send_piggybacked; + uint64_t s_iw_ack_received; + uint64_t s_iw_rdma_mr_alloc; + uint64_t s_iw_rdma_mr_free; + uint64_t s_iw_rdma_mr_used; + uint64_t s_iw_rdma_mr_pool_flush; + uint64_t s_iw_rdma_mr_pool_wait; + uint64_t s_iw_rdma_mr_pool_depleted; +}; + +extern struct workqueue_struct *rds_iw_wq; + +/* + * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h + * doesn't define it. + */ +static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_cpu(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu + +static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_device(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device + +static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) +{ + return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey; +} + +/* ib.c */ +extern struct rds_transport rds_iw_transport; +extern void rds_iw_add_one(struct ib_device *device); +extern void rds_iw_remove_one(struct ib_device *device); +extern struct ib_client rds_iw_client; + +extern unsigned int fastreg_pool_size; +extern unsigned int fastreg_message_size; + +extern spinlock_t iw_nodev_conns_lock; +extern struct list_head iw_nodev_conns; + +/* ib_cm.c */ +int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp); +void rds_iw_conn_free(void *arg); +int rds_iw_conn_connect(struct rds_connection *conn); +void rds_iw_conn_shutdown(struct rds_connection *conn); +void rds_iw_state_change(struct sock *sk); +int __init rds_iw_listen_init(void); +void rds_iw_listen_stop(void); +void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); +int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id); +void rds_iw_cm_connect_complete(struct rds_connection *conn, + struct rdma_cm_event *event); + + +#define rds_iw_conn_error(conn, fmt...) \ + __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt) + +/* ib_rdma.c */ +int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); +int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); +void rds_iw_remove_nodev_conns(void); +void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev); +struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *); +void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo); +void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *); +void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret); +void rds_iw_sync_mr(void *trans_private, int dir); +void rds_iw_free_mr(void *trans_private, int invalidate); +void rds_iw_flush_mrs(void); +void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); + +/* ib_recv.c */ +int __init rds_iw_recv_init(void); +void rds_iw_recv_exit(void); +int rds_iw_recv(struct rds_connection *conn); +int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill); +void rds_iw_inc_purge(struct rds_incoming *inc); +void rds_iw_inc_free(struct rds_incoming *inc); +int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, + size_t size); +void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_iw_recv_init_ring(struct rds_iw_connection *ic); +void rds_iw_recv_clear_ring(struct rds_iw_connection *ic); +void rds_iw_recv_init_ack(struct rds_iw_connection *ic); +void rds_iw_attempt_ack(struct rds_iw_connection *ic); +void rds_iw_ack_send_complete(struct rds_iw_connection *ic); +u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic); + +/* ib_ring.c */ +void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr); +void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr); +u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos); +void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val); +void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val); +int rds_iw_ring_empty(struct rds_iw_work_ring *ring); +int rds_iw_ring_low(struct rds_iw_work_ring *ring); +u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring); +u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest); +extern wait_queue_head_t rds_iw_ring_empty_wait; + +/* ib_send.c */ +void rds_iw_xmit_complete(struct rds_connection *conn); +int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_iw_send_init_ring(struct rds_iw_connection *ic); +void rds_iw_send_clear_ring(struct rds_iw_connection *ic); +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); +void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); +int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, + u32 *adv_credits, int need_posted); + +/* ib_stats.c */ +DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats); +#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member) +unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); + +/* ib_sysctl.c */ +int __init rds_iw_sysctl_init(void); +void rds_iw_sysctl_exit(void); +extern unsigned long rds_iw_sysctl_max_send_wr; +extern unsigned long rds_iw_sysctl_max_recv_wr; +extern unsigned long rds_iw_sysctl_max_unsig_wrs; +extern unsigned long rds_iw_sysctl_max_unsig_bytes; +extern unsigned long rds_iw_sysctl_max_recv_allocation; +extern unsigned int rds_iw_sysctl_flow_control; +extern ctl_table rds_iw_sysctl_table[]; + +/* + * Helper functions for getting/setting the header and data SGEs in + * RDS packets (not RDMA) + */ +static inline struct ib_sge * +rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge) +{ + return &sge[0]; +} + +static inline struct ib_sge * +rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge) +{ + return &sge[1]; +} + +static inline void rds_iw_set_64bit(u64 *ptr, u64 val) +{ +#if BITS_PER_LONG == 64 + *ptr = val; +#else + set_64bit(ptr, val); +#endif +} + +#endif diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c new file mode 100644 index 000000000000..57ecb3d4b8a5 --- /dev/null +++ b/net/rds/iw_cm.c @@ -0,0 +1,750 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/vmalloc.h> + +#include "rds.h" +#include "iw.h" + +/* + * Set the selected protocol version + */ +static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version) +{ + conn->c_version = version; +} + +/* + * Set up flow control + */ +static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (rds_iw_sysctl_flow_control && credits != 0) { + /* We're doing flow control */ + ic->i_flowctl = 1; + rds_iw_send_add_credits(conn, credits); + } else { + ic->i_flowctl = 0; + } +} + +/* + * Connection established. + * We get here for both outgoing and incoming connection. + */ +void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) +{ + const struct rds_iw_connect_private *dp = NULL; + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_device *rds_iwdev; + int err; + + if (event->param.conn.private_data_len) { + dp = event->param.conn.private_data; + + rds_iw_set_protocol(conn, + RDS_PROTOCOL(dp->dp_protocol_major, + dp->dp_protocol_minor)); + rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + } + + /* update ib_device with this local ipaddr & conn */ + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id); + if (err) + printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err); + err = rds_iw_add_conn(rds_iwdev, conn); + if (err) + printk(KERN_ERR "rds_iw_add_conn failed (%d)\n", err); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp && dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + + rds_connect_complete(conn); +} + +static void rds_iw_cm_fill_conn_param(struct rds_connection *conn, + struct rdma_conn_param *conn_param, + struct rds_iw_connect_private *dp, + u32 protocol_version) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + memset(conn_param, 0, sizeof(struct rdma_conn_param)); + /* XXX tune these? */ + conn_param->responder_resources = 1; + conn_param->initiator_depth = 1; + + if (dp) { + memset(dp, 0, sizeof(*dp)); + dp->dp_saddr = conn->c_laddr; + dp->dp_daddr = conn->c_faddr; + dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); + dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); + dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS); + dp->dp_ack_seq = rds_iw_piggyb_ack(ic); + + /* Advertise flow control */ + if (ic->i_flowctl) { + unsigned int credits; + + credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); + dp->dp_credit = cpu_to_be32(credits); + atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); + } + + conn_param->private_data = dp; + conn_param->private_data_len = sizeof(*dp); + } +} + +static void rds_iw_cq_event_handler(struct ib_event *event, void *data) +{ + rdsdebug("event %u data %p\n", event->event, data); +} + +static void rds_iw_qp_event_handler(struct ib_event *event, void *data) +{ + struct rds_connection *conn = data; + struct rds_iw_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); + break; + case IB_EVENT_QP_REQ_ERR: + case IB_EVENT_QP_FATAL: + default: + rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n", + event->event, &conn->c_laddr, + &conn->c_faddr); + break; + } +} + +/* + * Create a QP + */ +static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, + struct rds_iw_device *rds_iwdev, + struct rds_iw_work_ring *send_ring, + void (*send_cq_handler)(struct ib_cq *, void *), + struct rds_iw_work_ring *recv_ring, + void (*recv_cq_handler)(struct ib_cq *, void *), + void *context) +{ + struct ib_device *dev = rds_iwdev->dev; + unsigned int send_size, recv_size; + int ret; + + /* The offset of 1 is to accomodate the additional ACK WR. */ + send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); + recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); + rds_iw_ring_resize(send_ring, send_size - 1); + rds_iw_ring_resize(recv_ring, recv_size - 1); + + memset(attr, 0, sizeof(*attr)); + attr->event_handler = rds_iw_qp_event_handler; + attr->qp_context = context; + attr->cap.max_send_wr = send_size; + attr->cap.max_recv_wr = recv_size; + attr->cap.max_send_sge = rds_iwdev->max_sge; + attr->cap.max_recv_sge = RDS_IW_RECV_SGE; + attr->sq_sig_type = IB_SIGNAL_REQ_WR; + attr->qp_type = IB_QPT_RC; + + attr->send_cq = ib_create_cq(dev, send_cq_handler, + rds_iw_cq_event_handler, + context, send_size, 0); + if (IS_ERR(attr->send_cq)) { + ret = PTR_ERR(attr->send_cq); + attr->send_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + attr->recv_cq = ib_create_cq(dev, recv_cq_handler, + rds_iw_cq_event_handler, + context, recv_size, 0); + if (IS_ERR(attr->recv_cq)) { + ret = PTR_ERR(attr->recv_cq); + attr->recv_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP); + if (ret) { + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED); + if (ret) { + rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); + goto out; + } + +out: + if (ret) { + if (attr->send_cq) + ib_destroy_cq(attr->send_cq); + if (attr->recv_cq) + ib_destroy_cq(attr->recv_cq); + } + return ret; +} + +/* + * This needs to be very careful to not leave IS_ERR pointers around for + * cleanup to trip over. + */ +static int rds_iw_setup_qp(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct ib_qp_init_attr attr; + struct rds_iw_device *rds_iwdev; + int ret; + + /* rds_iw_add_one creates a rds_iw_device object per IB device, + * and allocates a protection domain, memory range and MR pool + * for each. If that fails for any reason, it will not register + * the rds_iwdev at all. + */ + rds_iwdev = ib_get_client_data(dev, &rds_iw_client); + if (rds_iwdev == NULL) { + if (printk_ratelimit()) + printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", + dev->name); + return -EOPNOTSUPP; + } + + /* Protection domain and memory range */ + ic->i_pd = rds_iwdev->pd; + ic->i_mr = rds_iwdev->mr; + + ret = rds_iw_init_qp_attrs(&attr, rds_iwdev, + &ic->i_send_ring, rds_iw_send_cq_comp_handler, + &ic->i_recv_ring, rds_iw_recv_cq_comp_handler, + conn); + if (ret < 0) + goto out; + + ic->i_send_cq = attr.send_cq; + ic->i_recv_cq = attr.recv_cq; + + /* + * XXX this can fail if max_*_wr is too large? Are we supposed + * to back off until we get a value that the hardware can support? + */ + ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); + if (ret) { + rdsdebug("rdma_create_qp failed: %d\n", ret); + goto out; + } + + ic->i_send_hdrs = ib_dma_alloc_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + &ic->i_send_hdrs_dma, GFP_KERNEL); + if (ic->i_send_hdrs == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent send failed\n"); + goto out; + } + + ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + &ic->i_recv_hdrs_dma, GFP_KERNEL); + if (ic->i_recv_hdrs == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent recv failed\n"); + goto out; + } + + ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), + &ic->i_ack_dma, GFP_KERNEL); + if (ic->i_ack == NULL) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent ack failed\n"); + goto out; + } + + ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); + if (ic->i_sends == NULL) { + ret = -ENOMEM; + rdsdebug("send allocation failed\n"); + goto out; + } + rds_iw_send_init_ring(ic); + + ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); + if (ic->i_recvs == NULL) { + ret = -ENOMEM; + rdsdebug("recv allocation failed\n"); + goto out; + } + + rds_iw_recv_init_ring(ic); + rds_iw_recv_init_ack(ic); + + /* Post receive buffers - as a side effect, this will update + * the posted credit count. */ + rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + + rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, + ic->i_send_cq, ic->i_recv_cq); + +out: + return ret; +} + +static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp) +{ + u16 common; + u32 version = 0; + + /* rdma_cm private data is odd - when there is any private data in the + * request, we will be given a pretty large buffer without telling us the + * original size. The only way to tell the difference is by looking at + * the contents, which are initialized to zero. + * If the protocol version fields aren't set, this is a connection attempt + * from an older version. This could could be 3.0 or 2.0 - we can't tell. + * We really should have changed this for OFED 1.3 :-( */ + if (dp->dp_protocol_major == 0) + return RDS_PROTOCOL_3_0; + + common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS; + if (dp->dp_protocol_major == 3 && common) { + version = RDS_PROTOCOL_3_0; + while ((common >>= 1) != 0) + version++; + } else if (printk_ratelimit()) { + printk(KERN_NOTICE "RDS: Connection from %pI4 using " + "incompatible protocol version %u.%u\n", + &dp->dp_saddr, + dp->dp_protocol_major, + dp->dp_protocol_minor); + } + return version; +} + +int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + const struct rds_iw_connect_private *dp = event->param.conn.private_data; + struct rds_iw_connect_private dp_rep; + struct rds_connection *conn = NULL; + struct rds_iw_connection *ic = NULL; + struct rdma_conn_param conn_param; + struct rds_iw_device *rds_iwdev; + u32 version; + int err, destroy = 1; + + /* Check whether the remote protocol version matches ours. */ + version = rds_iw_protocol_compatible(dp); + if (!version) + goto out; + + rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n", + &dp->dp_saddr, &dp->dp_daddr, + RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); + + conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport, + GFP_KERNEL); + if (IS_ERR(conn)) { + rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); + conn = NULL; + goto out; + } + + /* + * The connection request may occur while the + * previous connection exist, e.g. in case of failover. + * But as connections may be initiated simultaneously + * by both hosts, we have a random backoff mechanism - + * see the comment above rds_queue_reconnect() + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + if (rds_conn_state(conn) == RDS_CONN_UP) { + rdsdebug("incoming connect while connecting\n"); + rds_conn_drop(conn); + rds_iw_stats_inc(s_iw_listen_closed_stale); + } else + if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { + /* Wait and see - our connect may still be succeeding */ + rds_iw_stats_inc(s_iw_connect_raced); + } + mutex_unlock(&conn->c_cm_lock); + goto out; + } + + ic = conn->c_transport_data; + + rds_iw_set_protocol(conn, version); + rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + BUG_ON(cm_id->context); + BUG_ON(ic->i_cm_id); + + ic->i_cm_id = cm_id; + cm_id->context = conn; + + rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client); + ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; + + /* We got halfway through setting up the ib_connection, if we + * fail now, we have to take the long route out of this mess. */ + destroy = 0; + + err = rds_iw_setup_qp(conn); + if (err) { + rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); + goto out; + } + + rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + + /* rdma_accept() calls rdma_reject() internally if it fails */ + err = rdma_accept(cm_id, &conn_param); + mutex_unlock(&conn->c_cm_lock); + if (err) { + rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err); + goto out; + } + + return 0; + +out: + rdma_reject(cm_id, NULL, 0); + return destroy; +} + + +int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id) +{ + struct rds_connection *conn = cm_id->context; + struct rds_iw_connection *ic = conn->c_transport_data; + struct rdma_conn_param conn_param; + struct rds_iw_connect_private dp; + int ret; + + /* If the peer doesn't do protocol negotiation, we must + * default to RDSv3.0 */ + rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0); + ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */ + + ret = rds_iw_setup_qp(conn); + if (ret) { + rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret); + goto out; + } + + rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); + + ret = rdma_connect(cm_id, &conn_param); + if (ret) + rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret); + +out: + /* Beware - returning non-zero tells the rdma_cm to destroy + * the cm_id. We should certainly not do it as long as we still + * "own" the cm_id. */ + if (ret) { + struct rds_iw_connection *ic = conn->c_transport_data; + + if (ic->i_cm_id == cm_id) + ret = 0; + } + return ret; +} + +int rds_iw_conn_connect(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_device *rds_iwdev; + struct sockaddr_in src, dest; + int ret; + + /* XXX I wonder what affect the port space has */ + /* delegate cm event handler to rdma_transport */ + ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, + RDMA_PS_TCP); + if (IS_ERR(ic->i_cm_id)) { + ret = PTR_ERR(ic->i_cm_id); + ic->i_cm_id = NULL; + rdsdebug("rdma_create_id() failed: %d\n", ret); + goto out; + } + + rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (__force u32)conn->c_laddr; + src.sin_port = (__force u16)htons(0); + + /* First, bind to the local address and device. */ + ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src); + if (ret) { + rdsdebug("rdma_bind_addr(%pI4) failed: %d\n", + &conn->c_laddr, ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + goto out; + } + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (__force u32)conn->c_faddr; + dest.sin_port = (__force u16)htons(RDS_PORT); + + ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, + (struct sockaddr *)&dest, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, + ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + } + +out: + return ret; +} + +/* + * This is so careful about only cleaning up resources that were built up + * so that it can be called at any point during startup. In fact it + * can be called multiple times for a given connection. + */ +void rds_iw_conn_shutdown(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + int err = 0; + struct ib_qp_attr qp_attr; + + rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, + ic->i_pd, ic->i_send_cq, ic->i_recv_cq, + ic->i_cm_id ? ic->i_cm_id->qp : NULL); + + if (ic->i_cm_id) { + struct ib_device *dev = ic->i_cm_id->device; + + rdsdebug("disconnecting cm %p\n", ic->i_cm_id); + err = rdma_disconnect(ic->i_cm_id); + if (err) { + /* Actually this may happen quite frequently, when + * an outgoing connect raced with an incoming connect. + */ + rdsdebug("rds_iw_conn_shutdown: failed to disconnect," + " cm: %p err %d\n", ic->i_cm_id, err); + } + + if (ic->i_cm_id->qp) { + qp_attr.qp_state = IB_QPS_ERR; + ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); + } + + wait_event(rds_iw_ring_empty_wait, + rds_iw_ring_empty(&ic->i_send_ring) && + rds_iw_ring_empty(&ic->i_recv_ring)); + + if (ic->i_send_hdrs) + ib_dma_free_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + ic->i_send_hdrs, + ic->i_send_hdrs_dma); + + if (ic->i_recv_hdrs) + ib_dma_free_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + ic->i_recv_hdrs, + ic->i_recv_hdrs_dma); + + if (ic->i_ack) + ib_dma_free_coherent(dev, sizeof(struct rds_header), + ic->i_ack, ic->i_ack_dma); + + if (ic->i_sends) + rds_iw_send_clear_ring(ic); + if (ic->i_recvs) + rds_iw_recv_clear_ring(ic); + + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + + /* + * If associated with an rds_iw_device: + * Move connection back to the nodev list. + * Remove cm_id from the device cm_id list. + */ + if (ic->rds_iwdev) { + + spin_lock_irq(&ic->rds_iwdev->spinlock); + BUG_ON(list_empty(&ic->iw_node)); + list_del(&ic->iw_node); + spin_unlock_irq(&ic->rds_iwdev->spinlock); + + spin_lock_irq(&iw_nodev_conns_lock); + list_add_tail(&ic->iw_node, &iw_nodev_conns); + spin_unlock_irq(&iw_nodev_conns_lock); + rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id); + ic->rds_iwdev = NULL; + } + + rdma_destroy_id(ic->i_cm_id); + + ic->i_cm_id = NULL; + ic->i_pd = NULL; + ic->i_mr = NULL; + ic->i_send_cq = NULL; + ic->i_recv_cq = NULL; + ic->i_send_hdrs = NULL; + ic->i_recv_hdrs = NULL; + ic->i_ack = NULL; + } + BUG_ON(ic->rds_iwdev); + + /* Clear pending transmit */ + if (ic->i_rm) { + rds_message_put(ic->i_rm); + ic->i_rm = NULL; + } + + /* Clear the ACK state */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_iw_set_64bit(&ic->i_ack_next, 0); + ic->i_ack_recv = 0; + + /* Clear flow control state */ + ic->i_flowctl = 0; + atomic_set(&ic->i_credits, 0); + + rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); + rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); + + if (ic->i_iwinc) { + rds_inc_put(&ic->i_iwinc->ii_inc); + ic->i_iwinc = NULL; + } + + vfree(ic->i_sends); + ic->i_sends = NULL; + vfree(ic->i_recvs); + ic->i_recvs = NULL; + rdsdebug("shutdown complete\n"); +} + +int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_iw_connection *ic; + unsigned long flags; + + /* XXX too lazy? */ + ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); + if (ic == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&ic->iw_node); + mutex_init(&ic->i_recv_mutex); + + /* + * rds_iw_conn_shutdown() waits for these to be emptied so they + * must be initialized before it can be called. + */ + rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); + rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); + + ic->conn = conn; + conn->c_transport_data = ic; + + spin_lock_irqsave(&iw_nodev_conns_lock, flags); + list_add_tail(&ic->iw_node, &iw_nodev_conns); + spin_unlock_irqrestore(&iw_nodev_conns_lock, flags); + + + rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); + return 0; +} + +void rds_iw_conn_free(void *arg) +{ + struct rds_iw_connection *ic = arg; + rdsdebug("ic %p\n", ic); + list_del(&ic->iw_node); + kfree(ic); +} + +/* + * An error occurred on the connection + */ +void +__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + rds_conn_drop(conn); + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); +} diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c new file mode 100644 index 000000000000..1c02a8f952d0 --- /dev/null +++ b/net/rds/iw_rdma.c @@ -0,0 +1,888 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> + +#include "rds.h" +#include "rdma.h" +#include "iw.h" + + +/* + * This is stored as mr->r_trans_private. + */ +struct rds_iw_mr { + struct rds_iw_device *device; + struct rds_iw_mr_pool *pool; + struct rdma_cm_id *cm_id; + + struct ib_mr *mr; + struct ib_fast_reg_page_list *page_list; + + struct rds_iw_mapping mapping; + unsigned char remap_count; +}; + +/* + * Our own little MR pool + */ +struct rds_iw_mr_pool { + struct rds_iw_device *device; /* back ptr to the device that owns us */ + + struct mutex flush_lock; /* serialize fmr invalidate */ + struct work_struct flush_worker; /* flush worker */ + + spinlock_t list_lock; /* protect variables below */ + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + struct list_head dirty_list; /* dirty mappings */ + struct list_head clean_list; /* unused & unamapped MRs */ + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_message_size; /* in pages */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; + int max_pages; +}; + +static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all); +static void rds_iw_mr_pool_flush_worker(struct work_struct *work); +static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); +static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr, + struct scatterlist *sg, unsigned int nents); +static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); +static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, + struct list_head *unmap_list, + struct list_head *kill_list); +static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); + +static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) +{ + struct rds_iw_device *iwdev; + struct rds_iw_cm_id *i_cm_id; + + *rds_iwdev = NULL; + *cm_id = NULL; + + list_for_each_entry(iwdev, &rds_iw_devices, list) { + spin_lock_irq(&iwdev->spinlock); + list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) { + struct sockaddr_in *src_addr, *dst_addr; + + src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr; + dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr; + + rdsdebug("local ipaddr = %x port %d, " + "remote ipaddr = %x port %d" + "..looking for %x port %d, " + "remote ipaddr = %x port %d\n", + src_addr->sin_addr.s_addr, + src_addr->sin_port, + dst_addr->sin_addr.s_addr, + dst_addr->sin_port, + rs->rs_bound_addr, + rs->rs_bound_port, + rs->rs_conn_addr, + rs->rs_conn_port); +#ifdef WORKING_TUPLE_DETECTION + if (src_addr->sin_addr.s_addr == rs->rs_bound_addr && + src_addr->sin_port == rs->rs_bound_port && + dst_addr->sin_addr.s_addr == rs->rs_conn_addr && + dst_addr->sin_port == rs->rs_conn_port) { +#else + /* FIXME - needs to compare the local and remote + * ipaddr/port tuple, but the ipaddr is the only + * available infomation in the rds_sock (as the rest are + * zero'ed. It doesn't appear to be properly populated + * during connection setup... + */ + if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) { +#endif + spin_unlock_irq(&iwdev->spinlock); + *rds_iwdev = iwdev; + *cm_id = i_cm_id->cm_id; + return 0; + } + } + spin_unlock_irq(&iwdev->spinlock); + } + + return 1; +} + +static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +{ + struct rds_iw_cm_id *i_cm_id; + + i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL); + if (!i_cm_id) + return -ENOMEM; + + i_cm_id->cm_id = cm_id; + + spin_lock_irq(&rds_iwdev->spinlock); + list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list); + spin_unlock_irq(&rds_iwdev->spinlock); + + return 0; +} + +void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +{ + struct rds_iw_cm_id *i_cm_id; + + spin_lock_irq(&rds_iwdev->spinlock); + list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) { + if (i_cm_id->cm_id == cm_id) { + list_del(&i_cm_id->list); + kfree(i_cm_id); + break; + } + } + spin_unlock_irq(&rds_iwdev->spinlock); +} + + +int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +{ + struct sockaddr_in *src_addr, *dst_addr; + struct rds_iw_device *rds_iwdev_old; + struct rds_sock rs; + struct rdma_cm_id *pcm_id; + int rc; + + src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; + dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; + + rs.rs_bound_addr = src_addr->sin_addr.s_addr; + rs.rs_bound_port = src_addr->sin_port; + rs.rs_conn_addr = dst_addr->sin_addr.s_addr; + rs.rs_conn_port = dst_addr->sin_port; + + rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id); + if (rc) + rds_iw_remove_cm_id(rds_iwdev, cm_id); + + return rds_iw_add_cm_id(rds_iwdev, cm_id); +} + +int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + /* conn was previously on the nodev_conns_list */ + spin_lock_irq(&iw_nodev_conns_lock); + BUG_ON(list_empty(&iw_nodev_conns)); + BUG_ON(list_empty(&ic->iw_node)); + list_del(&ic->iw_node); + spin_unlock_irq(&iw_nodev_conns_lock); + + spin_lock_irq(&rds_iwdev->spinlock); + list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); + spin_unlock_irq(&rds_iwdev->spinlock); + + ic->rds_iwdev = rds_iwdev; + + return 0; +} + +void rds_iw_remove_nodev_conns(void) +{ + struct rds_iw_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&iw_nodev_conns_lock); + list_splice(&iw_nodev_conns, &tmp_list); + INIT_LIST_HEAD(&iw_nodev_conns); + spin_unlock_irq(&iw_nodev_conns_lock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) { + if (ic->conn->c_passive) + rds_conn_destroy(ic->conn->c_passive); + rds_conn_destroy(ic->conn); + } +} + +void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev) +{ + struct rds_iw_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&rds_iwdev->spinlock); + list_splice(&rds_iwdev->conn_list, &tmp_list); + INIT_LIST_HEAD(&rds_iwdev->conn_list); + spin_unlock_irq(&rds_iwdev->spinlock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) { + if (ic->conn->c_passive) + rds_conn_destroy(ic->conn->c_passive); + rds_conn_destroy(ic->conn); + } +} + +static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg, + struct scatterlist *list, unsigned int sg_len) +{ + sg->list = list; + sg->len = sg_len; + sg->dma_len = 0; + sg->dma_npages = 0; + sg->bytes = 0; +} + +static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, + struct rds_iw_scatterlist *sg, + unsigned int dma_page_shift) +{ + struct ib_device *dev = rds_iwdev->dev; + u64 *dma_pages = NULL; + u64 dma_mask; + unsigned int dma_page_size; + int i, j, ret; + + dma_page_size = 1 << dma_page_shift; + dma_mask = dma_page_size - 1; + + WARN_ON(sg->dma_len); + + sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL); + if (unlikely(!sg->dma_len)) { + printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n"); + return ERR_PTR(-EBUSY); + } + + sg->bytes = 0; + sg->dma_npages = 0; + + ret = -EINVAL; + for (i = 0; i < sg->dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); + u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); + u64 end_addr; + + sg->bytes += dma_len; + + end_addr = dma_addr + dma_len; + if (dma_addr & dma_mask) { + if (i > 0) + goto out_unmap; + dma_addr &= ~dma_mask; + } + if (end_addr & dma_mask) { + if (i < sg->dma_len - 1) + goto out_unmap; + end_addr = (end_addr + dma_mask) & ~dma_mask; + } + + sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift; + } + + /* Now gather the dma addrs into one list */ + if (sg->dma_npages > fastreg_message_size) + goto out_unmap; + + dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC); + if (!dma_pages) { + ret = -ENOMEM; + goto out_unmap; + } + + for (i = j = 0; i < sg->dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); + u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); + u64 end_addr; + + end_addr = dma_addr + dma_len; + dma_addr &= ~dma_mask; + for (; dma_addr < end_addr; dma_addr += dma_page_size) + dma_pages[j++] = dma_addr; + BUG_ON(j > sg->dma_npages); + } + + return dma_pages; + +out_unmap: + ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL); + sg->dma_len = 0; + kfree(dma_pages); + return ERR_PTR(ret); +} + + +struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev) +{ + struct rds_iw_mr_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) { + printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n"); + return ERR_PTR(-ENOMEM); + } + + pool->device = rds_iwdev; + INIT_LIST_HEAD(&pool->dirty_list); + INIT_LIST_HEAD(&pool->clean_list); + mutex_init(&pool->flush_lock); + spin_lock_init(&pool->list_lock); + INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker); + + pool->max_message_size = fastreg_message_size; + pool->max_items = fastreg_pool_size; + pool->max_free_pinned = pool->max_items * pool->max_message_size / 4; + pool->max_pages = fastreg_message_size; + + /* We never allow more than max_items MRs to be allocated. + * When we exceed more than max_items_soft, we start freeing + * items more aggressively. + * Make sure that max_items > max_items_soft > max_items / 2 + */ + pool->max_items_soft = pool->max_items * 3 / 4; + + return pool; +} + +void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo) +{ + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + + iinfo->rdma_mr_max = pool->max_items; + iinfo->rdma_mr_size = pool->max_pages; +} + +void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool) +{ + flush_workqueue(rds_wq); + rds_iw_flush_mr_pool(pool, 1); + BUG_ON(atomic_read(&pool->item_count)); + BUG_ON(atomic_read(&pool->free_pinned)); + kfree(pool); +} + +static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool) +{ + struct rds_iw_mr *ibmr = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->list_lock, flags); + if (!list_empty(&pool->clean_list)) { + ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list); + list_del_init(&ibmr->mapping.m_list); + } + spin_unlock_irqrestore(&pool->list_lock, flags); + + return ibmr; +} + +static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev) +{ + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + struct rds_iw_mr *ibmr = NULL; + int err = 0, iter = 0; + + while (1) { + ibmr = rds_iw_reuse_fmr(pool); + if (ibmr) + return ibmr; + + /* No clean MRs - now we have the choice of either + * allocating a fresh MR up to the limit imposed by the + * driver, or flush any dirty unused MRs. + * We try to avoid stalling in the send path if possible, + * so we allocate as long as we're allowed to. + * + * We're fussy with enforcing the FMR limit, though. If the driver + * tells us we can't use more than N fmrs, we shouldn't start + * arguing with it */ + if (atomic_inc_return(&pool->item_count) <= pool->max_items) + break; + + atomic_dec(&pool->item_count); + + if (++iter > 2) { + rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted); + return ERR_PTR(-EAGAIN); + } + + /* We do have some empty MRs. Flush them out. */ + rds_iw_stats_inc(s_iw_rdma_mr_pool_wait); + rds_iw_flush_mr_pool(pool, 0); + } + + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + spin_lock_init(&ibmr->mapping.m_lock); + INIT_LIST_HEAD(&ibmr->mapping.m_list); + ibmr->mapping.m_mr = ibmr; + + err = rds_iw_init_fastreg(pool, ibmr); + if (err) + goto out_no_cigar; + + rds_iw_stats_inc(s_iw_rdma_mr_alloc); + return ibmr; + +out_no_cigar: + if (ibmr) { + rds_iw_destroy_fastreg(pool, ibmr); + kfree(ibmr); + } + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +void rds_iw_sync_mr(void *trans_private, int direction) +{ + struct rds_iw_mr *ibmr = trans_private; + struct rds_iw_device *rds_iwdev = ibmr->device; + + switch (direction) { + case DMA_FROM_DEVICE: + ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list, + ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); + break; + case DMA_TO_DEVICE: + ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list, + ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); + break; + } +} + +static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all) +{ + unsigned int item_count; + + item_count = atomic_read(&pool->item_count); + if (free_all) + return item_count; + + return 0; +} + +/* + * Flush our pool of MRs. + * At a minimum, all currently unused MRs are unmapped. + * If the number of MRs allocated exceeds the limit, we also try + * to free as many MRs as needed to get back to this limit. + */ +static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) +{ + struct rds_iw_mr *ibmr, *next; + LIST_HEAD(unmap_list); + LIST_HEAD(kill_list); + unsigned long flags; + unsigned int nfreed = 0, ncleaned = 0, free_goal; + int ret = 0; + + rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); + + mutex_lock(&pool->flush_lock); + + spin_lock_irqsave(&pool->list_lock, flags); + /* Get the list of all mappings to be destroyed */ + list_splice_init(&pool->dirty_list, &unmap_list); + if (free_all) + list_splice_init(&pool->clean_list, &kill_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + + free_goal = rds_iw_flush_goal(pool, free_all); + + /* Batched invalidate of dirty MRs. + * For FMR based MRs, the mappings on the unmap list are + * actually members of an ibmr (ibmr->mapping). They either + * migrate to the kill_list, or have been cleaned and should be + * moved to the clean_list. + * For fastregs, they will be dynamically allocated, and + * will be destroyed by the unmap function. + */ + if (!list_empty(&unmap_list)) { + ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list); + /* If we've been asked to destroy all MRs, move those + * that were simply cleaned to the kill list */ + if (free_all) + list_splice_init(&unmap_list, &kill_list); + } + + /* Destroy any MRs that are past their best before date */ + list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) { + rds_iw_stats_inc(s_iw_rdma_mr_free); + list_del(&ibmr->mapping.m_list); + rds_iw_destroy_fastreg(pool, ibmr); + kfree(ibmr); + nfreed++; + } + + /* Anything that remains are laundered ibmrs, which we can add + * back to the clean list. */ + if (!list_empty(&unmap_list)) { + spin_lock_irqsave(&pool->list_lock, flags); + list_splice(&unmap_list, &pool->clean_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + } + + atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(nfreed, &pool->item_count); + + mutex_unlock(&pool->flush_lock); + return ret; +} + +static void rds_iw_mr_pool_flush_worker(struct work_struct *work) +{ + struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker); + + rds_iw_flush_mr_pool(pool, 0); +} + +void rds_iw_free_mr(void *trans_private, int invalidate) +{ + struct rds_iw_mr *ibmr = trans_private; + struct rds_iw_mr_pool *pool = ibmr->device->mr_pool; + + rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len); + if (!pool) + return; + + /* Return it to the pool's free list */ + rds_iw_free_fastreg(pool, ibmr); + + /* If we've pinned too many pages, request a flush */ + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned + || atomic_read(&pool->dirty_count) >= pool->max_items / 10) + queue_work(rds_wq, &pool->flush_worker); + + if (invalidate) { + if (likely(!in_interrupt())) { + rds_iw_flush_mr_pool(pool, 0); + } else { + /* We get here if the user created a MR marked + * as use_once and invalidate at the same time. */ + queue_work(rds_wq, &pool->flush_worker); + } + } +} + +void rds_iw_flush_mrs(void) +{ + struct rds_iw_device *rds_iwdev; + + list_for_each_entry(rds_iwdev, &rds_iw_devices, list) { + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + + if (pool) + rds_iw_flush_mr_pool(pool, 0); + } +} + +void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret) +{ + struct rds_iw_device *rds_iwdev; + struct rds_iw_mr *ibmr = NULL; + struct rdma_cm_id *cm_id; + int ret; + + ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id); + if (ret || !cm_id) { + ret = -ENODEV; + goto out; + } + + if (!rds_iwdev->mr_pool) { + ret = -ENODEV; + goto out; + } + + ibmr = rds_iw_alloc_mr(rds_iwdev); + if (IS_ERR(ibmr)) + return ibmr; + + ibmr->cm_id = cm_id; + ibmr->device = rds_iwdev; + + ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents); + if (ret == 0) + *key_ret = ibmr->mr->rkey; + else + printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret); + +out: + if (ret) { + if (ibmr) + rds_iw_free_mr(ibmr, 0); + ibmr = ERR_PTR(ret); + } + return ibmr; +} + +/* + * iWARP fastreg handling + * + * The life cycle of a fastreg registration is a bit different from + * FMRs. + * The idea behind fastreg is to have one MR, to which we bind different + * mappings over time. To avoid stalling on the expensive map and invalidate + * operations, these operations are pipelined on the same send queue on + * which we want to send the message containing the r_key. + * + * This creates a bit of a problem for us, as we do not have the destination + * IP in GET_MR, so the connection must be setup prior to the GET_MR call for + * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit + * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request + * before queuing the SEND. When completions for these arrive, they are + * dispatched to the MR has a bit set showing that RDMa can be performed. + * + * There is another interesting aspect that's related to invalidation. + * The application can request that a mapping is invalidated in FREE_MR. + * The expectation there is that this invalidation step includes ALL + * PREVIOUSLY FREED MRs. + */ +static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + struct rds_iw_device *rds_iwdev = pool->device; + struct ib_fast_reg_page_list *page_list = NULL; + struct ib_mr *mr; + int err; + + mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err); + return err; + } + + /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages + * is not filled in. + */ + page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size); + if (IS_ERR(page_list)) { + err = PTR_ERR(page_list); + + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err); + ib_dereg_mr(mr); + return err; + } + + ibmr->page_list = page_list; + ibmr->mr = mr; + return 0; +} + +static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping) +{ + struct rds_iw_mr *ibmr = mapping->m_mr; + struct ib_send_wr f_wr, *failed_wr; + int ret; + + /* + * Perform a WR for the fast_reg_mr. Each individual page + * in the sg list is added to the fast reg page list and placed + * inside the fast_reg_mr WR. The key used is a rolling 8bit + * counter, which should guarantee uniqueness. + */ + ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++); + mapping->m_rkey = ibmr->mr->rkey; + + memset(&f_wr, 0, sizeof(f_wr)); + f_wr.wr_id = RDS_IW_FAST_REG_WR_ID; + f_wr.opcode = IB_WR_FAST_REG_MR; + f_wr.wr.fast_reg.length = mapping->m_sg.bytes; + f_wr.wr.fast_reg.rkey = mapping->m_rkey; + f_wr.wr.fast_reg.page_list = ibmr->page_list; + f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len; + f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift; + f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + f_wr.wr.fast_reg.iova_start = 0; + f_wr.send_flags = IB_SEND_SIGNALED; + + failed_wr = &f_wr; + ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr); + BUG_ON(failed_wr != &f_wr); + if (ret && printk_ratelimit()) + printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + __func__, __LINE__, ret); + return ret; +} + +static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr) +{ + struct ib_send_wr s_wr, *failed_wr; + int ret = 0; + + if (!ibmr->cm_id->qp || !ibmr->mr) + goto out; + + memset(&s_wr, 0, sizeof(s_wr)); + s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID; + s_wr.opcode = IB_WR_LOCAL_INV; + s_wr.ex.invalidate_rkey = ibmr->mr->rkey; + s_wr.send_flags = IB_SEND_SIGNALED; + + failed_wr = &s_wr; + ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr); + if (ret && printk_ratelimit()) { + printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + __func__, __LINE__, ret); + goto out; + } +out: + return ret; +} + +static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr, + struct scatterlist *sg, + unsigned int sg_len) +{ + struct rds_iw_device *rds_iwdev = pool->device; + struct rds_iw_mapping *mapping = &ibmr->mapping; + u64 *dma_pages; + int i, ret = 0; + + rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len); + + dma_pages = rds_iw_map_scatterlist(rds_iwdev, + &mapping->m_sg, + rds_iwdev->page_shift); + if (IS_ERR(dma_pages)) { + ret = PTR_ERR(dma_pages); + dma_pages = NULL; + goto out; + } + + if (mapping->m_sg.dma_len > pool->max_message_size) { + ret = -EMSGSIZE; + goto out; + } + + for (i = 0; i < mapping->m_sg.dma_npages; ++i) + ibmr->page_list->page_list[i] = dma_pages[i]; + + ret = rds_iw_rdma_build_fastreg(mapping); + if (ret) + goto out; + + rds_iw_stats_inc(s_iw_rdma_mr_used); + +out: + kfree(dma_pages); + + return ret; +} + +/* + * "Free" a fastreg MR. + */ +static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + unsigned long flags; + int ret; + + if (!ibmr->mapping.m_sg.dma_len) + return; + + ret = rds_iw_rdma_fastreg_inv(ibmr); + if (ret) + return; + + /* Try to post the LOCAL_INV WR to the queue. */ + spin_lock_irqsave(&pool->list_lock, flags); + + list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list); + atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned); + atomic_inc(&pool->dirty_count); + + spin_unlock_irqrestore(&pool->list_lock, flags); +} + +static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, + struct list_head *unmap_list, + struct list_head *kill_list) +{ + struct rds_iw_mapping *mapping, *next; + unsigned int ncleaned = 0; + LIST_HEAD(laundered); + + /* Batched invalidation of fastreg MRs. + * Why do we do it this way, even though we could pipeline unmap + * and remap? The reason is the application semantics - when the + * application requests an invalidation of MRs, it expects all + * previously released R_Keys to become invalid. + * + * If we implement MR reuse naively, we risk memory corruption + * (this has actually been observed). So the default behavior + * requires that a MR goes through an explicit unmap operation before + * we can reuse it again. + * + * We could probably improve on this a little, by allowing immediate + * reuse of a MR on the same socket (eg you could add small + * cache of unused MRs to strct rds_socket - GET_MR could grab one + * of these without requiring an explicit invalidate). + */ + while (!list_empty(unmap_list)) { + unsigned long flags; + + spin_lock_irqsave(&pool->list_lock, flags); + list_for_each_entry_safe(mapping, next, unmap_list, m_list) { + list_move(&mapping->m_list, &laundered); + ncleaned++; + } + spin_unlock_irqrestore(&pool->list_lock, flags); + } + + /* Move all laundered mappings back to the unmap list. + * We do not kill any WRs right now - it doesn't seem the + * fastreg API has a max_remap limit. */ + list_splice_init(&laundered, unmap_list); + + return ncleaned; +} + +static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + if (ibmr->page_list) + ib_free_fast_reg_page_list(ibmr->page_list); + if (ibmr->mr) + ib_dereg_mr(ibmr->mr); +} diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c new file mode 100644 index 000000000000..a1931f0027a2 --- /dev/null +++ b/net/rds/iw_recv.c @@ -0,0 +1,869 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <rdma/rdma_cm.h> + +#include "rds.h" +#include "iw.h" + +static struct kmem_cache *rds_iw_incoming_slab; +static struct kmem_cache *rds_iw_frag_slab; +static atomic_t rds_iw_allocation = ATOMIC_INIT(0); + +static void rds_iw_frag_drop_page(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + __free_page(frag->f_page); + frag->f_page = NULL; +} + +static void rds_iw_frag_free(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + BUG_ON(frag->f_page != NULL); + kmem_cache_free(rds_iw_frag_slab, frag); +} + +/* + * We map a page at a time. Its fragments are posted in order. This + * is called in fragment order as the fragments get send completion events. + * Only the last frag in the page performs the unmapping. + * + * It's OK for ring cleanup to call this in whatever order it likes because + * DMA is not in flight and so we can unmap while other ring entries still + * hold page references in their frags. + */ +static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic, + struct rds_iw_recv_work *recv) +{ + struct rds_page_frag *frag = recv->r_frag; + + rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); + if (frag->f_mapped) + ib_dma_unmap_page(ic->i_cm_id->device, + frag->f_mapped, + RDS_FRAG_SIZE, DMA_FROM_DEVICE); + frag->f_mapped = 0; +} + +void rds_iw_recv_init_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_recv_work *recv; + u32 i; + + for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { + struct ib_sge *sge; + + recv->r_iwinc = NULL; + recv->r_frag = NULL; + + recv->r_wr.next = NULL; + recv->r_wr.wr_id = i; + recv->r_wr.sg_list = recv->r_sge; + recv->r_wr.num_sge = RDS_IW_RECV_SGE; + + sge = rds_iw_data_sge(ic, recv->r_sge); + sge->addr = 0; + sge->length = RDS_FRAG_SIZE; + sge->lkey = 0; + + sge = rds_iw_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = 0; + } +} + +static void rds_iw_recv_clear_one(struct rds_iw_connection *ic, + struct rds_iw_recv_work *recv) +{ + if (recv->r_iwinc) { + rds_inc_put(&recv->r_iwinc->ii_inc); + recv->r_iwinc = NULL; + } + if (recv->r_frag) { + rds_iw_recv_unmap_page(ic, recv); + if (recv->r_frag->f_page) + rds_iw_frag_drop_page(recv->r_frag); + rds_iw_frag_free(recv->r_frag); + recv->r_frag = NULL; + } +} + +void rds_iw_recv_clear_ring(struct rds_iw_connection *ic) +{ + u32 i; + + for (i = 0; i < ic->i_recv_ring.w_nr; i++) + rds_iw_recv_clear_one(ic, &ic->i_recvs[i]); + + if (ic->i_frag.f_page) + rds_iw_frag_drop_page(&ic->i_frag); +} + +static int rds_iw_recv_refill_one(struct rds_connection *conn, + struct rds_iw_recv_work *recv, + gfp_t kptr_gfp, gfp_t page_gfp) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + dma_addr_t dma_addr; + struct ib_sge *sge; + int ret = -ENOMEM; + + if (recv->r_iwinc == NULL) { + if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) { + rds_iw_stats_inc(s_iw_rx_alloc_limit); + goto out; + } + recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, + kptr_gfp); + if (recv->r_iwinc == NULL) + goto out; + atomic_inc(&rds_iw_allocation); + INIT_LIST_HEAD(&recv->r_iwinc->ii_frags); + rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); + } + + if (recv->r_frag == NULL) { + recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); + if (recv->r_frag == NULL) + goto out; + INIT_LIST_HEAD(&recv->r_frag->f_item); + recv->r_frag->f_page = NULL; + } + + if (ic->i_frag.f_page == NULL) { + ic->i_frag.f_page = alloc_page(page_gfp); + if (ic->i_frag.f_page == NULL) + goto out; + ic->i_frag.f_offset = 0; + } + + dma_addr = ib_dma_map_page(ic->i_cm_id->device, + ic->i_frag.f_page, + ic->i_frag.f_offset, + RDS_FRAG_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) + goto out; + + /* + * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap() + * must be called on this recv. This happens as completions hit + * in order or on connection shutdown. + */ + recv->r_frag->f_page = ic->i_frag.f_page; + recv->r_frag->f_offset = ic->i_frag.f_offset; + recv->r_frag->f_mapped = dma_addr; + + sge = rds_iw_data_sge(ic, recv->r_sge); + sge->addr = dma_addr; + sge->length = RDS_FRAG_SIZE; + + sge = rds_iw_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->length = sizeof(struct rds_header); + + get_page(recv->r_frag->f_page); + + if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { + ic->i_frag.f_offset += RDS_FRAG_SIZE; + } else { + put_page(ic->i_frag.f_page); + ic->i_frag.f_page = NULL; + ic->i_frag.f_offset = 0; + } + + ret = 0; +out: + return ret; +} + +/* + * This tries to allocate and post unused work requests after making sure that + * they have all the allocations they need to queue received fragments into + * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc + * pairs don't go unmatched. + * + * -1 is returned if posting fails due to temporary resource exhaustion. + */ +int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_recv_work *recv; + struct ib_recv_wr *failed_wr; + unsigned int posted = 0; + int ret = 0; + u32 pos; + + while ((prefill || rds_conn_up(conn)) + && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + if (pos >= ic->i_recv_ring.w_nr) { + printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", + pos); + ret = -EINVAL; + break; + } + + recv = &ic->i_recvs[pos]; + ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp); + if (ret) { + ret = -1; + break; + } + + /* XXX when can this fail? */ + ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); + rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv, + recv->r_iwinc, recv->r_frag->f_page, + (long) recv->r_frag->f_mapped, ret); + if (ret) { + rds_iw_conn_error(conn, "recv post on " + "%pI4 returned %d, disconnecting and " + "reconnecting\n", &conn->c_faddr, + ret); + ret = -1; + break; + } + + posted++; + } + + /* We're doing flow control - update the window. */ + if (ic->i_flowctl && posted) + rds_iw_advertise_credits(conn, posted); + + if (ret) + rds_iw_ring_unalloc(&ic->i_recv_ring, 1); + return ret; +} + +void rds_iw_inc_purge(struct rds_incoming *inc) +{ + struct rds_iw_incoming *iwinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + rdsdebug("purging iwinc %p inc %p\n", iwinc, inc); + + list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_iw_frag_drop_page(frag); + rds_iw_frag_free(frag); + } +} + +void rds_iw_inc_free(struct rds_incoming *inc) +{ + struct rds_iw_incoming *iwinc; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + + rds_iw_inc_purge(inc); + rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc); + BUG_ON(!list_empty(&iwinc->ii_frags)); + kmem_cache_free(rds_iw_incoming_slab, iwinc); + atomic_dec(&rds_iw_allocation); + BUG_ON(atomic_read(&rds_iw_allocation) < 0); +} + +int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, + size_t size) +{ + struct rds_iw_incoming *iwinc; + struct rds_page_frag *frag; + struct iovec *iov = first_iov; + unsigned long to_copy; + unsigned long frag_off = 0; + unsigned long iov_off = 0; + int copied = 0; + int ret; + u32 len; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); + len = be32_to_cpu(inc->i_hdr.h_len); + + while (copied < size && copied < len) { + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " + "[%p, %lu] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + frag->f_page, frag->f_offset, frag_off); + + /* XXX needs + offset for multiple recvs per page */ + ret = rds_page_copy_to_user(frag->f_page, + frag->f_offset + frag_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + frag_off += to_copy; + copied += to_copy; + } + + return copied; +} + +/* ic starts out kzalloc()ed */ +void rds_iw_recv_init_ack(struct rds_iw_connection *ic) +{ + struct ib_send_wr *wr = &ic->i_ack_wr; + struct ib_sge *sge = &ic->i_ack_sge; + + sge->addr = ic->i_ack_dma; + sge->length = sizeof(struct rds_header); + sge->lkey = rds_iw_local_dma_lkey(ic); + + wr->sg_list = sge; + wr->num_sge = 1; + wr->opcode = IB_WR_SEND; + wr->wr_id = RDS_IW_ACK_WR_ID; + wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; +} + +/* + * You'd think that with reliable IB connections you wouldn't need to ack + * messages that have been received. The problem is that IB hardware generates + * an ack message before it has DMAed the message into memory. This creates a + * potential message loss if the HCA is disabled for any reason between when it + * sends the ack and before the message is DMAed and processed. This is only a + * potential issue if another HCA is available for fail-over. + * + * When the remote host receives our ack they'll free the sent message from + * their send queue. To decrease the latency of this we always send an ack + * immediately after we've received messages. + * + * For simplicity, we only have one ack in flight at a time. This puts + * pressure on senders to have deep enough send queues to absorb the latency of + * a single ack frame being in flight. This might not be good enough. + * + * This is implemented by have a long-lived send_wr and sge which point to a + * statically allocated ack frame. This ack wr does not fall under the ring + * accounting that the tx and rx wrs do. The QP attribute specifically makes + * room for it beyond the ring size. Send completion notices its special + * wr_id and avoids working with the ring in that case. + */ +static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, + int ack_required) +{ + rds_iw_set_64bit(&ic->i_ack_next, seq); + if (ack_required) { + smp_mb__before_clear_bit(); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + } +} + +static u64 rds_iw_get_ack(struct rds_iw_connection *ic) +{ + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + smp_mb__after_clear_bit(); + + return ic->i_ack_next; +} + +static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits) +{ + struct rds_header *hdr = ic->i_ack; + struct ib_send_wr *failed_wr; + u64 seq; + int ret; + + seq = rds_iw_get_ack(ic); + + rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); + rds_message_populate_header(hdr, 0, 0, 0); + hdr->h_ack = cpu_to_be64(seq); + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + ic->i_ack_queued = jiffies; + + ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); + if (unlikely(ret)) { + /* Failed to send. Release the WR, and + * force another ACK. + */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + rds_iw_stats_inc(s_iw_ack_send_failure); + /* Need to finesse this later. */ + BUG(); + } else + rds_iw_stats_inc(s_iw_ack_sent); +} + +/* + * There are 3 ways of getting acknowledgements to the peer: + * 1. We call rds_iw_attempt_ack from the recv completion handler + * to send an ACK-only frame. + * However, there can be only one such frame in the send queue + * at any time, so we may have to postpone it. + * 2. When another (data) packet is transmitted while there's + * an ACK in the queue, we piggyback the ACK sequence number + * on the data packet. + * 3. If the ACK WR is done sending, we get called from the + * send queue completion handler, and check whether there's + * another ACK pending (postponed because the WR was on the + * queue). If so, we transmit it. + * + * We maintain 2 variables: + * - i_ack_flags, which keeps track of whether the ACK WR + * is currently in the send queue or not (IB_ACK_IN_FLIGHT) + * - i_ack_next, which is the last sequence number we received + * + * Potentially, send queue and receive queue handlers can run concurrently. + * + * Reconnecting complicates this picture just slightly. When we + * reconnect, we may be seeing duplicate packets. The peer + * is retransmitting them, because it hasn't seen an ACK for + * them. It is important that we ACK these. + * + * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with + * this flag set *MUST* be acknowledged immediately. + */ + +/* + * When we get here, we're called from the recv queue handler. + * Check whether we ought to transmit an ACK. + */ +void rds_iw_attempt_ack(struct rds_iw_connection *ic) +{ + unsigned int adv_credits; + + if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + return; + + if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { + rds_iw_stats_inc(s_iw_ack_send_delayed); + return; + } + + /* Can we get a send credit? */ + if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) { + rds_iw_stats_inc(s_iw_tx_throttle); + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + return; + } + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + rds_iw_send_ack(ic, adv_credits); +} + +/* + * We get here from the send completion handler, when the + * adapter tells us the ACK frame was sent. + */ +void rds_iw_ack_send_complete(struct rds_iw_connection *ic) +{ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_iw_attempt_ack(ic); +} + +/* + * This is called by the regular xmit code when it wants to piggyback + * an ACK on an outgoing frame. + */ +u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic) +{ + if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + rds_iw_stats_inc(s_iw_ack_send_piggybacked); + return rds_iw_get_ack(ic); +} + +/* + * It's kind of lame that we're copying from the posted receive pages into + * long-lived bitmaps. We could have posted the bitmaps and rdma written into + * them. But receiving new congestion bitmaps should be a *rare* event, so + * hopefully we won't need to invest that complexity in making it more + * efficient. By copying we can share a simpler core with TCP which has to + * copy. + */ +static void rds_iw_cong_recv(struct rds_connection *conn, + struct rds_iw_incoming *iwinc) +{ + struct rds_cong_map *map; + unsigned int map_off; + unsigned int map_page; + struct rds_page_frag *frag; + unsigned long frag_off; + unsigned long to_copy; + unsigned long copied; + uint64_t uncongested = 0; + void *addr; + + /* catch completely corrupt packets */ + if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) + return; + + map = conn->c_fcong; + map_page = 0; + map_off = 0; + + frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); + frag_off = 0; + + copied = 0; + + while (copied < RDS_CONG_MAP_BYTES) { + uint64_t *src, *dst; + unsigned int k; + + to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); + BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ + + addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); + + src = addr + frag_off; + dst = (void *)map->m_page_addrs[map_page] + map_off; + for (k = 0; k < to_copy; k += 8) { + /* Record ports that became uncongested, ie + * bits that changed from 0 to 1. */ + uncongested |= ~(*src) & *dst; + *dst++ = *src++; + } + kunmap_atomic(addr, KM_SOFTIRQ0); + + copied += to_copy; + + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + + frag_off += to_copy; + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + } + + /* the congestion map is in little endian order */ + uncongested = le64_to_cpu(uncongested); + + rds_cong_map_updated(map, uncongested); +} + +/* + * Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_iw_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + +static void rds_iw_process_recv(struct rds_connection *conn, + struct rds_iw_recv_work *recv, u32 byte_len, + struct rds_iw_ack_state *state) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_incoming *iwinc = ic->i_iwinc; + struct rds_header *ihdr, *hdr; + + /* XXX shut down the connection if port 0,0 are seen? */ + + rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv, + byte_len); + + if (byte_len < sizeof(struct rds_header)) { + rds_iw_conn_error(conn, "incoming message " + "from %pI4 didn't inclue a " + "header, disconnecting and " + "reconnecting\n", + &conn->c_faddr); + return; + } + byte_len -= sizeof(struct rds_header); + + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + + /* Validate the checksum. */ + if (!rds_message_verify_checksum(ihdr)) { + rds_iw_conn_error(conn, "incoming message " + "from %pI4 has corrupted header - " + "forcing a reconnect\n", + &conn->c_faddr); + rds_stats_inc(s_recv_drop_bad_checksum); + return; + } + + /* Process the ACK sequence which comes with every packet */ + state->ack_recv = be64_to_cpu(ihdr->h_ack); + state->ack_recv_valid = 1; + + /* Process the credits update if there was one */ + if (ihdr->h_credit) + rds_iw_send_add_credits(conn, ihdr->h_credit); + + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { + /* This is an ACK-only packet. The fact that it gets + * special treatment here is that historically, ACKs + * were rather special beasts. + */ + rds_iw_stats_inc(s_iw_ack_received); + + /* + * Usually the frags make their way on to incs and are then freed as + * the inc is freed. We don't go that route, so we have to drop the + * page ref ourselves. We can't just leave the page on the recv + * because that confuses the dma mapping of pages and each recv's use + * of a partial page. We can leave the frag, though, it will be + * reused. + * + * FIXME: Fold this into the code path below. + */ + rds_iw_frag_drop_page(recv->r_frag); + return; + } + + /* + * If we don't already have an inc on the connection then this + * fragment has a header and starts a message.. copy its header + * into the inc and save the inc so we can hang upcoming fragments + * off its list. + */ + if (iwinc == NULL) { + iwinc = recv->r_iwinc; + recv->r_iwinc = NULL; + ic->i_iwinc = iwinc; + + hdr = &iwinc->ii_inc.i_hdr; + memcpy(hdr, ihdr, sizeof(*hdr)); + ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + + rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc, + ic->i_recv_data_rem, hdr->h_flags); + } else { + hdr = &iwinc->ii_inc.i_hdr; + /* We can't just use memcmp here; fragments of a + * single message may carry different ACKs */ + if (hdr->h_sequence != ihdr->h_sequence + || hdr->h_len != ihdr->h_len + || hdr->h_sport != ihdr->h_sport + || hdr->h_dport != ihdr->h_dport) { + rds_iw_conn_error(conn, + "fragment header mismatch; forcing reconnect\n"); + return; + } + } + + list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags); + recv->r_frag = NULL; + + if (ic->i_recv_data_rem > RDS_FRAG_SIZE) + ic->i_recv_data_rem -= RDS_FRAG_SIZE; + else { + ic->i_recv_data_rem = 0; + ic->i_iwinc = NULL; + + if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_iw_cong_recv(conn, iwinc); + else { + rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + &iwinc->ii_inc, GFP_ATOMIC, + KM_SOFTIRQ0); + state->ack_next = be64_to_cpu(hdr->h_sequence); + state->ack_next_valid = 1; + } + + /* Evaluate the ACK_REQUIRED flag *after* we received + * the complete frame, and after bumping the next_rx + * sequence. */ + if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { + rds_stats_inc(s_recv_ack_required); + state->ack_required = 1; + } + + rds_inc_put(&iwinc->ii_inc); + } +} + +/* + * Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rds_recv_incoming() per RDS connection. + */ +void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_wc wc; + struct rds_iw_ack_state state = { 0, }; + struct rds_iw_recv_work *recv; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_iw_stats_inc(s_iw_rx_cq_call); + + ib_req_notify_cq(cq, IB_CQ_SOLICITED); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_iw_stats_inc(s_iw_rx_cq_event); + + recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)]; + + rds_iw_recv_unmap_page(ic, recv); + + /* + * Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) { + /* We expect errors as the qp is drained during shutdown */ + if (wc.status == IB_WC_SUCCESS) { + rds_iw_process_recv(conn, recv, wc.byte_len, &state); + } else { + rds_iw_conn_error(conn, "recv completion on " + "%pI4 had status %u, disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status); + } + } + + rds_iw_ring_free(&ic->i_recv_ring, 1); + } + + if (state.ack_next_valid) + rds_iw_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + if (rds_conn_up(conn)) + rds_iw_attempt_ack(ic); + + /* If we ever end up with a really empty receive ring, we're + * in deep trouble, as the sender will definitely see RNR + * timeouts. */ + if (rds_iw_ring_empty(&ic->i_recv_ring)) + rds_iw_stats_inc(s_iw_rx_ring_empty); + + /* + * If the ring is running low, then schedule the thread to refill. + */ + if (rds_iw_ring_low(&ic->i_recv_ring)) + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +} + +int rds_iw_recv(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + int ret = 0; + + rdsdebug("conn %p\n", conn); + + /* + * If we get a temporary posting failure in this context then + * we're really low and we want the caller to back off for a bit. + */ + mutex_lock(&ic->i_recv_mutex); + if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) + ret = -ENOMEM; + else + rds_iw_stats_inc(s_iw_rx_refill_from_thread); + mutex_unlock(&ic->i_recv_mutex); + + if (rds_conn_up(conn)) + rds_iw_attempt_ack(ic); + + return ret; +} + +int __init rds_iw_recv_init(void) +{ + struct sysinfo si; + int ret = -ENOMEM; + + /* Default to 30% of all available RAM for recv memory */ + si_meminfo(&si); + rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; + + rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", + sizeof(struct rds_iw_incoming), + 0, 0, NULL); + if (rds_iw_incoming_slab == NULL) + goto out; + + rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", + sizeof(struct rds_page_frag), + 0, 0, NULL); + if (rds_iw_frag_slab == NULL) + kmem_cache_destroy(rds_iw_incoming_slab); + else + ret = 0; +out: + return ret; +} + +void rds_iw_recv_exit(void) +{ + kmem_cache_destroy(rds_iw_incoming_slab); + kmem_cache_destroy(rds_iw_frag_slab); +} diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c new file mode 100644 index 000000000000..d422d4b5deef --- /dev/null +++ b/net/rds/iw_ring.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> + +#include "rds.h" +#include "iw.h" + +/* + * Locking for IB rings. + * We assume that allocation is always protected by a mutex + * in the caller (this is a valid assumption for the current + * implementation). + * + * Freeing always happens in an interrupt, and hence only + * races with allocations, but not with other free()s. + * + * The interaction between allocation and freeing is that + * the alloc code has to determine the number of free entries. + * To this end, we maintain two counters; an allocation counter + * and a free counter. Both are allowed to run freely, and wrap + * around. + * The number of used entries is always (alloc_ctr - free_ctr) % NR. + * + * The current implementation makes free_ctr atomic. When the + * caller finds an allocation fails, it should set an "alloc fail" + * bit and retry the allocation. The "alloc fail" bit essentially tells + * the CQ completion handlers to wake it up after freeing some + * more entries. + */ + +/* + * This only happens on shutdown. + */ +DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait); + +void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr) +{ + memset(ring, 0, sizeof(*ring)); + ring->w_nr = nr; + rdsdebug("ring %p nr %u\n", ring, ring->w_nr); +} + +static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring) +{ + u32 diff; + + /* This assumes that atomic_t has at least as many bits as u32 */ + diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); + BUG_ON(diff > ring->w_nr); + + return diff; +} + +void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr) +{ + /* We only ever get called from the connection setup code, + * prior to creating the QP. */ + BUG_ON(__rds_iw_ring_used(ring)); + ring->w_nr = nr; +} + +static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_used(ring) == 0; +} + +u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos) +{ + u32 ret = 0, avail; + + avail = ring->w_nr - __rds_iw_ring_used(ring); + + rdsdebug("ring %p val %u next %u free %u\n", ring, val, + ring->w_alloc_ptr, avail); + + if (val && avail) { + ret = min(val, avail); + *pos = ring->w_alloc_ptr; + + ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; + ring->w_alloc_ctr += ret; + } + + return ret; +} + +void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val) +{ + ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; + atomic_add(val, &ring->w_free_ctr); + + if (__rds_iw_ring_empty(ring) && + waitqueue_active(&rds_iw_ring_empty_wait)) + wake_up(&rds_iw_ring_empty_wait); +} + +void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val) +{ + ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; + ring->w_alloc_ctr -= val; +} + +int rds_iw_ring_empty(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_empty(ring); +} + +int rds_iw_ring_low(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2); +} + + +/* + * returns the oldest alloced ring entry. This will be the next one + * freed. This can't be called if there are none allocated. + */ +u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring) +{ + return ring->w_free_ptr; +} + +/* + * returns the number of completed work requests. + */ + +u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest) +{ + u32 ret; + + if (oldest <= (unsigned long long)wr_id) + ret = (unsigned long long)wr_id - oldest + 1; + else + ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; + + rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, + wr_id, oldest); + return ret; +} diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c new file mode 100644 index 000000000000..22dd38ffd608 --- /dev/null +++ b/net/rds/iw_send.c @@ -0,0 +1,975 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/device.h> +#include <linux/dmapool.h> + +#include "rds.h" +#include "rdma.h" +#include "iw.h" + +static void rds_iw_send_rdma_complete(struct rds_message *rm, + int wc_status) +{ + int notify_status; + + switch (wc_status) { + case IB_WC_WR_FLUSH_ERR: + return; + + case IB_WC_SUCCESS: + notify_status = RDS_RDMA_SUCCESS; + break; + + case IB_WC_REM_ACCESS_ERR: + notify_status = RDS_RDMA_REMOTE_ERROR; + break; + + default: + notify_status = RDS_RDMA_OTHER_ERROR; + break; + } + rds_rdma_send_complete(rm, notify_status); +} + +static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, + struct rds_rdma_op *op) +{ + if (op->r_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, + op->r_sg, op->r_nents, + op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->r_mapped = 0; + } +} + +static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, + struct rds_iw_send_work *send, + int wc_status) +{ + struct rds_message *rm = send->s_rm; + + rdsdebug("ic %p send %p rm %p\n", ic, send, rm); + + ib_dma_unmap_sg(ic->i_cm_id->device, + rm->m_sg, rm->m_nents, + DMA_TO_DEVICE); + + if (rm->m_rdma_op != NULL) { + rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); + + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * call rds_rdma_send_complete from the cq_handler. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_iw_send_rdma_complete(rm, wc_status); + + if (rm->m_rdma_op->r_write) + rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); + else + rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + } + + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + + rds_message_put(rm); + send->s_rm = NULL; +} + +void rds_iw_send_init_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + struct ib_sge *sge; + + send->s_rm = NULL; + send->s_op = NULL; + send->s_mapping = NULL; + + send->s_wr.next = NULL; + send->s_wr.wr_id = i; + send->s_wr.sg_list = send->s_sge; + send->s_wr.num_sge = 1; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.send_flags = 0; + send->s_wr.ex.imm_data = 0; + + sge = rds_iw_data_sge(ic, send->s_sge); + sge->lkey = 0; + + sge = rds_iw_header_sge(ic, send->s_sge); + sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = 0; + + send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size); + if (IS_ERR(send->s_mr)) { + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n"); + break; + } + + send->s_page_list = ib_alloc_fast_reg_page_list( + ic->i_cm_id->device, fastreg_message_size); + if (IS_ERR(send->s_page_list)) { + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n"); + break; + } + } +} + +void rds_iw_send_clear_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + BUG_ON(!send->s_mr); + ib_dereg_mr(send->s_mr); + BUG_ON(!send->s_page_list); + ib_free_fast_reg_page_list(send->s_page_list); + if (send->s_wr.opcode == 0xdead) + continue; + if (send->s_rm) + rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); + if (send->s_op) + rds_iw_send_unmap_rdma(ic, send->s_op); + } +} + +/* + * The _oldest/_free ring operations here race cleanly with the alloc/unalloc + * operations performed in the send path. As the sender allocs and potentially + * unallocs the next free entry in the ring it doesn't alter which is + * the next to be freed, which is what this is concerned with. + */ +void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_wc wc; + struct rds_iw_send_work *send; + u32 completed; + u32 oldest; + u32 i; + int ret; + + rdsdebug("cq %p conn %p\n", cq, conn); + rds_iw_stats_inc(s_iw_tx_cq_call); + ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (ret) + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_iw_stats_inc(s_iw_tx_cq_event); + + if (wc.status != IB_WC_SUCCESS) { + printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode); + break; + } + + if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) { + ic->i_fastreg_posted = 0; + continue; + } + + if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) { + ic->i_fastreg_posted = 1; + continue; + } + + if (wc.wr_id == RDS_IW_ACK_WR_ID) { + if (ic->i_ack_queued + HZ/2 < jiffies) + rds_iw_stats_inc(s_iw_tx_stalled); + rds_iw_ack_send_complete(ic); + continue; + } + + oldest = rds_iw_ring_oldest(&ic->i_send_ring); + + completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_rm) + rds_iw_send_unmap_rm(ic, send, wc.status); + break; + case IB_WR_FAST_REG_MR: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + /* Nothing to be done - the SG list will be unmapped + * when the SEND completes. */ + break; + default: + if (printk_ratelimit()) + printk(KERN_NOTICE + "RDS/IW: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; + } + + send->s_wr.opcode = 0xdead; + send->s_wr.num_sge = 1; + if (send->s_queued + HZ/2 < jiffies) + rds_iw_stats_inc(s_iw_tx_stalled); + + /* If a RDMA operation produced an error, signal this right + * away. If we don't, the subsequent SEND that goes with this + * RDMA will be canceled with ERR_WFLUSH, and the application + * never learn that the RDMA failed. */ + if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { + struct rds_message *rm; + + rm = rds_send_get_message(conn, send->s_op); + if (rm) + rds_iw_send_rdma_complete(rm, wc.status); + } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_iw_ring_free(&ic->i_send_ring, completed); + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) + || test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_iw_conn_error(conn, + "send completion on %pI4 " + "had status %u, disconnecting and reconnecting\n", + &conn->c_faddr, wc.status); + } + } +} + +/* + * This is the main function for allocating credits when sending + * messages. + * + * Conceptually, we have two counters: + * - send credits: this tells us how many WRs we're allowed + * to submit without overruning the reciever's queue. For + * each SEND WR we post, we decrement this by one. + * + * - posted credits: this tells us how many WRs we recently + * posted to the receive queue. This value is transferred + * to the peer as a "credit update" in a RDS header field. + * Every time we transmit credits to the peer, we subtract + * the amount of transferred credits from this counter. + * + * It is essential that we avoid situations where both sides have + * exhausted their send credits, and are unable to send new credits + * to the peer. We achieve this by requiring that we send at least + * one credit update to the peer before exhausting our credits. + * When new credits arrive, we subtract one credit that is withheld + * until we've posted new buffers and are ready to transmit these + * credits (see rds_iw_send_add_credits below). + * + * The RDS send code is essentially single-threaded; rds_send_xmit + * grabs c_send_lock to ensure exclusive access to the send ring. + * However, the ACK sending code is independent and can race with + * message SENDs. + * + * In the send path, we need to update the counters for send credits + * and the counter of posted buffers atomically - when we use the + * last available credit, we cannot allow another thread to race us + * and grab the posted credits counter. Hence, we have to use a + * spinlock to protect the credit counter, or use atomics. + * + * Spinlocks shared between the send and the receive path are bad, + * because they create unnecessary delays. An early implementation + * using a spinlock showed a 5% degradation in throughput at some + * loads. + * + * This implementation avoids spinlocks completely, putting both + * counters into a single atomic, and updating that atomic using + * atomic_add (in the receive path, when receiving fresh credits), + * and using atomic_cmpxchg when updating the two counters. + */ +int rds_iw_send_grab_credits(struct rds_iw_connection *ic, + u32 wanted, u32 *adv_credits, int need_posted) +{ + unsigned int avail, posted, got = 0, advertise; + long oldval, newval; + + *adv_credits = 0; + if (!ic->i_flowctl) + return wanted; + +try_again: + advertise = 0; + oldval = newval = atomic_read(&ic->i_credits); + posted = IB_GET_POST_CREDITS(oldval); + avail = IB_GET_SEND_CREDITS(oldval); + + rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n", + wanted, avail, posted); + + /* The last credit must be used to send a credit update. */ + if (avail && !posted) + avail--; + + if (avail < wanted) { + struct rds_connection *conn = ic->i_cm_id->context; + + /* Oops, there aren't that many credits left! */ + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + got = avail; + } else { + /* Sometimes you get what you want, lalala. */ + got = wanted; + } + newval -= IB_SET_SEND_CREDITS(got); + + /* + * If need_posted is non-zero, then the caller wants + * the posted regardless of whether any send credits are + * available. + */ + if (posted && (got || need_posted)) { + advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT); + newval -= IB_SET_POST_CREDITS(advertise); + } + + /* Finally bill everything */ + if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) + goto try_again; + + *adv_credits = advertise; + return got; +} + +void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (credits == 0) + return; + + rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n", + credits, + IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), + test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); + + atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); + + rds_iw_stats_inc(s_iw_rx_credit_updates); +} + +void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (posted == 0) + return; + + atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); + + /* Decide whether to send an update to the peer now. + * If we would send a credit update for every single buffer we + * post, we would end up with an ACK storm (ACK arrives, + * consumes buffer, we refill the ring, send ACK to remote + * advertising the newly posted buffer... ad inf) + * + * Performance pretty much depends on how often we send + * credit updates - too frequent updates mean lots of ACKs. + * Too infrequent updates, and the peer will run out of + * credits and has to throttle. + * For the time being, 16 seems to be a good compromise. + */ + if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); +} + +static inline void +rds_iw_xmit_populate_wr(struct rds_iw_connection *ic, + struct rds_iw_send_work *send, unsigned int pos, + unsigned long buffer, unsigned int length, + int send_flags) +{ + struct ib_sge *sge; + + WARN_ON(pos != send - ic->i_sends); + + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 2; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; + + if (length != 0) { + sge = rds_iw_data_sge(ic, send->s_sge); + sge->addr = buffer; + sge->length = length; + sge->lkey = rds_iw_local_dma_lkey(ic); + + sge = rds_iw_header_sge(ic, send->s_sge); + } else { + /* We're sending a packet with no payload. There is only + * one SGE */ + send->s_wr.num_sge = 1; + sge = &send->s_sge[0]; + } + + sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = rds_iw_local_dma_lkey(ic); +} + +/* + * This can be called multiple times for a given message. The first time + * we see a message we map its scatterlist into the IB device so that + * we can provide that mapped address to the IB scatter gather entries + * in the IB work requests. We translate the scatterlist into a series + * of work requests that fragment the message. These work requests complete + * in order so we pass ownership of the message to the completion handler + * once we send the final fragment. + * + * The RDS core uses the c_send_lock to only enter this function once + * per connection. This makes sure that the tx ring alloc/unalloc pairs + * don't get out of sync and confuse the ring. + */ +int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct rds_iw_send_work *send = NULL; + struct rds_iw_send_work *first; + struct rds_iw_send_work *prev; + struct ib_send_wr *failed_wr; + struct scatterlist *scat; + u32 pos; + u32 i; + u32 work_alloc; + u32 credit_alloc; + u32 posted; + u32 adv_credits = 0; + int send_flags = 0; + int sent; + int ret; + int flow_controlled = 0; + + BUG_ON(off % RDS_FRAG_SIZE); + BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); + + /* Fastreg support */ + if (rds_rdma_cookie_key(rm->m_rdma_cookie) + && !ic->i_fastreg_posted) { + ret = -EAGAIN; + goto out; + } + + /* FIXME we may overallocate here */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) + i = 1; + else + i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); + + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + credit_alloc = work_alloc; + if (ic->i_flowctl) { + credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0); + adv_credits += posted; + if (credit_alloc < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); + work_alloc = credit_alloc; + flow_controlled++; + } + if (work_alloc == 0) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_iw_stats_inc(s_iw_tx_throttle); + ret = -ENOMEM; + goto out; + } + } + + /* map the message the first time we see it */ + if (ic->i_rm == NULL) { + /* + printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", + be16_to_cpu(rm->m_inc.i_hdr.h_dport), + rm->m_inc.i_hdr.h_flags, + be32_to_cpu(rm->m_inc.i_hdr.h_len)); + */ + if (rm->m_nents) { + rm->m_count = ib_dma_map_sg(dev, + rm->m_sg, rm->m_nents, DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); + if (rm->m_count == 0) { + rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + } else { + rm->m_count = 0; + } + + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; + rds_message_addref(rm); + ic->i_rm = rm; + + /* Finalize the header */ + if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; + if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; + + /* If it has a RDMA op, tell the peer we did it. This is + * used by the peer to release use-once RDMA MRs. */ + if (rm->m_rdma_op) { + struct rds_ext_header_rdma ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); + } + if (rm->m_rdma_cookie) { + rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, + rds_rdma_cookie_key(rm->m_rdma_cookie), + rds_rdma_cookie_offset(rm->m_rdma_cookie)); + } + + /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so + * we should not do this unless we have a chance of at least + * sticking the header into the send ring. Which is why we + * should call rds_iw_ring_alloc first. */ + rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic)); + rds_message_make_checksum(&rm->m_inc.i_hdr); + + /* + * Update adv_credits since we reset the ACK_REQUIRED bit. + */ + rds_iw_send_grab_credits(ic, 0, &posted, 1); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } else if (ic->i_rm != rm) + BUG(); + + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &rm->m_sg[sg]; + sent = 0; + i = 0; + + /* Sometimes you want to put a fence between an RDMA + * READ and the following SEND. + * We could either do this all the time + * or when requested by the user. Right now, we let + * the application choose. + */ + if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + send_flags = IB_SEND_FENCE; + + /* + * We could be copying the header into the unused tail of the page. + * That would need to be changed in the future when those pages might + * be mapped userspace pages or page cache pages. So instead we always + * use a second sge and our long-lived ring of mapped headers. We send + * the header after the data so that the data payload can be aligned on + * the receiver. + */ + + /* handle a 0-len message */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { + rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); + goto add_header; + } + + /* if there's data reference it with a chain of work reqs */ + for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { + unsigned int len; + + send = &ic->i_sends[pos]; + + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + rds_iw_xmit_populate_wr(ic, send, pos, + ib_sg_dma_address(dev, scat) + off, len, + send_flags); + + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + ic->i_unsignaled_bytes -= len; + if (ic->i_unsignaled_bytes <= 0) { + ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + /* + * Always signal the last one if we're stopping due to flow control. + */ + if (flow_controlled && i == (work_alloc-1)) + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } + +add_header: + /* Tack on the header after the data. The header SGE should already + * have been set up to point to the right header buffer. */ + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + + if (0) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", + be16_to_cpu(hdr->h_dport), + hdr->h_flags, + be32_to_cpu(hdr->h_len)); + } + if (adv_credits) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + /* add credit and redo the header checksum */ + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + adv_credits = 0; + rds_iw_stats_inc(s_iw_tx_credit_updates); + } + + if (prev) + prev->s_wr.next = &send->s_wr; + prev = send; + + pos = (pos + 1) % ic->i_send_ring.w_nr; + } + + /* Account the RDS header in the number of bytes we sent, but just once. + * The caller has no concept of fragmentation. */ + if (hdr_off == 0) + sent += sizeof(struct rds_header); + + /* if we finished the message then send completion owns it */ + if (scat == &rm->m_sg[rm->m_count]) { + prev->s_rm = ic->i_rm; + prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + ic->i_rm = NULL; + } + + if (i < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + if (ic->i_flowctl && i < credit_alloc) + rds_iw_send_add_credits(conn, credit_alloc - i); + + /* XXX need to worry about failed_wr and partial sends. */ + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + if (prev->s_rm) { + ic->i_rm = prev->s_rm; + prev->s_rm = NULL; + } + goto out; + } + + ret = sent; +out: + BUG_ON(adv_credits); + return ret; +} + +static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr) +{ + BUG_ON(nent > send->s_page_list->max_page_list_len); + /* + * Perform a WR for the fast_reg_mr. Each individual page + * in the sg list is added to the fast reg page list and placed + * inside the fast_reg_mr WR. + */ + send->s_wr.opcode = IB_WR_FAST_REG_MR; + send->s_wr.wr.fast_reg.length = len; + send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey; + send->s_wr.wr.fast_reg.page_list = send->s_page_list; + send->s_wr.wr.fast_reg.page_list_len = nent; + send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift; + send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE; + send->s_wr.wr.fast_reg.iova_start = sg_addr; + + ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); +} + +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_send_work *send = NULL; + struct rds_iw_send_work *first; + struct rds_iw_send_work *prev; + struct ib_send_wr *failed_wr; + struct rds_iw_device *rds_iwdev; + struct scatterlist *scat; + unsigned long len; + u64 remote_addr = op->r_remote_addr; + u32 pos, fr_pos; + u32 work_alloc; + u32 i; + u32 j; + int sent; + int ret; + int num_sge; + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + + /* map the message the first time we see it */ + if (!op->r_mapped) { + op->r_count = ib_dma_map_sg(ic->i_cm_id->device, + op->r_sg, op->r_nents, (op->r_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); + if (op->r_count == 0) { + rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + op->r_mapped = 1; + } + + if (!op->r_write) { + /* Alloc space on the send queue for the fastreg */ + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); + if (work_alloc != 1) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + } + + /* + * Instead of knowing how to return a partial rdma read/write we insist that there + * be enough work requests to send the entire message. + */ + i = ceil(op->r_count, rds_iwdev->max_sge); + + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc != i) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + send = &ic->i_sends[pos]; + if (!op->r_write) { + first = prev = &ic->i_sends[fr_pos]; + } else { + first = send; + prev = NULL; + } + scat = &op->r_sg[0]; + sent = 0; + num_sge = op->r_count; + + for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + send->s_wr.send_flags = 0; + send->s_queued = jiffies; + + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + send->s_wr.send_flags = IB_SEND_SIGNALED; + } + + /* To avoid the need to have the plumbing to invalidate the fastreg_mr used + * for local access after RDS is finished with it, using + * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. + */ + if (op->r_write) + send->s_wr.opcode = IB_WR_RDMA_WRITE; + else + send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; + + send->s_wr.wr.rdma.remote_addr = remote_addr; + send->s_wr.wr.rdma.rkey = op->r_key; + send->s_op = op; + + if (num_sge > rds_iwdev->max_sge) { + send->s_wr.num_sge = rds_iwdev->max_sge; + num_sge -= rds_iwdev->max_sge; + } else + send->s_wr.num_sge = num_sge; + + send->s_wr.next = NULL; + + if (prev) + prev->s_wr.next = &send->s_wr; + + for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + len = ib_sg_dma_len(ic->i_cm_id->device, scat); + + if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) + send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat); + else { + send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); + send->s_sge[j].length = len; + send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic); + } + + sent += len; + rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); + remote_addr += len; + + scat++; + } + + if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) { + send->s_wr.num_sge = 1; + send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr; + send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes; + send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey; + } + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + prev = send; + if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) + send = ic->i_sends; + } + + /* if we finished the message then send completion owns it */ + if (scat == &op->r_sg[op->r_count]) + first->s_wr.send_flags = IB_SEND_SIGNALED; + + if (i < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + + /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not + * recommended. Putting the lkey on the wire is a security hole, as it can + * allow for memory access to all of memory on the remote system. Some + * adapters do not allow using the lkey for this at all. To bypass this use a + * fastreg_mr (or possibly a dma_mr) + */ + if (!op->r_write) { + rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], + op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); + work_alloc++; + } + + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + goto out; + } + +out: + return ret; +} + +void rds_iw_xmit_complete(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + /* We may have a pending ACK or window update we were unable + * to send previously (due to flow control). Try again. */ + rds_iw_attempt_ack(ic); +} diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c new file mode 100644 index 000000000000..ccc7e8f0bf0e --- /dev/null +++ b/net/rds/iw_stats.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "rds.h" +#include "iw.h" + +DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned; + +static char *rds_iw_stat_names[] = { + "iw_connect_raced", + "iw_listen_closed_stale", + "iw_tx_cq_call", + "iw_tx_cq_event", + "iw_tx_ring_full", + "iw_tx_throttle", + "iw_tx_sg_mapping_failure", + "iw_tx_stalled", + "iw_tx_credit_updates", + "iw_rx_cq_call", + "iw_rx_cq_event", + "iw_rx_ring_empty", + "iw_rx_refill_from_cq", + "iw_rx_refill_from_thread", + "iw_rx_alloc_limit", + "iw_rx_credit_updates", + "iw_ack_sent", + "iw_ack_send_failure", + "iw_ack_send_delayed", + "iw_ack_send_piggybacked", + "iw_ack_received", + "iw_rdma_mr_alloc", + "iw_rdma_mr_free", + "iw_rdma_mr_used", + "iw_rdma_mr_pool_flush", + "iw_rdma_mr_pool_wait", + "iw_rdma_mr_pool_depleted", +}; + +unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) +{ + struct rds_iw_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + if (avail < ARRAY_SIZE(rds_iw_stat_names)) + goto out; + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names, + ARRAY_SIZE(rds_iw_stat_names)); +out: + return ARRAY_SIZE(rds_iw_stat_names); +} diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c new file mode 100644 index 000000000000..9590678cd616 --- /dev/null +++ b/net/rds/iw_sysctl.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> + +#include "iw.h" + +static struct ctl_table_header *rds_iw_sysctl_hdr; + +unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR; +unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR; +unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; +static unsigned long rds_iw_sysctl_max_wr_min = 1; +/* hardware will fail CQ creation long before this */ +static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0; + +unsigned long rds_iw_sysctl_max_unsig_wrs = 16; +static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1; +static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64; + +unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20); +static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1; +static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; + +unsigned int rds_iw_sysctl_flow_control = 1; + +ctl_table rds_iw_sysctl_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_send_wr", + .data = &rds_iw_sysctl_max_send_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_wr_min, + .extra2 = &rds_iw_sysctl_max_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_recv_wr", + .data = &rds_iw_sysctl_max_recv_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_wr_min, + .extra2 = &rds_iw_sysctl_max_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unsignaled_wr", + .data = &rds_iw_sysctl_max_unsig_wrs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_unsig_wr_min, + .extra2 = &rds_iw_sysctl_max_unsig_wr_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unsignaled_bytes", + .data = &rds_iw_sysctl_max_unsig_bytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_unsig_bytes_min, + .extra2 = &rds_iw_sysctl_max_unsig_bytes_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_recv_allocation", + .data = &rds_iw_sysctl_max_recv_allocation, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "flow_control", + .data = &rds_iw_sysctl_flow_control, + .maxlen = sizeof(rds_iw_sysctl_flow_control), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0} +}; + +static struct ctl_path rds_iw_sysctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, + { .procname = "iw", .ctl_name = CTL_UNNUMBERED, }, + { } +}; + +void rds_iw_sysctl_exit(void) +{ + if (rds_iw_sysctl_hdr) + unregister_sysctl_table(rds_iw_sysctl_hdr); +} + +int __init rds_iw_sysctl_init(void) +{ + rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); + if (rds_iw_sysctl_hdr == NULL) + return -ENOMEM; + return 0; +} diff --git a/net/rds/loop.c b/net/rds/loop.c new file mode 100644 index 000000000000..4a61997f554d --- /dev/null +++ b/net/rds/loop.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> + +#include "rds.h" +#include "loop.h" + +static DEFINE_SPINLOCK(loop_conns_lock); +static LIST_HEAD(loop_conns); + +/* + * This 'loopback' transport is a special case for flows that originate + * and terminate on the same machine. + * + * Connection build-up notices if the destination address is thought of + * as a local address by a transport. At that time it decides to use the + * loopback transport instead of the bound transport of the sending socket. + * + * The loopback transport's sending path just hands the sent rds_message + * straight to the receiving path via an embedded rds_incoming. + */ + +/* + * Usually a message transits both the sender and receiver's conns as it + * flows to the receiver. In the loopback case, though, the receive path + * is handed the sending conn so the sense of the addresses is reversed. + */ +static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, + unsigned int off) +{ + BUG_ON(hdr_off || sg || off); + + rds_inc_init(&rm->m_inc, conn, conn->c_laddr); + rds_message_addref(rm); /* for the inc */ + + rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, + GFP_KERNEL, KM_USER0); + + rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), + NULL); + + rds_inc_put(&rm->m_inc); + + return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); +} + +static int rds_loop_xmit_cong_map(struct rds_connection *conn, + struct rds_cong_map *map, + unsigned long offset) +{ + unsigned long i; + + BUG_ON(offset); + BUG_ON(map != conn->c_lcong); + + for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { + memcpy((void *)conn->c_fcong->m_page_addrs[i], + (void *)map->m_page_addrs[i], PAGE_SIZE); + } + + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + + return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; +} + +/* we need to at least give the thread something to succeed */ +static int rds_loop_recv(struct rds_connection *conn) +{ + return 0; +} + +struct rds_loop_connection { + struct list_head loop_node; + struct rds_connection *conn; +}; + +/* + * Even the loopback transport needs to keep track of its connections, + * so it can call rds_conn_destroy() on them on exit. N.B. there are + * 1+ loopback addresses (127.*.*.*) so it's not a bug to have + * multiple loopback conns allocated, although rather useless. + */ +static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_loop_connection *lc; + unsigned long flags; + + lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); + if (lc == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&lc->loop_node); + lc->conn = conn; + conn->c_transport_data = lc; + + spin_lock_irqsave(&loop_conns_lock, flags); + list_add_tail(&lc->loop_node, &loop_conns); + spin_unlock_irqrestore(&loop_conns_lock, flags); + + return 0; +} + +static void rds_loop_conn_free(void *arg) +{ + struct rds_loop_connection *lc = arg; + rdsdebug("lc %p\n", lc); + list_del(&lc->loop_node); + kfree(lc); +} + +static int rds_loop_conn_connect(struct rds_connection *conn) +{ + rds_connect_complete(conn); + return 0; +} + +static void rds_loop_conn_shutdown(struct rds_connection *conn) +{ +} + +void rds_loop_exit(void) +{ + struct rds_loop_connection *lc, *_lc; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&loop_conns_lock); + list_splice(&loop_conns, &tmp_list); + INIT_LIST_HEAD(&loop_conns); + spin_unlock_irq(&loop_conns_lock); + + list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) { + WARN_ON(lc->conn->c_passive); + rds_conn_destroy(lc->conn); + } +} + +/* + * This is missing .xmit_* because loop doesn't go through generic + * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and + * .laddr_check are missing because transport.c doesn't iterate over + * rds_loop_transport. + */ +struct rds_transport rds_loop_transport = { + .xmit = rds_loop_xmit, + .xmit_cong_map = rds_loop_xmit_cong_map, + .recv = rds_loop_recv, + .conn_alloc = rds_loop_conn_alloc, + .conn_free = rds_loop_conn_free, + .conn_connect = rds_loop_conn_connect, + .conn_shutdown = rds_loop_conn_shutdown, + .inc_copy_to_user = rds_message_inc_copy_to_user, + .inc_purge = rds_message_inc_purge, + .inc_free = rds_message_inc_free, + .t_name = "loopback", +}; diff --git a/net/rds/loop.h b/net/rds/loop.h new file mode 100644 index 000000000000..f32b0939a04d --- /dev/null +++ b/net/rds/loop.h @@ -0,0 +1,9 @@ +#ifndef _RDS_LOOP_H +#define _RDS_LOOP_H + +/* loop.c */ +extern struct rds_transport rds_loop_transport; + +void rds_loop_exit(void); + +#endif diff --git a/net/rds/message.c b/net/rds/message.c new file mode 100644 index 000000000000..5a15dc8d0cd7 --- /dev/null +++ b/net/rds/message.c @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> + +#include "rds.h" +#include "rdma.h" + +static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq); + +static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { +[RDS_EXTHDR_NONE] = 0, +[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), +[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), +[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), +}; + + +void rds_message_addref(struct rds_message *rm) +{ + rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); + atomic_inc(&rm->m_refcount); +} + +/* + * This relies on dma_map_sg() not touching sg[].page during merging. + */ +static void rds_message_purge(struct rds_message *rm) +{ + unsigned long i; + + if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) + return; + + for (i = 0; i < rm->m_nents; i++) { + rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); + /* XXX will have to put_page for page refs */ + __free_page(sg_page(&rm->m_sg[i])); + } + rm->m_nents = 0; + + if (rm->m_rdma_op) + rds_rdma_free_op(rm->m_rdma_op); + if (rm->m_rdma_mr) + rds_mr_put(rm->m_rdma_mr); +} + +void rds_message_inc_purge(struct rds_incoming *inc) +{ + struct rds_message *rm = container_of(inc, struct rds_message, m_inc); + rds_message_purge(rm); +} + +void rds_message_put(struct rds_message *rm) +{ + rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); + + if (atomic_dec_and_test(&rm->m_refcount)) { + BUG_ON(!list_empty(&rm->m_sock_item)); + BUG_ON(!list_empty(&rm->m_conn_item)); + rds_message_purge(rm); + + kfree(rm); + } +} + +void rds_message_inc_free(struct rds_incoming *inc) +{ + struct rds_message *rm = container_of(inc, struct rds_message, m_inc); + rds_message_put(rm); +} + +void rds_message_populate_header(struct rds_header *hdr, __be16 sport, + __be16 dport, u64 seq) +{ + hdr->h_flags = 0; + hdr->h_sport = sport; + hdr->h_dport = dport; + hdr->h_sequence = cpu_to_be64(seq); + hdr->h_exthdr[0] = RDS_EXTHDR_NONE; +} + +int rds_message_add_extension(struct rds_header *hdr, + unsigned int type, const void *data, unsigned int len) +{ + unsigned int ext_len = sizeof(u8) + len; + unsigned char *dst; + + /* For now, refuse to add more than one extension header */ + if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE) + return 0; + + if (type >= __RDS_EXTHDR_MAX + || len != rds_exthdr_size[type]) + return 0; + + if (ext_len >= RDS_HEADER_EXT_SPACE) + return 0; + dst = hdr->h_exthdr; + + *dst++ = type; + memcpy(dst, data, len); + + dst[len] = RDS_EXTHDR_NONE; + return 1; +} + +/* + * If a message has extension headers, retrieve them here. + * Call like this: + * + * unsigned int pos = 0; + * + * while (1) { + * buflen = sizeof(buffer); + * type = rds_message_next_extension(hdr, &pos, buffer, &buflen); + * if (type == RDS_EXTHDR_NONE) + * break; + * ... + * } + */ +int rds_message_next_extension(struct rds_header *hdr, + unsigned int *pos, void *buf, unsigned int *buflen) +{ + unsigned int offset, ext_type, ext_len; + u8 *src = hdr->h_exthdr; + + offset = *pos; + if (offset >= RDS_HEADER_EXT_SPACE) + goto none; + + /* Get the extension type and length. For now, the + * length is implied by the extension type. */ + ext_type = src[offset++]; + + if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX) + goto none; + ext_len = rds_exthdr_size[ext_type]; + if (offset + ext_len > RDS_HEADER_EXT_SPACE) + goto none; + + *pos = offset + ext_len; + if (ext_len < *buflen) + *buflen = ext_len; + memcpy(buf, src + offset, *buflen); + return ext_type; + +none: + *pos = RDS_HEADER_EXT_SPACE; + *buflen = 0; + return RDS_EXTHDR_NONE; +} + +int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version) +{ + struct rds_ext_header_version ext_hdr; + + ext_hdr.h_version = cpu_to_be32(version); + return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr)); +} + +int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version) +{ + struct rds_ext_header_version ext_hdr; + unsigned int pos = 0, len = sizeof(ext_hdr); + + /* We assume the version extension is the only one present */ + if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION) + return 0; + *version = be32_to_cpu(ext_hdr.h_version); + return 1; +} + +int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) +{ + struct rds_ext_header_rdma_dest ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(r_key); + ext_hdr.h_rdma_offset = cpu_to_be32(offset); + return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr)); +} + +struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) +{ + struct rds_message *rm; + + rm = kzalloc(sizeof(struct rds_message) + + (nents * sizeof(struct scatterlist)), gfp); + if (!rm) + goto out; + + if (nents) + sg_init_table(rm->m_sg, nents); + atomic_set(&rm->m_refcount, 1); + INIT_LIST_HEAD(&rm->m_sock_item); + INIT_LIST_HEAD(&rm->m_conn_item); + spin_lock_init(&rm->m_rs_lock); + +out: + return rm; +} + +struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) +{ + struct rds_message *rm; + unsigned int i; + + rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); + if (rm == NULL) + return ERR_PTR(-ENOMEM); + + set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); + rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); + rm->m_nents = ceil(total_len, PAGE_SIZE); + + for (i = 0; i < rm->m_nents; ++i) { + sg_set_page(&rm->m_sg[i], + virt_to_page(page_addrs[i]), + PAGE_SIZE, 0); + } + + return rm; +} + +struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, + size_t total_len) +{ + unsigned long to_copy; + unsigned long iov_off; + unsigned long sg_off; + struct rds_message *rm; + struct iovec *iov; + struct scatterlist *sg; + int ret; + + rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); + if (rm == NULL) { + ret = -ENOMEM; + goto out; + } + + rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); + + /* + * now allocate and copy in the data payload. + */ + sg = rm->m_sg; + iov = first_iov; + iov_off = 0; + sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ + + while (total_len) { + if (sg_page(sg) == NULL) { + ret = rds_page_remainder_alloc(sg, total_len, + GFP_HIGHUSER); + if (ret) + goto out; + rm->m_nents++; + sg_off = 0; + } + + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, sg->length - sg_off); + to_copy = min_t(size_t, to_copy, total_len); + + rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to " + "sg [%p, %u, %u] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + (void *)sg_page(sg), sg->offset, sg->length, sg_off); + + ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off, + iov->iov_base + iov_off, + to_copy); + if (ret) + goto out; + + iov_off += to_copy; + total_len -= to_copy; + sg_off += to_copy; + + if (sg_off == sg->length) + sg++; + } + + ret = 0; +out: + if (ret) { + if (rm) + rds_message_put(rm); + rm = ERR_PTR(ret); + } + return rm; +} + +int rds_message_inc_copy_to_user(struct rds_incoming *inc, + struct iovec *first_iov, size_t size) +{ + struct rds_message *rm; + struct iovec *iov; + struct scatterlist *sg; + unsigned long to_copy; + unsigned long iov_off; + unsigned long vec_off; + int copied; + int ret; + u32 len; + + rm = container_of(inc, struct rds_message, m_inc); + len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + iov = first_iov; + iov_off = 0; + sg = rm->m_sg; + vec_off = 0; + copied = 0; + + while (copied < size && copied < len) { + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, sg->length - vec_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to " + "sg [%p, %u, %u] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + sg_page(sg), sg->offset, sg->length, vec_off); + + ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + vec_off += to_copy; + copied += to_copy; + + if (vec_off == sg->length) { + vec_off = 0; + sg++; + } + } + + return copied; +} + +/* + * If the message is still on the send queue, wait until the transport + * is done with it. This is particularly important for RDMA operations. + */ +void rds_message_wait(struct rds_message *rm) +{ + wait_event(rds_message_flush_waitq, + !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); +} + +void rds_message_unmapped(struct rds_message *rm) +{ + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + if (waitqueue_active(&rds_message_flush_waitq)) + wake_up(&rds_message_flush_waitq); +} + diff --git a/net/rds/page.c b/net/rds/page.c new file mode 100644 index 000000000000..c460743a89ad --- /dev/null +++ b/net/rds/page.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/highmem.h> + +#include "rds.h" + +struct rds_page_remainder { + struct page *r_page; + unsigned long r_offset; +}; + +DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; + +/* + * returns 0 on success or -errno on failure. + * + * We don't have to worry about flush_dcache_page() as this only works + * with private pages. If, say, we were to do directed receive to pinned + * user pages we'd have to worry more about cache coherence. (Though + * the flush_dcache_page() in get_user_pages() would probably be enough). + */ +int rds_page_copy_user(struct page *page, unsigned long offset, + void __user *ptr, unsigned long bytes, + int to_user) +{ + unsigned long ret; + void *addr; + + if (to_user) + rds_stats_add(s_copy_to_user, bytes); + else + rds_stats_add(s_copy_from_user, bytes); + + addr = kmap_atomic(page, KM_USER0); + if (to_user) + ret = __copy_to_user_inatomic(ptr, addr + offset, bytes); + else + ret = __copy_from_user_inatomic(addr + offset, ptr, bytes); + kunmap_atomic(addr, KM_USER0); + + if (ret) { + addr = kmap(page); + if (to_user) + ret = copy_to_user(ptr, addr + offset, bytes); + else + ret = copy_from_user(addr + offset, ptr, bytes); + kunmap(page); + if (ret) + return -EFAULT; + } + + return 0; +} + +/* + * Message allocation uses this to build up regions of a message. + * + * @bytes - the number of bytes needed. + * @gfp - the waiting behaviour of the allocation + * + * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to + * kmap the pages, etc. + * + * If @bytes is at least a full page then this just returns a page from + * alloc_page(). + * + * If @bytes is a partial page then this stores the unused region of the + * page in a per-cpu structure. Future partial-page allocations may be + * satisfied from that cached region. This lets us waste less memory on + * small allocations with minimal complexity. It works because the transmit + * path passes read-only page regions down to devices. They hold a page + * reference until they are done with the region. + */ +int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, + gfp_t gfp) +{ + struct rds_page_remainder *rem; + unsigned long flags; + struct page *page; + int ret; + + gfp |= __GFP_HIGHMEM; + + /* jump straight to allocation if we're trying for a huge page */ + if (bytes >= PAGE_SIZE) { + page = alloc_page(gfp); + if (page == NULL) { + ret = -ENOMEM; + } else { + sg_set_page(scat, page, PAGE_SIZE, 0); + ret = 0; + } + goto out; + } + + rem = &per_cpu(rds_page_remainders, get_cpu()); + local_irq_save(flags); + + while (1) { + /* avoid a tiny region getting stuck by tossing it */ + if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) { + rds_stats_inc(s_page_remainder_miss); + __free_page(rem->r_page); + rem->r_page = NULL; + } + + /* hand out a fragment from the cached page */ + if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) { + sg_set_page(scat, rem->r_page, bytes, rem->r_offset); + get_page(sg_page(scat)); + + if (rem->r_offset != 0) + rds_stats_inc(s_page_remainder_hit); + + rem->r_offset += bytes; + if (rem->r_offset == PAGE_SIZE) { + __free_page(rem->r_page); + rem->r_page = NULL; + } + ret = 0; + break; + } + + /* alloc if there is nothing for us to use */ + local_irq_restore(flags); + put_cpu(); + + page = alloc_page(gfp); + + rem = &per_cpu(rds_page_remainders, get_cpu()); + local_irq_save(flags); + + if (page == NULL) { + ret = -ENOMEM; + break; + } + + /* did someone race to fill the remainder before us? */ + if (rem->r_page) { + __free_page(page); + continue; + } + + /* otherwise install our page and loop around to alloc */ + rem->r_page = page; + rem->r_offset = 0; + } + + local_irq_restore(flags); + put_cpu(); +out: + rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, + ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, + ret ? 0 : scat->length); + return ret; +} + +static int rds_page_remainder_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct rds_page_remainder *rem; + long cpu = (long)hcpu; + + rem = &per_cpu(rds_page_remainders, cpu); + + rdsdebug("cpu %ld action 0x%lx\n", cpu, action); + + switch (action) { + case CPU_DEAD: + if (rem->r_page) + __free_page(rem->r_page); + rem->r_page = NULL; + break; + } + + return 0; +} + +static struct notifier_block rds_page_remainder_nb = { + .notifier_call = rds_page_remainder_cpu_notify, +}; + +void rds_page_exit(void) +{ + int i; + + for_each_possible_cpu(i) + rds_page_remainder_cpu_notify(&rds_page_remainder_nb, + (unsigned long)CPU_DEAD, + (void *)(long)i); +} diff --git a/net/rds/rdma.c b/net/rds/rdma.c new file mode 100644 index 000000000000..eaeeb91e1119 --- /dev/null +++ b/net/rds/rdma.c @@ -0,0 +1,679 @@ +/* + * Copyright (c) 2007 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/pagemap.h> +#include <linux/rbtree.h> +#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ + +#include "rdma.h" + +/* + * XXX + * - build with sparse + * - should we limit the size of a mr region? let transport return failure? + * - should we detect duplicate keys on a socket? hmm. + * - an rdma is an mlock, apply rlimit? + */ + +/* + * get the number of pages by looking at the page indices that the start and + * end addresses fall in. + * + * Returns 0 if the vec is invalid. It is invalid if the number of bytes + * causes the address to wrap or overflows an unsigned int. This comes + * from being stored in the 'length' member of 'struct scatterlist'. + */ +static unsigned int rds_pages_in_vec(struct rds_iovec *vec) +{ + if ((vec->addr + vec->bytes <= vec->addr) || + (vec->bytes > (u64)UINT_MAX)) + return 0; + + return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (vec->addr >> PAGE_SHIFT); +} + +static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, + struct rds_mr *insert) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct rds_mr *mr; + + while (*p) { + parent = *p; + mr = rb_entry(parent, struct rds_mr, r_rb_node); + + if (key < mr->r_key) + p = &(*p)->rb_left; + else if (key > mr->r_key) + p = &(*p)->rb_right; + else + return mr; + } + + if (insert) { + rb_link_node(&insert->r_rb_node, parent, p); + rb_insert_color(&insert->r_rb_node, root); + atomic_inc(&insert->r_refcount); + } + return NULL; +} + +/* + * Destroy the transport-specific part of a MR. + */ +static void rds_destroy_mr(struct rds_mr *mr) +{ + struct rds_sock *rs = mr->r_sock; + void *trans_private = NULL; + unsigned long flags; + + rdsdebug("RDS: destroy mr key is %x refcnt %u\n", + mr->r_key, atomic_read(&mr->r_refcount)); + + if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state)) + return; + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + if (!RB_EMPTY_NODE(&mr->r_rb_node)) + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + trans_private = mr->r_trans_private; + mr->r_trans_private = NULL; + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (trans_private) + mr->r_trans->free_mr(trans_private, mr->r_invalidate); +} + +void __rds_put_mr_final(struct rds_mr *mr) +{ + rds_destroy_mr(mr); + kfree(mr); +} + +/* + * By the time this is called we can't have any more ioctls called on + * the socket so we don't need to worry about racing with others. + */ +void rds_rdma_drop_keys(struct rds_sock *rs) +{ + struct rds_mr *mr; + struct rb_node *node; + + /* Release any MRs associated with this socket */ + while ((node = rb_first(&rs->rs_rdma_keys))) { + mr = container_of(node, struct rds_mr, r_rb_node); + if (mr->r_trans == rs->rs_transport) + mr->r_invalidate = 0; + rds_mr_put(mr); + } + + if (rs->rs_transport && rs->rs_transport->flush_mrs) + rs->rs_transport->flush_mrs(); +} + +/* + * Helper function to pin user pages. + */ +static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, + struct page **pages, int write) +{ + int ret; + + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, user_addr, + nr_pages, write, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (0 <= ret && (unsigned) ret < nr_pages) { + while (ret--) + put_page(pages[ret]); + ret = -EFAULT; + } + + return ret; +} + +static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, + u64 *cookie_ret, struct rds_mr **mr_ret) +{ + struct rds_mr *mr = NULL, *found; + unsigned int nr_pages; + struct page **pages = NULL; + struct scatterlist *sg; + void *trans_private; + unsigned long flags; + rds_rdma_cookie_t cookie; + unsigned int nents; + long i; + int ret; + + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (rs->rs_transport->get_mr == NULL) { + ret = -EOPNOTSUPP; + goto out; + } + + nr_pages = rds_pages_in_vec(&args->vec); + if (nr_pages == 0) { + ret = -EINVAL; + goto out; + } + + rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", + args->vec.addr, args->vec.bytes, nr_pages); + + /* XXX clamp nr_pages to limit the size of this alloc? */ + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + ret = -ENOMEM; + goto out; + } + + mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); + if (mr == NULL) { + ret = -ENOMEM; + goto out; + } + + atomic_set(&mr->r_refcount, 1); + RB_CLEAR_NODE(&mr->r_rb_node); + mr->r_trans = rs->rs_transport; + mr->r_sock = rs; + + if (args->flags & RDS_RDMA_USE_ONCE) + mr->r_use_once = 1; + if (args->flags & RDS_RDMA_INVALIDATE) + mr->r_invalidate = 1; + if (args->flags & RDS_RDMA_READWRITE) + mr->r_write = 1; + + /* + * Pin the pages that make up the user buffer and transfer the page + * pointers to the mr's sg array. We check to see if we've mapped + * the whole region after transferring the partial page references + * to the sg array so that we can have one page ref cleanup path. + * + * For now we have no flag that tells us whether the mapping is + * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to + * the zero page. + */ + ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); + if (ret < 0) + goto out; + + nents = ret; + sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); + if (sg == NULL) { + ret = -ENOMEM; + goto out; + } + WARN_ON(!nents); + sg_init_table(sg, nents); + + /* Stick all pages into the scatterlist */ + for (i = 0 ; i < nents; i++) + sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + + rdsdebug("RDS: trans_private nents is %u\n", nents); + + /* Obtain a transport specific MR. If this succeeds, the + * s/g list is now owned by the MR. + * Note that dma_map() implies that pending writes are + * flushed to RAM, so no dma_sync is needed here. */ + trans_private = rs->rs_transport->get_mr(sg, nents, rs, + &mr->r_key); + + if (IS_ERR(trans_private)) { + for (i = 0 ; i < nents; i++) + put_page(sg_page(&sg[i])); + kfree(sg); + ret = PTR_ERR(trans_private); + goto out; + } + + mr->r_trans_private = trans_private; + + rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n", + mr->r_key, (void *)(unsigned long) args->cookie_addr); + + /* The user may pass us an unaligned address, but we can only + * map page aligned regions. So we keep the offset, and build + * a 64bit cookie containing <R_Key, offset> and pass that + * around. */ + cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); + if (cookie_ret) + *cookie_ret = cookie; + + if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) { + ret = -EFAULT; + goto out; + } + + /* Inserting the new MR into the rbtree bumps its + * reference count. */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + BUG_ON(found && found != mr); + + rdsdebug("RDS: get_mr key is %x\n", mr->r_key); + if (mr_ret) { + atomic_inc(&mr->r_refcount); + *mr_ret = mr; + } + + ret = 0; +out: + kfree(pages); + if (mr) + rds_mr_put(mr); + return ret; +} + +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_get_mr_args args; + + if (optlen != sizeof(struct rds_get_mr_args)) + return -EINVAL; + + if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval, + sizeof(struct rds_get_mr_args))) + return -EFAULT; + + return __rds_rdma_map(rs, &args, NULL, NULL); +} + +/* + * Free the MR indicated by the given R_Key + */ +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_free_mr_args args; + struct rds_mr *mr; + unsigned long flags; + + if (optlen != sizeof(struct rds_free_mr_args)) + return -EINVAL; + + if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval, + sizeof(struct rds_free_mr_args))) + return -EFAULT; + + /* Special case - a null cookie means flush all unused MRs */ + if (args.cookie == 0) { + if (!rs->rs_transport || !rs->rs_transport->flush_mrs) + return -EINVAL; + rs->rs_transport->flush_mrs(); + return 0; + } + + /* Look up the MR given its R_key and remove it from the rbtree + * so nobody else finds it. + * This should also prevent races with rds_rdma_unuse. + */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL); + if (mr) { + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + if (args.flags & RDS_RDMA_INVALIDATE) + mr->r_invalidate = 1; + } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (!mr) + return -EINVAL; + + /* + * call rds_destroy_mr() ourselves so that we're sure it's done by the time + * we return. If we let rds_mr_put() do it it might not happen until + * someone else drops their ref. + */ + rds_destroy_mr(mr); + rds_mr_put(mr); + return 0; +} + +/* + * This is called when we receive an extension header that + * tells us this MR was used. It allows us to implement + * use_once semantics + */ +void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) +{ + struct rds_mr *mr; + unsigned long flags; + int zot_me = 0; + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); + if (mr && (mr->r_use_once || force)) { + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + zot_me = 1; + } else if (mr) + atomic_inc(&mr->r_refcount); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + /* May have to issue a dma_sync on this memory region. + * Note we could avoid this if the operation was a RDMA READ, + * but at this point we can't tell. */ + if (mr != NULL) { + if (mr->r_trans->sync_mr) + mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); + + /* If the MR was marked as invalidate, this will + * trigger an async flush. */ + if (zot_me) + rds_destroy_mr(mr); + rds_mr_put(mr); + } +} + +void rds_rdma_free_op(struct rds_rdma_op *ro) +{ + unsigned int i; + + for (i = 0; i < ro->r_nents; i++) { + struct page *page = sg_page(&ro->r_sg[i]); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory */ + if (!ro->r_write) + set_page_dirty(page); + put_page(page); + } + + kfree(ro->r_notifier); + kfree(ro); +} + +/* + * args is a pointer to an in-kernel copy in the sendmsg cmsg. + */ +static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, + struct rds_rdma_args *args) +{ + struct rds_iovec vec; + struct rds_rdma_op *op = NULL; + unsigned int nr_pages; + unsigned int max_pages; + unsigned int nr_bytes; + struct page **pages = NULL; + struct rds_iovec __user *local_vec; + struct scatterlist *sg; + unsigned int nr; + unsigned int i, j; + int ret; + + + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (args->nr_local > (u64)UINT_MAX) { + ret = -EMSGSIZE; + goto out; + } + + nr_pages = 0; + max_pages = 0; + + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + + /* figure out the number of pages in the vector */ + for (i = 0; i < args->nr_local; i++) { + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) { + ret = -EFAULT; + goto out; + } + + nr = rds_pages_in_vec(&vec); + if (nr == 0) { + ret = -EINVAL; + goto out; + } + + max_pages = max(nr, max_pages); + nr_pages += nr; + } + + pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + ret = -ENOMEM; + goto out; + } + + op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); + if (op == NULL) { + ret = -ENOMEM; + goto out; + } + + op->r_write = !!(args->flags & RDS_RDMA_READWRITE); + op->r_fence = !!(args->flags & RDS_RDMA_FENCE); + op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + op->r_recverr = rs->rs_recverr; + WARN_ON(!nr_pages); + sg_init_table(op->r_sg, nr_pages); + + if (op->r_notify || op->r_recverr) { + /* We allocate an uninitialized notifier here, because + * we don't want to do that in the completion handler. We + * would have to use GFP_ATOMIC there, and don't want to deal + * with failed allocations. + */ + op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); + if (!op->r_notifier) { + ret = -ENOMEM; + goto out; + } + op->r_notifier->n_user_token = args->user_token; + op->r_notifier->n_status = RDS_RDMA_SUCCESS; + } + + /* The cookie contains the R_Key of the remote memory region, and + * optionally an offset into it. This is how we implement RDMA into + * unaligned memory. + * When setting up the RDMA, we need to add that offset to the + * destination address (which is really an offset into the MR) + * FIXME: We may want to move this into ib_rdma.c + */ + op->r_key = rds_rdma_cookie_key(args->cookie); + op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); + + nr_bytes = 0; + + rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", + (unsigned long long)args->nr_local, + (unsigned long long)args->remote_vec.addr, + op->r_key); + + for (i = 0; i < args->nr_local; i++) { + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) { + ret = -EFAULT; + goto out; + } + + nr = rds_pages_in_vec(&vec); + if (nr == 0) { + ret = -EINVAL; + goto out; + } + + rs->rs_user_addr = vec.addr; + rs->rs_user_bytes = vec.bytes; + + /* did the user change the vec under us? */ + if (nr > max_pages || op->r_nents + nr > nr_pages) { + ret = -EINVAL; + goto out; + } + /* If it's a WRITE operation, we want to pin the pages for reading. + * If it's a READ operation, we need to pin the pages for writing. + */ + ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); + if (ret < 0) + goto out; + + rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", + nr_bytes, nr, vec.bytes, vec.addr); + + nr_bytes += vec.bytes; + + for (j = 0; j < nr; j++) { + unsigned int offset = vec.addr & ~PAGE_MASK; + + sg = &op->r_sg[op->r_nents + j]; + sg_set_page(sg, pages[j], + min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), + offset); + + rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", + sg->offset, sg->length, vec.addr, vec.bytes); + + vec.addr += sg->length; + vec.bytes -= sg->length; + } + + op->r_nents += nr; + } + + + if (nr_bytes > args->remote_vec.bytes) { + rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", + nr_bytes, + (unsigned int) args->remote_vec.bytes); + ret = -EINVAL; + goto out; + } + op->r_bytes = nr_bytes; + + ret = 0; +out: + kfree(pages); + if (ret) { + if (op) + rds_rdma_free_op(op); + op = ERR_PTR(ret); + } + return op; +} + +/* + * The application asks for a RDMA transfer. + * Extract all arguments and set up the rdma_op + */ +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct rds_rdma_op *op; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) + || rm->m_rdma_op != NULL) + return -EINVAL; + + op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); + if (IS_ERR(op)) + return PTR_ERR(op); + rds_stats_inc(s_send_rdma); + rm->m_rdma_op = op; + return 0; +} + +/* + * The application wants us to pass an RDMA destination (aka MR) + * to the remote + */ +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + unsigned long flags; + struct rds_mr *mr; + u32 r_key; + int err = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) + || rm->m_rdma_cookie != 0) + return -EINVAL; + + memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); + + /* We are reusing a previously mapped MR here. Most likely, the + * application has written to the buffer, so we need to explicitly + * flush those writes to RAM. Otherwise the HCA may not see them + * when doing a DMA from that buffer. + */ + r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); + if (mr == NULL) + err = -EINVAL; /* invalid r_key */ + else + atomic_inc(&mr->r_refcount); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (mr) { + mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); + rm->m_rdma_mr = mr; + } + return err; +} + +/* + * The application passes us an address range it wants to enable RDMA + * to/from. We map the area, and save the <R_Key,offset> pair + * in rm->m_rdma_cookie. This causes it to be sent along to the peer + * in an extension header. + */ +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) + || rm->m_rdma_cookie != 0) + return -EINVAL; + + return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); +} diff --git a/net/rds/rdma.h b/net/rds/rdma.h new file mode 100644 index 000000000000..425512098b0b --- /dev/null +++ b/net/rds/rdma.h @@ -0,0 +1,84 @@ +#ifndef _RDS_RDMA_H +#define _RDS_RDMA_H + +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/scatterlist.h> + +#include "rds.h" + +struct rds_mr { + struct rb_node r_rb_node; + atomic_t r_refcount; + u32 r_key; + + /* A copy of the creation flags */ + unsigned int r_use_once:1; + unsigned int r_invalidate:1; + unsigned int r_write:1; + + /* This is for RDS_MR_DEAD. + * It would be nice & consistent to make this part of the above + * bit field here, but we need to use test_and_set_bit. + */ + unsigned long r_state; + struct rds_sock *r_sock; /* back pointer to the socket that owns us */ + struct rds_transport *r_trans; + void *r_trans_private; +}; + +/* Flags for mr->r_state */ +#define RDS_MR_DEAD 0 + +struct rds_rdma_op { + u32 r_key; + u64 r_remote_addr; + unsigned int r_write:1; + unsigned int r_fence:1; + unsigned int r_notify:1; + unsigned int r_recverr:1; + unsigned int r_mapped:1; + struct rds_notifier *r_notifier; + unsigned int r_bytes; + unsigned int r_nents; + unsigned int r_count; + struct scatterlist r_sg[0]; +}; + +static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) +{ + return r_key | (((u64) offset) << 32); +} + +static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) +{ + return cookie; +} + +static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) +{ + return cookie >> 32; +} + +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); +void rds_rdma_drop_keys(struct rds_sock *rs); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +void rds_rdma_free_op(struct rds_rdma_op *ro); +void rds_rdma_send_complete(struct rds_message *rm, int); + +extern void __rds_put_mr_final(struct rds_mr *mr); +static inline void rds_mr_put(struct rds_mr *mr) +{ + if (atomic_dec_and_test(&mr->r_refcount)) + __rds_put_mr_final(mr); +} + +#endif diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c new file mode 100644 index 000000000000..7b19024f9706 --- /dev/null +++ b/net/rds/rdma_transport.c @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2009 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <rdma/rdma_cm.h> + +#include "rdma_transport.h" + +static struct rdma_cm_id *rds_iw_listen_id; + +int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + /* this can be null in the listening path */ + struct rds_connection *conn = cm_id->context; + struct rds_transport *trans; + int ret = 0; + + rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, + event->event); + + if (cm_id->device->node_type == RDMA_NODE_RNIC) + trans = &rds_iw_transport; + else + trans = &rds_ib_transport; + + /* Prevent shutdown from tearing down the connection + * while we're executing. */ + if (conn) { + mutex_lock(&conn->c_cm_lock); + + /* If the connection is being shut down, bail out + * right away. We return 0 so cm_id doesn't get + * destroyed prematurely */ + if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) { + /* Reject incoming connections while we're tearing + * down an existing one. */ + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) + ret = 1; + goto out; + } + } + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = trans->cm_handle_connect(cm_id, event); + break; + + case RDMA_CM_EVENT_ADDR_RESOLVED: + /* XXX do we need to clean up if this fails? */ + ret = rdma_resolve_route(cm_id, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + /* XXX worry about racing with listen acceptance */ + ret = trans->cm_initiate_connect(cm_id); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + trans->cm_connect_complete(conn, event); + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_ADDR_CHANGE: + if (conn) + rds_conn_drop(conn); + break; + + case RDMA_CM_EVENT_DISCONNECTED: + printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection " + "%pI4->%pI4\n", &conn->c_laddr, + &conn->c_faddr); + rds_conn_drop(conn); + break; + + default: + /* things like device disconnect? */ + printk(KERN_ERR "unknown event %u\n", event->event); + BUG(); + break; + } + +out: + if (conn) + mutex_unlock(&conn->c_cm_lock); + + rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); + + return ret; +} + +static int __init rds_rdma_listen_init(void) +{ + struct sockaddr_in sin; + struct rdma_cm_id *cm_id; + int ret; + + cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP); + if (IS_ERR(cm_id)) { + ret = PTR_ERR(cm_id); + printk(KERN_ERR "RDS/IW: failed to setup listener, " + "rdma_create_id() returned %d\n", ret); + goto out; + } + + sin.sin_family = PF_INET, + sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); + sin.sin_port = (__force u16)htons(RDS_PORT); + + /* + * XXX I bet this binds the cm_id to a device. If we want to support + * fail-over we'll have to take this into consideration. + */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + if (ret) { + printk(KERN_ERR "RDS/IW: failed to setup listener, " + "rdma_bind_addr() returned %d\n", ret); + goto out; + } + + ret = rdma_listen(cm_id, 128); + if (ret) { + printk(KERN_ERR "RDS/IW: failed to setup listener, " + "rdma_listen() returned %d\n", ret); + goto out; + } + + rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); + + rds_iw_listen_id = cm_id; + cm_id = NULL; +out: + if (cm_id) + rdma_destroy_id(cm_id); + return ret; +} + +static void rds_rdma_listen_stop(void) +{ + if (rds_iw_listen_id) { + rdsdebug("cm %p\n", rds_iw_listen_id); + rdma_destroy_id(rds_iw_listen_id); + rds_iw_listen_id = NULL; + } +} + +int __init rds_rdma_init(void) +{ + int ret; + + ret = rds_rdma_listen_init(); + if (ret) + goto out; + + ret = rds_iw_init(); + if (ret) + goto err_iw_init; + + ret = rds_ib_init(); + if (ret) + goto err_ib_init; + + goto out; + +err_ib_init: + rds_iw_exit(); +err_iw_init: + rds_rdma_listen_stop(); +out: + return ret; +} + +void rds_rdma_exit(void) +{ + /* stop listening first to ensure no new connections are attempted */ + rds_rdma_listen_stop(); + rds_ib_exit(); + rds_iw_exit(); +} + diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h new file mode 100644 index 000000000000..2f2c7d976c21 --- /dev/null +++ b/net/rds/rdma_transport.h @@ -0,0 +1,28 @@ +#ifndef _RDMA_TRANSPORT_H +#define _RDMA_TRANSPORT_H + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include "rds.h" + +#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 + +int rds_rdma_conn_connect(struct rds_connection *conn); +int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); + +/* from rdma_transport.c */ +int rds_rdma_init(void); +void rds_rdma_exit(void); + +/* from ib.c */ +extern struct rds_transport rds_ib_transport; +int rds_ib_init(void); +void rds_ib_exit(void); + +/* from iw.c */ +extern struct rds_transport rds_iw_transport; +int rds_iw_init(void); +void rds_iw_exit(void); + +#endif diff --git a/net/rds/rds.h b/net/rds/rds.h new file mode 100644 index 000000000000..060400704979 --- /dev/null +++ b/net/rds/rds.h @@ -0,0 +1,686 @@ +#ifndef _RDS_RDS_H +#define _RDS_RDS_H + +#include <net/sock.h> +#include <linux/scatterlist.h> +#include <linux/highmem.h> +#include <rdma/rdma_cm.h> +#include <linux/mutex.h> +#include <linux/rds.h> + +#include "info.h" + +/* + * RDS Network protocol version + */ +#define RDS_PROTOCOL_3_0 0x0300 +#define RDS_PROTOCOL_3_1 0x0301 +#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 +#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) +#define RDS_PROTOCOL_MINOR(v) ((v) & 255) +#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) + +/* + * XXX randomly chosen, but at least seems to be unused: + * # 18464-18768 Unassigned + * We should do better. We want a reserved port to discourage unpriv'ed + * userspace from listening. + */ +#define RDS_PORT 18634 + +#ifdef DEBUG +#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) +#else +/* sigh, pr_debug() causes unused variable warnings */ +static inline void __attribute__ ((format (printf, 1, 2))) +rdsdebug(char *fmt, ...) +{ +} +#endif + +/* XXX is there one of these somewhere? */ +#define ceil(x, y) \ + ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; }) + +#define RDS_FRAG_SHIFT 12 +#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) + +#define RDS_CONG_MAP_BYTES (65536 / 8) +#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long)) +#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) +#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) + +struct rds_cong_map { + struct rb_node m_rb_node; + __be32 m_addr; + wait_queue_head_t m_waitq; + struct list_head m_conn_list; + unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; +}; + + +/* + * This is how we will track the connection state: + * A connection is always in one of the following + * states. Updates to the state are atomic and imply + * a memory barrier. + */ +enum { + RDS_CONN_DOWN = 0, + RDS_CONN_CONNECTING, + RDS_CONN_DISCONNECTING, + RDS_CONN_UP, + RDS_CONN_ERROR, +}; + +/* Bits for c_flags */ +#define RDS_LL_SEND_FULL 0 +#define RDS_RECONNECT_PENDING 1 + +struct rds_connection { + struct hlist_node c_hash_node; + __be32 c_laddr; + __be32 c_faddr; + unsigned int c_loopback:1; + struct rds_connection *c_passive; + + struct rds_cong_map *c_lcong; + struct rds_cong_map *c_fcong; + + struct mutex c_send_lock; /* protect send ring */ + struct rds_message *c_xmit_rm; + unsigned long c_xmit_sg; + unsigned int c_xmit_hdr_off; + unsigned int c_xmit_data_off; + unsigned int c_xmit_rdma_sent; + + spinlock_t c_lock; /* protect msg queues */ + u64 c_next_tx_seq; + struct list_head c_send_queue; + struct list_head c_retrans; + + u64 c_next_rx_seq; + + struct rds_transport *c_trans; + void *c_transport_data; + + atomic_t c_state; + unsigned long c_flags; + unsigned long c_reconnect_jiffies; + struct delayed_work c_send_w; + struct delayed_work c_recv_w; + struct delayed_work c_conn_w; + struct work_struct c_down_w; + struct mutex c_cm_lock; /* protect conn state & cm */ + + struct list_head c_map_item; + unsigned long c_map_queued; + unsigned long c_map_offset; + unsigned long c_map_bytes; + + unsigned int c_unacked_packets; + unsigned int c_unacked_bytes; + + /* Protocol version */ + unsigned int c_version; +}; + +#define RDS_FLAG_CONG_BITMAP 0x01 +#define RDS_FLAG_ACK_REQUIRED 0x02 +#define RDS_FLAG_RETRANSMITTED 0x04 +#define RDS_MAX_ADV_CREDIT 127 + +/* + * Maximum space available for extension headers. + */ +#define RDS_HEADER_EXT_SPACE 16 + +struct rds_header { + __be64 h_sequence; + __be64 h_ack; + __be32 h_len; + __be16 h_sport; + __be16 h_dport; + u8 h_flags; + u8 h_credit; + u8 h_padding[4]; + __sum16 h_csum; + + u8 h_exthdr[RDS_HEADER_EXT_SPACE]; +}; + +/* + * Reserved - indicates end of extensions + */ +#define RDS_EXTHDR_NONE 0 + +/* + * This extension header is included in the very + * first message that is sent on a new connection, + * and identifies the protocol level. This will help + * rolling updates if a future change requires breaking + * the protocol. + * NB: This is no longer true for IB, where we do a version + * negotiation during the connection setup phase (protocol + * version information is included in the RDMA CM private data). + */ +#define RDS_EXTHDR_VERSION 1 +struct rds_ext_header_version { + __be32 h_version; +}; + +/* + * This extension header is included in the RDS message + * chasing an RDMA operation. + */ +#define RDS_EXTHDR_RDMA 2 +struct rds_ext_header_rdma { + __be32 h_rdma_rkey; +}; + +/* + * This extension header tells the peer about the + * destination <R_Key,offset> of the requested RDMA + * operation. + */ +#define RDS_EXTHDR_RDMA_DEST 3 +struct rds_ext_header_rdma_dest { + __be32 h_rdma_rkey; + __be32 h_rdma_offset; +}; + +#define __RDS_EXTHDR_MAX 16 /* for now */ + +struct rds_incoming { + atomic_t i_refcount; + struct list_head i_item; + struct rds_connection *i_conn; + struct rds_header i_hdr; + unsigned long i_rx_jiffies; + __be32 i_saddr; + + rds_rdma_cookie_t i_rdma_cookie; +}; + +/* + * m_sock_item and m_conn_item are on lists that are serialized under + * conn->c_lock. m_sock_item has additional meaning in that once it is empty + * the message will not be put back on the retransmit list after being sent. + * messages that are canceled while being sent rely on this. + * + * m_inc is used by loopback so that it can pass an incoming message straight + * back up into the rx path. It embeds a wire header which is also used by + * the send path, which is kind of awkward. + * + * m_sock_item indicates the message's presence on a socket's send or receive + * queue. m_rs will point to that socket. + * + * m_daddr is used by cancellation to prune messages to a given destination. + * + * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock + * nesting. As paths iterate over messages on a sock, or conn, they must + * also lock the conn, or sock, to remove the message from those lists too. + * Testing the flag to determine if the message is still on the lists lets + * us avoid testing the list_head directly. That means each path can use + * the message's list_head to keep it on a local list while juggling locks + * without confusing the other path. + * + * m_ack_seq is an optional field set by transports who need a different + * sequence number range to invalidate. They can use this in a callback + * that they pass to rds_send_drop_acked() to see if each message has been + * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't + * had ack_seq set yet. + */ +#define RDS_MSG_ON_SOCK 1 +#define RDS_MSG_ON_CONN 2 +#define RDS_MSG_HAS_ACK_SEQ 3 +#define RDS_MSG_ACK_REQUIRED 4 +#define RDS_MSG_RETRANSMITTED 5 +#define RDS_MSG_MAPPED 6 +#define RDS_MSG_PAGEVEC 7 + +struct rds_message { + atomic_t m_refcount; + struct list_head m_sock_item; + struct list_head m_conn_item; + struct rds_incoming m_inc; + u64 m_ack_seq; + __be32 m_daddr; + unsigned long m_flags; + + /* Never access m_rs without holding m_rs_lock. + * Lock nesting is + * rm->m_rs_lock + * -> rs->rs_lock + */ + spinlock_t m_rs_lock; + struct rds_sock *m_rs; + struct rds_rdma_op *m_rdma_op; + rds_rdma_cookie_t m_rdma_cookie; + struct rds_mr *m_rdma_mr; + unsigned int m_nents; + unsigned int m_count; + struct scatterlist m_sg[0]; +}; + +/* + * The RDS notifier is used (optionally) to tell the application about + * completed RDMA operations. Rather than keeping the whole rds message + * around on the queue, we allocate a small notifier that is put on the + * socket's notifier_list. Notifications are delivered to the application + * through control messages. + */ +struct rds_notifier { + struct list_head n_list; + uint64_t n_user_token; + int n_status; +}; + +/** + * struct rds_transport - transport specific behavioural hooks + * + * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send + * part of a message. The caller serializes on the send_sem so this + * doesn't need to be reentrant for a given conn. The header must be + * sent before the data payload. .xmit must be prepared to send a + * message with no data payload. .xmit should return the number of + * bytes that were sent down the connection, including header bytes. + * Returning 0 tells the caller that it doesn't need to perform any + * additional work now. This is usually the case when the transport has + * filled the sending queue for its connection and will handle + * triggering the rds thread to continue the send when space becomes + * available. Returning -EAGAIN tells the caller to retry the send + * immediately. Returning -ENOMEM tells the caller to retry the send at + * some point in the future. + * + * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once + * it returns the connection can not call rds_recv_incoming(). + * This will only be called once after conn_connect returns + * non-zero success and will The caller serializes this with + * the send and connecting paths (xmit_* and conn_*). The + * transport is responsible for other serialization, including + * rds_recv_incoming(). This is called in process context but + * should try hard not to block. + * + * @xmit_cong_map: This asks the transport to send the local bitmap down the + * given connection. XXX get a better story about the bitmap + * flag and header. + */ + +struct rds_transport { + char t_name[TRANSNAMSIZ]; + struct list_head t_item; + struct module *t_owner; + unsigned int t_prefer_loopback:1; + + int (*laddr_check)(__be32 addr); + int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); + void (*conn_free)(void *data); + int (*conn_connect)(struct rds_connection *conn); + void (*conn_shutdown)(struct rds_connection *conn); + void (*xmit_prepare)(struct rds_connection *conn); + void (*xmit_complete)(struct rds_connection *conn); + int (*xmit)(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); + int (*xmit_cong_map)(struct rds_connection *conn, + struct rds_cong_map *map, unsigned long offset); + int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); + int (*recv)(struct rds_connection *conn); + int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, + size_t size); + void (*inc_purge)(struct rds_incoming *inc); + void (*inc_free)(struct rds_incoming *inc); + + int (*cm_handle_connect)(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); + int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); + void (*cm_connect_complete)(struct rds_connection *conn, + struct rdma_cm_event *event); + + unsigned int (*stats_info_copy)(struct rds_info_iterator *iter, + unsigned int avail); + void (*exit)(void); + void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, + struct rds_sock *rs, u32 *key_ret); + void (*sync_mr)(void *trans_private, int direction); + void (*free_mr)(void *trans_private, int invalidate); + void (*flush_mrs)(void); +}; + +struct rds_sock { + struct sock rs_sk; + + u64 rs_user_addr; + u64 rs_user_bytes; + + /* + * bound_addr used for both incoming and outgoing, no INADDR_ANY + * support. + */ + struct rb_node rs_bound_node; + __be32 rs_bound_addr; + __be32 rs_conn_addr; + __be16 rs_bound_port; + __be16 rs_conn_port; + + /* + * This is only used to communicate the transport between bind and + * initiating connections. All other trans use is referenced through + * the connection. + */ + struct rds_transport *rs_transport; + + /* + * rds_sendmsg caches the conn it used the last time around. + * This helps avoid costly lookups. + */ + struct rds_connection *rs_conn; + + /* flag indicating we were congested or not */ + int rs_congested; + + /* rs_lock protects all these adjacent members before the newline */ + spinlock_t rs_lock; + struct list_head rs_send_queue; + u32 rs_snd_bytes; + int rs_rcv_bytes; + struct list_head rs_notify_queue; /* currently used for failed RDMAs */ + + /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask + * to decide whether the application should be woken up. + * If not set, we use rs_cong_track to find out whether a cong map + * update arrived. + */ + uint64_t rs_cong_mask; + uint64_t rs_cong_notify; + struct list_head rs_cong_list; + unsigned long rs_cong_track; + + /* + * rs_recv_lock protects the receive queue, and is + * used to serialize with rds_release. + */ + rwlock_t rs_recv_lock; + struct list_head rs_recv_queue; + + /* just for stats reporting */ + struct list_head rs_item; + + /* these have their own lock */ + spinlock_t rs_rdma_lock; + struct rb_root rs_rdma_keys; + + /* Socket options - in case there will be more */ + unsigned char rs_recverr, + rs_cong_monitor; +}; + +static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) +{ + return container_of(sk, struct rds_sock, rs_sk); +} +static inline struct sock *rds_rs_to_sk(struct rds_sock *rs) +{ + return &rs->rs_sk; +} + +/* + * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value + * to account for overhead. We don't account for overhead, we just apply + * the number of payload bytes to the specified value. + */ +static inline int rds_sk_sndbuf(struct rds_sock *rs) +{ + return rds_rs_to_sk(rs)->sk_sndbuf / 2; +} +static inline int rds_sk_rcvbuf(struct rds_sock *rs) +{ + return rds_rs_to_sk(rs)->sk_rcvbuf / 2; +} + +struct rds_statistics { + uint64_t s_conn_reset; + uint64_t s_recv_drop_bad_checksum; + uint64_t s_recv_drop_old_seq; + uint64_t s_recv_drop_no_sock; + uint64_t s_recv_drop_dead_sock; + uint64_t s_recv_deliver_raced; + uint64_t s_recv_delivered; + uint64_t s_recv_queued; + uint64_t s_recv_immediate_retry; + uint64_t s_recv_delayed_retry; + uint64_t s_recv_ack_required; + uint64_t s_recv_rdma_bytes; + uint64_t s_recv_ping; + uint64_t s_send_queue_empty; + uint64_t s_send_queue_full; + uint64_t s_send_sem_contention; + uint64_t s_send_sem_queue_raced; + uint64_t s_send_immediate_retry; + uint64_t s_send_delayed_retry; + uint64_t s_send_drop_acked; + uint64_t s_send_ack_required; + uint64_t s_send_queued; + uint64_t s_send_rdma; + uint64_t s_send_rdma_bytes; + uint64_t s_send_pong; + uint64_t s_page_remainder_hit; + uint64_t s_page_remainder_miss; + uint64_t s_copy_to_user; + uint64_t s_copy_from_user; + uint64_t s_cong_update_queued; + uint64_t s_cong_update_received; + uint64_t s_cong_send_error; + uint64_t s_cong_send_blocked; +}; + +/* af_rds.c */ +void rds_sock_addref(struct rds_sock *rs); +void rds_sock_put(struct rds_sock *rs); +void rds_wake_sk_sleep(struct rds_sock *rs); +static inline void __rds_wake_sk_sleep(struct sock *sk) +{ + wait_queue_head_t *waitq = sk->sk_sleep; + + if (!sock_flag(sk, SOCK_DEAD) && waitq) + wake_up(waitq); +} +extern wait_queue_head_t rds_poll_waitq; + + +/* bind.c */ +int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +void rds_remove_bound(struct rds_sock *rs); +struct rds_sock *rds_find_bound(__be32 addr, __be16 port); + +/* cong.c */ +int rds_cong_get_maps(struct rds_connection *conn); +void rds_cong_add_conn(struct rds_connection *conn); +void rds_cong_remove_conn(struct rds_connection *conn); +void rds_cong_set_bit(struct rds_cong_map *map, __be16 port); +void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port); +int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs); +void rds_cong_queue_updates(struct rds_cong_map *map); +void rds_cong_map_updated(struct rds_cong_map *map, uint64_t); +int rds_cong_updated_since(unsigned long *recent); +void rds_cong_add_socket(struct rds_sock *); +void rds_cong_remove_socket(struct rds_sock *); +void rds_cong_exit(void); +struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); + +/* conn.c */ +int __init rds_conn_init(void); +void rds_conn_exit(void); +struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp); +struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp); +void rds_conn_destroy(struct rds_connection *conn); +void rds_conn_reset(struct rds_connection *conn); +void rds_conn_drop(struct rds_connection *conn); +void rds_for_each_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_connection *, void *), + size_t item_len); +void __rds_conn_error(struct rds_connection *conn, const char *, ...) + __attribute__ ((format (printf, 2, 3))); +#define rds_conn_error(conn, fmt...) \ + __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) + +static inline int +rds_conn_transition(struct rds_connection *conn, int old, int new) +{ + return atomic_cmpxchg(&conn->c_state, old, new) == old; +} + +static inline int +rds_conn_state(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state); +} + +static inline int +rds_conn_up(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state) == RDS_CONN_UP; +} + +static inline int +rds_conn_connecting(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING; +} + +/* message.c */ +struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); +struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, + size_t total_len); +struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); +void rds_message_populate_header(struct rds_header *hdr, __be16 sport, + __be16 dport, u64 seq); +int rds_message_add_extension(struct rds_header *hdr, + unsigned int type, const void *data, unsigned int len); +int rds_message_next_extension(struct rds_header *hdr, + unsigned int *pos, void *buf, unsigned int *buflen); +int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version); +int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version); +int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); +int rds_message_inc_copy_to_user(struct rds_incoming *inc, + struct iovec *first_iov, size_t size); +void rds_message_inc_purge(struct rds_incoming *inc); +void rds_message_inc_free(struct rds_incoming *inc); +void rds_message_addref(struct rds_message *rm); +void rds_message_put(struct rds_message *rm); +void rds_message_wait(struct rds_message *rm); +void rds_message_unmapped(struct rds_message *rm); + +static inline void rds_message_make_checksum(struct rds_header *hdr) +{ + hdr->h_csum = 0; + hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2); +} + +static inline int rds_message_verify_checksum(const struct rds_header *hdr) +{ + return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0; +} + + +/* page.c */ +int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, + gfp_t gfp); +int rds_page_copy_user(struct page *page, unsigned long offset, + void __user *ptr, unsigned long bytes, + int to_user); +#define rds_page_copy_to_user(page, offset, ptr, bytes) \ + rds_page_copy_user(page, offset, ptr, bytes, 1) +#define rds_page_copy_from_user(page, offset, ptr, bytes) \ + rds_page_copy_user(page, offset, ptr, bytes, 0) +void rds_page_exit(void); + +/* recv.c */ +void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, + __be32 saddr); +void rds_inc_addref(struct rds_incoming *inc); +void rds_inc_put(struct rds_incoming *inc); +void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, + struct rds_incoming *inc, gfp_t gfp, enum km_type km); +int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int msg_flags); +void rds_clear_recv_queue(struct rds_sock *rs); +int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); +void rds_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + __be32 saddr, __be32 daddr, int flip); + +/* send.c */ +int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t payload_len); +void rds_send_reset(struct rds_connection *conn); +int rds_send_xmit(struct rds_connection *conn); +struct sockaddr_in; +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); +typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); +void rds_send_drop_acked(struct rds_connection *conn, u64 ack, + is_acked_func is_acked); +int rds_send_acked_before(struct rds_connection *conn, u64 seq); +void rds_send_remove_from_sock(struct list_head *messages, int status); +int rds_send_pong(struct rds_connection *conn, __be16 dport); +struct rds_message *rds_send_get_message(struct rds_connection *, + struct rds_rdma_op *); + +/* rdma.c */ +void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); + +/* stats.c */ +DECLARE_PER_CPU(struct rds_statistics, rds_stats); +#define rds_stats_inc_which(which, member) do { \ + per_cpu(which, get_cpu()).member++; \ + put_cpu(); \ +} while (0) +#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member) +#define rds_stats_add_which(which, member, count) do { \ + per_cpu(which, get_cpu()).member += count; \ + put_cpu(); \ +} while (0) +#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) +int __init rds_stats_init(void); +void rds_stats_exit(void); +void rds_stats_info_copy(struct rds_info_iterator *iter, + uint64_t *values, char **names, size_t nr); + +/* sysctl.c */ +int __init rds_sysctl_init(void); +void rds_sysctl_exit(void); +extern unsigned long rds_sysctl_sndbuf_min; +extern unsigned long rds_sysctl_sndbuf_default; +extern unsigned long rds_sysctl_sndbuf_max; +extern unsigned long rds_sysctl_reconnect_min_jiffies; +extern unsigned long rds_sysctl_reconnect_max_jiffies; +extern unsigned int rds_sysctl_max_unacked_packets; +extern unsigned int rds_sysctl_max_unacked_bytes; +extern unsigned int rds_sysctl_ping_enable; +extern unsigned long rds_sysctl_trace_flags; +extern unsigned int rds_sysctl_trace_level; + +/* threads.c */ +int __init rds_threads_init(void); +void rds_threads_exit(void); +extern struct workqueue_struct *rds_wq; +void rds_connect_worker(struct work_struct *); +void rds_shutdown_worker(struct work_struct *); +void rds_send_worker(struct work_struct *); +void rds_recv_worker(struct work_struct *); +void rds_connect_complete(struct rds_connection *conn); + +/* transport.c */ +int rds_trans_register(struct rds_transport *trans); +void rds_trans_unregister(struct rds_transport *trans); +struct rds_transport *rds_trans_get_preferred(__be32 addr); +unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); +int __init rds_trans_init(void); +void rds_trans_exit(void); + +#endif diff --git a/net/rds/recv.c b/net/rds/recv.c new file mode 100644 index 000000000000..f2118c51cfa3 --- /dev/null +++ b/net/rds/recv.c @@ -0,0 +1,542 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <net/sock.h> +#include <linux/in.h> + +#include "rds.h" +#include "rdma.h" + +void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, + __be32 saddr) +{ + atomic_set(&inc->i_refcount, 1); + INIT_LIST_HEAD(&inc->i_item); + inc->i_conn = conn; + inc->i_saddr = saddr; + inc->i_rdma_cookie = 0; +} + +void rds_inc_addref(struct rds_incoming *inc) +{ + rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); + atomic_inc(&inc->i_refcount); +} + +void rds_inc_put(struct rds_incoming *inc) +{ + rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); + if (atomic_dec_and_test(&inc->i_refcount)) { + BUG_ON(!list_empty(&inc->i_item)); + + inc->i_conn->c_trans->inc_free(inc); + } +} + +static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, + struct rds_cong_map *map, + int delta, __be16 port) +{ + int now_congested; + + if (delta == 0) + return; + + rs->rs_rcv_bytes += delta; + now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); + + rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " + "now_cong %d delta %d\n", + rs, &rs->rs_bound_addr, + ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, + rds_sk_rcvbuf(rs), now_congested, delta); + + /* wasn't -> am congested */ + if (!rs->rs_congested && now_congested) { + rs->rs_congested = 1; + rds_cong_set_bit(map, port); + rds_cong_queue_updates(map); + } + /* was -> aren't congested */ + /* Require more free space before reporting uncongested to prevent + bouncing cong/uncong state too often */ + else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) { + rs->rs_congested = 0; + rds_cong_clear_bit(map, port); + rds_cong_queue_updates(map); + } + + /* do nothing if no change in cong state */ +} + +/* + * Process all extension headers that come with this message. + */ +static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs) +{ + struct rds_header *hdr = &inc->i_hdr; + unsigned int pos = 0, type, len; + union { + struct rds_ext_header_version version; + struct rds_ext_header_rdma rdma; + struct rds_ext_header_rdma_dest rdma_dest; + } buffer; + + while (1) { + len = sizeof(buffer); + type = rds_message_next_extension(hdr, &pos, &buffer, &len); + if (type == RDS_EXTHDR_NONE) + break; + /* Process extension header here */ + switch (type) { + case RDS_EXTHDR_RDMA: + rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0); + break; + + case RDS_EXTHDR_RDMA_DEST: + /* We ignore the size for now. We could stash it + * somewhere and use it for error checking. */ + inc->i_rdma_cookie = rds_rdma_make_cookie( + be32_to_cpu(buffer.rdma_dest.h_rdma_rkey), + be32_to_cpu(buffer.rdma_dest.h_rdma_offset)); + + break; + } + } +} + +/* + * The transport must make sure that this is serialized against other + * rx and conn reset on this specific conn. + * + * We currently assert that only one fragmented message will be sent + * down a connection at a time. This lets us reassemble in the conn + * instead of per-flow which means that we don't have to go digging through + * flows to tear down partial reassembly progress on conn failure and + * we save flow lookup and locking for each frag arrival. It does mean + * that small messages will wait behind large ones. Fragmenting at all + * is only to reduce the memory consumption of pre-posted buffers. + * + * The caller passes in saddr and daddr instead of us getting it from the + * conn. This lets loopback, who only has one conn for both directions, + * tell us which roles the addrs in the conn are playing for this message. + */ +void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, + struct rds_incoming *inc, gfp_t gfp, enum km_type km) +{ + struct rds_sock *rs = NULL; + struct sock *sk; + unsigned long flags; + + inc->i_conn = conn; + inc->i_rx_jiffies = jiffies; + + rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " + "flags 0x%x rx_jiffies %lu\n", conn, + (unsigned long long)conn->c_next_rx_seq, + inc, + (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), + be32_to_cpu(inc->i_hdr.h_len), + be16_to_cpu(inc->i_hdr.h_sport), + be16_to_cpu(inc->i_hdr.h_dport), + inc->i_hdr.h_flags, + inc->i_rx_jiffies); + + /* + * Sequence numbers should only increase. Messages get their + * sequence number as they're queued in a sending conn. They + * can be dropped, though, if the sending socket is closed before + * they hit the wire. So sequence numbers can skip forward + * under normal operation. They can also drop back in the conn + * failover case as previously sent messages are resent down the + * new instance of a conn. We drop those, otherwise we have + * to assume that the next valid seq does not come after a + * hole in the fragment stream. + * + * The headers don't give us a way to realize if fragments of + * a message have been dropped. We assume that frags that arrive + * to a flow are part of the current message on the flow that is + * being reassembled. This means that senders can't drop messages + * from the sending conn until all their frags are sent. + * + * XXX we could spend more on the wire to get more robust failure + * detection, arguably worth it to avoid data corruption. + */ + if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq + && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { + rds_stats_inc(s_recv_drop_old_seq); + goto out; + } + conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; + + if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { + rds_stats_inc(s_recv_ping); + rds_send_pong(conn, inc->i_hdr.h_sport); + goto out; + } + + rs = rds_find_bound(daddr, inc->i_hdr.h_dport); + if (rs == NULL) { + rds_stats_inc(s_recv_drop_no_sock); + goto out; + } + + /* Process extension headers */ + rds_recv_incoming_exthdrs(inc, rs); + + /* We can be racing with rds_release() which marks the socket dead. */ + sk = rds_rs_to_sk(rs); + + /* serialize with rds_release -> sock_orphan */ + write_lock_irqsave(&rs->rs_recv_lock, flags); + if (!sock_flag(sk, SOCK_DEAD)) { + rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs); + rds_stats_inc(s_recv_queued); + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + rds_inc_addref(inc); + list_add_tail(&inc->i_item, &rs->rs_recv_queue); + __rds_wake_sk_sleep(sk); + } else { + rds_stats_inc(s_recv_drop_dead_sock); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + +out: + if (rs) + rds_sock_put(rs); +} + +/* + * be very careful here. This is being called as the condition in + * wait_event_*() needs to cope with being called many times. + */ +static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) +{ + unsigned long flags; + + if (*inc == NULL) { + read_lock_irqsave(&rs->rs_recv_lock, flags); + if (!list_empty(&rs->rs_recv_queue)) { + *inc = list_entry(rs->rs_recv_queue.next, + struct rds_incoming, + i_item); + rds_inc_addref(*inc); + } + read_unlock_irqrestore(&rs->rs_recv_lock, flags); + } + + return *inc != NULL; +} + +static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, + int drop) +{ + struct sock *sk = rds_rs_to_sk(rs); + int ret = 0; + unsigned long flags; + + write_lock_irqsave(&rs->rs_recv_lock, flags); + if (!list_empty(&inc->i_item)) { + ret = 1; + if (drop) { + /* XXX make sure this i_conn is reliable */ + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + -be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + list_del_init(&inc->i_item); + rds_inc_put(inc); + } + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + + rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); + return ret; +} + +/* + * Pull errors off the error queue. + * If msghdr is NULL, we will just purge the error queue. + */ +int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) +{ + struct rds_notifier *notifier; + struct rds_rdma_notify cmsg; + unsigned int count = 0, max_messages = ~0U; + unsigned long flags; + LIST_HEAD(copy); + int err = 0; + + + /* put_cmsg copies to user space and thus may sleep. We can't do this + * with rs_lock held, so first grab as many notifications as we can stuff + * in the user provided cmsg buffer. We don't try to copy more, to avoid + * losing notifications - except when the buffer is so small that it wouldn't + * even hold a single notification. Then we give him as much of this single + * msg as we can squeeze in, and set MSG_CTRUNC. + */ + if (msghdr) { + max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg)); + if (!max_messages) + max_messages = 1; + } + + spin_lock_irqsave(&rs->rs_lock, flags); + while (!list_empty(&rs->rs_notify_queue) && count < max_messages) { + notifier = list_entry(rs->rs_notify_queue.next, + struct rds_notifier, n_list); + list_move(¬ifier->n_list, ©); + count++; + } + spin_unlock_irqrestore(&rs->rs_lock, flags); + + if (!count) + return 0; + + while (!list_empty(©)) { + notifier = list_entry(copy.next, struct rds_notifier, n_list); + + if (msghdr) { + cmsg.user_token = notifier->n_user_token; + cmsg.status = notifier->n_status; + + err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, + sizeof(cmsg), &cmsg); + if (err) + break; + } + + list_del_init(¬ifier->n_list); + kfree(notifier); + } + + /* If we bailed out because of an error in put_cmsg, + * we may be left with one or more notifications that we + * didn't process. Return them to the head of the list. */ + if (!list_empty(©)) { + spin_lock_irqsave(&rs->rs_lock, flags); + list_splice(©, &rs->rs_notify_queue); + spin_unlock_irqrestore(&rs->rs_lock, flags); + } + + return err; +} + +/* + * Queue a congestion notification + */ +static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr) +{ + uint64_t notify = rs->rs_cong_notify; + unsigned long flags; + int err; + + err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE, + sizeof(notify), ¬ify); + if (err) + return err; + + spin_lock_irqsave(&rs->rs_lock, flags); + rs->rs_cong_notify &= ~notify; + spin_unlock_irqrestore(&rs->rs_lock, flags); + + return 0; +} + +/* + * Receive any control messages. + */ +static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) +{ + int ret = 0; + + if (inc->i_rdma_cookie) { + ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, + sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); + if (ret) + return ret; + } + + return 0; +} + +int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int msg_flags) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + long timeo; + int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; + struct sockaddr_in *sin; + struct rds_incoming *inc = NULL; + + /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ + timeo = sock_rcvtimeo(sk, nonblock); + + rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo); + + if (msg_flags & MSG_OOB) + goto out; + + /* If there are pending notifications, do those - and nothing else */ + if (!list_empty(&rs->rs_notify_queue)) { + ret = rds_notify_queue_get(rs, msg); + goto out; + } + + if (rs->rs_cong_notify) { + ret = rds_notify_cong(rs, msg); + goto out; + } + + while (1) { + if (!rds_next_incoming(rs, &inc)) { + if (nonblock) { + ret = -EAGAIN; + break; + } + + timeo = wait_event_interruptible_timeout(*sk->sk_sleep, + rds_next_incoming(rs, &inc), + timeo); + rdsdebug("recvmsg woke inc %p timeo %ld\n", inc, + timeo); + if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) + continue; + + ret = timeo; + if (ret == 0) + ret = -ETIMEDOUT; + break; + } + + rdsdebug("copying inc %p from %pI4:%u to user\n", inc, + &inc->i_conn->c_faddr, + ntohs(inc->i_hdr.h_sport)); + ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov, + size); + if (ret < 0) + break; + + /* + * if the message we just copied isn't at the head of the + * recv queue then someone else raced us to return it, try + * to get the next message. + */ + if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) { + rds_inc_put(inc); + inc = NULL; + rds_stats_inc(s_recv_deliver_raced); + continue; + } + + if (ret < be32_to_cpu(inc->i_hdr.h_len)) { + if (msg_flags & MSG_TRUNC) + ret = be32_to_cpu(inc->i_hdr.h_len); + msg->msg_flags |= MSG_TRUNC; + } + + if (rds_cmsg_recv(inc, msg)) { + ret = -EFAULT; + goto out; + } + + rds_stats_inc(s_recv_delivered); + + sin = (struct sockaddr_in *)msg->msg_name; + if (sin) { + sin->sin_family = AF_INET; + sin->sin_port = inc->i_hdr.h_sport; + sin->sin_addr.s_addr = inc->i_saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + break; + } + + if (inc) + rds_inc_put(inc); + +out: + return ret; +} + +/* + * The socket is being shut down and we're asked to drop messages that were + * queued for recvmsg. The caller has unbound the socket so the receive path + * won't queue any more incoming fragments or messages on the socket. + */ +void rds_clear_recv_queue(struct rds_sock *rs) +{ + struct sock *sk = rds_rs_to_sk(rs); + struct rds_incoming *inc, *tmp; + unsigned long flags; + + write_lock_irqsave(&rs->rs_recv_lock, flags); + list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + -be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + list_del_init(&inc->i_item); + rds_inc_put(inc); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); +} + +/* + * inc->i_saddr isn't used here because it is only set in the receive + * path. + */ +void rds_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + __be32 saddr, __be32 daddr, int flip) +{ + struct rds_info_message minfo; + + minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); + minfo.len = be32_to_cpu(inc->i_hdr.h_len); + + if (flip) { + minfo.laddr = daddr; + minfo.faddr = saddr; + minfo.lport = inc->i_hdr.h_dport; + minfo.fport = inc->i_hdr.h_sport; + } else { + minfo.laddr = saddr; + minfo.faddr = daddr; + minfo.lport = inc->i_hdr.h_sport; + minfo.fport = inc->i_hdr.h_dport; + } + + rds_info_copy(iter, &minfo, sizeof(minfo)); +} diff --git a/net/rds/send.c b/net/rds/send.c new file mode 100644 index 000000000000..1b37364656f0 --- /dev/null +++ b/net/rds/send.c @@ -0,0 +1,1003 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <net/sock.h> +#include <linux/in.h> +#include <linux/list.h> + +#include "rds.h" +#include "rdma.h" + +/* When transmitting messages in rds_send_xmit, we need to emerge from + * time to time and briefly release the CPU. Otherwise the softlock watchdog + * will kick our shin. + * Also, it seems fairer to not let one busy connection stall all the + * others. + * + * send_batch_count is the number of times we'll loop in send_xmit. Setting + * it to 0 will restore the old behavior (where we looped until we had + * drained the queue). + */ +static int send_batch_count = 64; +module_param(send_batch_count, int, 0444); +MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); + +/* + * Reset the send state. Caller must hold c_send_lock when calling here. + */ +void rds_send_reset(struct rds_connection *conn) +{ + struct rds_message *rm, *tmp; + unsigned long flags; + + if (conn->c_xmit_rm) { + /* Tell the user the RDMA op is no longer mapped by the + * transport. This isn't entirely true (it's flushed out + * independently) but as the connection is down, there's + * no ongoing RDMA to/from that memory */ + rds_message_unmapped(conn->c_xmit_rm); + rds_message_put(conn->c_xmit_rm); + conn->c_xmit_rm = NULL; + } + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + + conn->c_map_queued = 0; + + conn->c_unacked_packets = rds_sysctl_max_unacked_packets; + conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + + /* Mark messages as retransmissions, and move them to the send q */ + spin_lock_irqsave(&conn->c_lock, flags); + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); + } + list_splice_init(&conn->c_retrans, &conn->c_send_queue); + spin_unlock_irqrestore(&conn->c_lock, flags); +} + +/* + * We're making the concious trade-off here to only send one message + * down the connection at a time. + * Pro: + * - tx queueing is a simple fifo list + * - reassembly is optional and easily done by transports per conn + * - no per flow rx lookup at all, straight to the socket + * - less per-frag memory and wire overhead + * Con: + * - queued acks can be delayed behind large messages + * Depends: + * - small message latency is higher behind queued large messages + * - large message latency isn't starved by intervening small sends + */ +int rds_send_xmit(struct rds_connection *conn) +{ + struct rds_message *rm; + unsigned long flags; + unsigned int tmp; + unsigned int send_quota = send_batch_count; + struct scatterlist *sg; + int ret = 0; + int was_empty = 0; + LIST_HEAD(to_be_dropped); + + /* + * sendmsg calls here after having queued its message on the send + * queue. We only have one task feeding the connection at a time. If + * another thread is already feeding the queue then we back off. This + * avoids blocking the caller and trading per-connection data between + * caches per message. + * + * The sem holder will issue a retry if they notice that someone queued + * a message after they stopped walking the send queue but before they + * dropped the sem. + */ + if (!mutex_trylock(&conn->c_send_lock)) { + rds_stats_inc(s_send_sem_contention); + ret = -ENOMEM; + goto out; + } + + if (conn->c_trans->xmit_prepare) + conn->c_trans->xmit_prepare(conn); + + /* + * spin trying to push headers and data down the connection until + * the connection doens't make forward progress. + */ + while (--send_quota) { + /* + * See if need to send a congestion map update if we're + * between sending messages. The send_sem protects our sole + * use of c_map_offset and _bytes. + * Note this is used only by transports that define a special + * xmit_cong_map function. For all others, we create allocate + * a cong_map message and treat it just like any other send. + */ + if (conn->c_map_bytes) { + ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, + conn->c_map_offset); + if (ret <= 0) + break; + + conn->c_map_offset += ret; + conn->c_map_bytes -= ret; + if (conn->c_map_bytes) + continue; + } + + /* If we're done sending the current message, clear the + * offset and S/G temporaries. + */ + rm = conn->c_xmit_rm; + if (rm != NULL && + conn->c_xmit_hdr_off == sizeof(struct rds_header) && + conn->c_xmit_sg == rm->m_nents) { + conn->c_xmit_rm = NULL; + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + + /* Release the reference to the previous message. */ + rds_message_put(rm); + rm = NULL; + } + + /* If we're asked to send a cong map update, do so. + */ + if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { + if (conn->c_trans->xmit_cong_map != NULL) { + conn->c_map_offset = 0; + conn->c_map_bytes = sizeof(struct rds_header) + + RDS_CONG_MAP_BYTES; + continue; + } + + rm = rds_cong_update_alloc(conn); + if (IS_ERR(rm)) { + ret = PTR_ERR(rm); + break; + } + + conn->c_xmit_rm = rm; + } + + /* + * Grab the next message from the send queue, if there is one. + * + * c_xmit_rm holds a ref while we're sending this message down + * the connction. We can use this ref while holding the + * send_sem.. rds_send_reset() is serialized with it. + */ + if (rm == NULL) { + unsigned int len; + + spin_lock_irqsave(&conn->c_lock, flags); + + if (!list_empty(&conn->c_send_queue)) { + rm = list_entry(conn->c_send_queue.next, + struct rds_message, + m_conn_item); + rds_message_addref(rm); + + /* + * Move the message from the send queue to the retransmit + * list right away. + */ + list_move_tail(&rm->m_conn_item, &conn->c_retrans); + } + + spin_unlock_irqrestore(&conn->c_lock, flags); + + if (rm == NULL) { + was_empty = 1; + break; + } + + /* Unfortunately, the way Infiniband deals with + * RDMA to a bad MR key is by moving the entire + * queue pair to error state. We cold possibly + * recover from that, but right now we drop the + * connection. + * Therefore, we never retransmit messages with RDMA ops. + */ + if (rm->m_rdma_op + && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { + spin_lock_irqsave(&conn->c_lock, flags); + if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) + list_move(&rm->m_conn_item, &to_be_dropped); + spin_unlock_irqrestore(&conn->c_lock, flags); + rds_message_put(rm); + continue; + } + + /* Require an ACK every once in a while */ + len = ntohl(rm->m_inc.i_hdr.h_len); + if (conn->c_unacked_packets == 0 + || conn->c_unacked_bytes < len) { + __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + + conn->c_unacked_packets = rds_sysctl_max_unacked_packets; + conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + rds_stats_inc(s_send_ack_required); + } else { + conn->c_unacked_bytes -= len; + conn->c_unacked_packets--; + } + + conn->c_xmit_rm = rm; + } + + /* + * Try and send an rdma message. Let's see if we can + * keep this simple and require that the transport either + * send the whole rdma or none of it. + */ + if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { + ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); + if (ret) + break; + conn->c_xmit_rdma_sent = 1; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); + } + + if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || + conn->c_xmit_sg < rm->m_nents) { + ret = conn->c_trans->xmit(conn, rm, + conn->c_xmit_hdr_off, + conn->c_xmit_sg, + conn->c_xmit_data_off); + if (ret <= 0) + break; + + if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) { + tmp = min_t(int, ret, + sizeof(struct rds_header) - + conn->c_xmit_hdr_off); + conn->c_xmit_hdr_off += tmp; + ret -= tmp; + } + + sg = &rm->m_sg[conn->c_xmit_sg]; + while (ret) { + tmp = min_t(int, ret, sg->length - + conn->c_xmit_data_off); + conn->c_xmit_data_off += tmp; + ret -= tmp; + if (conn->c_xmit_data_off == sg->length) { + conn->c_xmit_data_off = 0; + sg++; + conn->c_xmit_sg++; + BUG_ON(ret != 0 && + conn->c_xmit_sg == rm->m_nents); + } + } + } + } + + /* Nuke any messages we decided not to retransmit. */ + if (!list_empty(&to_be_dropped)) + rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); + + if (conn->c_trans->xmit_complete) + conn->c_trans->xmit_complete(conn); + + /* + * We might be racing with another sender who queued a message but + * backed off on noticing that we held the c_send_lock. If we check + * for queued messages after dropping the sem then either we'll + * see the queued message or the queuer will get the sem. If we + * notice the queued message then we trigger an immediate retry. + * + * We need to be careful only to do this when we stopped processing + * the send queue because it was empty. It's the only way we + * stop processing the loop when the transport hasn't taken + * responsibility for forward progress. + */ + mutex_unlock(&conn->c_send_lock); + + if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { + /* We exhausted the send quota, but there's work left to + * do. Return and (re-)schedule the send worker. + */ + ret = -EAGAIN; + } + + if (ret == 0 && was_empty) { + /* A simple bit test would be way faster than taking the + * spin lock */ + spin_lock_irqsave(&conn->c_lock, flags); + if (!list_empty(&conn->c_send_queue)) { + rds_stats_inc(s_send_sem_queue_raced); + ret = -EAGAIN; + } + spin_unlock_irqrestore(&conn->c_lock, flags); + } +out: + return ret; +} + +static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) +{ + u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + assert_spin_locked(&rs->rs_lock); + + BUG_ON(rs->rs_snd_bytes < len); + rs->rs_snd_bytes -= len; + + if (rs->rs_snd_bytes == 0) + rds_stats_inc(s_send_queue_empty); +} + +static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, + is_acked_func is_acked) +{ + if (is_acked) + return is_acked(rm, ack); + return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack; +} + +/* + * Returns true if there are no messages on the send and retransmit queues + * which have a sequence number greater than or equal to the given sequence + * number. + */ +int rds_send_acked_before(struct rds_connection *conn, u64 seq) +{ + struct rds_message *rm, *tmp; + int ret = 1; + + spin_lock(&conn->c_lock); + + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) + ret = 0; + break; + } + + list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { + if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) + ret = 0; + break; + } + + spin_unlock(&conn->c_lock); + + return ret; +} + +/* + * This is pretty similar to what happens below in the ACK + * handling code - except that we call here as soon as we get + * the IB send completion on the RDMA op and the accompanying + * message. + */ +void rds_rdma_send_complete(struct rds_message *rm, int status) +{ + struct rds_sock *rs = NULL; + struct rds_rdma_op *ro; + struct rds_notifier *notifier; + + spin_lock(&rm->m_rs_lock); + + ro = rm->m_rdma_op; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) + && ro && ro->r_notify && ro->r_notifier) { + notifier = ro->r_notifier; + rs = rm->m_rs; + sock_hold(rds_rs_to_sk(rs)); + + notifier->n_status = status; + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + + ro->r_notifier = NULL; + } + + spin_unlock(&rm->m_rs_lock); + + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } +} + +/* + * This is the same as rds_rdma_send_complete except we + * don't do any locking - we have all the ingredients (message, + * socket, socket lock) and can just move the notifier. + */ +static inline void +__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) +{ + struct rds_rdma_op *ro; + + ro = rm->m_rdma_op; + if (ro && ro->r_notify && ro->r_notifier) { + ro->r_notifier->n_status = status; + list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); + ro->r_notifier = NULL; + } + + /* No need to wake the app - caller does this */ +} + +/* + * This is called from the IB send completion when we detect + * a RDMA operation that failed with remote access error. + * So speed is not an issue here. + */ +struct rds_message *rds_send_get_message(struct rds_connection *conn, + struct rds_rdma_op *op) +{ + struct rds_message *rm, *tmp, *found = NULL; + unsigned long flags; + + spin_lock_irqsave(&conn->c_lock, flags); + + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + if (rm->m_rdma_op == op) { + atomic_inc(&rm->m_refcount); + found = rm; + goto out; + } + } + + list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { + if (rm->m_rdma_op == op) { + atomic_inc(&rm->m_refcount); + found = rm; + break; + } + } + +out: + spin_unlock_irqrestore(&conn->c_lock, flags); + + return found; +} + +/* + * This removes messages from the socket's list if they're on it. The list + * argument must be private to the caller, we must be able to modify it + * without locks. The messages must have a reference held for their + * position on the list. This function will drop that reference after + * removing the messages from the 'messages' list regardless of if it found + * the messages on the socket list or not. + */ +void rds_send_remove_from_sock(struct list_head *messages, int status) +{ + unsigned long flags = 0; /* silence gcc :P */ + struct rds_sock *rs = NULL; + struct rds_message *rm; + + local_irq_save(flags); + while (!list_empty(messages)) { + rm = list_entry(messages->next, struct rds_message, + m_conn_item); + list_del_init(&rm->m_conn_item); + + /* + * If we see this flag cleared then we're *sure* that someone + * else beat us to removing it from the sock. If we race + * with their flag update we'll get the lock and then really + * see that the flag has been cleared. + * + * The message spinlock makes sure nobody clears rm->m_rs + * while we're messing with it. It does not prevent the + * message from being removed from the socket, though. + */ + spin_lock(&rm->m_rs_lock); + if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) + goto unlock_and_drop; + + if (rs != rm->m_rs) { + if (rs) { + spin_unlock(&rs->rs_lock); + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } + rs = rm->m_rs; + spin_lock(&rs->rs_lock); + sock_hold(rds_rs_to_sk(rs)); + } + + if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { + struct rds_rdma_op *ro = rm->m_rdma_op; + struct rds_notifier *notifier; + + list_del_init(&rm->m_sock_item); + rds_send_sndbuf_remove(rs, rm); + + if (ro && ro->r_notifier + && (status || ro->r_notify)) { + notifier = ro->r_notifier; + list_add_tail(¬ifier->n_list, + &rs->rs_notify_queue); + if (!notifier->n_status) + notifier->n_status = status; + rm->m_rdma_op->r_notifier = NULL; + } + rds_message_put(rm); + rm->m_rs = NULL; + } + +unlock_and_drop: + spin_unlock(&rm->m_rs_lock); + rds_message_put(rm); + } + + if (rs) { + spin_unlock(&rs->rs_lock); + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } + local_irq_restore(flags); +} + +/* + * Transports call here when they've determined that the receiver queued + * messages up to, and including, the given sequence number. Messages are + * moved to the retrans queue when rds_send_xmit picks them off the send + * queue. This means that in the TCP case, the message may not have been + * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked + * checks the RDS_MSG_HAS_ACK_SEQ bit. + * + * XXX It's not clear to me how this is safely serialized with socket + * destruction. Maybe it should bail if it sees SOCK_DEAD. + */ +void rds_send_drop_acked(struct rds_connection *conn, u64 ack, + is_acked_func is_acked) +{ + struct rds_message *rm, *tmp; + unsigned long flags; + LIST_HEAD(list); + + spin_lock_irqsave(&conn->c_lock, flags); + + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + if (!rds_send_is_acked(rm, ack, is_acked)) + break; + + list_move(&rm->m_conn_item, &list); + clear_bit(RDS_MSG_ON_CONN, &rm->m_flags); + } + + /* order flag updates with spin locks */ + if (!list_empty(&list)) + smp_mb__after_clear_bit(); + + spin_unlock_irqrestore(&conn->c_lock, flags); + + /* now remove the messages from the sock list as needed */ + rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); +} + +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) +{ + struct rds_message *rm, *tmp; + struct rds_connection *conn; + unsigned long flags; + LIST_HEAD(list); + int wake = 0; + + /* get all the messages we're dropping under the rs lock */ + spin_lock_irqsave(&rs->rs_lock, flags); + + list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { + if (dest && (dest->sin_addr.s_addr != rm->m_daddr || + dest->sin_port != rm->m_inc.i_hdr.h_dport)) + continue; + + wake = 1; + list_move(&rm->m_sock_item, &list); + rds_send_sndbuf_remove(rs, rm); + clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); + + /* If this is a RDMA operation, notify the app. */ + __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); + } + + /* order flag updates with the rs lock */ + if (wake) + smp_mb__after_clear_bit(); + + spin_unlock_irqrestore(&rs->rs_lock, flags); + + if (wake) + rds_wake_sk_sleep(rs); + + conn = NULL; + + /* now remove the messages from the conn list as needed */ + list_for_each_entry(rm, &list, m_sock_item) { + /* We do this here rather than in the loop above, so that + * we don't have to nest m_rs_lock under rs->rs_lock */ + spin_lock(&rm->m_rs_lock); + rm->m_rs = NULL; + spin_unlock(&rm->m_rs_lock); + + /* + * If we see this flag cleared then we're *sure* that someone + * else beat us to removing it from the conn. If we race + * with their flag update we'll get the lock and then really + * see that the flag has been cleared. + */ + if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) + continue; + + if (conn != rm->m_inc.i_conn) { + if (conn) + spin_unlock_irqrestore(&conn->c_lock, flags); + conn = rm->m_inc.i_conn; + spin_lock_irqsave(&conn->c_lock, flags); + } + + if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { + list_del_init(&rm->m_conn_item); + rds_message_put(rm); + } + } + + if (conn) + spin_unlock_irqrestore(&conn->c_lock, flags); + + while (!list_empty(&list)) { + rm = list_entry(list.next, struct rds_message, m_sock_item); + list_del_init(&rm->m_sock_item); + + rds_message_wait(rm); + rds_message_put(rm); + } +} + +/* + * we only want this to fire once so we use the callers 'queued'. It's + * possible that another thread can race with us and remove the + * message from the flow with RDS_CANCEL_SENT_TO. + */ +static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, + struct rds_message *rm, __be16 sport, + __be16 dport, int *queued) +{ + unsigned long flags; + u32 len; + + if (*queued) + goto out; + + len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + /* this is the only place which holds both the socket's rs_lock + * and the connection's c_lock */ + spin_lock_irqsave(&rs->rs_lock, flags); + + /* + * If there is a little space in sndbuf, we don't queue anything, + * and userspace gets -EAGAIN. But poll() indicates there's send + * room. This can lead to bad behavior (spinning) if snd_bytes isn't + * freed up by incoming acks. So we check the *old* value of + * rs_snd_bytes here to allow the last msg to exceed the buffer, + * and poll() now knows no more data can be sent. + */ + if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) { + rs->rs_snd_bytes += len; + + /* let recv side know we are close to send space exhaustion. + * This is probably not the optimal way to do it, as this + * means we set the flag on *all* messages as soon as our + * throughput hits a certain threshold. + */ + if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2) + __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + + list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); + set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); + rds_message_addref(rm); + rm->m_rs = rs; + + /* The code ordering is a little weird, but we're + trying to minimize the time we hold c_lock */ + rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); + rm->m_inc.i_conn = conn; + rds_message_addref(rm); + + spin_lock(&conn->c_lock); + rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++); + list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + set_bit(RDS_MSG_ON_CONN, &rm->m_flags); + spin_unlock(&conn->c_lock); + + rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", + rm, len, rs, rs->rs_snd_bytes, + (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence)); + + *queued = 1; + } + + spin_unlock_irqrestore(&rs->rs_lock, flags); +out: + return *queued; +} + +static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, + struct msghdr *msg, int *allocated_mr) +{ + struct cmsghdr *cmsg; + int ret = 0; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + /* As a side effect, RDMA_DEST and RDMA_MAP will set + * rm->m_rdma_cookie and rm->m_rdma_mr. + */ + switch (cmsg->cmsg_type) { + case RDS_CMSG_RDMA_ARGS: + ret = rds_cmsg_rdma_args(rs, rm, cmsg); + break; + + case RDS_CMSG_RDMA_DEST: + ret = rds_cmsg_rdma_dest(rs, rm, cmsg); + break; + + case RDS_CMSG_RDMA_MAP: + ret = rds_cmsg_rdma_map(rs, rm, cmsg); + if (!ret) + *allocated_mr = 1; + break; + + default: + return -EINVAL; + } + + if (ret) + break; + } + + return ret; +} + +int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t payload_len) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + __be32 daddr; + __be16 dport; + struct rds_message *rm = NULL; + struct rds_connection *conn; + int ret = 0; + int queued = 0, allocated_mr = 0; + int nonblock = msg->msg_flags & MSG_DONTWAIT; + long timeo = sock_rcvtimeo(sk, nonblock); + + /* Mirror Linux UDP mirror of BSD error message compatibility */ + /* XXX: Perhaps MSG_MORE someday */ + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { + printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags); + ret = -EOPNOTSUPP; + goto out; + } + + if (msg->msg_namelen) { + /* XXX fail non-unicast destination IPs? */ + if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) { + ret = -EINVAL; + goto out; + } + daddr = usin->sin_addr.s_addr; + dport = usin->sin_port; + } else { + /* We only care about consistency with ->connect() */ + lock_sock(sk); + daddr = rs->rs_conn_addr; + dport = rs->rs_conn_port; + release_sock(sk); + } + + /* racing with another thread binding seems ok here */ + if (daddr == 0 || rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + rm = rds_message_copy_from_user(msg->msg_iov, payload_len); + if (IS_ERR(rm)) { + ret = PTR_ERR(rm); + rm = NULL; + goto out; + } + + rm->m_daddr = daddr; + + /* Parse any control messages the user may have included. */ + ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); + if (ret) + goto out; + + /* rds_conn_create has a spinlock that runs with IRQ off. + * Caching the conn in the socket helps a lot. */ + if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) + conn = rs->rs_conn; + else { + conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr, + rs->rs_transport, + sock->sk->sk_allocation); + if (IS_ERR(conn)) { + ret = PTR_ERR(conn); + goto out; + } + rs->rs_conn = conn; + } + + if ((rm->m_rdma_cookie || rm->m_rdma_op) + && conn->c_trans->xmit_rdma == NULL) { + if (printk_ratelimit()) + printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", + rm->m_rdma_op, conn->c_trans->xmit_rdma); + ret = -EOPNOTSUPP; + goto out; + } + + /* If the connection is down, trigger a connect. We may + * have scheduled a delayed reconnect however - in this case + * we should not interfere. + */ + if (rds_conn_state(conn) == RDS_CONN_DOWN + && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + + ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); + if (ret) + goto out; + + while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, + dport, &queued)) { + rds_stats_inc(s_send_queue_full); + /* XXX make sure this is reasonable */ + if (payload_len > rds_sk_sndbuf(rs)) { + ret = -EMSGSIZE; + goto out; + } + if (nonblock) { + ret = -EAGAIN; + goto out; + } + + timeo = wait_event_interruptible_timeout(*sk->sk_sleep, + rds_send_queue_rm(rs, conn, rm, + rs->rs_bound_port, + dport, + &queued), + timeo); + rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo); + if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) + continue; + + ret = timeo; + if (ret == 0) + ret = -ETIMEDOUT; + goto out; + } + + /* + * By now we've committed to the send. We reuse rds_send_worker() + * to retry sends in the rds thread if the transport asks us to. + */ + rds_stats_inc(s_send_queued); + + if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + rds_send_worker(&conn->c_send_w.work); + + rds_message_put(rm); + return payload_len; + +out: + /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. + * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN + * or in any other way, we need to destroy the MR again */ + if (allocated_mr) + rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1); + + if (rm) + rds_message_put(rm); + return ret; +} + +/* + * Reply to a ping packet. + */ +int +rds_send_pong(struct rds_connection *conn, __be16 dport) +{ + struct rds_message *rm; + unsigned long flags; + int ret = 0; + + rm = rds_message_alloc(0, GFP_ATOMIC); + if (rm == NULL) { + ret = -ENOMEM; + goto out; + } + + rm->m_daddr = conn->c_faddr; + + /* If the connection is down, trigger a connect. We may + * have scheduled a delayed reconnect however - in this case + * we should not interfere. + */ + if (rds_conn_state(conn) == RDS_CONN_DOWN + && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + + ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); + if (ret) + goto out; + + spin_lock_irqsave(&conn->c_lock, flags); + list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + set_bit(RDS_MSG_ON_CONN, &rm->m_flags); + rds_message_addref(rm); + rm->m_inc.i_conn = conn; + + rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport, + conn->c_next_tx_seq); + conn->c_next_tx_seq++; + spin_unlock_irqrestore(&conn->c_lock, flags); + + rds_stats_inc(s_send_queued); + rds_stats_inc(s_send_pong); + + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + rds_message_put(rm); + return 0; + +out: + if (rm) + rds_message_put(rm); + return ret; +} diff --git a/net/rds/stats.c b/net/rds/stats.c new file mode 100644 index 000000000000..637146893cf3 --- /dev/null +++ b/net/rds/stats.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "rds.h" + +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); + +/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */ + +static char *rds_stat_names[] = { + "conn_reset", + "recv_drop_bad_checksum", + "recv_drop_old_seq", + "recv_drop_no_sock", + "recv_drop_dead_sock", + "recv_deliver_raced", + "recv_delivered", + "recv_queued", + "recv_immediate_retry", + "recv_delayed_retry", + "recv_ack_required", + "recv_rdma_bytes", + "recv_ping", + "send_queue_empty", + "send_queue_full", + "send_sem_contention", + "send_sem_queue_raced", + "send_immediate_retry", + "send_delayed_retry", + "send_drop_acked", + "send_ack_required", + "send_queued", + "send_rdma", + "send_rdma_bytes", + "send_pong", + "page_remainder_hit", + "page_remainder_miss", + "copy_to_user", + "copy_from_user", + "cong_update_queued", + "cong_update_received", + "cong_send_error", + "cong_send_blocked", +}; + +void rds_stats_info_copy(struct rds_info_iterator *iter, + uint64_t *values, char **names, size_t nr) +{ + struct rds_info_counter ctr; + size_t i; + + for (i = 0; i < nr; i++) { + BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); + strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); + ctr.value = values[i]; + + rds_info_copy(iter, &ctr, sizeof(ctr)); + } +} + +/* + * This gives global counters across all the transports. The strings + * are copied in so that the tool doesn't need knowledge of the specific + * stats that we're exporting. Some are pretty implementation dependent + * and may change over time. That doesn't stop them from being useful. + * + * This is the only function in the chain that knows about the byte granular + * length in userspace. It converts it to number of stat entries that the + * rest of the functions operate in. + */ +static void rds_stats_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + unsigned int avail; + + avail = len / sizeof(struct rds_info_counter); + + if (avail < ARRAY_SIZE(rds_stat_names)) { + avail = 0; + goto trans; + } + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names, + ARRAY_SIZE(rds_stat_names)); + avail -= ARRAY_SIZE(rds_stat_names); + +trans: + lens->each = sizeof(struct rds_info_counter); + lens->nr = rds_trans_stats_info_copy(iter, avail) + + ARRAY_SIZE(rds_stat_names); +} + +void rds_stats_exit(void) +{ + rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); +} + +int __init rds_stats_init(void) +{ + rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); + return 0; +} diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c new file mode 100644 index 000000000000..307dc5c1be15 --- /dev/null +++ b/net/rds/sysctl.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> + +#include "rds.h" + +static struct ctl_table_header *rds_sysctl_reg_table; + +static unsigned long rds_sysctl_reconnect_min = 1; +static unsigned long rds_sysctl_reconnect_max = ~0UL; + +unsigned long rds_sysctl_reconnect_min_jiffies; +unsigned long rds_sysctl_reconnect_max_jiffies = HZ; + +unsigned int rds_sysctl_max_unacked_packets = 8; +unsigned int rds_sysctl_max_unacked_bytes = (16 << 20); + +unsigned int rds_sysctl_ping_enable = 1; + +static ctl_table rds_sysctl_rds_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "reconnect_min_delay_ms", + .data = &rds_sysctl_reconnect_min_jiffies, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .extra1 = &rds_sysctl_reconnect_min, + .extra2 = &rds_sysctl_reconnect_max_jiffies, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "reconnect_max_delay_ms", + .data = &rds_sysctl_reconnect_max_jiffies, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .extra1 = &rds_sysctl_reconnect_min_jiffies, + .extra2 = &rds_sysctl_reconnect_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unacked_packets", + .data = &rds_sysctl_max_unacked_packets, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_unacked_bytes", + .data = &rds_sysctl_max_unacked_bytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ping_enable", + .data = &rds_sysctl_ping_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0} +}; + +static struct ctl_path rds_sysctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, + { } +}; + + +void rds_sysctl_exit(void) +{ + if (rds_sysctl_reg_table) + unregister_sysctl_table(rds_sysctl_reg_table); +} + +int __init rds_sysctl_init(void) +{ + rds_sysctl_reconnect_min = msecs_to_jiffies(1); + rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; + + rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); + if (rds_sysctl_reg_table == NULL) + return -ENOMEM; + return 0; +} diff --git a/net/rds/threads.c b/net/rds/threads.c new file mode 100644 index 000000000000..828a1bf9ea92 --- /dev/null +++ b/net/rds/threads.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/random.h> + +#include "rds.h" + +/* + * All of connection management is simplified by serializing it through + * work queues that execute in a connection managing thread. + * + * TCP wants to send acks through sendpage() in response to data_ready(), + * but it needs a process context to do so. + * + * The receive paths need to allocate but can't drop packets (!) so we have + * a thread around to block allocating if the receive fast path sees an + * allocation failure. + */ + +/* Grand Unified Theory of connection life cycle: + * At any point in time, the connection can be in one of these states: + * DOWN, CONNECTING, UP, DISCONNECTING, ERROR + * + * The following transitions are possible: + * ANY -> ERROR + * UP -> DISCONNECTING + * ERROR -> DISCONNECTING + * DISCONNECTING -> DOWN + * DOWN -> CONNECTING + * CONNECTING -> UP + * + * Transition to state DISCONNECTING/DOWN: + * - Inside the shutdown worker; synchronizes with xmit path + * through c_send_lock, and with connection management callbacks + * via c_cm_lock. + * + * For receive callbacks, we rely on the underlying transport + * (TCP, IB/RDMA) to provide the necessary synchronisation. + */ +struct workqueue_struct *rds_wq; + +void rds_connect_complete(struct rds_connection *conn) +{ + if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) { + printk(KERN_WARNING "%s: Cannot transition to state UP, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + atomic_set(&conn->c_state, RDS_CONN_ERROR); + queue_work(rds_wq, &conn->c_down_w); + return; + } + + rdsdebug("conn %p for %pI4 to %pI4 complete\n", + conn, &conn->c_laddr, &conn->c_faddr); + + conn->c_reconnect_jiffies = 0; + set_bit(0, &conn->c_map_queued); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +} + +/* + * This random exponential backoff is relied on to eventually resolve racing + * connects. + * + * If connect attempts race then both parties drop both connections and come + * here to wait for a random amount of time before trying again. Eventually + * the backoff range will be so much greater than the time it takes to + * establish a connection that one of the pair will establish the connection + * before the other's random delay fires. + * + * Connection attempts that arrive while a connection is already established + * are also considered to be racing connects. This lets a connection from + * a rebooted machine replace an existing stale connection before the transport + * notices that the connection has failed. + * + * We should *always* start with a random backoff; otherwise a broken connection + * will always take several iterations to be re-established. + */ +static void rds_queue_reconnect(struct rds_connection *conn) +{ + unsigned long rand; + + rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", + conn, &conn->c_laddr, &conn->c_faddr, + conn->c_reconnect_jiffies); + + set_bit(RDS_RECONNECT_PENDING, &conn->c_flags); + if (conn->c_reconnect_jiffies == 0) { + conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + return; + } + + get_random_bytes(&rand, sizeof(rand)); + rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", + rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, + conn, &conn->c_laddr, &conn->c_faddr); + queue_delayed_work(rds_wq, &conn->c_conn_w, + rand % conn->c_reconnect_jiffies); + + conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, + rds_sysctl_reconnect_max_jiffies); +} + +void rds_connect_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work); + int ret; + + clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags); + if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + ret = conn->c_trans->conn_connect(conn); + rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", + conn, &conn->c_laddr, &conn->c_faddr, ret); + + if (ret) { + if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) + rds_queue_reconnect(conn); + else + rds_conn_error(conn, "RDS: connect failed\n"); + } + } +} + +void rds_shutdown_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); + + /* shut it down unless it's down already */ + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { + /* + * Quiesce the connection mgmt handlers before we start tearing + * things down. We don't hold the mutex for the entire + * duration of the shutdown operation, else we may be + * deadlocking with the CM handler. Instead, the CM event + * handler is supposed to check for state DISCONNECTING + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) + && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { + rds_conn_error(conn, "shutdown called in state %d\n", + atomic_read(&conn->c_state)); + mutex_unlock(&conn->c_cm_lock); + return; + } + mutex_unlock(&conn->c_cm_lock); + + mutex_lock(&conn->c_send_lock); + conn->c_trans->conn_shutdown(conn); + rds_conn_reset(conn); + mutex_unlock(&conn->c_send_lock); + + if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { + /* This can happen - eg when we're in the middle of tearing + * down the connection, and someone unloads the rds module. + * Quite reproduceable with loopback connections. + * Mostly harmless. + */ + rds_conn_error(conn, + "%s: failed to transition to state DOWN, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + return; + } + } + + /* Then reconnect if it's still live. + * The passive side of an IB loopback connection is never added + * to the conn hash, so we never trigger a reconnect on this + * conn - the reconnect is always triggered by the active peer. */ + cancel_delayed_work(&conn->c_conn_w); + if (!hlist_unhashed(&conn->c_hash_node)) + rds_queue_reconnect(conn); +} + +void rds_send_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); + int ret; + + if (rds_conn_state(conn) == RDS_CONN_UP) { + ret = rds_send_xmit(conn); + rdsdebug("conn %p ret %d\n", conn, ret); + switch (ret) { + case -EAGAIN: + rds_stats_inc(s_send_immediate_retry); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + break; + case -ENOMEM: + rds_stats_inc(s_send_delayed_retry); + queue_delayed_work(rds_wq, &conn->c_send_w, 2); + default: + break; + } + } +} + +void rds_recv_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work); + int ret; + + if (rds_conn_state(conn) == RDS_CONN_UP) { + ret = conn->c_trans->recv(conn); + rdsdebug("conn %p ret %d\n", conn, ret); + switch (ret) { + case -EAGAIN: + rds_stats_inc(s_recv_immediate_retry); + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + break; + case -ENOMEM: + rds_stats_inc(s_recv_delayed_retry); + queue_delayed_work(rds_wq, &conn->c_recv_w, 2); + default: + break; + } + } +} + +void rds_threads_exit(void) +{ + destroy_workqueue(rds_wq); +} + +int __init rds_threads_init(void) +{ + rds_wq = create_singlethread_workqueue("krdsd"); + if (rds_wq == NULL) + return -ENOMEM; + + return 0; +} diff --git a/net/rds/transport.c b/net/rds/transport.c new file mode 100644 index 000000000000..767da61ad2f3 --- /dev/null +++ b/net/rds/transport.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/in.h> + +#include "rds.h" +#include "loop.h" + +static LIST_HEAD(rds_transports); +static DECLARE_RWSEM(rds_trans_sem); + +int rds_trans_register(struct rds_transport *trans) +{ + BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); + + down_write(&rds_trans_sem); + + list_add_tail(&trans->t_item, &rds_transports); + printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name); + + up_write(&rds_trans_sem); + + return 0; +} + +void rds_trans_unregister(struct rds_transport *trans) +{ + down_write(&rds_trans_sem); + + list_del_init(&trans->t_item); + printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name); + + up_write(&rds_trans_sem); +} + +struct rds_transport *rds_trans_get_preferred(__be32 addr) +{ + struct rds_transport *trans; + struct rds_transport *ret = NULL; + + if (IN_LOOPBACK(ntohl(addr))) + return &rds_loop_transport; + + down_read(&rds_trans_sem); + list_for_each_entry(trans, &rds_transports, t_item) { + if (trans->laddr_check(addr) == 0) { + ret = trans; + break; + } + } + up_read(&rds_trans_sem); + + return ret; +} + +/* + * This returns the number of stats entries in the snapshot and only + * copies them using the iter if there is enough space for them. The + * caller passes in the global stats so that we can size and copy while + * holding the lock. + */ +unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) + +{ + struct rds_transport *trans; + unsigned int total = 0; + unsigned int part; + + rds_info_iter_unmap(iter); + down_read(&rds_trans_sem); + + list_for_each_entry(trans, &rds_transports, t_item) { + if (trans->stats_info_copy == NULL) + continue; + + part = trans->stats_info_copy(iter, avail); + avail -= min(avail, part); + total += part; + } + + up_read(&rds_trans_sem); + + return total; +} + diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 01392649b462..650139626581 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1587,8 +1587,7 @@ static int __init rose_proto_init(void) char name[IFNAMSIZ]; sprintf(name, "rose%d", i); - dev = alloc_netdev(sizeof(struct net_device_stats), - name, rose_setup); + dev = alloc_netdev(0, name, rose_setup); if (!dev) { printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n"); rc = -ENOMEM; diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c index 12cfcf09556b..7dcf2569613b 100644 --- a/net/rose/rose_dev.c +++ b/net/rose/rose_dev.c @@ -57,7 +57,7 @@ static int rose_rebuild_header(struct sk_buff *skb) { #ifdef CONFIG_INET struct net_device *dev = skb->dev; - struct net_device_stats *stats = netdev_priv(dev); + struct net_device_stats *stats = &dev->stats; unsigned char *bp = (unsigned char *)skb->data; struct sk_buff *skbn; unsigned int len; @@ -133,7 +133,7 @@ static int rose_close(struct net_device *dev) static int rose_xmit(struct sk_buff *skb, struct net_device *dev) { - struct net_device_stats *stats = netdev_priv(dev); + struct net_device_stats *stats = &dev->stats; if (!netif_running(dev)) { printk(KERN_ERR "ROSE: rose_xmit - called when iface is down\n"); @@ -144,30 +144,28 @@ static int rose_xmit(struct sk_buff *skb, struct net_device *dev) return 0; } -static struct net_device_stats *rose_get_stats(struct net_device *dev) -{ - return netdev_priv(dev); -} - static const struct header_ops rose_header_ops = { .create = rose_header, .rebuild= rose_rebuild_header, }; +static const struct net_device_ops rose_netdev_ops = { + .ndo_open = rose_open, + .ndo_stop = rose_close, + .ndo_start_xmit = rose_xmit, + .ndo_set_mac_address = rose_set_mac_address, +}; + void rose_setup(struct net_device *dev) { dev->mtu = ROSE_MAX_PACKET_SIZE - 2; - dev->hard_start_xmit = rose_xmit; - dev->open = rose_open; - dev->stop = rose_close; + dev->netdev_ops = &rose_netdev_ops; dev->header_ops = &rose_header_ops; dev->hard_header_len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; dev->addr_len = ROSE_ADDR_LEN; dev->type = ARPHRD_ROSE; - dev->set_mac_address = rose_set_mac_address; /* New-style flags. */ dev->flags = IFF_NOARP; - dev->get_stats = rose_get_stats; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 0fc4a18fd96f..32009793307b 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -444,6 +444,17 @@ out: } EXPORT_SYMBOL(qdisc_calculate_pkt_len); +void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc) +{ + if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { + printk(KERN_WARNING + "%s: %s qdisc %X: is non-work-conserving?\n", + txt, qdisc->ops->id, qdisc->handle >> 16); + qdisc->flags |= TCQ_F_WARN_NONWC; + } +} +EXPORT_SYMBOL(qdisc_warn_nonwc); + static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 9e43ed949167..d728d8111732 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1960,8 +1960,11 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) cbq_rmprio(q, cl); sch_tree_unlock(sch); - if (--cl->refcnt == 0) - cbq_destroy_class(sch, cl); + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ return 0; } diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index e36e94ab4e10..7597fe146866 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -155,8 +155,11 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg) drr_purge_queue(cl); qdisc_class_hash_remove(&q->clhash, &cl->common); - if (--cl->refcnt == 0) - drr_destroy_class(sch, cl); + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 45c31b1a4e1d..5022f9c1f34b 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -887,8 +887,7 @@ qdisc_peek_len(struct Qdisc *sch) skb = sch->ops->peek(sch); if (skb == NULL) { - if (net_ratelimit()) - printk("qdisc_peek_len: non work-conserving qdisc ?\n"); + qdisc_warn_nonwc("qdisc_peek_len", sch); return 0; } len = qdisc_pkt_len(skb); @@ -1140,8 +1139,11 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg) hfsc_purge_queue(sch, cl); qdisc_class_hash_remove(&q->clhash, &cl->cl_common); - if (--cl->refcnt == 0) - hfsc_destroy_class(sch, cl); + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ sch_tree_unlock(sch); return 0; @@ -1642,8 +1644,7 @@ hfsc_dequeue(struct Qdisc *sch) skb = qdisc_dequeue_peeked(cl->qdisc); if (skb == NULL) { - if (net_ratelimit()) - printk("HFSC: Non-work-conserving qdisc ?\n"); + qdisc_warn_nonwc("HFSC", cl->qdisc); return NULL; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 2f0f0b04d3fb..88cd02626621 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -35,6 +35,7 @@ #include <linux/list.h> #include <linux/compiler.h> #include <linux/rbtree.h> +#include <linux/workqueue.h> #include <net/netlink.h> #include <net/pkt_sched.h> @@ -114,8 +115,6 @@ struct htb_class { struct tcf_proto *filter_list; int filter_cnt; - int warned; /* only one warning about non work conserving .. */ - /* token bucket parameters */ struct qdisc_rate_table *rate; /* rate table of the class itself */ struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ @@ -155,6 +154,10 @@ struct htb_sched { int direct_qlen; /* max qlen of above */ long direct_pkts; + +#define HTB_WARN_TOOMANYEVENTS 0x1 + unsigned int warned; /* only one warning */ + struct work_struct work; }; /* find class in global hash table using given handle */ @@ -658,7 +661,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, * htb_do_events - make mode changes to classes at the level * * Scans event queue for pending events and applies them. Returns time of - * next pending event (0 for no event in pq). + * next pending event (0 for no event in pq, q->now for too many events). * Note: Applied are events whose have cl->pq_key <= q->now. */ static psched_time_t htb_do_events(struct htb_sched *q, int level, @@ -686,8 +689,14 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level, if (cl->cmode != HTB_CAN_SEND) htb_add_to_wait_tree(q, cl, diff); } - /* too much load - let's continue on next jiffie (including above) */ - return q->now + 2 * PSCHED_TICKS_PER_SEC / HZ; + + /* too much load - let's continue after a break for scheduling */ + if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) { + printk(KERN_WARNING "htb: too many events!\n"); + q->warned |= HTB_WARN_TOOMANYEVENTS; + } + + return q->now; } /* Returns class->node+prio from id-tree where classe's id is >= id. NULL @@ -809,13 +818,8 @@ next: skb = cl->un.leaf.q->dequeue(cl->un.leaf.q); if (likely(skb != NULL)) break; - if (!cl->warned) { - printk(KERN_WARNING - "htb: class %X isn't work conserving ?!\n", - cl->common.classid); - cl->warned = 1; - } + qdisc_warn_nonwc("htb", cl->un.leaf.q); htb_next_rb_node((level ? cl->parent->un.inner.ptr : q-> ptr[0]) + prio); cl = htb_lookup_leaf(q->row[level] + prio, prio, @@ -892,7 +896,10 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) } } sch->qstats.overlimits++; - qdisc_watchdog_schedule(&q->watchdog, next_event); + if (likely(next_event > q->now)) + qdisc_watchdog_schedule(&q->watchdog, next_event); + else + schedule_work(&q->work); fin: return skb; } @@ -962,6 +969,14 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, }; +static void htb_work_func(struct work_struct *work) +{ + struct htb_sched *q = container_of(work, struct htb_sched, work); + struct Qdisc *sch = q->watchdog.qdisc; + + __netif_schedule(qdisc_root(sch)); +} + static int htb_init(struct Qdisc *sch, struct nlattr *opt) { struct htb_sched *q = qdisc_priv(sch); @@ -996,6 +1011,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) INIT_LIST_HEAD(q->drops + i); qdisc_watchdog_init(&q->watchdog, sch); + INIT_WORK(&q->work, htb_work_func); skb_queue_head_init(&q->direct_queue); q->direct_qlen = qdisc_dev(sch)->tx_queue_len; @@ -1188,7 +1204,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) kfree(cl); } -/* always caled under BH & queue lock */ static void htb_destroy(struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); @@ -1196,6 +1211,7 @@ static void htb_destroy(struct Qdisc *sch) struct htb_class *cl; unsigned int i; + cancel_work_sync(&q->work); qdisc_watchdog_cancel(&q->watchdog); /* This line used to be after htb_destroy_class call below and surprisingly it worked in 2.4. But it must precede it @@ -1259,8 +1275,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) if (last_child) htb_parent_to_leaf(q, cl, new_q); - if (--cl->refcnt == 0) - htb_destroy_class(sch, cl); + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 7e151861794b..912731203047 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -202,7 +202,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) int i; if (!netif_is_multiqueue(qdisc_dev(sch))) - return -EINVAL; + return -EOPNOTSUPP; if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index a2f93c09f3cc..e22dfe85e43e 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -236,7 +236,6 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt) struct tc_tbf_qopt *qopt; struct qdisc_rate_table *rtab = NULL; struct qdisc_rate_table *ptab = NULL; - struct qdisc_rate_table *tmp; struct Qdisc *child = NULL; int max_size,n; @@ -295,13 +294,9 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt) q->tokens = q->buffer; q->ptokens = q->mtu; - tmp = q->R_tab; - q->R_tab = rtab; - rtab = tmp; + swap(q->R_tab, rtab); + swap(q->P_tab, ptab); - tmp = q->P_tab; - q->P_tab = ptab; - ptab = tmp; sch_tree_unlock(sch); err = 0; done: diff --git a/net/sctp/debug.c b/net/sctp/debug.c index 67715f4eb849..7ff548a30cfb 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -86,6 +86,9 @@ const char *sctp_cname(const sctp_subtype_t cid) case SCTP_CID_FWD_TSN: return "FWD_TSN"; + case SCTP_CID_AUTH: + return "AUTH"; + default: break; } @@ -135,6 +138,7 @@ static const char *sctp_primitive_tbl[SCTP_NUM_PRIMITIVE_TYPES] = { "PRIMITIVE_ABORT", "PRIMITIVE_SEND", "PRIMITIVE_REQUESTHEARTBEAT", + "PRIMITIVE_ASCONF", }; /* Lookup primitive debug name. */ diff --git a/net/sctp/input.c b/net/sctp/input.c index 2e4a8646dbc3..d2e98803ffe3 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -83,14 +83,15 @@ static inline int sctp_rcv_checksum(struct sk_buff *skb) { struct sk_buff *list = skb_shinfo(skb)->frag_list; struct sctphdr *sh = sctp_hdr(skb); - __be32 cmp = sh->checksum; - __be32 val = sctp_start_cksum((__u8 *)sh, skb_headlen(skb)); + __le32 cmp = sh->checksum; + __le32 val; + __u32 tmp = sctp_start_cksum((__u8 *)sh, skb_headlen(skb)); for (; list; list = list->next) - val = sctp_update_cksum((__u8 *)list->data, skb_headlen(list), - val); + tmp = sctp_update_cksum((__u8 *)list->data, skb_headlen(list), + tmp); - val = sctp_end_cksum(val); + val = sctp_end_cksum(tmp); if (val != cmp) { /* CRC failure, dump it. */ @@ -142,7 +143,8 @@ int sctp_rcv(struct sk_buff *skb) __skb_pull(skb, skb_transport_offset(skb)); if (skb->len < sizeof(struct sctphdr)) goto discard_it; - if (!skb_csum_unnecessary(skb) && sctp_rcv_checksum(skb) < 0) + if (!sctp_checksum_disable && !skb_csum_unnecessary(skb) && + sctp_rcv_checksum(skb) < 0) goto discard_it; skb_pull(skb, sizeof(struct sctphdr)); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ceaa4aa066ea..a63de3f7f185 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -97,8 +97,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_port = 0; - memcpy(&addr->a.v6.sin6_addr, &ifa->addr, - sizeof(struct in6_addr)); + ipv6_addr_copy(&addr->a.v6.sin6_addr, &ifa->addr); addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex; addr->valid = 1; spin_lock_bh(&sctp_local_addr_lock); @@ -628,9 +627,7 @@ static sctp_scope_t sctp_v6_scope(union sctp_addr *addr) static struct sock *sctp_v6_create_accept_sk(struct sock *sk, struct sctp_association *asoc) { - struct inet_sock *inet = inet_sk(sk); struct sock *newsk; - struct inet_sock *newinet; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; @@ -640,17 +637,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, sock_init_data(NULL, newsk); - newsk->sk_type = SOCK_STREAM; - - newsk->sk_prot = sk->sk_prot; - newsk->sk_no_check = sk->sk_no_check; - newsk->sk_reuse = sk->sk_reuse; - - newsk->sk_destruct = inet_sock_destruct; - newsk->sk_family = PF_INET6; - newsk->sk_protocol = IPPROTO_SCTP; - newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; - newsk->sk_shutdown = sk->sk_shutdown; + sctp_copy_sock(newsk, sk, asoc); sock_reset_flag(sk, SOCK_ZAPPED); newsctp6sk = (struct sctp6_sock *)newsk; @@ -658,7 +645,6 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped; - newinet = inet_sk(newsk); newnp = inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); @@ -666,26 +652,8 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname() * and getpeername(). */ - newinet->sport = inet->sport; - newnp->saddr = np->saddr; - newnp->rcv_saddr = np->rcv_saddr; - newinet->dport = htons(asoc->peer.port); sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk); - /* Init the ipv4 part of the socket since we can have sockets - * using v6 API for ipv4. - */ - newinet->uc_ttl = -1; - newinet->mc_loop = 1; - newinet->mc_ttl = 1; - newinet->mc_index = 0; - newinet->mc_list = NULL; - - if (ipv4_config.no_pmtu_disc) - newinet->pmtudisc = IP_PMTUDISC_DONT; - else - newinet->pmtudisc = IP_PMTUDISC_WANT; - sk_refcnt_debug_inc(newsk); if (newsk->sk_prot->init(newsk)) { diff --git a/net/sctp/output.c b/net/sctp/output.c index 73639355157e..7d08f522ec84 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -49,13 +49,10 @@ #include <linux/ipv6.h> #include <linux/init.h> #include <net/inet_ecn.h> +#include <net/ip.h> #include <net/icmp.h> #include <net/net_namespace.h> -#ifndef TEST_FRAME -#include <net/tcp.h> -#endif /* TEST_FRAME (not defined) */ - #include <linux/socket.h> /* for sa_family_t */ #include <net/sock.h> @@ -367,7 +364,6 @@ int sctp_packet_transmit(struct sctp_packet *packet) struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctphdr *sh; - __be32 crc32 = __constant_cpu_to_be32(0); struct sk_buff *nskb; struct sctp_chunk *chunk, *tmp; struct sock *sk; @@ -531,17 +527,16 @@ int sctp_packet_transmit(struct sctp_packet *packet) * Note: Adler-32 is no longer applicable, as has been replaced * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>. */ - if (!(dst->dev->features & NETIF_F_NO_CSUM)) { - crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len); - crc32 = sctp_end_cksum(crc32); + if (!sctp_checksum_disable && !(dst->dev->features & NETIF_F_NO_CSUM)) { + __u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len); + + /* 3) Put the resultant value into the checksum field in the + * common header, and leave the rest of the bits unchanged. + */ + sh->checksum = sctp_end_cksum(crc32); } else nskb->ip_summed = CHECKSUM_UNNECESSARY; - /* 3) Put the resultant value into the checksum field in the - * common header, and leave the rest of the bits unchanged. - */ - sh->checksum = crc32; - /* IP layer ECN support * From RFC 2481 * "The ECN-Capable Transport (ECT) bit would be set by the diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index bc411c896216..d765fc53e74d 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -428,7 +428,8 @@ void sctp_retransmit_mark(struct sctp_outq *q, * retransmitting due to T3 timeout. */ if (reason == SCTP_RTXR_T3_RTX && - (jiffies - chunk->sent_at) < transport->last_rto) + time_before(jiffies, chunk->sent_at + + transport->last_rto)) continue; /* RFC 2960 6.2.1 Processing a Received SACK @@ -1757,6 +1758,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) struct sctp_chunk *chunk; struct list_head *lchunk, *temp; + if (!asoc->peer.prsctp_capable) + return; + /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the * received SACK. * diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index c4986d0f7419..cb198af8887c 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -589,46 +589,21 @@ static int sctp_v4_is_ce(const struct sk_buff *skb) static struct sock *sctp_v4_create_accept_sk(struct sock *sk, struct sctp_association *asoc) { - struct inet_sock *inet = inet_sk(sk); - struct inet_sock *newinet; struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, sk->sk_prot); + struct inet_sock *newinet; if (!newsk) goto out; sock_init_data(NULL, newsk); - newsk->sk_type = SOCK_STREAM; - - newsk->sk_no_check = sk->sk_no_check; - newsk->sk_reuse = sk->sk_reuse; - newsk->sk_shutdown = sk->sk_shutdown; - - newsk->sk_destruct = inet_sock_destruct; - newsk->sk_family = PF_INET; - newsk->sk_protocol = IPPROTO_SCTP; - newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + sctp_copy_sock(newsk, sk, asoc); sock_reset_flag(newsk, SOCK_ZAPPED); newinet = inet_sk(newsk); - /* Initialize sk's sport, dport, rcv_saddr and daddr for - * getsockname() and getpeername() - */ - newinet->sport = inet->sport; - newinet->saddr = inet->saddr; - newinet->rcv_saddr = inet->rcv_saddr; - newinet->dport = htons(asoc->peer.port); newinet->daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr; - newinet->pmtudisc = inet->pmtudisc; - newinet->id = asoc->next_tsn ^ jiffies; - - newinet->uc_ttl = -1; - newinet->mc_loop = 1; - newinet->mc_ttl = 1; - newinet->mc_index = 0; - newinet->mc_list = NULL; sk_refcnt_debug_inc(newsk); @@ -1413,4 +1388,6 @@ MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132"); MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132"); MODULE_AUTHOR("Linux Kernel SCTP developers <lksctp-developers@lists.sourceforge.net>"); MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); +module_param_named(no_checksums, sctp_checksum_disable, bool, 0644); +MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification"); MODULE_LICENSE("GPL"); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index fd8acb48c3f2..6851ee94e974 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -100,11 +100,11 @@ int sctp_chunk_iif(const struct sctp_chunk *chunk) */ static const struct sctp_paramhdr ecap_param = { SCTP_PARAM_ECN_CAPABLE, - __constant_htons(sizeof(struct sctp_paramhdr)), + cpu_to_be16(sizeof(struct sctp_paramhdr)), }; static const struct sctp_paramhdr prsctp_param = { SCTP_PARAM_FWD_TSN_SUPPORT, - __constant_htons(sizeof(struct sctp_paramhdr)), + cpu_to_be16(sizeof(struct sctp_paramhdr)), }; /* A helper to initialize to initialize an op error inside a @@ -224,7 +224,9 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, num_ext += 2; } - chunksize += sizeof(aiparam); + if (sp->adaptation_ind) + chunksize += sizeof(aiparam); + chunksize += vparam_len; /* Account for AUTH related parameters */ @@ -304,10 +306,12 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, if (sctp_prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); - aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; - aiparam.param_hdr.length = htons(sizeof(aiparam)); - aiparam.adaptation_ind = htonl(sp->adaptation_ind); - sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); + if (sp->adaptation_ind) { + aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; + aiparam.param_hdr.length = htons(sizeof(aiparam)); + aiparam.adaptation_ind = htonl(sp->adaptation_ind); + sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); + } /* Add SCTP-AUTH chunks to the parameter list */ if (sctp_auth_enable) { @@ -332,6 +336,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, sctp_inithdr_t initack; struct sctp_chunk *retval; union sctp_params addrs; + struct sctp_sock *sp; int addrs_len; sctp_cookie_param_t *cookie; int cookie_len; @@ -366,22 +371,24 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, /* Calculate the total size of allocation, include the reserved * space for reporting unknown parameters if it is specified. */ + sp = sctp_sk(asoc->base.sk); chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len; /* Tell peer that we'll do ECN only if peer advertised such cap. */ if (asoc->peer.ecn_capable) chunksize += sizeof(ecap_param); - if (sctp_prsctp_enable) + if (asoc->peer.prsctp_capable) chunksize += sizeof(prsctp_param); - if (sctp_addip_enable) { + if (asoc->peer.asconf_capable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } - chunksize += sizeof(aiparam); + if (sp->adaptation_ind) + chunksize += sizeof(aiparam); if (asoc->peer.auth_capable) { auth_random = (sctp_paramhdr_t *)asoc->c.auth_random; @@ -432,10 +439,12 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, if (asoc->peer.prsctp_capable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); - aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; - aiparam.param_hdr.length = htons(sizeof(aiparam)); - aiparam.adaptation_ind = htonl(sctp_sk(asoc->base.sk)->adaptation_ind); - sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); + if (sp->adaptation_ind) { + aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; + aiparam.param_hdr.length = htons(sizeof(aiparam)); + aiparam.adaptation_ind = htonl(sp->adaptation_ind); + sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); + } if (asoc->peer.auth_capable) { sctp_addto_chunk(retval, ntohs(auth_random->length), diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index b5495aecab60..e2020eb2c8ca 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -434,7 +434,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { * */ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, - struct sctp_transport *transport) + struct sctp_transport *transport, + int is_hb) { /* The check for association's overall error counter exceeding the * threshold is done in the state function. @@ -461,9 +462,15 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, * expires, set RTO <- RTO * 2 ("back off the timer"). The * maximum value discussed in rule C7 above (RTO.max) may be * used to provide an upper bound to this doubling operation. + * + * Special Case: the first HB doesn't trigger exponential backoff. + * The first unacknowleged HB triggers it. We do this with a flag + * that indicates that we have an outstanding HB. */ - transport->last_rto = transport->rto; - transport->rto = min((transport->rto * 2), transport->asoc->rto_max); + if (!is_hb || transport->hb_sent) { + transport->last_rto = transport->rto; + transport->rto = min((transport->rto * 2), transport->asoc->rto_max); + } } /* Worker routine to handle INIT command failure. */ @@ -621,6 +628,11 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, t->error_count = 0; t->asoc->overall_error_count = 0; + /* Clear the hb_sent flag to signal that we had a good + * acknowledgement. + */ + t->hb_sent = 0; + /* Mark the destination transport address as active if it is not so * marked. */ @@ -646,18 +658,6 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, sctp_transport_hold(t); } -/* Helper function to do a transport reset at the expiry of the hearbeat - * timer. - */ -static void sctp_cmd_transport_reset(sctp_cmd_seq_t *cmds, - struct sctp_association *asoc, - struct sctp_transport *t) -{ - sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE); - - /* Mark one strike against a transport. */ - sctp_do_8_2_transport_strike(asoc, t); -} /* Helper function to process the process SACK command. */ static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds, @@ -1458,12 +1458,19 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_STRIKE: /* Mark one strike against a transport. */ - sctp_do_8_2_transport_strike(asoc, cmd->obj.transport); + sctp_do_8_2_transport_strike(asoc, cmd->obj.transport, + 0); + break; + + case SCTP_CMD_TRANSPORT_IDLE: + t = cmd->obj.transport; + sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE); break; - case SCTP_CMD_TRANSPORT_RESET: + case SCTP_CMD_TRANSPORT_HB_SENT: t = cmd->obj.transport; - sctp_cmd_transport_reset(commands, asoc, t); + sctp_do_8_2_transport_strike(asoc, t, 1); + t->hb_sent = 1; break; case SCTP_CMD_TRANSPORT_ON: diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index f88dfded0e3a..55a61aa69662 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -988,7 +988,9 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep, /* Set transport error counter and association error counter * when sending heartbeat. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_RESET, + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_IDLE, + SCTP_TRANSPORT(transport)); + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT, SCTP_TRANSPORT(transport)); } sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMER_UPDATE, @@ -4955,7 +4957,7 @@ sctp_disposition_t sctp_sf_do_prm_requestheartbeat( * to that address and not acknowledged within one RTO. * */ - sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_RESET, + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT, SCTP_TRANSPORT(arg)); return SCTP_DISPOSITION_CONSUME; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ff0a8f88de04..5fb3a8c9792e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3069,9 +3069,6 @@ static int sctp_setsockopt_maxburst(struct sock *sk, int val; int assoc_id = 0; - if (optlen < sizeof(int)) - return -EINVAL; - if (optlen == sizeof(int)) { printk(KERN_WARNING "SCTP: Use of int in max_burst socket option deprecated\n"); @@ -3939,7 +3936,6 @@ SCTP_STATIC int sctp_do_peeloff(struct sctp_association *asoc, { struct sock *sk = asoc->base.sk; struct socket *sock; - struct inet_sock *inetsk; struct sctp_af *af; int err = 0; @@ -3954,18 +3950,18 @@ SCTP_STATIC int sctp_do_peeloff(struct sctp_association *asoc, if (err < 0) return err; - /* Populate the fields of the newsk from the oldsk and migrate the - * asoc to the newsk. - */ - sctp_sock_migrate(sk, sock->sk, asoc, SCTP_SOCKET_UDP_HIGH_BANDWIDTH); + sctp_copy_sock(sock->sk, sk, asoc); /* Make peeled-off sockets more like 1-1 accepted sockets. * Set the daddr and initialize id to something more random */ af = sctp_get_af_specific(asoc->peer.primary_addr.sa.sa_family); af->to_sk_daddr(&asoc->peer.primary_addr, sk); - inetsk = inet_sk(sock->sk); - inetsk->id = asoc->next_tsn ^ jiffies; + + /* Populate the fields of the newsk from the oldsk and migrate the + * asoc to the newsk. + */ + sctp_sock_migrate(sk, sock->sk, asoc, SCTP_SOCKET_UDP_HIGH_BANDWIDTH); *sockp = sock; @@ -5284,16 +5280,14 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len, struct sctp_sock *sp; struct sctp_association *asoc; - if (len < sizeof(int)) - return -EINVAL; - if (len == sizeof(int)) { printk(KERN_WARNING "SCTP: Use of int in max_burst socket option deprecated\n"); printk(KERN_WARNING "SCTP: Use struct sctp_assoc_value instead\n"); params.assoc_id = 0; - } else if (len == sizeof (struct sctp_assoc_value)) { + } else if (len >= sizeof(struct sctp_assoc_value)) { + len = sizeof(struct sctp_assoc_value); if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else @@ -5849,37 +5843,28 @@ static int sctp_get_port(struct sock *sk, unsigned short snum) } /* - * 3.1.3 listen() - UDP Style Syntax - * - * By default, new associations are not accepted for UDP style sockets. - * An application uses listen() to mark a socket as being able to - * accept new associations. + * Move a socket to LISTENING state. */ -SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog) +SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; + struct crypto_hash *tfm = NULL; - /* Only UDP style sockets that are not peeled off are allowed to - * listen(). - */ - if (!sctp_style(sk, UDP)) - return -EINVAL; - - /* If backlog is zero, disable listening. */ - if (!backlog) { - if (sctp_sstate(sk, CLOSED)) - return 0; - - sctp_unhash_endpoint(ep); - sk->sk_state = SCTP_SS_CLOSED; - return 0; + /* Allocate HMAC for generating cookie. */ + if (!sctp_sk(sk)->hmac && sctp_hmac_alg) { + tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + if (net_ratelimit()) { + printk(KERN_INFO + "SCTP: failed to load transform for %s: %ld\n", + sctp_hmac_alg, PTR_ERR(tfm)); + } + return -ENOSYS; + } + sctp_sk(sk)->hmac = tfm; } - /* Return if we are already listening. */ - if (sctp_sstate(sk, LISTENING)) - return 0; - /* * If a bind() or sctp_bindx() is not called prior to a listen() * call that allows new associations to be accepted, the system @@ -5890,7 +5875,6 @@ SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog) * extensions draft, but follows the practice as seen in TCP * sockets. * - * Additionally, turn off fastreuse flag since we are not listening */ sk->sk_state = SCTP_SS_LISTENING; if (!ep->base.bind_addr.port) { @@ -5901,113 +5885,71 @@ SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog) sk->sk_state = SCTP_SS_CLOSED; return -EADDRINUSE; } - sctp_sk(sk)->bind_hash->fastreuse = 0; } - sctp_hash_endpoint(ep); - return 0; -} - -/* - * 4.1.3 listen() - TCP Style Syntax - * - * Applications uses listen() to ready the SCTP endpoint for accepting - * inbound associations. - */ -SCTP_STATIC int sctp_stream_listen(struct sock *sk, int backlog) -{ - struct sctp_sock *sp = sctp_sk(sk); - struct sctp_endpoint *ep = sp->ep; - - /* If backlog is zero, disable listening. */ - if (!backlog) { - if (sctp_sstate(sk, CLOSED)) - return 0; - - sctp_unhash_endpoint(ep); - sk->sk_state = SCTP_SS_CLOSED; - return 0; - } - - if (sctp_sstate(sk, LISTENING)) - return 0; - - /* - * If a bind() or sctp_bindx() is not called prior to a listen() - * call that allows new associations to be accepted, the system - * picks an ephemeral port and will choose an address set equivalent - * to binding with a wildcard address. - * - * This is not currently spelled out in the SCTP sockets - * extensions draft, but follows the practice as seen in TCP - * sockets. - */ - sk->sk_state = SCTP_SS_LISTENING; - if (!ep->base.bind_addr.port) { - if (sctp_autobind(sk)) - return -EAGAIN; - } else - sctp_sk(sk)->bind_hash->fastreuse = 0; - sk->sk_max_ack_backlog = backlog; sctp_hash_endpoint(ep); return 0; } /* + * 4.1.3 / 5.1.3 listen() + * + * By default, new associations are not accepted for UDP style sockets. + * An application uses listen() to mark a socket as being able to + * accept new associations. + * + * On TCP style sockets, applications use listen() to ready the SCTP + * endpoint for accepting inbound associations. + * + * On both types of endpoints a backlog of '0' disables listening. + * * Move a socket to LISTENING state. */ int sctp_inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; - struct crypto_hash *tfm = NULL; + struct sctp_endpoint *ep = sctp_sk(sk)->ep; int err = -EINVAL; if (unlikely(backlog < 0)) - goto out; + return err; sctp_lock_sock(sk); + /* Peeled-off sockets are not allowed to listen(). */ + if (sctp_style(sk, UDP_HIGH_BANDWIDTH)) + goto out; + if (sock->state != SS_UNCONNECTED) goto out; - /* Allocate HMAC for generating cookie. */ - if (!sctp_sk(sk)->hmac && sctp_hmac_alg) { - tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) { - if (net_ratelimit()) { - printk(KERN_INFO - "SCTP: failed to load transform for %s: %ld\n", - sctp_hmac_alg, PTR_ERR(tfm)); - } - err = -ENOSYS; + /* If backlog is zero, disable listening. */ + if (!backlog) { + if (sctp_sstate(sk, CLOSED)) goto out; - } - } - switch (sock->type) { - case SOCK_SEQPACKET: - err = sctp_seqpacket_listen(sk, backlog); - break; - case SOCK_STREAM: - err = sctp_stream_listen(sk, backlog); - break; - default: - break; + err = 0; + sctp_unhash_endpoint(ep); + sk->sk_state = SCTP_SS_CLOSED; + if (sk->sk_reuse) + sctp_sk(sk)->bind_hash->fastreuse = 1; + goto out; } - if (err) - goto cleanup; + /* If we are already listening, just update the backlog */ + if (sctp_sstate(sk, LISTENING)) + sk->sk_max_ack_backlog = backlog; + else { + err = sctp_listen_start(sk, backlog); + if (err) + goto out; + } - /* Store away the transform reference. */ - if (!sctp_sk(sk)->hmac) - sctp_sk(sk)->hmac = tfm; + err = 0; out: sctp_release_sock(sk); return err; -cleanup: - crypto_free_hash(tfm); - goto out; } /* @@ -6700,6 +6642,48 @@ done: sctp_skb_set_owner_r(skb, sk); } +void sctp_copy_sock(struct sock *newsk, struct sock *sk, + struct sctp_association *asoc) +{ + struct inet_sock *inet = inet_sk(sk); + struct inet_sock *newinet = inet_sk(newsk); + + newsk->sk_type = sk->sk_type; + newsk->sk_bound_dev_if = sk->sk_bound_dev_if; + newsk->sk_flags = sk->sk_flags; + newsk->sk_no_check = sk->sk_no_check; + newsk->sk_reuse = sk->sk_reuse; + + newsk->sk_shutdown = sk->sk_shutdown; + newsk->sk_destruct = inet_sock_destruct; + newsk->sk_family = sk->sk_family; + newsk->sk_protocol = IPPROTO_SCTP; + newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + newsk->sk_sndbuf = sk->sk_sndbuf; + newsk->sk_rcvbuf = sk->sk_rcvbuf; + newsk->sk_lingertime = sk->sk_lingertime; + newsk->sk_rcvtimeo = sk->sk_rcvtimeo; + newsk->sk_sndtimeo = sk->sk_sndtimeo; + + newinet = inet_sk(newsk); + + /* Initialize sk's sport, dport, rcv_saddr and daddr for + * getsockname() and getpeername() + */ + newinet->sport = inet->sport; + newinet->saddr = inet->saddr; + newinet->rcv_saddr = inet->rcv_saddr; + newinet->dport = htons(asoc->peer.port); + newinet->pmtudisc = inet->pmtudisc; + newinet->id = asoc->next_tsn ^ jiffies; + + newinet->uc_ttl = inet->uc_ttl; + newinet->mc_loop = 1; + newinet->mc_ttl = 1; + newinet->mc_index = 0; + newinet->mc_list = NULL; +} + /* Populate the fields of the newsk from the oldsk and migrate the assoc * and its messages to the newsk. */ diff --git a/net/sctp/transport.c b/net/sctp/transport.c index e745c118f239..e5dde45c79d3 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -79,6 +79,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, peer->rttvar = 0; peer->srtt = 0; peer->rto_pending = 0; + peer->hb_sent = 0; peer->fast_recovery = 0; peer->last_time_heard = jiffies; @@ -542,8 +543,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * congestion indications more than once every window of * data (or more loosely more than once every round-trip time). */ - if ((jiffies - transport->last_time_ecne_reduced) > - transport->rtt) { + if (time_after(jiffies, transport->last_time_ecne_reduced + + transport->rtt)) { transport->ssthresh = max(transport->cwnd/2, 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; @@ -560,7 +561,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * to be done every RTO interval, we do it every hearbeat * interval. */ - if ((jiffies - transport->last_time_used) > transport->rto) + if (time_after(jiffies, transport->last_time_used + + transport->rto)) transport->cwnd = max(transport->cwnd/2, 4*transport->asoc->pathmtu); break; @@ -608,6 +610,7 @@ void sctp_transport_reset(struct sctp_transport *t) t->flight_size = 0; t->error_count = 0; t->rto_pending = 0; + t->hb_sent = 0; t->fast_recovery = 0; /* Initialize the state information for SFR-CACC */ diff --git a/net/socket.c b/net/socket.c index 35dd7371752a..47a3dc074eb0 100644 --- a/net/socket.c +++ b/net/socket.c @@ -545,6 +545,18 @@ void sock_release(struct socket *sock) sock->file = NULL; } +int sock_tx_timestamp(struct msghdr *msg, struct sock *sk, + union skb_shared_tx *shtx) +{ + shtx->flags = 0; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) + shtx->hardware = 1; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) + shtx->software = 1; + return 0; +} +EXPORT_SYMBOL(sock_tx_timestamp); + static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { @@ -595,33 +607,65 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, return result; } +static int ktime2ts(ktime_t kt, struct timespec *ts) +{ + if (kt.tv64) { + *ts = ktime_to_timespec(kt); + return 1; + } else { + return 0; + } +} + /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - ktime_t kt = skb->tstamp; - - if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { - struct timeval tv; - /* Race occurred between timestamp enabling and packet - receiving. Fill in the current time for now. */ - if (kt.tv64 == 0) - kt = ktime_get_real(); - skb->tstamp = kt; - tv = ktime_to_timeval(kt); - put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv); - } else { - struct timespec ts; - /* Race occurred between timestamp enabling and packet - receiving. Fill in the current time for now. */ - if (kt.tv64 == 0) - kt = ktime_get_real(); - skb->tstamp = kt; - ts = ktime_to_timespec(kt); - put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, sizeof(ts), &ts); + int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); + struct timespec ts[3]; + int empty = 1; + struct skb_shared_hwtstamps *shhwtstamps = + skb_hwtstamps(skb); + + /* Race occurred between timestamp enabling and packet + receiving. Fill in the current time for now. */ + if (need_software_tstamp && skb->tstamp.tv64 == 0) + __net_timestamp(skb); + + if (need_software_tstamp) { + if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { + struct timeval tv; + skb_get_timestamp(skb, &tv); + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, + sizeof(tv), &tv); + } else { + struct timespec ts; + skb_get_timestampns(skb, &ts); + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, + sizeof(ts), &ts); + } + } + + + memset(ts, 0, sizeof(ts)); + if (skb->tstamp.tv64 && + sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) { + skb_get_timestampns(skb, ts + 0); + empty = 0; + } + if (shhwtstamps) { + if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE) && + ktime2ts(shhwtstamps->syststamp, ts + 1)) + empty = 0; + if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE) && + ktime2ts(shhwtstamps->hwtstamp, ts + 2)) + empty = 0; } + if (!empty) + put_cmsg(msg, SOL_SOCKET, + SCM_TIMESTAMPING, sizeof(ts), &ts); } EXPORT_SYMBOL_GPL(__sock_recv_timestamp); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5cbb404c4cdf..b49e434c094f 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1215,6 +1215,23 @@ out: read_unlock(&sk->sk_callback_lock); } +static void xs_write_space(struct sock *sk) +{ + struct socket *sock; + struct rpc_xprt *xprt; + + if (unlikely(!(sock = sk->sk_socket))) + return; + clear_bit(SOCK_NOSPACE, &sock->flags); + + if (unlikely(!(xprt = xprt_from_sock(sk)))) + return; + if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0) + return; + + xprt_write_space(xprt); +} + /** * xs_udp_write_space - callback invoked when socket buffer space * becomes available @@ -1230,23 +1247,9 @@ static void xs_udp_write_space(struct sock *sk) read_lock(&sk->sk_callback_lock); /* from net/core/sock.c:sock_def_write_space */ - if (sock_writeable(sk)) { - struct socket *sock; - struct rpc_xprt *xprt; - - if (unlikely(!(sock = sk->sk_socket))) - goto out; - clear_bit(SOCK_NOSPACE, &sock->flags); - - if (unlikely(!(xprt = xprt_from_sock(sk)))) - goto out; - if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0) - goto out; - - xprt_write_space(xprt); - } + if (sock_writeable(sk)) + xs_write_space(sk); - out: read_unlock(&sk->sk_callback_lock); } @@ -1265,23 +1268,9 @@ static void xs_tcp_write_space(struct sock *sk) read_lock(&sk->sk_callback_lock); /* from net/core/stream.c:sk_stream_write_space */ - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { - struct socket *sock; - struct rpc_xprt *xprt; - - if (unlikely(!(sock = sk->sk_socket))) - goto out; - clear_bit(SOCK_NOSPACE, &sock->flags); + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + xs_write_space(sk); - if (unlikely(!(xprt = xprt_from_sock(sk)))) - goto out; - if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0) - goto out; - - xprt_write_space(xprt); - } - - out: read_unlock(&sk->sk_callback_lock); } diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 972201cd5fa7..0b15d7250c40 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -61,7 +61,7 @@ static struct ctl_table_root net_sysctl_root = { static int net_ctl_ro_header_perms(struct ctl_table_root *root, struct nsproxy *namespaces, struct ctl_table *table) { - if (namespaces->net_ns == &init_net) + if (net_eq(namespaces->net_ns, &init_net)) return table->mode; else return table->mode & ~0222; diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 3ddaff42d1bb..a3bfd4064912 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -119,7 +119,7 @@ static struct bclink *bclink = NULL; static struct link *bcl = NULL; static DEFINE_SPINLOCK(bc_lock); -char tipc_bclink_name[] = "multicast-link"; +const char tipc_bclink_name[] = "multicast-link"; static u32 buf_seqno(struct sk_buff *buf) @@ -800,7 +800,7 @@ int tipc_bclink_init(void) tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); bcl->b_ptr = &bcbearer->bearer; bcl->state = WORKING_WORKING; - sprintf(bcl->name, tipc_bclink_name); + strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME); if (BCLINK_LOG_BUF_SIZE) { char *pb = kmalloc(BCLINK_LOG_BUF_SIZE, GFP_ATOMIC); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 2f2d731bc1c2..4c1771e95c99 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -70,7 +70,7 @@ struct port_list { struct tipc_node; -extern char tipc_bclink_name[]; +extern const char tipc_bclink_name[]; /** diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c index 29ecae851668..1885a7edb0c8 100644 --- a/net/tipc/dbg.c +++ b/net/tipc/dbg.c @@ -258,7 +258,7 @@ void tipc_printf(struct print_buf *pb, const char *fmt, ...) } if (pb->echo) - printk(print_string); + printk("%s", print_string); spin_unlock_bh(&print_lock); } diff --git a/net/tipc/node.c b/net/tipc/node.c index 20d98c56e152..2c24e7d6d950 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -703,7 +703,7 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space) link_info.dest = htonl(tipc_own_addr & 0xfffff00); link_info.up = htonl(1); - sprintf(link_info.str, tipc_bclink_name); + strlcpy(link_info.str, tipc_bclink_name, TIPC_MAX_LINK_NAME); tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info)); /* Add TLVs for any other links in scope */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d1b89820ab4f..baac91049b0e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1178,8 +1178,7 @@ out_unlock: unix_state_unlock(other); out: - if (skb) - kfree_skb(skb); + kfree_skb(skb); if (newsk) unix_release_sock(newsk, 0); if (other) diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 39701dec1dba..466e2d22d256 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -86,8 +86,10 @@ static int wanrouter_device_del_if(struct wan_device *wandev, static struct wan_device *wanrouter_find_device(char *name); static int wanrouter_delete_interface(struct wan_device *wandev, char *name); -static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags); -static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags); +static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __acquires(lock); +static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __releases(lock); @@ -763,12 +765,14 @@ static int wanrouter_delete_interface(struct wan_device *wandev, char *name) } static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __acquires(lock) { spin_lock_irqsave(lock, *smp_flags); } static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __releases(lock) { spin_unlock_irqrestore(lock, *smp_flags); } diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c index 267f7ff49827..c44d96b3a437 100644 --- a/net/wanrouter/wanproc.c +++ b/net/wanrouter/wanproc.c @@ -80,6 +80,7 @@ static struct proc_dir_entry *proc_router; * Iterator */ static void *r_start(struct seq_file *m, loff_t *pos) + __acquires(kernel_lock) { struct wan_device *wandev; loff_t l = *pos; @@ -101,6 +102,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) } static void r_stop(struct seq_file *m, void *v) + __releases(kernel_lock) { unlock_kernel(); } diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c index cb3b4ad53683..5d149c1b5f0d 100644 --- a/net/wimax/op-msg.c +++ b/net/wimax/op-msg.c @@ -258,7 +258,6 @@ EXPORT_SYMBOL_GPL(wimax_msg_len); */ int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb) { - int result; struct device *dev = wimax_dev->net_dev->dev.parent; void *msg = skb->data; size_t size = skb->len; @@ -266,11 +265,9 @@ int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb) d_printf(1, dev, "CTX: wimax msg, %zu bytes\n", size); d_dump(2, dev, msg, size); - result = genlmsg_multicast(skb, 0, wimax_gnl_mcg.id, GFP_KERNEL); - d_printf(1, dev, "CTX: genl multicast result %d\n", result); - if (result == -ESRCH) /* Nobody connected, ignore it */ - result = 0; /* btw, the skb is freed already */ - return result; + genlmsg_multicast(skb, 0, wimax_gnl_mcg.id, GFP_KERNEL); + d_printf(1, dev, "CTX: genl multicast done\n"); + return 0; } EXPORT_SYMBOL_GPL(wimax_msg_send); diff --git a/net/wimax/stack.c b/net/wimax/stack.c index 3869c0327882..a0ee76b52510 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -163,16 +163,12 @@ int wimax_gnl_re_state_change_send( struct device *dev = wimax_dev_to_dev(wimax_dev); d_fnstart(3, dev, "(wimax_dev %p report_skb %p)\n", wimax_dev, report_skb); - if (report_skb == NULL) + if (report_skb == NULL) { + result = -ENOMEM; goto out; - genlmsg_end(report_skb, header); - result = genlmsg_multicast(report_skb, 0, wimax_gnl_mcg.id, GFP_KERNEL); - if (result == -ESRCH) /* Nobody connected, ignore it */ - result = 0; /* btw, the skb is freed already */ - if (result < 0) { - dev_err(dev, "RE_STCH: Error sending: %d\n", result); - nlmsg_free(report_skb); } + genlmsg_end(report_skb, header); + genlmsg_multicast(report_skb, 0, wimax_gnl_mcg.id, GFP_KERNEL); out: d_fnend(3, dev, "(wimax_dev %p report_skb %p) = %d\n", wimax_dev, report_skb, result); diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 938a334c8dbc..dad43c24f695 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_LIB80211_CRYPT_WEP) += lib80211_crypt_wep.o obj-$(CONFIG_LIB80211_CRYPT_CCMP) += lib80211_crypt_ccmp.o obj-$(CONFIG_LIB80211_CRYPT_TKIP) += lib80211_crypt_tkip.o -cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o +cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o cfg80211-$(CONFIG_WIRELESS_EXT) += wext-compat.o cfg80211-$(CONFIG_NL80211) += nl80211.o diff --git a/net/wireless/core.c b/net/wireless/core.c index b96fc0c3f1c4..17fe39049740 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -7,7 +7,6 @@ #include <linux/if.h> #include <linux/module.h> #include <linux/err.h> -#include <linux/mutex.h> #include <linux/list.h> #include <linux/nl80211.h> #include <linux/debugfs.h> @@ -31,18 +30,29 @@ MODULE_DESCRIPTION("wireless configuration support"); * only read the list, and that can happen quite * often because we need to do it for each command */ LIST_HEAD(cfg80211_drv_list); -DEFINE_MUTEX(cfg80211_drv_mutex); + +/* + * This is used to protect the cfg80211_drv_list, cfg80211_regdomain, + * country_ie_regdomain, the reg_beacon_list and the the last regulatory + * request receipt (last_request). + */ +DEFINE_MUTEX(cfg80211_mutex); /* for debugfs */ static struct dentry *ieee80211_debugfs_dir; -/* requires cfg80211_drv_mutex to be held! */ -static struct cfg80211_registered_device *cfg80211_drv_by_wiphy(int wiphy) +/* requires cfg80211_mutex to be held! */ +struct cfg80211_registered_device *cfg80211_drv_by_wiphy_idx(int wiphy_idx) { struct cfg80211_registered_device *result = NULL, *drv; + if (!wiphy_idx_valid(wiphy_idx)) + return NULL; + + assert_cfg80211_lock(); + list_for_each_entry(drv, &cfg80211_drv_list, list) { - if (drv->idx == wiphy) { + if (drv->wiphy_idx == wiphy_idx) { result = drv; break; } @@ -51,17 +61,44 @@ static struct cfg80211_registered_device *cfg80211_drv_by_wiphy(int wiphy) return result; } +int get_wiphy_idx(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *drv; + if (!wiphy) + return WIPHY_IDX_STALE; + drv = wiphy_to_dev(wiphy); + return drv->wiphy_idx; +} + /* requires cfg80211_drv_mutex to be held! */ +struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx) +{ + struct cfg80211_registered_device *drv; + + if (!wiphy_idx_valid(wiphy_idx)) + return NULL; + + assert_cfg80211_lock(); + + drv = cfg80211_drv_by_wiphy_idx(wiphy_idx); + if (!drv) + return NULL; + return &drv->wiphy; +} + +/* requires cfg80211_mutex to be held! */ static struct cfg80211_registered_device * __cfg80211_drv_from_info(struct genl_info *info) { int ifindex; - struct cfg80211_registered_device *bywiphy = NULL, *byifidx = NULL; + struct cfg80211_registered_device *bywiphyidx = NULL, *byifidx = NULL; struct net_device *dev; int err = -EINVAL; + assert_cfg80211_lock(); + if (info->attrs[NL80211_ATTR_WIPHY]) { - bywiphy = cfg80211_drv_by_wiphy( + bywiphyidx = cfg80211_drv_by_wiphy_idx( nla_get_u32(info->attrs[NL80211_ATTR_WIPHY])); err = -ENODEV; } @@ -78,14 +115,14 @@ __cfg80211_drv_from_info(struct genl_info *info) err = -ENODEV; } - if (bywiphy && byifidx) { - if (bywiphy != byifidx) + if (bywiphyidx && byifidx) { + if (bywiphyidx != byifidx) return ERR_PTR(-EINVAL); else - return bywiphy; /* == byifidx */ + return bywiphyidx; /* == byifidx */ } - if (bywiphy) - return bywiphy; + if (bywiphyidx) + return bywiphyidx; if (byifidx) return byifidx; @@ -98,7 +135,7 @@ cfg80211_get_dev_from_info(struct genl_info *info) { struct cfg80211_registered_device *drv; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); drv = __cfg80211_drv_from_info(info); /* if it is not an error we grab the lock on @@ -107,7 +144,7 @@ cfg80211_get_dev_from_info(struct genl_info *info) if (!IS_ERR(drv)) mutex_lock(&drv->mtx); - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); return drv; } @@ -118,7 +155,7 @@ cfg80211_get_dev_from_ifindex(int ifindex) struct cfg80211_registered_device *drv = ERR_PTR(-ENODEV); struct net_device *dev; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); dev = dev_get_by_index(&init_net, ifindex); if (!dev) goto out; @@ -129,7 +166,7 @@ cfg80211_get_dev_from_ifindex(int ifindex) drv = ERR_PTR(-ENODEV); dev_put(dev); out: - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); return drv; } @@ -143,16 +180,16 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, char *newname) { struct cfg80211_registered_device *drv; - int idx, taken = -1, result, digits; + int wiphy_idx, taken = -1, result, digits; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); /* prohibit calling the thing phy%d when %d is not its number */ - sscanf(newname, PHY_NAME "%d%n", &idx, &taken); - if (taken == strlen(newname) && idx != rdev->idx) { - /* count number of places needed to print idx */ + sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken); + if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) { + /* count number of places needed to print wiphy_idx */ digits = 1; - while (idx /= 10) + while (wiphy_idx /= 10) digits++; /* * deny the name if it is phy<idx> where <idx> is printed @@ -193,7 +230,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, result = 0; out_unlock: - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); if (result == 0) nl80211_notify_dev_rename(rdev); @@ -220,26 +257,28 @@ struct wiphy *wiphy_new(struct cfg80211_ops *ops, int sizeof_priv) drv->ops = ops; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); - drv->idx = wiphy_counter++; + drv->wiphy_idx = wiphy_counter++; - if (unlikely(drv->idx < 0)) { + if (unlikely(!wiphy_idx_valid(drv->wiphy_idx))) { wiphy_counter--; - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); /* ugh, wrapped! */ kfree(drv); return NULL; } - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); /* give it a proper name */ - dev_set_name(&drv->wiphy.dev, PHY_NAME "%d", drv->idx); + dev_set_name(&drv->wiphy.dev, PHY_NAME "%d", drv->wiphy_idx); mutex_init(&drv->mtx); mutex_init(&drv->devlist_mtx); INIT_LIST_HEAD(&drv->netdev_list); + spin_lock_init(&drv->bss_lock); + INIT_LIST_HEAD(&drv->bss_list); device_initialize(&drv->wiphy.dev); drv->wiphy.dev.class = &ieee80211_class; @@ -259,6 +298,9 @@ int wiphy_register(struct wiphy *wiphy) int i; u16 ifmodes = wiphy->interface_modes; + if (WARN_ON(wiphy->max_scan_ssids < 1)) + return -EINVAL; + /* sanity check ifmodes */ WARN_ON(!ifmodes); ifmodes &= ((1 << __NL80211_IFTYPE_AFTER_LAST) - 1) & ~1; @@ -273,10 +315,16 @@ int wiphy_register(struct wiphy *wiphy) sband->band = band; - if (!sband->n_channels || !sband->n_bitrates) { - WARN_ON(1); + if (WARN_ON(!sband->n_channels || !sband->n_bitrates)) + return -EINVAL; + + /* + * Since we use a u32 for rate bitmaps in + * ieee80211_get_response_rate, we cannot + * have more than 32 legacy rates. + */ + if (WARN_ON(sband->n_bitrates > 32)) return -EINVAL; - } for (i = 0; i < sband->n_channels; i++) { sband->channels[i].orig_flags = @@ -299,10 +347,10 @@ int wiphy_register(struct wiphy *wiphy) /* check and set up bitrates */ ieee80211_set_bitrate_flags(wiphy); - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); /* set up regulatory info */ - wiphy_update_regulatory(wiphy, REGDOM_SET_BY_CORE); + wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE); res = device_add(&drv->wiphy.dev); if (res) @@ -317,9 +365,20 @@ int wiphy_register(struct wiphy *wiphy) if (IS_ERR(drv->wiphy.debugfsdir)) drv->wiphy.debugfsdir = NULL; + if (wiphy->custom_regulatory) { + struct regulatory_request request; + + request.wiphy_idx = get_wiphy_idx(wiphy); + request.initiator = NL80211_REGDOM_SET_BY_DRIVER; + request.alpha2[0] = '9'; + request.alpha2[1] = '9'; + + nl80211_send_reg_change_event(&request); + } + res = 0; out_unlock: - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); return res; } EXPORT_SYMBOL(wiphy_register); @@ -329,7 +388,7 @@ void wiphy_unregister(struct wiphy *wiphy) struct cfg80211_registered_device *drv = wiphy_to_dev(wiphy); /* protect the device list */ - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); BUG_ON(!list_empty(&drv->netdev_list)); @@ -355,14 +414,17 @@ void wiphy_unregister(struct wiphy *wiphy) device_del(&drv->wiphy.dev); debugfs_remove(drv->wiphy.debugfsdir); - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); } EXPORT_SYMBOL(wiphy_unregister); void cfg80211_dev_free(struct cfg80211_registered_device *drv) { + struct cfg80211_internal_bss *scan, *tmp; mutex_destroy(&drv->mtx); mutex_destroy(&drv->devlist_mtx); + list_for_each_entry_safe(scan, tmp, &drv->bss_list, list) + cfg80211_put_bss(&scan->pub); kfree(drv); } diff --git a/net/wireless/core.h b/net/wireless/core.h index f7fb9f413028..6acd483a61f8 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -8,6 +8,9 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/netdevice.h> +#include <linux/kref.h> +#include <linux/rbtree.h> +#include <linux/mutex.h> #include <net/genetlink.h> #include <net/wireless.h> #include <net/cfg80211.h> @@ -35,12 +38,20 @@ struct cfg80211_registered_device { enum environment_cap env; /* wiphy index, internal only */ - int idx; + int wiphy_idx; /* associate netdev list */ struct mutex devlist_mtx; struct list_head netdev_list; + /* BSSes/scanning */ + spinlock_t bss_lock; + struct list_head bss_list; + struct rb_root bss_tree; + u32 bss_generation; + struct cfg80211_scan_request *scan_req; /* protected by RTNL */ + unsigned long suspend_at; + /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wiphy wiphy __attribute__((__aligned__(NETDEV_ALIGN))); @@ -53,9 +64,39 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy) return container_of(wiphy, struct cfg80211_registered_device, wiphy); } -extern struct mutex cfg80211_drv_mutex; +/* Note 0 is valid, hence phy0 */ +static inline +bool wiphy_idx_valid(int wiphy_idx) +{ + return (wiphy_idx >= 0); +} + +extern struct mutex cfg80211_mutex; extern struct list_head cfg80211_drv_list; +static inline void assert_cfg80211_lock(void) +{ + WARN_ON(!mutex_is_locked(&cfg80211_mutex)); +} + +/* + * You can use this to mark a wiphy_idx as not having an associated wiphy. + * It guarantees cfg80211_drv_by_wiphy_idx(wiphy_idx) will return NULL + */ +#define WIPHY_IDX_STALE -1 + +struct cfg80211_internal_bss { + struct list_head list; + struct rb_node rbn; + unsigned long ts; + struct kref ref; + /* must be last because of priv member */ + struct cfg80211_bss pub; +}; + +struct cfg80211_registered_device *cfg80211_drv_by_wiphy_idx(int wiphy_idx); +int get_wiphy_idx(struct wiphy *wiphy); + /* * This function returns a pointer to the driver * that the genl_info item that is passed refers to. @@ -63,13 +104,13 @@ extern struct list_head cfg80211_drv_list; * the driver's mutex! * * This means that you need to call cfg80211_put_dev() - * before being allowed to acquire &cfg80211_drv_mutex! + * before being allowed to acquire &cfg80211_mutex! * * This is necessary because we need to lock the global * mutex to get an item off the list safely, and then * we lock the drv mutex so it doesn't go away under us. * - * We don't want to keep cfg80211_drv_mutex locked + * We don't want to keep cfg80211_mutex locked * for all the time in order to allow requests on * other interfaces to go through at the same time. * @@ -79,6 +120,9 @@ extern struct list_head cfg80211_drv_list; extern struct cfg80211_registered_device * cfg80211_get_dev_from_info(struct genl_info *info); +/* requires cfg80211_drv_mutex to be held! */ +struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx); + /* identical to cfg80211_get_dev_from_info but only operate on ifindex */ extern struct cfg80211_registered_device * cfg80211_get_dev_from_ifindex(int ifindex); @@ -92,6 +136,11 @@ extern int cfg80211_dev_rename(struct cfg80211_registered_device *drv, char *newname); void ieee80211_set_bitrate_flags(struct wiphy *wiphy); -void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby); +void wiphy_update_regulatory(struct wiphy *wiphy, + enum nl80211_reg_initiator setby); + +void cfg80211_bss_expire(struct cfg80211_registered_device *dev); +void cfg80211_bss_age(struct cfg80211_registered_device *dev, + unsigned long age_secs); #endif /* __NET_WIRELESS_CORE_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 31b807af3235..ab9d8f14e151 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7,13 +7,13 @@ #include <linux/if.h> #include <linux/module.h> #include <linux/err.h> -#include <linux/mutex.h> #include <linux/list.h> #include <linux/if_ether.h> #include <linux/ieee80211.h> #include <linux/nl80211.h> #include <linux/rtnetlink.h> #include <linux/netlink.h> +#include <linux/etherdevice.h> #include <net/genetlink.h> #include <net/cfg80211.h> #include "core.h" @@ -105,6 +105,12 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = { [NL80211_ATTR_HT_CAPABILITY] = { .type = NLA_BINARY, .len = NL80211_HT_CAPABILITY_LEN }, + + [NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 }, + [NL80211_ATTR_IE] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_SCAN_FREQUENCIES] = { .type = NLA_NESTED }, + [NL80211_ATTR_SCAN_SSIDS] = { .type = NLA_NESTED }, }; /* message building helper */ @@ -135,8 +141,10 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, if (!hdr) return -1; - NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->idx); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->wiphy_idx); NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy)); + NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS, + dev->wiphy.max_scan_ssids); nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES); if (!nl_modes) @@ -247,7 +255,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) int start = cb->args[0]; struct cfg80211_registered_device *dev; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); list_for_each_entry(dev, &cfg80211_drv_list, list) { if (++idx <= start) continue; @@ -258,7 +266,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) break; } } - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); cb->args[0] = idx; @@ -461,7 +469,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * struct cfg80211_registered_device *dev; struct wireless_dev *wdev; - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); list_for_each_entry(dev, &cfg80211_drv_list, list) { if (wp_idx < wp_start) { wp_idx++; @@ -488,7 +496,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * wp_idx++; } out: - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); cb->args[0] = wp_idx; cb->args[1] = if_idx; @@ -738,7 +746,7 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_KEY_IDX]) key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); - if (key_idx > 3) + if (key_idx > 5) return -EINVAL; if (info->attrs[NL80211_ATTR_MAC]) @@ -804,30 +812,41 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) int err; struct net_device *dev; u8 key_idx; + int (*func)(struct wiphy *wiphy, struct net_device *netdev, + u8 key_index); if (!info->attrs[NL80211_ATTR_KEY_IDX]) return -EINVAL; key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); - if (key_idx > 3) + if (info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT]) { + if (key_idx < 4 || key_idx > 5) + return -EINVAL; + } else if (key_idx > 3) return -EINVAL; /* currently only support setting default key */ - if (!info->attrs[NL80211_ATTR_KEY_DEFAULT]) + if (!info->attrs[NL80211_ATTR_KEY_DEFAULT] && + !info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT]) return -EINVAL; err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev); if (err) return err; - if (!drv->ops->set_default_key) { + if (info->attrs[NL80211_ATTR_KEY_DEFAULT]) + func = drv->ops->set_default_key; + else + func = drv->ops->set_default_mgmt_key; + + if (!func) { err = -EOPNOTSUPP; goto out; } rtnl_lock(); - err = drv->ops->set_default_key(&drv->wiphy, dev, key_idx); + err = func(&drv->wiphy, dev, key_idx); rtnl_unlock(); out: @@ -863,7 +882,7 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); - if (key_idx > 3) + if (key_idx > 5) return -EINVAL; /* @@ -894,6 +913,10 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) if (params.key_len != 13) return -EINVAL; break; + case WLAN_CIPHER_SUITE_AES_CMAC: + if (params.key_len != 16) + return -EINVAL; + break; default: return -EINVAL; } @@ -928,7 +951,7 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_KEY_IDX]) key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); - if (key_idx > 3) + if (key_idx > 5) return -EINVAL; if (info->attrs[NL80211_ATTR_MAC]) @@ -1182,6 +1205,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq, nla_nest_end(msg, txrate); } + if (sinfo->filled & STATION_INFO_RX_PACKETS) + NLA_PUT_U32(msg, NL80211_STA_INFO_RX_PACKETS, + sinfo->rx_packets); + if (sinfo->filled & STATION_INFO_TX_PACKETS) + NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS, + sinfo->tx_packets); nla_nest_end(msg, sinfoattr); return genlmsg_end(msg, hdr); @@ -1876,6 +1905,19 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) int r; char *data = NULL; + /* + * You should only get this when cfg80211 hasn't yet initialized + * completely when built-in to the kernel right between the time + * window between nl80211_init() and regulatory_init(), if that is + * even possible. + */ + mutex_lock(&cfg80211_mutex); + if (unlikely(!cfg80211_regdomain)) { + mutex_unlock(&cfg80211_mutex); + return -EINPROGRESS; + } + mutex_unlock(&cfg80211_mutex); + if (!info->attrs[NL80211_ATTR_REG_ALPHA2]) return -EINVAL; @@ -1886,9 +1928,9 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) if (is_world_regdom(data)) return -EINVAL; #endif - mutex_lock(&cfg80211_drv_mutex); - r = __regulatory_hint(NULL, REGDOM_SET_BY_USER, data, 0, ENVIRON_ANY); - mutex_unlock(&cfg80211_drv_mutex); + + r = regulatory_hint_user(data); + return r; } @@ -2080,6 +2122,81 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info) #undef FILL_IN_MESH_PARAM_IF_SET +static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr = NULL; + struct nlattr *nl_reg_rules; + unsigned int i; + int err = -EINVAL; + + mutex_lock(&cfg80211_mutex); + + if (!cfg80211_regdomain) + goto out; + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) { + err = -ENOBUFS; + goto out; + } + + hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, + NL80211_CMD_GET_REG); + if (!hdr) + goto nla_put_failure; + + NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2, + cfg80211_regdomain->alpha2); + + nl_reg_rules = nla_nest_start(msg, NL80211_ATTR_REG_RULES); + if (!nl_reg_rules) + goto nla_put_failure; + + for (i = 0; i < cfg80211_regdomain->n_reg_rules; i++) { + struct nlattr *nl_reg_rule; + const struct ieee80211_reg_rule *reg_rule; + const struct ieee80211_freq_range *freq_range; + const struct ieee80211_power_rule *power_rule; + + reg_rule = &cfg80211_regdomain->reg_rules[i]; + freq_range = ®_rule->freq_range; + power_rule = ®_rule->power_rule; + + nl_reg_rule = nla_nest_start(msg, i); + if (!nl_reg_rule) + goto nla_put_failure; + + NLA_PUT_U32(msg, NL80211_ATTR_REG_RULE_FLAGS, + reg_rule->flags); + NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_START, + freq_range->start_freq_khz); + NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_END, + freq_range->end_freq_khz); + NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_MAX_BW, + freq_range->max_bandwidth_khz); + NLA_PUT_U32(msg, NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN, + power_rule->max_antenna_gain); + NLA_PUT_U32(msg, NL80211_ATTR_POWER_RULE_MAX_EIRP, + power_rule->max_eirp); + + nla_nest_end(msg, nl_reg_rule); + } + + nla_nest_end(msg, nl_reg_rules); + + genlmsg_end(msg, hdr); + err = genlmsg_unicast(msg, info->snd_pid); + goto out; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + err = -EMSGSIZE; +out: + mutex_unlock(&cfg80211_mutex); + return err; +} + static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[NL80211_REG_RULE_ATTR_MAX + 1]; @@ -2135,9 +2252,9 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) BUG_ON(rule_idx != num_rules); - mutex_lock(&cfg80211_drv_mutex); + mutex_lock(&cfg80211_mutex); r = set_regdom(rd); - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); return r; bad_reg: @@ -2145,6 +2262,302 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } +static int nl80211_set_mgmt_extra_ie(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *drv; + int err; + struct net_device *dev; + struct mgmt_extra_ie_params params; + + memset(¶ms, 0, sizeof(params)); + + if (!info->attrs[NL80211_ATTR_MGMT_SUBTYPE]) + return -EINVAL; + params.subtype = nla_get_u8(info->attrs[NL80211_ATTR_MGMT_SUBTYPE]); + if (params.subtype > 15) + return -EINVAL; /* FC Subtype field is 4 bits (0..15) */ + + if (info->attrs[NL80211_ATTR_IE]) { + params.ies = nla_data(info->attrs[NL80211_ATTR_IE]); + params.ies_len = nla_len(info->attrs[NL80211_ATTR_IE]); + } + + err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev); + if (err) + return err; + + if (drv->ops->set_mgmt_extra_ie) { + rtnl_lock(); + err = drv->ops->set_mgmt_extra_ie(&drv->wiphy, dev, ¶ms); + rtnl_unlock(); + } else + err = -EOPNOTSUPP; + + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + +static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *drv; + struct net_device *dev; + struct cfg80211_scan_request *request; + struct cfg80211_ssid *ssid; + struct ieee80211_channel *channel; + struct nlattr *attr; + struct wiphy *wiphy; + int err, tmp, n_ssids = 0, n_channels = 0, i; + enum ieee80211_band band; + size_t ie_len; + + err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev); + if (err) + return err; + + wiphy = &drv->wiphy; + + if (!drv->ops->scan) { + err = -EOPNOTSUPP; + goto out; + } + + rtnl_lock(); + + if (drv->scan_req) { + err = -EBUSY; + goto out_unlock; + } + + if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { + nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp) + n_channels++; + if (!n_channels) { + err = -EINVAL; + goto out_unlock; + } + } else { + for (band = 0; band < IEEE80211_NUM_BANDS; band++) + if (wiphy->bands[band]) + n_channels += wiphy->bands[band]->n_channels; + } + + if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) + nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) + n_ssids++; + + if (n_ssids > wiphy->max_scan_ssids) { + err = -EINVAL; + goto out_unlock; + } + + if (info->attrs[NL80211_ATTR_IE]) + ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + else + ie_len = 0; + + request = kzalloc(sizeof(*request) + + sizeof(*ssid) * n_ssids + + sizeof(channel) * n_channels + + ie_len, GFP_KERNEL); + if (!request) { + err = -ENOMEM; + goto out_unlock; + } + + request->channels = (void *)((char *)request + sizeof(*request)); + request->n_channels = n_channels; + if (n_ssids) + request->ssids = (void *)(request->channels + n_channels); + request->n_ssids = n_ssids; + if (ie_len) { + if (request->ssids) + request->ie = (void *)(request->ssids + n_ssids); + else + request->ie = (void *)(request->channels + n_channels); + } + + if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { + /* user specified, bail out if channel not found */ + request->n_channels = n_channels; + i = 0; + nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp) { + request->channels[i] = ieee80211_get_channel(wiphy, nla_get_u32(attr)); + if (!request->channels[i]) { + err = -EINVAL; + goto out_free; + } + i++; + } + } else { + /* all channels */ + i = 0; + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + int j; + if (!wiphy->bands[band]) + continue; + for (j = 0; j < wiphy->bands[band]->n_channels; j++) { + request->channels[i] = &wiphy->bands[band]->channels[j]; + i++; + } + } + } + + i = 0; + if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) { + nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) { + if (request->ssids[i].ssid_len > IEEE80211_MAX_SSID_LEN) { + err = -EINVAL; + goto out_free; + } + memcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr)); + request->ssids[i].ssid_len = nla_len(attr); + i++; + } + } + + if (info->attrs[NL80211_ATTR_IE]) { + request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + memcpy(request->ie, nla_data(info->attrs[NL80211_ATTR_IE]), + request->ie_len); + } + + request->ifidx = dev->ifindex; + request->wiphy = &drv->wiphy; + + drv->scan_req = request; + err = drv->ops->scan(&drv->wiphy, dev, request); + + out_free: + if (err) { + drv->scan_req = NULL; + kfree(request); + } + out_unlock: + rtnl_unlock(); + out: + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + +static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags, + struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_bss *res) +{ + void *hdr; + struct nlattr *bss; + + hdr = nl80211hdr_put(msg, pid, seq, flags, + NL80211_CMD_NEW_SCAN_RESULTS); + if (!hdr) + return -1; + + NLA_PUT_U32(msg, NL80211_ATTR_SCAN_GENERATION, + rdev->bss_generation); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex); + + bss = nla_nest_start(msg, NL80211_ATTR_BSS); + if (!bss) + goto nla_put_failure; + if (!is_zero_ether_addr(res->bssid)) + NLA_PUT(msg, NL80211_BSS_BSSID, ETH_ALEN, res->bssid); + if (res->information_elements && res->len_information_elements) + NLA_PUT(msg, NL80211_BSS_INFORMATION_ELEMENTS, + res->len_information_elements, + res->information_elements); + if (res->tsf) + NLA_PUT_U64(msg, NL80211_BSS_TSF, res->tsf); + if (res->beacon_interval) + NLA_PUT_U16(msg, NL80211_BSS_BEACON_INTERVAL, res->beacon_interval); + NLA_PUT_U16(msg, NL80211_BSS_CAPABILITY, res->capability); + NLA_PUT_U32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq); + + switch (rdev->wiphy.signal_type) { + case CFG80211_SIGNAL_TYPE_MBM: + NLA_PUT_U32(msg, NL80211_BSS_SIGNAL_MBM, res->signal); + break; + case CFG80211_SIGNAL_TYPE_UNSPEC: + NLA_PUT_U8(msg, NL80211_BSS_SIGNAL_UNSPEC, res->signal); + break; + default: + break; + } + + nla_nest_end(msg, bss); + + return genlmsg_end(msg, hdr); + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int nl80211_dump_scan(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct cfg80211_registered_device *dev; + struct net_device *netdev; + struct cfg80211_internal_bss *scan; + int ifidx = cb->args[0]; + int start = cb->args[1], idx = 0; + int err; + + if (!ifidx) { + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, + nl80211_fam.attrbuf, nl80211_fam.maxattr, + nl80211_policy); + if (err) + return err; + + if (!nl80211_fam.attrbuf[NL80211_ATTR_IFINDEX]) + return -EINVAL; + + ifidx = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_IFINDEX]); + if (!ifidx) + return -EINVAL; + cb->args[0] = ifidx; + } + + netdev = dev_get_by_index(&init_net, ifidx); + if (!netdev) + return -ENODEV; + + dev = cfg80211_get_dev_from_ifindex(ifidx); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out_put_netdev; + } + + spin_lock_bh(&dev->bss_lock); + cfg80211_bss_expire(dev); + + list_for_each_entry(scan, &dev->bss_list, list) { + if (++idx <= start) + continue; + if (nl80211_send_bss(skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + dev, netdev, &scan->pub) < 0) { + idx--; + goto out; + } + } + + out: + spin_unlock_bh(&dev->bss_lock); + + cb->args[1] = idx; + err = skb->len; + cfg80211_put_dev(dev); + out_put_netdev: + dev_put(netdev); + + return err; +} + static struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_GET_WIPHY, @@ -2231,7 +2644,6 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_get_station, .dumpit = nl80211_dump_station, .policy = nl80211_policy, - .flags = GENL_ADMIN_PERM, }, { .cmd = NL80211_CMD_SET_STATION, @@ -2283,6 +2695,12 @@ static struct genl_ops nl80211_ops[] = { .flags = GENL_ADMIN_PERM, }, { + .cmd = NL80211_CMD_GET_REG, + .doit = nl80211_get_reg, + .policy = nl80211_policy, + /* can be retrieved by unprivileged users */ + }, + { .cmd = NL80211_CMD_SET_REG, .doit = nl80211_set_reg, .policy = nl80211_policy, @@ -2306,12 +2724,35 @@ static struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, + { + .cmd = NL80211_CMD_SET_MGMT_EXTRA_IE, + .doit = nl80211_set_mgmt_extra_ie, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_TRIGGER_SCAN, + .doit = nl80211_trigger_scan, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_GET_SCAN, + .policy = nl80211_policy, + .dumpit = nl80211_dump_scan, + }, }; /* multicast groups */ static struct genl_multicast_group nl80211_config_mcgrp = { .name = "config", }; +static struct genl_multicast_group nl80211_scan_mcgrp = { + .name = "scan", +}; +static struct genl_multicast_group nl80211_regulatory_mcgrp = { + .name = "regulatory", +}; /* notification functions */ @@ -2331,6 +2772,121 @@ void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev) genlmsg_multicast(msg, 0, nl80211_config_mcgrp.id, GFP_KERNEL); } +static int nl80211_send_scan_donemsg(struct sk_buff *msg, + struct cfg80211_registered_device *rdev, + struct net_device *netdev, + u32 pid, u32 seq, int flags, + u32 cmd) +{ + void *hdr; + + hdr = nl80211hdr_put(msg, pid, seq, flags, cmd); + if (!hdr) + return -1; + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + + /* XXX: we should probably bounce back the request? */ + + return genlmsg_end(msg, hdr); + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, + struct net_device *netdev) +{ + struct sk_buff *msg; + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return; + + if (nl80211_send_scan_donemsg(msg, rdev, netdev, 0, 0, 0, + NL80211_CMD_NEW_SCAN_RESULTS) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast(msg, 0, nl80211_scan_mcgrp.id, GFP_KERNEL); +} + +void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, + struct net_device *netdev) +{ + struct sk_buff *msg; + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return; + + if (nl80211_send_scan_donemsg(msg, rdev, netdev, 0, 0, 0, + NL80211_CMD_SCAN_ABORTED) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast(msg, 0, nl80211_scan_mcgrp.id, GFP_KERNEL); +} + +/* + * This can happen on global regulatory changes or device specific settings + * based on custom world regulatory domains. + */ +void nl80211_send_reg_change_event(struct regulatory_request *request) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_CHANGE); + if (!hdr) { + nlmsg_free(msg); + return; + } + + /* Userspace can always count this one always being set */ + NLA_PUT_U8(msg, NL80211_ATTR_REG_INITIATOR, request->initiator); + + if (request->alpha2[0] == '0' && request->alpha2[1] == '0') + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_WORLD); + else if (request->alpha2[0] == '9' && request->alpha2[1] == '9') + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_CUSTOM_WORLD); + else if ((request->alpha2[0] == '9' && request->alpha2[1] == '8') || + request->intersect) + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_INTERSECTION); + else { + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_COUNTRY); + NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2, request->alpha2); + } + + if (wiphy_idx_valid(request->wiphy_idx)) + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, request->wiphy_idx); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast(msg, 0, nl80211_regulatory_mcgrp.id, GFP_KERNEL); + + return; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + /* initialisation/exit functions */ int nl80211_init(void) @@ -2351,6 +2907,14 @@ int nl80211_init(void) if (err) goto err_out; + err = genl_register_mc_group(&nl80211_fam, &nl80211_scan_mcgrp); + if (err) + goto err_out; + + err = genl_register_mc_group(&nl80211_fam, &nl80211_regulatory_mcgrp); + if (err) + goto err_out; + return 0; err_out: genl_unregister_family(&nl80211_fam); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index f3ea5c029aee..e65a3c38c52f 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -7,6 +7,11 @@ extern int nl80211_init(void); extern void nl80211_exit(void); extern void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev); +extern void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, + struct net_device *netdev); +extern void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, + struct net_device *netdev); +extern void nl80211_send_reg_change_event(struct regulatory_request *request); #else static inline int nl80211_init(void) { @@ -19,6 +24,18 @@ static inline void nl80211_notify_dev_rename( struct cfg80211_registered_device *rdev) { } +static inline void +nl80211_send_scan_done(struct cfg80211_registered_device *rdev, + struct net_device *netdev) +{} +static inline void nl80211_send_scan_aborted( + struct cfg80211_registered_device *rdev, + struct net_device *netdev) +{} +static inline void +nl80211_send_reg_change_event(struct regulatory_request *request) +{ +} #endif /* CONFIG_NL80211 */ #endif /* __NET_WIRELESS_NL80211_H */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index bd0a16c3de5e..eb8b8ed16155 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -41,38 +41,7 @@ #include <net/cfg80211.h> #include "core.h" #include "reg.h" - -/** - * struct regulatory_request - receipt of last regulatory request - * - * @wiphy: this is set if this request's initiator is - * %REGDOM_SET_BY_COUNTRY_IE or %REGDOM_SET_BY_DRIVER. This - * can be used by the wireless core to deal with conflicts - * and potentially inform users of which devices specifically - * cased the conflicts. - * @initiator: indicates who sent this request, could be any of - * of those set in reg_set_by, %REGDOM_SET_BY_* - * @alpha2: the ISO / IEC 3166 alpha2 country code of the requested - * regulatory domain. We have a few special codes: - * 00 - World regulatory domain - * 99 - built by driver but a specific alpha2 cannot be determined - * 98 - result of an intersection between two regulatory domains - * @intersect: indicates whether the wireless core should intersect - * the requested regulatory domain with the presently set regulatory - * domain. - * @country_ie_checksum: checksum of the last processed and accepted - * country IE - * @country_ie_env: lets us know if the AP is telling us we are outdoor, - * indoor, or if it doesn't matter - */ -struct regulatory_request { - struct wiphy *wiphy; - enum reg_set_by initiator; - char alpha2[2]; - bool intersect; - u32 country_ie_checksum; - enum environment_cap country_ie_env; -}; +#include "nl80211.h" /* Receipt of information from last regulatory request */ static struct regulatory_request *last_request; @@ -86,22 +55,63 @@ static u32 supported_bandwidths[] = { MHZ_TO_KHZ(20), }; -/* Central wireless core regulatory domains, we only need two, +/* + * Central wireless core regulatory domains, we only need two, * the current one and a world regulatory domain in case we have no - * information to give us an alpha2 */ -static const struct ieee80211_regdomain *cfg80211_regdomain; + * information to give us an alpha2 + */ +const struct ieee80211_regdomain *cfg80211_regdomain; -/* We use this as a place for the rd structure built from the +/* + * We use this as a place for the rd structure built from the * last parsed country IE to rest until CRDA gets back to us with - * what it thinks should apply for the same country */ + * what it thinks should apply for the same country + */ static const struct ieee80211_regdomain *country_ie_regdomain; +/* Used to queue up regulatory hints */ +static LIST_HEAD(reg_requests_list); +static spinlock_t reg_requests_lock; + +/* Used to queue up beacon hints for review */ +static LIST_HEAD(reg_pending_beacons); +static spinlock_t reg_pending_beacons_lock; + +/* Used to keep track of processed beacon hints */ +static LIST_HEAD(reg_beacon_list); + +struct reg_beacon { + struct list_head list; + struct ieee80211_channel chan; +}; + /* We keep a static world regulatory domain in case of the absence of CRDA */ static const struct ieee80211_regdomain world_regdom = { - .n_reg_rules = 1, + .n_reg_rules = 5, .alpha2 = "00", .reg_rules = { - REG_RULE(2412-10, 2462+10, 40, 6, 20, + /* IEEE 802.11b/g, channels 1..11 */ + REG_RULE(2412-10, 2462+10, 40, 6, 20, 0), + /* IEEE 802.11b/g, channels 12..13. No HT40 + * channel fits here. */ + REG_RULE(2467-10, 2472+10, 20, 6, 20, + NL80211_RRF_PASSIVE_SCAN | + NL80211_RRF_NO_IBSS), + /* IEEE 802.11 channel 14 - Only JP enables + * this and for 802.11b only */ + REG_RULE(2484-10, 2484+10, 20, 6, 20, + NL80211_RRF_PASSIVE_SCAN | + NL80211_RRF_NO_IBSS | + NL80211_RRF_NO_OFDM), + /* IEEE 802.11a, channel 36..48 */ + REG_RULE(5180-10, 5240+10, 40, 6, 20, + NL80211_RRF_PASSIVE_SCAN | + NL80211_RRF_NO_IBSS), + + /* NB: 5260 MHz - 5700 MHz requies DFS */ + + /* IEEE 802.11a, channel 149..165 */ + REG_RULE(5745-10, 5825+10, 40, 6, 20, NL80211_RRF_PASSIVE_SCAN | NL80211_RRF_NO_IBSS), } @@ -115,9 +125,11 @@ static char *ieee80211_regdom = "US"; module_param(ieee80211_regdom, charp, 0444); MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code"); -/* We assume 40 MHz bandwidth for the old regulatory work. +/* + * We assume 40 MHz bandwidth for the old regulatory work. * We make emphasis we are using the exact same frequencies - * as before */ + * as before + */ static const struct ieee80211_regdomain us_regdom = { .n_reg_rules = 6, @@ -156,8 +168,10 @@ static const struct ieee80211_regdomain jp_regdom = { static const struct ieee80211_regdomain eu_regdom = { .n_reg_rules = 6, - /* This alpha2 is bogus, we leave it here just for stupid - * backward compatibility */ + /* + * This alpha2 is bogus, we leave it here just for stupid + * backward compatibility + */ .alpha2 = "EU", .reg_rules = { /* IEEE 802.11b/g, channels 1..13 */ @@ -226,8 +240,10 @@ static void reset_regdomains(void) cfg80211_regdomain = NULL; } -/* Dynamic world regulatory domain requested by the wireless - * core upon initialization */ +/* + * Dynamic world regulatory domain requested by the wireless + * core upon initialization + */ static void update_world_regdomain(const struct ieee80211_regdomain *rd) { BUG_ON(!last_request); @@ -268,8 +284,10 @@ static bool is_unknown_alpha2(const char *alpha2) { if (!alpha2) return false; - /* Special case where regulatory domain was built by driver - * but a specific alpha2 cannot be determined */ + /* + * Special case where regulatory domain was built by driver + * but a specific alpha2 cannot be determined + */ if (alpha2[0] == '9' && alpha2[1] == '9') return true; return false; @@ -279,9 +297,11 @@ static bool is_intersected_alpha2(const char *alpha2) { if (!alpha2) return false; - /* Special case where regulatory domain is the + /* + * Special case where regulatory domain is the * result of an intersection between two regulatory domain - * structures */ + * structures + */ if (alpha2[0] == '9' && alpha2[1] == '8') return true; return false; @@ -306,8 +326,10 @@ static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y) return false; } -static bool regdom_changed(const char *alpha2) +static bool regdom_changes(const char *alpha2) { + assert_cfg80211_lock(); + if (!cfg80211_regdomain) return true; if (alpha2_equal(cfg80211_regdomain->alpha2, alpha2)) @@ -334,8 +356,10 @@ static bool country_ie_integrity_changes(u32 checksum) return false; } -/* This lets us keep regulatory code which is updated on a regulatory - * basis in userspace. */ +/* + * This lets us keep regulatory code which is updated on a regulatory + * basis in userspace. + */ static int call_crda(const char *alpha2) { char country_env[9 + 2] = "COUNTRY="; @@ -447,10 +471,12 @@ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range, #undef ONE_GHZ_IN_KHZ } -/* Converts a country IE to a regulatory domain. A regulatory domain +/* + * Converts a country IE to a regulatory domain. A regulatory domain * structure has a lot of information which the IE doesn't yet have, * so for the other values we use upper max values as we will intersect - * with our userspace regulatory agent to get lower bounds. */ + * with our userspace regulatory agent to get lower bounds. + */ static struct ieee80211_regdomain *country_ie_2_rd( u8 *country_ie, u8 country_ie_len, @@ -495,9 +521,11 @@ static struct ieee80211_regdomain *country_ie_2_rd( *checksum ^= ((flags ^ alpha2[0] ^ alpha2[1]) << 8); - /* We need to build a reg rule for each triplet, but first we must + /* + * We need to build a reg rule for each triplet, but first we must * calculate the number of reg rules we will need. We will need one - * for each channel subband */ + * for each channel subband + */ while (country_ie_len >= 3) { int end_channel = 0; struct ieee80211_country_ie_triplet *triplet = @@ -535,9 +563,11 @@ static struct ieee80211_regdomain *country_ie_2_rd( if (cur_sub_max_channel < cur_channel) return NULL; - /* Do not allow overlapping channels. Also channels + /* + * Do not allow overlapping channels. Also channels * passed in each subband must be monotonically - * increasing */ + * increasing + */ if (last_sub_max_channel) { if (cur_channel <= last_sub_max_channel) return NULL; @@ -545,10 +575,12 @@ static struct ieee80211_regdomain *country_ie_2_rd( return NULL; } - /* When dot11RegulatoryClassesRequired is supported + /* + * When dot11RegulatoryClassesRequired is supported * we can throw ext triplets as part of this soup, * for now we don't care when those change as we - * don't support them */ + * don't support them + */ *checksum ^= ((cur_channel ^ cur_sub_max_channel) << 8) | ((cur_sub_max_channel ^ cur_sub_max_channel) << 16) | ((triplet->chans.max_power ^ cur_sub_max_channel) << 24); @@ -559,8 +591,10 @@ static struct ieee80211_regdomain *country_ie_2_rd( country_ie_len -= 3; num_rules++; - /* Note: this is not a IEEE requirement but - * simply a memory requirement */ + /* + * Note: this is not a IEEE requirement but + * simply a memory requirement + */ if (num_rules > NL80211_MAX_SUPP_REG_RULES) return NULL; } @@ -588,8 +622,10 @@ static struct ieee80211_regdomain *country_ie_2_rd( struct ieee80211_freq_range *freq_range = NULL; struct ieee80211_power_rule *power_rule = NULL; - /* Must parse if dot11RegulatoryClassesRequired is true, - * we don't support this yet */ + /* + * Must parse if dot11RegulatoryClassesRequired is true, + * we don't support this yet + */ if (triplet->ext.reg_extension_id >= IEEE80211_COUNTRY_EXTENSION_ID) { country_ie += 3; @@ -611,10 +647,12 @@ static struct ieee80211_regdomain *country_ie_2_rd( end_channel = triplet->chans.first_channel + (4 * (triplet->chans.num_channels - 1)); - /* The +10 is since the regulatory domain expects + /* + * The +10 is since the regulatory domain expects * the actual band edge, not the center of freq for * its start and end freqs, assuming 20 MHz bandwidth on - * the channels passed */ + * the channels passed + */ freq_range->start_freq_khz = MHZ_TO_KHZ(ieee80211_channel_to_frequency( triplet->chans.first_channel) - 10); @@ -622,9 +660,11 @@ static struct ieee80211_regdomain *country_ie_2_rd( MHZ_TO_KHZ(ieee80211_channel_to_frequency( end_channel) + 10); - /* Large arbitrary values, we intersect later */ - /* Increment this if we ever support >= 40 MHz channels - * in IEEE 802.11 */ + /* + * These are large arbitrary values we use to intersect later. + * Increment this if we ever support >= 40 MHz channels + * in IEEE 802.11 + */ freq_range->max_bandwidth_khz = MHZ_TO_KHZ(40); power_rule->max_antenna_gain = DBI_TO_MBI(100); power_rule->max_eirp = DBM_TO_MBM(100); @@ -640,8 +680,10 @@ static struct ieee80211_regdomain *country_ie_2_rd( } -/* Helper for regdom_intersect(), this does the real - * mathematical intersection fun */ +/* + * Helper for regdom_intersect(), this does the real + * mathematical intersection fun + */ static int reg_rules_intersect( const struct ieee80211_reg_rule *rule1, const struct ieee80211_reg_rule *rule2, @@ -719,11 +761,13 @@ static struct ieee80211_regdomain *regdom_intersect( if (!rd1 || !rd2) return NULL; - /* First we get a count of the rules we'll need, then we actually + /* + * First we get a count of the rules we'll need, then we actually * build them. This is to so we can malloc() and free() a * regdomain once. The reason we use reg_rules_intersect() here * is it will return -EINVAL if the rule computed makes no sense. - * All rules that do check out OK are valid. */ + * All rules that do check out OK are valid. + */ for (x = 0; x < rd1->n_reg_rules; x++) { rule1 = &rd1->reg_rules[x]; @@ -751,14 +795,18 @@ static struct ieee80211_regdomain *regdom_intersect( rule1 = &rd1->reg_rules[x]; for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; - /* This time around instead of using the stack lets + /* + * This time around instead of using the stack lets * write to the target rule directly saving ourselves - * a memcpy() */ + * a memcpy() + */ intersected_rule = &rd->reg_rules[rule_idx]; r = reg_rules_intersect(rule1, rule2, intersected_rule); - /* No need to memset here the intersected rule here as - * we're not using the stack anymore */ + /* + * No need to memset here the intersected rule here as + * we're not using the stack anymore + */ if (r) continue; rule_idx++; @@ -777,8 +825,10 @@ static struct ieee80211_regdomain *regdom_intersect( return rd; } -/* XXX: add support for the rest of enum nl80211_reg_rule_flags, we may - * want to just have the channel structure use these */ +/* + * XXX: add support for the rest of enum nl80211_reg_rule_flags, we may + * want to just have the channel structure use these + */ static u32 map_regdom_flags(u32 rd_flags) { u32 channel_flags = 0; @@ -791,48 +841,45 @@ static u32 map_regdom_flags(u32 rd_flags) return channel_flags; } -/** - * freq_reg_info - get regulatory information for the given frequency - * @center_freq: Frequency in KHz for which we want regulatory information for - * @bandwidth: the bandwidth requirement you have in KHz, if you do not have one - * you can set this to 0. If this frequency is allowed we then set - * this value to the maximum allowed bandwidth. - * @reg_rule: the regulatory rule which we have for this frequency - * - * Use this function to get the regulatory rule for a specific frequency on - * a given wireless device. If the device has a specific regulatory domain - * it wants to follow we respect that unless a country IE has been received - * and processed already. - * - * Returns 0 if it was able to find a valid regulatory rule which does - * apply to the given center_freq otherwise it returns non-zero. It will - * also return -ERANGE if we determine the given center_freq does not even have - * a regulatory rule for a frequency range in the center_freq's band. See - * freq_in_rule_band() for our current definition of a band -- this is purely - * subjective and right now its 802.11 specific. - */ -static int freq_reg_info(u32 center_freq, u32 *bandwidth, - const struct ieee80211_reg_rule **reg_rule) +static int freq_reg_info_regd(struct wiphy *wiphy, + u32 center_freq, + u32 *bandwidth, + const struct ieee80211_reg_rule **reg_rule, + const struct ieee80211_regdomain *custom_regd) { int i; bool band_rule_found = false; + const struct ieee80211_regdomain *regd; u32 max_bandwidth = 0; - if (!cfg80211_regdomain) + regd = custom_regd ? custom_regd : cfg80211_regdomain; + + /* + * Follow the driver's regulatory domain, if present, unless a country + * IE has been processed or a user wants to help complaince further + */ + if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && + last_request->initiator != NL80211_REGDOM_SET_BY_USER && + wiphy->regd) + regd = wiphy->regd; + + if (!regd) return -EINVAL; - for (i = 0; i < cfg80211_regdomain->n_reg_rules; i++) { + for (i = 0; i < regd->n_reg_rules; i++) { const struct ieee80211_reg_rule *rr; const struct ieee80211_freq_range *fr = NULL; const struct ieee80211_power_rule *pr = NULL; - rr = &cfg80211_regdomain->reg_rules[i]; + rr = ®d->reg_rules[i]; fr = &rr->freq_range; pr = &rr->power_rule; - /* We only need to know if one frequency rule was + /* + * We only need to know if one frequency rule was * was in center_freq's band, that's enough, so lets - * not overwrite it once found */ + * not overwrite it once found + */ if (!band_rule_found) band_rule_found = freq_in_rule_band(fr, center_freq); @@ -850,6 +897,14 @@ static int freq_reg_info(u32 center_freq, u32 *bandwidth, return !max_bandwidth; } +EXPORT_SYMBOL(freq_reg_info); + +int freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 *bandwidth, + const struct ieee80211_reg_rule **reg_rule) +{ + return freq_reg_info_regd(wiphy, center_freq, + bandwidth, reg_rule, NULL); +} static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band, unsigned int chan_idx) @@ -861,6 +916,11 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band, const struct ieee80211_power_rule *power_rule = NULL; struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; + struct wiphy *request_wiphy = NULL; + + assert_cfg80211_lock(); + + request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); sband = wiphy->bands[band]; BUG_ON(chan_idx >= sband->n_channels); @@ -868,11 +928,12 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band, flags = chan->orig_flags; - r = freq_reg_info(MHZ_TO_KHZ(chan->center_freq), + r = freq_reg_info(wiphy, MHZ_TO_KHZ(chan->center_freq), &max_bandwidth, ®_rule); if (r) { - /* This means no regulatory rule was found in the country IE + /* + * This means no regulatory rule was found in the country IE * with a frequency range on the center_freq's band, since * IEEE-802.11 allows for a country IE to have a subset of the * regulatory information provided in a country we ignore @@ -883,7 +944,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band, * http://tinyurl.com/11d-clarification */ if (r == -ERANGE && - last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) { + last_request->initiator == + NL80211_REGDOM_SET_BY_COUNTRY_IE) { #ifdef CONFIG_CFG80211_REG_DEBUG printk(KERN_DEBUG "cfg80211: Leaving channel %d MHz " "intact on %s - no rule found in band on " @@ -891,10 +953,13 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band, chan->center_freq, wiphy_name(wiphy)); #endif } else { - /* In this case we know the country IE has at least one reg rule - * for the band so we respect its band definitions */ + /* + * In this case we know the country IE has at least one reg rule + * for the band so we respect its band definitions + */ #ifdef CONFIG_CFG80211_REG_DEBUG - if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) + if (last_request->initiator == + NL80211_REGDOM_SET_BY_COUNTRY_IE) printk(KERN_DEBUG "cfg80211: Disabling " "channel %d MHz on %s due to " "Country IE\n", @@ -908,6 +973,24 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band, power_rule = ®_rule->power_rule; + if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && + request_wiphy && request_wiphy == wiphy && + request_wiphy->strict_regulatory) { + /* + * This gaurantees the driver's requested regulatory domain + * will always be used as a base for further regulatory + * settings + */ + chan->flags = chan->orig_flags = + map_regdom_flags(reg_rule->flags); + chan->max_antenna_gain = chan->orig_mag = + (int) MBI_TO_DBI(power_rule->max_antenna_gain); + chan->max_bandwidth = KHZ_TO_MHZ(max_bandwidth); + chan->max_power = chan->orig_mpwr = + (int) MBM_TO_DBM(power_rule->max_eirp); + return; + } + chan->flags = flags | map_regdom_flags(reg_rule->flags); chan->max_antenna_gain = min(chan->orig_mag, (int) MBI_TO_DBI(power_rule->max_antenna_gain)); @@ -931,132 +1014,407 @@ static void handle_band(struct wiphy *wiphy, enum ieee80211_band band) handle_channel(wiphy, band, i); } -static bool ignore_reg_update(struct wiphy *wiphy, enum reg_set_by setby) +static bool ignore_reg_update(struct wiphy *wiphy, + enum nl80211_reg_initiator initiator) { if (!last_request) return true; - if (setby == REGDOM_SET_BY_CORE && - wiphy->fw_handles_regulatory) + if (initiator == NL80211_REGDOM_SET_BY_CORE && + wiphy->custom_regulatory) + return true; + /* + * wiphy->regd will be set once the device has its own + * desired regulatory domain set + */ + if (wiphy->strict_regulatory && !wiphy->regd && + !is_world_regdom(last_request->alpha2)) return true; return false; } -static void update_all_wiphy_regulatory(enum reg_set_by setby) +static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator) { struct cfg80211_registered_device *drv; list_for_each_entry(drv, &cfg80211_drv_list, list) - if (!ignore_reg_update(&drv->wiphy, setby)) - wiphy_update_regulatory(&drv->wiphy, setby); + wiphy_update_regulatory(&drv->wiphy, initiator); +} + +static void handle_reg_beacon(struct wiphy *wiphy, + unsigned int chan_idx, + struct reg_beacon *reg_beacon) +{ +#ifdef CONFIG_CFG80211_REG_DEBUG +#define REG_DEBUG_BEACON_FLAG(desc) \ + printk(KERN_DEBUG "cfg80211: Enabling " desc " on " \ + "frequency: %d MHz (Ch %d) on %s\n", \ + reg_beacon->chan.center_freq, \ + ieee80211_frequency_to_channel(reg_beacon->chan.center_freq), \ + wiphy_name(wiphy)); +#else +#define REG_DEBUG_BEACON_FLAG(desc) do {} while (0) +#endif + struct ieee80211_supported_band *sband; + struct ieee80211_channel *chan; + + assert_cfg80211_lock(); + + sband = wiphy->bands[reg_beacon->chan.band]; + chan = &sband->channels[chan_idx]; + + if (likely(chan->center_freq != reg_beacon->chan.center_freq)) + return; + + if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) { + chan->flags &= ~IEEE80211_CHAN_PASSIVE_SCAN; + REG_DEBUG_BEACON_FLAG("active scanning"); + } + + if (chan->flags & IEEE80211_CHAN_NO_IBSS) { + chan->flags &= ~IEEE80211_CHAN_NO_IBSS; + REG_DEBUG_BEACON_FLAG("beaconing"); + } + + chan->beacon_found = true; +#undef REG_DEBUG_BEACON_FLAG } -void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby) +/* + * Called when a scan on a wiphy finds a beacon on + * new channel + */ +static void wiphy_update_new_beacon(struct wiphy *wiphy, + struct reg_beacon *reg_beacon) +{ + unsigned int i; + struct ieee80211_supported_band *sband; + + assert_cfg80211_lock(); + + if (!wiphy->bands[reg_beacon->chan.band]) + return; + + sband = wiphy->bands[reg_beacon->chan.band]; + + for (i = 0; i < sband->n_channels; i++) + handle_reg_beacon(wiphy, i, reg_beacon); +} + +/* + * Called upon reg changes or a new wiphy is added + */ +static void wiphy_update_beacon_reg(struct wiphy *wiphy) +{ + unsigned int i; + struct ieee80211_supported_band *sband; + struct reg_beacon *reg_beacon; + + assert_cfg80211_lock(); + + if (list_empty(®_beacon_list)) + return; + + list_for_each_entry(reg_beacon, ®_beacon_list, list) { + if (!wiphy->bands[reg_beacon->chan.band]) + continue; + sband = wiphy->bands[reg_beacon->chan.band]; + for (i = 0; i < sband->n_channels; i++) + handle_reg_beacon(wiphy, i, reg_beacon); + } +} + +static bool reg_is_world_roaming(struct wiphy *wiphy) +{ + if (is_world_regdom(cfg80211_regdomain->alpha2) || + (wiphy->regd && is_world_regdom(wiphy->regd->alpha2))) + return true; + if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && + wiphy->custom_regulatory) + return true; + return false; +} + +/* Reap the advantages of previously found beacons */ +static void reg_process_beacons(struct wiphy *wiphy) +{ + if (!reg_is_world_roaming(wiphy)) + return; + wiphy_update_beacon_reg(wiphy); +} + +void wiphy_update_regulatory(struct wiphy *wiphy, + enum nl80211_reg_initiator initiator) { enum ieee80211_band band; + + if (ignore_reg_update(wiphy, initiator)) + goto out; for (band = 0; band < IEEE80211_NUM_BANDS; band++) { if (wiphy->bands[band]) handle_band(wiphy, band); - if (wiphy->reg_notifier) - wiphy->reg_notifier(wiphy, setby); } +out: + reg_process_beacons(wiphy); + if (wiphy->reg_notifier) + wiphy->reg_notifier(wiphy, last_request); +} + +static void handle_channel_custom(struct wiphy *wiphy, + enum ieee80211_band band, + unsigned int chan_idx, + const struct ieee80211_regdomain *regd) +{ + int r; + u32 max_bandwidth = 0; + const struct ieee80211_reg_rule *reg_rule = NULL; + const struct ieee80211_power_rule *power_rule = NULL; + struct ieee80211_supported_band *sband; + struct ieee80211_channel *chan; + + sband = wiphy->bands[band]; + BUG_ON(chan_idx >= sband->n_channels); + chan = &sband->channels[chan_idx]; + + r = freq_reg_info_regd(wiphy, MHZ_TO_KHZ(chan->center_freq), + &max_bandwidth, ®_rule, regd); + + if (r) { + chan->flags = IEEE80211_CHAN_DISABLED; + return; + } + + power_rule = ®_rule->power_rule; + + chan->flags |= map_regdom_flags(reg_rule->flags); + chan->max_antenna_gain = (int) MBI_TO_DBI(power_rule->max_antenna_gain); + chan->max_bandwidth = KHZ_TO_MHZ(max_bandwidth); + chan->max_power = (int) MBM_TO_DBM(power_rule->max_eirp); } -/* Return value which can be used by ignore_request() to indicate - * it has been determined we should intersect two regulatory domains */ +static void handle_band_custom(struct wiphy *wiphy, enum ieee80211_band band, + const struct ieee80211_regdomain *regd) +{ + unsigned int i; + struct ieee80211_supported_band *sband; + + BUG_ON(!wiphy->bands[band]); + sband = wiphy->bands[band]; + + for (i = 0; i < sband->n_channels; i++) + handle_channel_custom(wiphy, band, i, regd); +} + +/* Used by drivers prior to wiphy registration */ +void wiphy_apply_custom_regulatory(struct wiphy *wiphy, + const struct ieee80211_regdomain *regd) +{ + enum ieee80211_band band; + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + if (wiphy->bands[band]) + handle_band_custom(wiphy, band, regd); + } +} +EXPORT_SYMBOL(wiphy_apply_custom_regulatory); + +static int reg_copy_regd(const struct ieee80211_regdomain **dst_regd, + const struct ieee80211_regdomain *src_regd) +{ + struct ieee80211_regdomain *regd; + int size_of_regd = 0; + unsigned int i; + + size_of_regd = sizeof(struct ieee80211_regdomain) + + ((src_regd->n_reg_rules + 1) * sizeof(struct ieee80211_reg_rule)); + + regd = kzalloc(size_of_regd, GFP_KERNEL); + if (!regd) + return -ENOMEM; + + memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain)); + + for (i = 0; i < src_regd->n_reg_rules; i++) + memcpy(®d->reg_rules[i], &src_regd->reg_rules[i], + sizeof(struct ieee80211_reg_rule)); + + *dst_regd = regd; + return 0; +} + +/* + * Return value which can be used by ignore_request() to indicate + * it has been determined we should intersect two regulatory domains + */ #define REG_INTERSECT 1 /* This has the logic which determines when a new request * should be ignored. */ -static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by, - const char *alpha2) +static int ignore_request(struct wiphy *wiphy, + struct regulatory_request *pending_request) { + struct wiphy *last_wiphy = NULL; + + assert_cfg80211_lock(); + /* All initial requests are respected */ if (!last_request) return 0; - switch (set_by) { - case REGDOM_SET_BY_INIT: + switch (pending_request->initiator) { + case NL80211_REGDOM_SET_BY_CORE: return -EINVAL; - case REGDOM_SET_BY_CORE: - /* - * Always respect new wireless core hints, should only happen - * when updating the world regulatory domain at init. - */ - return 0; - case REGDOM_SET_BY_COUNTRY_IE: - if (unlikely(!is_an_alpha2(alpha2))) + case NL80211_REGDOM_SET_BY_COUNTRY_IE: + + last_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + + if (unlikely(!is_an_alpha2(pending_request->alpha2))) return -EINVAL; - if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) { - if (last_request->wiphy != wiphy) { + if (last_request->initiator == + NL80211_REGDOM_SET_BY_COUNTRY_IE) { + if (last_wiphy != wiphy) { /* * Two cards with two APs claiming different * different Country IE alpha2s. We could * intersect them, but that seems unlikely * to be correct. Reject second one for now. */ - if (!alpha2_equal(alpha2, - cfg80211_regdomain->alpha2)) + if (regdom_changes(pending_request->alpha2)) return -EOPNOTSUPP; return -EALREADY; } - /* Two consecutive Country IE hints on the same wiphy. - * This should be picked up early by the driver/stack */ - if (WARN_ON(!alpha2_equal(cfg80211_regdomain->alpha2, - alpha2))) + /* + * Two consecutive Country IE hints on the same wiphy. + * This should be picked up early by the driver/stack + */ + if (WARN_ON(regdom_changes(pending_request->alpha2))) return 0; return -EALREADY; } return REG_INTERSECT; - case REGDOM_SET_BY_DRIVER: - if (last_request->initiator == REGDOM_SET_BY_DRIVER) + case NL80211_REGDOM_SET_BY_DRIVER: + if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) { + if (is_old_static_regdom(cfg80211_regdomain)) + return 0; + if (regdom_changes(pending_request->alpha2)) + return 0; return -EALREADY; - return 0; - case REGDOM_SET_BY_USER: - if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) + } + + /* + * This would happen if you unplug and plug your card + * back in or if you add a new device for which the previously + * loaded card also agrees on the regulatory domain. + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && + !regdom_changes(pending_request->alpha2)) + return -EALREADY; + + return REG_INTERSECT; + case NL80211_REGDOM_SET_BY_USER: + if (last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) return REG_INTERSECT; - /* If the user knows better the user should set the regdom - * to their country before the IE is picked up */ - if (last_request->initiator == REGDOM_SET_BY_USER && + /* + * If the user knows better the user should set the regdom + * to their country before the IE is picked up + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_USER && last_request->intersect) return -EOPNOTSUPP; + /* + * Process user requests only after previous user/driver/core + * requests have been processed + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE || + last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER || + last_request->initiator == NL80211_REGDOM_SET_BY_USER) { + if (regdom_changes(last_request->alpha2)) + return -EAGAIN; + } + + if (!is_old_static_regdom(cfg80211_regdomain) && + !regdom_changes(pending_request->alpha2)) + return -EALREADY; + return 0; } return -EINVAL; } -/* Caller must hold &cfg80211_drv_mutex */ -int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by, - const char *alpha2, - u32 country_ie_checksum, - enum environment_cap env) +/** + * __regulatory_hint - hint to the wireless core a regulatory domain + * @wiphy: if the hint comes from country information from an AP, this + * is required to be set to the wiphy that received the information + * @pending_request: the regulatory request currently being processed + * + * The Wireless subsystem can use this function to hint to the wireless core + * what it believes should be the current regulatory domain. + * + * Returns zero if all went fine, %-EALREADY if a regulatory domain had + * already been set or other standard error codes. + * + * Caller must hold &cfg80211_mutex + */ +static int __regulatory_hint(struct wiphy *wiphy, + struct regulatory_request *pending_request) { - struct regulatory_request *request; bool intersect = false; int r = 0; - r = ignore_request(wiphy, set_by, alpha2); + assert_cfg80211_lock(); + + r = ignore_request(wiphy, pending_request); - if (r == REG_INTERSECT) + if (r == REG_INTERSECT) { + if (pending_request->initiator == + NL80211_REGDOM_SET_BY_DRIVER) { + r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain); + if (r) { + kfree(pending_request); + return r; + } + } intersect = true; - else if (r) + } else if (r) { + /* + * If the regulatory domain being requested by the + * driver has already been set just copy it to the + * wiphy + */ + if (r == -EALREADY && + pending_request->initiator == + NL80211_REGDOM_SET_BY_DRIVER) { + r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain); + if (r) { + kfree(pending_request); + return r; + } + r = -EALREADY; + goto new_request; + } + kfree(pending_request); return r; + } - request = kzalloc(sizeof(struct regulatory_request), - GFP_KERNEL); - if (!request) - return -ENOMEM; +new_request: + kfree(last_request); - request->alpha2[0] = alpha2[0]; - request->alpha2[1] = alpha2[1]; - request->initiator = set_by; - request->wiphy = wiphy; - request->intersect = intersect; - request->country_ie_checksum = country_ie_checksum; - request->country_ie_env = env; + last_request = pending_request; + last_request->intersect = intersect; + + pending_request = NULL; + + /* When r == REG_INTERSECT we do need to call CRDA */ + if (r < 0) { + /* + * Since CRDA will not be called in this case as we already + * have applied the requested regulatory domain before we just + * inform userspace we have processed the request + */ + if (r == -EALREADY) + nl80211_send_reg_change_event(last_request); + return r; + } - kfree(last_request); - last_request = request; /* * Note: When CONFIG_WIRELESS_OLD_REGULATORY is enabled * AND if CRDA is NOT present nothing will happen, if someone @@ -1067,29 +1425,194 @@ int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by, * * to intersect with the static rd */ - return call_crda(alpha2); + return call_crda(last_request->alpha2); +} + +/* This currently only processes user and driver regulatory hints */ +static void reg_process_hint(struct regulatory_request *reg_request) +{ + int r = 0; + struct wiphy *wiphy = NULL; + + BUG_ON(!reg_request->alpha2); + + mutex_lock(&cfg80211_mutex); + + if (wiphy_idx_valid(reg_request->wiphy_idx)) + wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); + + if (reg_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && + !wiphy) { + kfree(reg_request); + goto out; + } + + r = __regulatory_hint(wiphy, reg_request); + /* This is required so that the orig_* parameters are saved */ + if (r == -EALREADY && wiphy && wiphy->strict_regulatory) + wiphy_update_regulatory(wiphy, reg_request->initiator); +out: + mutex_unlock(&cfg80211_mutex); } -void regulatory_hint(struct wiphy *wiphy, const char *alpha2) +/* Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_* */ +static void reg_process_pending_hints(void) + { + struct regulatory_request *reg_request; + + spin_lock(®_requests_lock); + while (!list_empty(®_requests_list)) { + reg_request = list_first_entry(®_requests_list, + struct regulatory_request, + list); + list_del_init(®_request->list); + + spin_unlock(®_requests_lock); + reg_process_hint(reg_request); + spin_lock(®_requests_lock); + } + spin_unlock(®_requests_lock); +} + +/* Processes beacon hints -- this has nothing to do with country IEs */ +static void reg_process_pending_beacon_hints(void) { + struct cfg80211_registered_device *drv; + struct reg_beacon *pending_beacon, *tmp; + + mutex_lock(&cfg80211_mutex); + + /* This goes through the _pending_ beacon list */ + spin_lock_bh(®_pending_beacons_lock); + + if (list_empty(®_pending_beacons)) { + spin_unlock_bh(®_pending_beacons_lock); + goto out; + } + + list_for_each_entry_safe(pending_beacon, tmp, + ®_pending_beacons, list) { + + list_del_init(&pending_beacon->list); + + /* Applies the beacon hint to current wiphys */ + list_for_each_entry(drv, &cfg80211_drv_list, list) + wiphy_update_new_beacon(&drv->wiphy, pending_beacon); + + /* Remembers the beacon hint for new wiphys or reg changes */ + list_add_tail(&pending_beacon->list, ®_beacon_list); + } + + spin_unlock_bh(®_pending_beacons_lock); +out: + mutex_unlock(&cfg80211_mutex); +} + +static void reg_todo(struct work_struct *work) +{ + reg_process_pending_hints(); + reg_process_pending_beacon_hints(); +} + +static DECLARE_WORK(reg_work, reg_todo); + +static void queue_regulatory_request(struct regulatory_request *request) +{ + spin_lock(®_requests_lock); + list_add_tail(&request->list, ®_requests_list); + spin_unlock(®_requests_lock); + + schedule_work(®_work); +} + +/* Core regulatory hint -- happens once during cfg80211_init() */ +static int regulatory_hint_core(const char *alpha2) +{ + struct regulatory_request *request; + + BUG_ON(last_request); + + request = kzalloc(sizeof(struct regulatory_request), + GFP_KERNEL); + if (!request) + return -ENOMEM; + + request->alpha2[0] = alpha2[0]; + request->alpha2[1] = alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_CORE; + + queue_regulatory_request(request); + + return 0; +} + +/* User hints */ +int regulatory_hint_user(const char *alpha2) +{ + struct regulatory_request *request; + + BUG_ON(!alpha2); + + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + if (!request) + return -ENOMEM; + + request->wiphy_idx = WIPHY_IDX_STALE; + request->alpha2[0] = alpha2[0]; + request->alpha2[1] = alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_USER, + + queue_regulatory_request(request); + + return 0; +} + +/* Driver hints */ +int regulatory_hint(struct wiphy *wiphy, const char *alpha2) +{ + struct regulatory_request *request; + BUG_ON(!alpha2); + BUG_ON(!wiphy); + + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + if (!request) + return -ENOMEM; - mutex_lock(&cfg80211_drv_mutex); - __regulatory_hint(wiphy, REGDOM_SET_BY_DRIVER, alpha2, 0, ENVIRON_ANY); - mutex_unlock(&cfg80211_drv_mutex); + request->wiphy_idx = get_wiphy_idx(wiphy); + + /* Must have registered wiphy first */ + BUG_ON(!wiphy_idx_valid(request->wiphy_idx)); + + request->alpha2[0] = alpha2[0]; + request->alpha2[1] = alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_DRIVER; + + queue_regulatory_request(request); + + return 0; } EXPORT_SYMBOL(regulatory_hint); static bool reg_same_country_ie_hint(struct wiphy *wiphy, u32 country_ie_checksum) { - if (!last_request->wiphy) + struct wiphy *request_wiphy; + + assert_cfg80211_lock(); + + request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + + if (!request_wiphy) return false; - if (likely(last_request->wiphy != wiphy)) + + if (likely(request_wiphy != wiphy)) return !country_ie_integrity_changes(country_ie_checksum); - /* We should not have let these through at this point, they + /* + * We should not have let these through at this point, they * should have been picked up earlier by the first alpha2 check - * on the device */ + * on the device + */ if (WARN_ON(!country_ie_integrity_changes(country_ie_checksum))) return true; return false; @@ -1103,11 +1626,14 @@ void regulatory_hint_11d(struct wiphy *wiphy, char alpha2[2]; u32 checksum = 0; enum environment_cap env = ENVIRON_ANY; + struct regulatory_request *request; - if (!last_request) - return; + mutex_lock(&cfg80211_mutex); - mutex_lock(&cfg80211_drv_mutex); + if (unlikely(!last_request)) { + mutex_unlock(&cfg80211_mutex); + return; + } /* IE len must be evenly divisible by 2 */ if (country_ie_len & 0x01) @@ -1116,9 +1642,11 @@ void regulatory_hint_11d(struct wiphy *wiphy, if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) goto out; - /* Pending country IE processing, this can happen after we + /* + * Pending country IE processing, this can happen after we * call CRDA and wait for a response if a beacon was received before - * we were able to process the last regulatory_hint_11d() call */ + * we were able to process the last regulatory_hint_11d() call + */ if (country_ie_regdomain) goto out; @@ -1130,33 +1658,44 @@ void regulatory_hint_11d(struct wiphy *wiphy, else if (country_ie[2] == 'O') env = ENVIRON_OUTDOOR; - /* We will run this for *every* beacon processed for the BSSID, so + /* + * We will run this for *every* beacon processed for the BSSID, so * we optimize an early check to exit out early if we don't have to - * do anything */ - if (likely(last_request->wiphy)) { + * do anything + */ + if (likely(wiphy_idx_valid(last_request->wiphy_idx))) { struct cfg80211_registered_device *drv_last_ie; - drv_last_ie = wiphy_to_dev(last_request->wiphy); + drv_last_ie = + cfg80211_drv_by_wiphy_idx(last_request->wiphy_idx); - /* Lets keep this simple -- we trust the first AP - * after we intersect with CRDA */ - if (likely(last_request->wiphy == wiphy)) { - /* Ignore IEs coming in on this wiphy with - * the same alpha2 and environment cap */ + /* + * Lets keep this simple -- we trust the first AP + * after we intersect with CRDA + */ + if (likely(&drv_last_ie->wiphy == wiphy)) { + /* + * Ignore IEs coming in on this wiphy with + * the same alpha2 and environment cap + */ if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2, alpha2) && env == drv_last_ie->env)) { goto out; } - /* the wiphy moved on to another BSSID or the AP + /* + * the wiphy moved on to another BSSID or the AP * was reconfigured. XXX: We need to deal with the * case where the user suspends and goes to goes * to another country, and then gets IEs from an - * AP with different settings */ + * AP with different settings + */ goto out; } else { - /* Ignore IEs coming in on two separate wiphys with - * the same alpha2 and environment cap */ + /* + * Ignore IEs coming in on two separate wiphys with + * the same alpha2 and environment cap + */ if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2, alpha2) && env == drv_last_ie->env)) { @@ -1171,28 +1710,97 @@ void regulatory_hint_11d(struct wiphy *wiphy, if (!rd) goto out; - /* This will not happen right now but we leave it here for the + /* + * This will not happen right now but we leave it here for the * the future when we want to add suspend/resume support and having * the user move to another country after doing so, or having the user - * move to another AP. Right now we just trust the first AP. This is why - * this is marked as likley(). If we hit this before we add this support - * we want to be informed of it as it would indicate a mistake in the - * current design */ - if (likely(WARN_ON(reg_same_country_ie_hint(wiphy, checksum)))) - goto out; + * move to another AP. Right now we just trust the first AP. + * + * If we hit this before we add this support we want to be informed of + * it as it would indicate a mistake in the current design + */ + if (WARN_ON(reg_same_country_ie_hint(wiphy, checksum))) + goto free_rd_out; + + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + if (!request) + goto free_rd_out; - /* We keep this around for when CRDA comes back with a response so - * we can intersect with that */ + /* + * We keep this around for when CRDA comes back with a response so + * we can intersect with that + */ country_ie_regdomain = rd; - __regulatory_hint(wiphy, REGDOM_SET_BY_COUNTRY_IE, - country_ie_regdomain->alpha2, checksum, env); + request->wiphy_idx = get_wiphy_idx(wiphy); + request->alpha2[0] = rd->alpha2[0]; + request->alpha2[1] = rd->alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE; + request->country_ie_checksum = checksum; + request->country_ie_env = env; + + mutex_unlock(&cfg80211_mutex); + queue_regulatory_request(request); + + return; + +free_rd_out: + kfree(rd); out: - mutex_unlock(&cfg80211_drv_mutex); + mutex_unlock(&cfg80211_mutex); } EXPORT_SYMBOL(regulatory_hint_11d); +static bool freq_is_chan_12_13_14(u16 freq) +{ + if (freq == ieee80211_channel_to_frequency(12) || + freq == ieee80211_channel_to_frequency(13) || + freq == ieee80211_channel_to_frequency(14)) + return true; + return false; +} + +int regulatory_hint_found_beacon(struct wiphy *wiphy, + struct ieee80211_channel *beacon_chan, + gfp_t gfp) +{ + struct reg_beacon *reg_beacon; + + if (likely((beacon_chan->beacon_found || + (beacon_chan->flags & IEEE80211_CHAN_RADAR) || + (beacon_chan->band == IEEE80211_BAND_2GHZ && + !freq_is_chan_12_13_14(beacon_chan->center_freq))))) + return 0; + + reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp); + if (!reg_beacon) + return -ENOMEM; + +#ifdef CONFIG_CFG80211_REG_DEBUG + printk(KERN_DEBUG "cfg80211: Found new beacon on " + "frequency: %d MHz (Ch %d) on %s\n", + beacon_chan->center_freq, + ieee80211_frequency_to_channel(beacon_chan->center_freq), + wiphy_name(wiphy)); +#endif + memcpy(®_beacon->chan, beacon_chan, + sizeof(struct ieee80211_channel)); + + + /* + * Since we can be called from BH or and non-BH context + * we must use spin_lock_bh() + */ + spin_lock_bh(®_pending_beacons_lock); + list_add_tail(®_beacon->list, ®_pending_beacons); + spin_unlock_bh(®_pending_beacons_lock); + + schedule_work(®_work); + + return 0; +} + static void print_rd_rules(const struct ieee80211_regdomain *rd) { unsigned int i; @@ -1208,8 +1816,10 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd) freq_range = ®_rule->freq_range; power_rule = ®_rule->power_rule; - /* There may not be documentation for max antenna gain - * in certain regions */ + /* + * There may not be documentation for max antenna gain + * in certain regions + */ if (power_rule->max_antenna_gain) printk(KERN_INFO "\t(%d KHz - %d KHz @ %d KHz), " "(%d mBi, %d mBm)\n", @@ -1232,13 +1842,13 @@ static void print_regdomain(const struct ieee80211_regdomain *rd) { if (is_intersected_alpha2(rd->alpha2)) { - struct wiphy *wiphy = NULL; - struct cfg80211_registered_device *drv; - if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) { - if (last_request->wiphy) { - wiphy = last_request->wiphy; - drv = wiphy_to_dev(wiphy); + if (last_request->initiator == + NL80211_REGDOM_SET_BY_COUNTRY_IE) { + struct cfg80211_registered_device *drv; + drv = cfg80211_drv_by_wiphy_idx( + last_request->wiphy_idx); + if (drv) { printk(KERN_INFO "cfg80211: Current regulatory " "domain updated by AP to: %c%c\n", drv->country_ie_alpha2[0], @@ -1248,7 +1858,7 @@ static void print_regdomain(const struct ieee80211_regdomain *rd) "domain intersected: \n"); } else printk(KERN_INFO "cfg80211: Current regulatory " - "intersected: \n"); + "domain intersected: \n"); } else if (is_world_regdom(rd->alpha2)) printk(KERN_INFO "cfg80211: World regulatory " "domain updated:\n"); @@ -1304,7 +1914,7 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) { const struct ieee80211_regdomain *intersected_rd = NULL; struct cfg80211_registered_device *drv = NULL; - struct wiphy *wiphy = NULL; + struct wiphy *request_wiphy; /* Some basic sanity checks first */ if (is_world_regdom(rd->alpha2)) { @@ -1321,23 +1931,27 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) if (!last_request) return -EINVAL; - /* Lets only bother proceeding on the same alpha2 if the current + /* + * Lets only bother proceeding on the same alpha2 if the current * rd is non static (it means CRDA was present and was used last) - * and the pending request came in from a country IE */ - if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) { - /* If someone else asked us to change the rd lets only bother - * checking if the alpha2 changes if CRDA was already called */ + * and the pending request came in from a country IE + */ + if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) { + /* + * If someone else asked us to change the rd lets only bother + * checking if the alpha2 changes if CRDA was already called + */ if (!is_old_static_regdom(cfg80211_regdomain) && - !regdom_changed(rd->alpha2)) + !regdom_changes(rd->alpha2)) return -EINVAL; } - wiphy = last_request->wiphy; - - /* Now lets set the regulatory domain, update all driver channels + /* + * Now lets set the regulatory domain, update all driver channels * and finally inform them of what we have done, in case they want * to review or adjust their own settings based on their own - * internal EEPROM data */ + * internal EEPROM data + */ if (WARN_ON(!reg_is_valid_request(rd->alpha2))) return -EINVAL; @@ -1349,7 +1963,28 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) return -EINVAL; } + request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + if (!last_request->intersect) { + int r; + + if (last_request->initiator != NL80211_REGDOM_SET_BY_DRIVER) { + reset_regdomains(); + cfg80211_regdomain = rd; + return 0; + } + + /* + * For a driver hint, lets copy the regulatory domain the + * driver wanted to the wiphy to deal with conflicts + */ + + BUG_ON(request_wiphy->regd); + + r = reg_copy_regd(&request_wiphy->regd, rd); + if (r) + return r; + reset_regdomains(); cfg80211_regdomain = rd; return 0; @@ -1357,14 +1992,22 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) /* Intersection requires a bit more work */ - if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) { + if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) { intersected_rd = regdom_intersect(rd, cfg80211_regdomain); if (!intersected_rd) return -EINVAL; - /* We can trash what CRDA provided now */ - kfree(rd); + /* + * We can trash what CRDA provided now. + * However if a driver requested this specific regulatory + * domain we keep it for its private use + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER) + request_wiphy->regd = rd; + else + kfree(rd); + rd = NULL; reset_regdomains(); @@ -1381,8 +2024,10 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) BUG_ON(!country_ie_regdomain); if (rd != country_ie_regdomain) { - /* Intersect what CRDA returned and our what we - * had built from the Country IE received */ + /* + * Intersect what CRDA returned and our what we + * had built from the Country IE received + */ intersected_rd = regdom_intersect(rd, country_ie_regdomain); @@ -1392,16 +2037,18 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) kfree(country_ie_regdomain); country_ie_regdomain = NULL; } else { - /* This would happen when CRDA was not present and + /* + * This would happen when CRDA was not present and * OLD_REGULATORY was enabled. We intersect our Country - * IE rd and what was set on cfg80211 originally */ + * IE rd and what was set on cfg80211 originally + */ intersected_rd = regdom_intersect(rd, cfg80211_regdomain); } if (!intersected_rd) return -EINVAL; - drv = wiphy_to_dev(wiphy); + drv = wiphy_to_dev(request_wiphy); drv->country_ie_alpha2[0] = rd->alpha2[0]; drv->country_ie_alpha2[1] = rd->alpha2[1]; @@ -1419,13 +2066,17 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) } -/* Use this call to set the current regulatory domain. Conflicts with +/* + * Use this call to set the current regulatory domain. Conflicts with * multiple drivers can be ironed out later. Caller must've already - * kmalloc'd the rd structure. Caller must hold cfg80211_drv_mutex */ + * kmalloc'd the rd structure. Caller must hold cfg80211_mutex + */ int set_regdom(const struct ieee80211_regdomain *rd) { int r; + assert_cfg80211_lock(); + /* Note that this doesn't update the wiphys, this is done below */ r = __set_regdom(rd); if (r) { @@ -1442,56 +2093,87 @@ int set_regdom(const struct ieee80211_regdomain *rd) print_regdomain(cfg80211_regdomain); + nl80211_send_reg_change_event(last_request); + return r; } -/* Caller must hold cfg80211_drv_mutex */ +/* Caller must hold cfg80211_mutex */ void reg_device_remove(struct wiphy *wiphy) { - if (!last_request || !last_request->wiphy) + struct wiphy *request_wiphy; + + assert_cfg80211_lock(); + + request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + + kfree(wiphy->regd); + if (!last_request || !request_wiphy) return; - if (last_request->wiphy != wiphy) + if (request_wiphy != wiphy) return; - last_request->wiphy = NULL; + last_request->wiphy_idx = WIPHY_IDX_STALE; last_request->country_ie_env = ENVIRON_ANY; } int regulatory_init(void) { - int err; + int err = 0; reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); if (IS_ERR(reg_pdev)) return PTR_ERR(reg_pdev); + spin_lock_init(®_requests_lock); + spin_lock_init(®_pending_beacons_lock); + #ifdef CONFIG_WIRELESS_OLD_REGULATORY cfg80211_regdomain = static_regdom(ieee80211_regdom); printk(KERN_INFO "cfg80211: Using static regulatory domain info\n"); print_regdomain_info(cfg80211_regdomain); - /* The old code still requests for a new regdomain and if + /* + * The old code still requests for a new regdomain and if * you have CRDA you get it updated, otherwise you get * stuck with the static values. We ignore "EU" code as - * that is not a valid ISO / IEC 3166 alpha2 */ + * that is not a valid ISO / IEC 3166 alpha2 + */ if (ieee80211_regdom[0] != 'E' || ieee80211_regdom[1] != 'U') - err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, - ieee80211_regdom, 0, ENVIRON_ANY); + err = regulatory_hint_core(ieee80211_regdom); #else cfg80211_regdomain = cfg80211_world_regdom; - err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", 0, ENVIRON_ANY); - if (err) - printk(KERN_ERR "cfg80211: calling CRDA failed - " - "unable to update world regulatory domain, " - "using static definition\n"); + err = regulatory_hint_core("00"); +#endif + if (err) { + if (err == -ENOMEM) + return err; + /* + * N.B. kobject_uevent_env() can fail mainly for when we're out + * memory which is handled and propagated appropriately above + * but it can also fail during a netlink_broadcast() or during + * early boot for call_usermodehelper(). For now treat these + * errors as non-fatal. + */ + printk(KERN_ERR "cfg80211: kobject_uevent_env() was unable " + "to call CRDA during init"); +#ifdef CONFIG_CFG80211_REG_DEBUG + /* We want to find out exactly why when debugging */ + WARN_ON(err); #endif + } return 0; } void regulatory_exit(void) { - mutex_lock(&cfg80211_drv_mutex); + struct regulatory_request *reg_request, *tmp; + struct reg_beacon *reg_beacon, *btmp; + + cancel_work_sync(®_work); + + mutex_lock(&cfg80211_mutex); reset_regdomains(); @@ -1502,5 +2184,33 @@ void regulatory_exit(void) platform_device_unregister(reg_pdev); - mutex_unlock(&cfg80211_drv_mutex); + spin_lock_bh(®_pending_beacons_lock); + if (!list_empty(®_pending_beacons)) { + list_for_each_entry_safe(reg_beacon, btmp, + ®_pending_beacons, list) { + list_del(®_beacon->list); + kfree(reg_beacon); + } + } + spin_unlock_bh(®_pending_beacons_lock); + + if (!list_empty(®_beacon_list)) { + list_for_each_entry_safe(reg_beacon, btmp, + ®_beacon_list, list) { + list_del(®_beacon->list); + kfree(reg_beacon); + } + } + + spin_lock(®_requests_lock); + if (!list_empty(®_requests_list)) { + list_for_each_entry_safe(reg_request, tmp, + ®_requests_list, list) { + list_del(®_request->list); + kfree(reg_request); + } + } + spin_unlock(®_requests_lock); + + mutex_unlock(&cfg80211_mutex); } diff --git a/net/wireless/reg.h b/net/wireless/reg.h index a76ea3ff7cd6..e37829a49dc4 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -1,9 +1,13 @@ #ifndef __NET_WIRELESS_REG_H #define __NET_WIRELESS_REG_H +extern const struct ieee80211_regdomain *cfg80211_regdomain; + bool is_world_regdom(const char *alpha2); bool reg_is_valid_request(const char *alpha2); +int regulatory_hint_user(const char *alpha2); + void reg_device_remove(struct wiphy *wiphy); int regulatory_init(void); @@ -11,34 +15,25 @@ void regulatory_exit(void); int set_regdom(const struct ieee80211_regdomain *rd); -enum environment_cap { - ENVIRON_ANY, - ENVIRON_INDOOR, - ENVIRON_OUTDOOR, -}; - - /** - * __regulatory_hint - hint to the wireless core a regulatory domain - * @wiphy: if the hint comes from country information from an AP, this - * is required to be set to the wiphy that received the information - * @alpha2: the ISO/IEC 3166 alpha2 being claimed the regulatory domain - * should be in. - * @country_ie_checksum: checksum of processed country IE, set this to 0 - * if the hint did not come from a country IE - * @country_ie_env: the environment the IE told us we are in, %ENVIRON_* - * - * The Wireless subsystem can use this function to hint to the wireless core - * what it believes should be the current regulatory domain by giving it an - * ISO/IEC 3166 alpha2 country code it knows its regulatory domain should be - * in. + * regulatory_hint_found_beacon - hints a beacon was found on a channel + * @wiphy: the wireless device where the beacon was found on + * @beacon_chan: the channel on which the beacon was found on + * @gfp: context flags * - * Returns zero if all went fine, %-EALREADY if a regulatory domain had - * already been set or other standard error codes. + * This informs the wireless core that a beacon from an AP was found on + * the channel provided. This allows the wireless core to make educated + * guesses on regulatory to help with world roaming. This is only used for + * world roaming -- when we do not know our current location. This is + * only useful on channels 12, 13 and 14 on the 2 GHz band as channels + * 1-11 are already enabled by the world regulatory domain; and on + * non-radar 5 GHz channels. * + * Drivers do not need to call this, cfg80211 will do it for after a scan + * on a newly found BSS. */ -extern int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by, - const char *alpha2, u32 country_ie_checksum, - enum environment_cap country_ie_env); +int regulatory_hint_found_beacon(struct wiphy *wiphy, + struct ieee80211_channel *beacon_chan, + gfp_t gfp); #endif /* __NET_WIRELESS_REG_H */ diff --git a/net/wireless/scan.c b/net/wireless/scan.c new file mode 100644 index 000000000000..280dbcd02c15 --- /dev/null +++ b/net/wireless/scan.c @@ -0,0 +1,866 @@ +/* + * cfg80211 scan result handling + * + * Copyright 2008 Johannes Berg <johannes@sipsolutions.net> + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/wireless.h> +#include <linux/nl80211.h> +#include <linux/etherdevice.h> +#include <net/arp.h> +#include <net/cfg80211.h> +#include <net/iw_handler.h> +#include "core.h" +#include "nl80211.h" + +#define IEEE80211_SCAN_RESULT_EXPIRE (10 * HZ) + +void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted) +{ + struct net_device *dev; +#ifdef CONFIG_WIRELESS_EXT + union iwreq_data wrqu; +#endif + + dev = dev_get_by_index(&init_net, request->ifidx); + if (!dev) + goto out; + + WARN_ON(request != wiphy_to_dev(request->wiphy)->scan_req); + wiphy_to_dev(request->wiphy)->scan_req = NULL; + + if (aborted) + nl80211_send_scan_aborted(wiphy_to_dev(request->wiphy), dev); + else + nl80211_send_scan_done(wiphy_to_dev(request->wiphy), dev); + +#ifdef CONFIG_WIRELESS_EXT + if (!aborted) { + memset(&wrqu, 0, sizeof(wrqu)); + + wireless_send_event(dev, SIOCGIWSCAN, &wrqu, NULL); + } +#endif + + dev_put(dev); + + out: + kfree(request); +} +EXPORT_SYMBOL(cfg80211_scan_done); + +static void bss_release(struct kref *ref) +{ + struct cfg80211_internal_bss *bss; + + bss = container_of(ref, struct cfg80211_internal_bss, ref); + if (bss->pub.free_priv) + bss->pub.free_priv(&bss->pub); + kfree(bss); +} + +/* must hold dev->bss_lock! */ +void cfg80211_bss_age(struct cfg80211_registered_device *dev, + unsigned long age_secs) +{ + struct cfg80211_internal_bss *bss; + unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC); + + list_for_each_entry(bss, &dev->bss_list, list) { + bss->ts -= age_jiffies; + } +} + +/* must hold dev->bss_lock! */ +void cfg80211_bss_expire(struct cfg80211_registered_device *dev) +{ + struct cfg80211_internal_bss *bss, *tmp; + bool expired = false; + + list_for_each_entry_safe(bss, tmp, &dev->bss_list, list) { + if (!time_after(jiffies, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE)) + continue; + list_del(&bss->list); + rb_erase(&bss->rbn, &dev->bss_tree); + kref_put(&bss->ref, bss_release); + expired = true; + } + + if (expired) + dev->bss_generation++; +} + +static u8 *find_ie(u8 num, u8 *ies, size_t len) +{ + while (len > 2 && ies[0] != num) { + len -= ies[1] + 2; + ies += ies[1] + 2; + } + if (len < 2) + return NULL; + if (len < 2 + ies[1]) + return NULL; + return ies; +} + +static int cmp_ies(u8 num, u8 *ies1, size_t len1, u8 *ies2, size_t len2) +{ + const u8 *ie1 = find_ie(num, ies1, len1); + const u8 *ie2 = find_ie(num, ies2, len2); + int r; + + if (!ie1 && !ie2) + return 0; + if (!ie1) + return -1; + + r = memcmp(ie1 + 2, ie2 + 2, min(ie1[1], ie2[1])); + if (r == 0 && ie1[1] != ie2[1]) + return ie2[1] - ie1[1]; + return r; +} + +static bool is_bss(struct cfg80211_bss *a, + const u8 *bssid, + const u8 *ssid, size_t ssid_len) +{ + const u8 *ssidie; + + if (bssid && compare_ether_addr(a->bssid, bssid)) + return false; + + if (!ssid) + return true; + + ssidie = find_ie(WLAN_EID_SSID, + a->information_elements, + a->len_information_elements); + if (!ssidie) + return false; + if (ssidie[1] != ssid_len) + return false; + return memcmp(ssidie + 2, ssid, ssid_len) == 0; +} + +static bool is_mesh(struct cfg80211_bss *a, + const u8 *meshid, size_t meshidlen, + const u8 *meshcfg) +{ + const u8 *ie; + + if (!is_zero_ether_addr(a->bssid)) + return false; + + ie = find_ie(WLAN_EID_MESH_ID, + a->information_elements, + a->len_information_elements); + if (!ie) + return false; + if (ie[1] != meshidlen) + return false; + if (memcmp(ie + 2, meshid, meshidlen)) + return false; + + ie = find_ie(WLAN_EID_MESH_CONFIG, + a->information_elements, + a->len_information_elements); + if (ie[1] != IEEE80211_MESH_CONFIG_LEN) + return false; + + /* + * Ignore mesh capability (last two bytes of the IE) when + * comparing since that may differ between stations taking + * part in the same mesh. + */ + return memcmp(ie + 2, meshcfg, IEEE80211_MESH_CONFIG_LEN - 2) == 0; +} + +static int cmp_bss(struct cfg80211_bss *a, + struct cfg80211_bss *b) +{ + int r; + + if (a->channel != b->channel) + return b->channel->center_freq - a->channel->center_freq; + + r = memcmp(a->bssid, b->bssid, ETH_ALEN); + if (r) + return r; + + if (is_zero_ether_addr(a->bssid)) { + r = cmp_ies(WLAN_EID_MESH_ID, + a->information_elements, + a->len_information_elements, + b->information_elements, + b->len_information_elements); + if (r) + return r; + return cmp_ies(WLAN_EID_MESH_CONFIG, + a->information_elements, + a->len_information_elements, + b->information_elements, + b->len_information_elements); + } + + return cmp_ies(WLAN_EID_SSID, + a->information_elements, + a->len_information_elements, + b->information_elements, + b->len_information_elements); +} + +struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, + struct ieee80211_channel *channel, + const u8 *bssid, + const u8 *ssid, size_t ssid_len, + u16 capa_mask, u16 capa_val) +{ + struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); + struct cfg80211_internal_bss *bss, *res = NULL; + + spin_lock_bh(&dev->bss_lock); + + list_for_each_entry(bss, &dev->bss_list, list) { + if ((bss->pub.capability & capa_mask) != capa_val) + continue; + if (channel && bss->pub.channel != channel) + continue; + if (is_bss(&bss->pub, bssid, ssid, ssid_len)) { + res = bss; + kref_get(&res->ref); + break; + } + } + + spin_unlock_bh(&dev->bss_lock); + if (!res) + return NULL; + return &res->pub; +} +EXPORT_SYMBOL(cfg80211_get_bss); + +struct cfg80211_bss *cfg80211_get_mesh(struct wiphy *wiphy, + struct ieee80211_channel *channel, + const u8 *meshid, size_t meshidlen, + const u8 *meshcfg) +{ + struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); + struct cfg80211_internal_bss *bss, *res = NULL; + + spin_lock_bh(&dev->bss_lock); + + list_for_each_entry(bss, &dev->bss_list, list) { + if (channel && bss->pub.channel != channel) + continue; + if (is_mesh(&bss->pub, meshid, meshidlen, meshcfg)) { + res = bss; + kref_get(&res->ref); + break; + } + } + + spin_unlock_bh(&dev->bss_lock); + if (!res) + return NULL; + return &res->pub; +} +EXPORT_SYMBOL(cfg80211_get_mesh); + + +static void rb_insert_bss(struct cfg80211_registered_device *dev, + struct cfg80211_internal_bss *bss) +{ + struct rb_node **p = &dev->bss_tree.rb_node; + struct rb_node *parent = NULL; + struct cfg80211_internal_bss *tbss; + int cmp; + + while (*p) { + parent = *p; + tbss = rb_entry(parent, struct cfg80211_internal_bss, rbn); + + cmp = cmp_bss(&bss->pub, &tbss->pub); + + if (WARN_ON(!cmp)) { + /* will sort of leak this BSS */ + return; + } + + if (cmp < 0) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&bss->rbn, parent, p); + rb_insert_color(&bss->rbn, &dev->bss_tree); +} + +static struct cfg80211_internal_bss * +rb_find_bss(struct cfg80211_registered_device *dev, + struct cfg80211_internal_bss *res) +{ + struct rb_node *n = dev->bss_tree.rb_node; + struct cfg80211_internal_bss *bss; + int r; + + while (n) { + bss = rb_entry(n, struct cfg80211_internal_bss, rbn); + r = cmp_bss(&res->pub, &bss->pub); + + if (r == 0) + return bss; + else if (r < 0) + n = n->rb_left; + else + n = n->rb_right; + } + + return NULL; +} + +static struct cfg80211_internal_bss * +cfg80211_bss_update(struct cfg80211_registered_device *dev, + struct cfg80211_internal_bss *res, + bool overwrite) +{ + struct cfg80211_internal_bss *found = NULL; + const u8 *meshid, *meshcfg; + + /* + * The reference to "res" is donated to this function. + */ + + if (WARN_ON(!res->pub.channel)) { + kref_put(&res->ref, bss_release); + return NULL; + } + + res->ts = jiffies; + + if (is_zero_ether_addr(res->pub.bssid)) { + /* must be mesh, verify */ + meshid = find_ie(WLAN_EID_MESH_ID, res->pub.information_elements, + res->pub.len_information_elements); + meshcfg = find_ie(WLAN_EID_MESH_CONFIG, + res->pub.information_elements, + res->pub.len_information_elements); + if (!meshid || !meshcfg || + meshcfg[1] != IEEE80211_MESH_CONFIG_LEN) { + /* bogus mesh */ + kref_put(&res->ref, bss_release); + return NULL; + } + } + + spin_lock_bh(&dev->bss_lock); + + found = rb_find_bss(dev, res); + + if (found && overwrite) { + list_replace(&found->list, &res->list); + rb_replace_node(&found->rbn, &res->rbn, + &dev->bss_tree); + kref_put(&found->ref, bss_release); + found = res; + } else if (found) { + kref_get(&found->ref); + found->pub.beacon_interval = res->pub.beacon_interval; + found->pub.tsf = res->pub.tsf; + found->pub.signal = res->pub.signal; + found->pub.capability = res->pub.capability; + found->ts = res->ts; + kref_put(&res->ref, bss_release); + } else { + /* this "consumes" the reference */ + list_add_tail(&res->list, &dev->bss_list); + rb_insert_bss(dev, res); + found = res; + } + + dev->bss_generation++; + spin_unlock_bh(&dev->bss_lock); + + kref_get(&found->ref); + return found; +} + +struct cfg80211_bss * +cfg80211_inform_bss_frame(struct wiphy *wiphy, + struct ieee80211_channel *channel, + struct ieee80211_mgmt *mgmt, size_t len, + s32 signal, gfp_t gfp) +{ + struct cfg80211_internal_bss *res; + size_t ielen = len - offsetof(struct ieee80211_mgmt, + u.probe_resp.variable); + bool overwrite; + size_t privsz = wiphy->bss_priv_size; + + if (WARN_ON(wiphy->signal_type == NL80211_BSS_SIGNAL_UNSPEC && + (signal < 0 || signal > 100))) + return NULL; + + if (WARN_ON(!mgmt || !wiphy || + len < offsetof(struct ieee80211_mgmt, u.probe_resp.variable))) + return NULL; + + res = kzalloc(sizeof(*res) + privsz + ielen, gfp); + if (!res) + return NULL; + + memcpy(res->pub.bssid, mgmt->bssid, ETH_ALEN); + res->pub.channel = channel; + res->pub.signal = signal; + res->pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); + res->pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); + res->pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); + /* point to after the private area */ + res->pub.information_elements = (u8 *)res + sizeof(*res) + privsz; + memcpy(res->pub.information_elements, mgmt->u.probe_resp.variable, ielen); + res->pub.len_information_elements = ielen; + + kref_init(&res->ref); + + overwrite = ieee80211_is_probe_resp(mgmt->frame_control); + + res = cfg80211_bss_update(wiphy_to_dev(wiphy), res, overwrite); + if (!res) + return NULL; + + if (res->pub.capability & WLAN_CAPABILITY_ESS) + regulatory_hint_found_beacon(wiphy, channel, gfp); + + /* cfg80211_bss_update gives us a referenced result */ + return &res->pub; +} +EXPORT_SYMBOL(cfg80211_inform_bss_frame); + +void cfg80211_put_bss(struct cfg80211_bss *pub) +{ + struct cfg80211_internal_bss *bss; + + if (!pub) + return; + + bss = container_of(pub, struct cfg80211_internal_bss, pub); + kref_put(&bss->ref, bss_release); +} +EXPORT_SYMBOL(cfg80211_put_bss); + +void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) +{ + struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); + struct cfg80211_internal_bss *bss; + + if (WARN_ON(!pub)) + return; + + bss = container_of(pub, struct cfg80211_internal_bss, pub); + + spin_lock_bh(&dev->bss_lock); + + list_del(&bss->list); + rb_erase(&bss->rbn, &dev->bss_tree); + + spin_unlock_bh(&dev->bss_lock); + + kref_put(&bss->ref, bss_release); +} +EXPORT_SYMBOL(cfg80211_unlink_bss); + +#ifdef CONFIG_WIRELESS_EXT +int cfg80211_wext_siwscan(struct net_device *dev, + struct iw_request_info *info, + union iwreq_data *wrqu, char *extra) +{ + struct cfg80211_registered_device *rdev; + struct wiphy *wiphy; + struct iw_scan_req *wreq = NULL; + struct cfg80211_scan_request *creq; + int i, err, n_channels = 0; + enum ieee80211_band band; + + if (!netif_running(dev)) + return -ENETDOWN; + + rdev = cfg80211_get_dev_from_ifindex(dev->ifindex); + + if (IS_ERR(rdev)) + return PTR_ERR(rdev); + + if (rdev->scan_req) { + err = -EBUSY; + goto out; + } + + wiphy = &rdev->wiphy; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) + if (wiphy->bands[band]) + n_channels += wiphy->bands[band]->n_channels; + + creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) + + n_channels * sizeof(void *), + GFP_ATOMIC); + if (!creq) { + err = -ENOMEM; + goto out; + } + + creq->wiphy = wiphy; + creq->ifidx = dev->ifindex; + creq->ssids = (void *)(creq + 1); + creq->channels = (void *)(creq->ssids + 1); + creq->n_channels = n_channels; + creq->n_ssids = 1; + + /* all channels */ + i = 0; + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + int j; + if (!wiphy->bands[band]) + continue; + for (j = 0; j < wiphy->bands[band]->n_channels; j++) { + creq->channels[i] = &wiphy->bands[band]->channels[j]; + i++; + } + } + + /* translate scan request */ + if (wrqu->data.length == sizeof(struct iw_scan_req)) { + wreq = (struct iw_scan_req *)extra; + + if (wrqu->data.flags & IW_SCAN_THIS_ESSID) { + if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) + return -EINVAL; + memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); + creq->ssids[0].ssid_len = wreq->essid_len; + } + if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) + creq->n_ssids = 0; + } + + rdev->scan_req = creq; + err = rdev->ops->scan(wiphy, dev, creq); + if (err) { + rdev->scan_req = NULL; + kfree(creq); + } + out: + cfg80211_put_dev(rdev); + return err; +} +EXPORT_SYMBOL(cfg80211_wext_siwscan); + +static void ieee80211_scan_add_ies(struct iw_request_info *info, + struct cfg80211_bss *bss, + char **current_ev, char *end_buf) +{ + u8 *pos, *end, *next; + struct iw_event iwe; + + if (!bss->information_elements || + !bss->len_information_elements) + return; + + /* + * If needed, fragment the IEs buffer (at IE boundaries) into short + * enough fragments to fit into IW_GENERIC_IE_MAX octet messages. + */ + pos = bss->information_elements; + end = pos + bss->len_information_elements; + + while (end - pos > IW_GENERIC_IE_MAX) { + next = pos + 2 + pos[1]; + while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX) + next = next + 2 + next[1]; + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVGENIE; + iwe.u.data.length = next - pos; + *current_ev = iwe_stream_add_point(info, *current_ev, + end_buf, &iwe, pos); + + pos = next; + } + + if (end > pos) { + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVGENIE; + iwe.u.data.length = end - pos; + *current_ev = iwe_stream_add_point(info, *current_ev, + end_buf, &iwe, pos); + } +} + +static inline unsigned int elapsed_jiffies_msecs(unsigned long start) +{ + unsigned long end = jiffies; + + if (end >= start) + return jiffies_to_msecs(end - start); + + return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1); +} + +static char * +ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, + struct cfg80211_internal_bss *bss, char *current_ev, + char *end_buf) +{ + struct iw_event iwe; + u8 *buf, *cfg, *p; + u8 *ie = bss->pub.information_elements; + int rem = bss->pub.len_information_elements, i, sig; + bool ismesh = false; + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWAP; + iwe.u.ap_addr.sa_family = ARPHRD_ETHER; + memcpy(iwe.u.ap_addr.sa_data, bss->pub.bssid, ETH_ALEN); + current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, + IW_EV_ADDR_LEN); + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWFREQ; + iwe.u.freq.m = ieee80211_frequency_to_channel(bss->pub.channel->center_freq); + iwe.u.freq.e = 0; + current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, + IW_EV_FREQ_LEN); + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWFREQ; + iwe.u.freq.m = bss->pub.channel->center_freq; + iwe.u.freq.e = 6; + current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, + IW_EV_FREQ_LEN); + + if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) { + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVQUAL; + iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED | + IW_QUAL_NOISE_INVALID | + IW_QUAL_QUAL_UPDATED; + switch (wiphy->signal_type) { + case CFG80211_SIGNAL_TYPE_MBM: + sig = bss->pub.signal / 100; + iwe.u.qual.level = sig; + iwe.u.qual.updated |= IW_QUAL_DBM; + if (sig < -110) /* rather bad */ + sig = -110; + else if (sig > -40) /* perfect */ + sig = -40; + /* will give a range of 0 .. 70 */ + iwe.u.qual.qual = sig + 110; + break; + case CFG80211_SIGNAL_TYPE_UNSPEC: + iwe.u.qual.level = bss->pub.signal; + /* will give range 0 .. 100 */ + iwe.u.qual.qual = bss->pub.signal; + break; + default: + /* not reached */ + break; + } + current_ev = iwe_stream_add_event(info, current_ev, end_buf, + &iwe, IW_EV_QUAL_LEN); + } + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWENCODE; + if (bss->pub.capability & WLAN_CAPABILITY_PRIVACY) + iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY; + else + iwe.u.data.flags = IW_ENCODE_DISABLED; + iwe.u.data.length = 0; + current_ev = iwe_stream_add_point(info, current_ev, end_buf, + &iwe, ""); + + while (rem >= 2) { + /* invalid data */ + if (ie[1] > rem - 2) + break; + + switch (ie[0]) { + case WLAN_EID_SSID: + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWESSID; + iwe.u.data.length = ie[1]; + iwe.u.data.flags = 1; + current_ev = iwe_stream_add_point(info, current_ev, end_buf, + &iwe, ie + 2); + break; + case WLAN_EID_MESH_ID: + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWESSID; + iwe.u.data.length = ie[1]; + iwe.u.data.flags = 1; + current_ev = iwe_stream_add_point(info, current_ev, end_buf, + &iwe, ie + 2); + break; + case WLAN_EID_MESH_CONFIG: + ismesh = true; + if (ie[1] != IEEE80211_MESH_CONFIG_LEN) + break; + buf = kmalloc(50, GFP_ATOMIC); + if (!buf) + break; + cfg = ie + 2; + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVCUSTOM; + sprintf(buf, "Mesh network (version %d)", cfg[0]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + sprintf(buf, "Path Selection Protocol ID: " + "0x%02X%02X%02X%02X", cfg[1], cfg[2], cfg[3], + cfg[4]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + sprintf(buf, "Path Selection Metric ID: " + "0x%02X%02X%02X%02X", cfg[5], cfg[6], cfg[7], + cfg[8]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + sprintf(buf, "Congestion Control Mode ID: " + "0x%02X%02X%02X%02X", cfg[9], cfg[10], + cfg[11], cfg[12]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + sprintf(buf, "Channel Precedence: " + "0x%02X%02X%02X%02X", cfg[13], cfg[14], + cfg[15], cfg[16]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + kfree(buf); + break; + case WLAN_EID_SUPP_RATES: + case WLAN_EID_EXT_SUPP_RATES: + /* display all supported rates in readable format */ + p = current_ev + iwe_stream_lcp_len(info); + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWRATE; + /* Those two flags are ignored... */ + iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0; + + for (i = 0; i < ie[1]; i++) { + iwe.u.bitrate.value = + ((ie[i + 2] & 0x7f) * 500000); + p = iwe_stream_add_value(info, current_ev, p, + end_buf, &iwe, IW_EV_PARAM_LEN); + } + current_ev = p; + break; + } + rem -= ie[1] + 2; + ie += ie[1] + 2; + } + + if (bss->pub.capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS) + || ismesh) { + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWMODE; + if (ismesh) + iwe.u.mode = IW_MODE_MESH; + else if (bss->pub.capability & WLAN_CAPABILITY_ESS) + iwe.u.mode = IW_MODE_MASTER; + else + iwe.u.mode = IW_MODE_ADHOC; + current_ev = iwe_stream_add_event(info, current_ev, end_buf, + &iwe, IW_EV_UINT_LEN); + } + + buf = kmalloc(30, GFP_ATOMIC); + if (buf) { + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVCUSTOM; + sprintf(buf, "tsf=%016llx", (unsigned long long)(bss->pub.tsf)); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, end_buf, + &iwe, buf); + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVCUSTOM; + sprintf(buf, " Last beacon: %ums ago", + elapsed_jiffies_msecs(bss->ts)); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, &iwe, buf); + kfree(buf); + } + + ieee80211_scan_add_ies(info, &bss->pub, ¤t_ev, end_buf); + + return current_ev; +} + + +static int ieee80211_scan_results(struct cfg80211_registered_device *dev, + struct iw_request_info *info, + char *buf, size_t len) +{ + char *current_ev = buf; + char *end_buf = buf + len; + struct cfg80211_internal_bss *bss; + + spin_lock_bh(&dev->bss_lock); + cfg80211_bss_expire(dev); + + list_for_each_entry(bss, &dev->bss_list, list) { + if (buf + len - current_ev <= IW_EV_ADDR_LEN) { + spin_unlock_bh(&dev->bss_lock); + return -E2BIG; + } + current_ev = ieee80211_bss(&dev->wiphy, info, bss, + current_ev, end_buf); + } + spin_unlock_bh(&dev->bss_lock); + return current_ev - buf; +} + + +int cfg80211_wext_giwscan(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *extra) +{ + struct cfg80211_registered_device *rdev; + int res; + + if (!netif_running(dev)) + return -ENETDOWN; + + rdev = cfg80211_get_dev_from_ifindex(dev->ifindex); + + if (IS_ERR(rdev)) + return PTR_ERR(rdev); + + if (rdev->scan_req) { + res = -EAGAIN; + goto out; + } + + res = ieee80211_scan_results(rdev, info, extra, data->length); + data->length = 0; + if (res >= 0) { + data->length = res; + res = 0; + } + + out: + cfg80211_put_dev(rdev); + return res; +} +EXPORT_SYMBOL(cfg80211_wext_giwscan); +#endif diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 79a382877641..efe3c5c92b2d 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -31,7 +31,7 @@ static ssize_t name ## _show(struct device *dev, \ return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \ } -SHOW_FMT(index, "%d", idx); +SHOW_FMT(index, "%d", wiphy_idx); SHOW_FMT(macaddress, "%pM", wiphy.perm_addr); static struct device_attribute ieee80211_dev_attrs[] = { @@ -55,6 +55,41 @@ static int wiphy_uevent(struct device *dev, struct kobj_uevent_env *env) } #endif +static int wiphy_suspend(struct device *dev, pm_message_t state) +{ + struct cfg80211_registered_device *rdev = dev_to_rdev(dev); + int ret = 0; + + rdev->suspend_at = get_seconds(); + + if (rdev->ops->suspend) { + rtnl_lock(); + ret = rdev->ops->suspend(&rdev->wiphy); + rtnl_unlock(); + } + + return ret; +} + +static int wiphy_resume(struct device *dev) +{ + struct cfg80211_registered_device *rdev = dev_to_rdev(dev); + int ret = 0; + + /* Age scan results with time spent in suspend */ + spin_lock_bh(&rdev->bss_lock); + cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at); + spin_unlock_bh(&rdev->bss_lock); + + if (rdev->ops->resume) { + rtnl_lock(); + ret = rdev->ops->resume(&rdev->wiphy); + rtnl_unlock(); + } + + return ret; +} + struct class ieee80211_class = { .name = "ieee80211", .owner = THIS_MODULE, @@ -63,6 +98,8 @@ struct class ieee80211_class = { #ifdef CONFIG_HOTPLUG .dev_uevent = wiphy_uevent, #endif + .suspend = wiphy_suspend, + .resume = wiphy_resume, }; int wiphy_sysfs_init(void) diff --git a/net/wireless/util.c b/net/wireless/util.c index e76cc28b0345..487cdd9bcffc 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -9,7 +9,7 @@ struct ieee80211_rate * ieee80211_get_response_rate(struct ieee80211_supported_band *sband, - u64 basic_rates, int bitrate) + u32 basic_rates, int bitrate) { struct ieee80211_rate *result = &sband->bitrates[0]; int i; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 58e489fd4aed..b84a9b4fe96a 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -137,3 +137,100 @@ int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, return 0; } EXPORT_SYMBOL(cfg80211_wext_giwmode); + + +int cfg80211_wext_giwrange(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct iw_range *range = (struct iw_range *) extra; + enum ieee80211_band band; + int c = 0; + + if (!wdev) + return -EOPNOTSUPP; + + data->length = sizeof(struct iw_range); + memset(range, 0, sizeof(struct iw_range)); + + range->we_version_compiled = WIRELESS_EXT; + range->we_version_source = 21; + range->retry_capa = IW_RETRY_LIMIT; + range->retry_flags = IW_RETRY_LIMIT; + range->min_retry = 0; + range->max_retry = 255; + range->min_rts = 0; + range->max_rts = 2347; + range->min_frag = 256; + range->max_frag = 2346; + + range->encoding_size[0] = 5; + range->encoding_size[1] = 13; + range->num_encoding_sizes = 2; + range->max_encoding_tokens = 4; + + range->max_qual.updated = IW_QUAL_NOISE_INVALID; + + switch (wdev->wiphy->signal_type) { + case CFG80211_SIGNAL_TYPE_NONE: + break; + case CFG80211_SIGNAL_TYPE_MBM: + range->max_qual.level = -110; + range->max_qual.qual = 70; + range->avg_qual.qual = 35; + range->max_qual.updated |= IW_QUAL_DBM; + range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; + range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; + break; + case CFG80211_SIGNAL_TYPE_UNSPEC: + range->max_qual.level = 100; + range->max_qual.qual = 100; + range->avg_qual.qual = 50; + range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; + range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; + break; + } + + range->avg_qual.level = range->max_qual.level / 2; + range->avg_qual.noise = range->max_qual.noise / 2; + range->avg_qual.updated = range->max_qual.updated; + + range->enc_capa = IW_ENC_CAPA_WPA | IW_ENC_CAPA_WPA2 | + IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP; + + + for (band = 0; band < IEEE80211_NUM_BANDS; band ++) { + int i; + struct ieee80211_supported_band *sband; + + sband = wdev->wiphy->bands[band]; + + if (!sband) + continue; + + for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) { + struct ieee80211_channel *chan = &sband->channels[i]; + + if (!(chan->flags & IEEE80211_CHAN_DISABLED)) { + range->freq[c].i = + ieee80211_frequency_to_channel( + chan->center_freq); + range->freq[c].m = chan->center_freq; + range->freq[c].e = 6; + c++; + } + } + } + range->num_channels = c; + range->num_frequency = c; + + IW_EVENT_CAPA_SET_KERNEL(range->event_capa); + IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP); + IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN); + + range->scan_capa |= IW_SCAN_CAPA_ESSID; + + return 0; +} +EXPORT_SYMBOL(cfg80211_wext_giwrange); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 9fc5b023d111..9ca17b1ce52e 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -951,10 +951,8 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, /* * Incoming Call User Data. */ - if (skb->len >= 0) { - skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); - makex25->calluserdata.cudlength = skb->len; - } + skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); + makex25->calluserdata.cudlength = skb->len; sk->sk_ack_backlog++; @@ -1122,8 +1120,9 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock, if (msg->msg_flags & MSG_OOB) skb_queue_tail(&x25->interrupt_out_queue, skb); else { - len = x25_output(sk, skb); - if (len < 0) + rc = x25_output(sk, skb); + len = rc; + if (rc < 0) kfree_skb(skb); else if (x25->qbitincl) len++; @@ -1608,8 +1607,8 @@ static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { SOCKOPS_WRAP(x25_proto, AF_X25); -static struct packet_type x25_packet_type = { - .type = __constant_htons(ETH_P_X25), +static struct packet_type x25_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_X25), .func = x25_lapb_receive_frame, }; |