diff options
Diffstat (limited to 'net/netfilter')
40 files changed, 1214 insertions, 495 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8593a77cfea9..43288259f4a1 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -40,27 +40,6 @@ config NF_CONNTRACK if NF_CONNTRACK -config NF_CT_ACCT - bool "Connection tracking flow accounting" - depends on NETFILTER_ADVANCED - help - If this option is enabled, the connection tracking code will - keep per-flow packet and byte counters. - - Those counters can be used for flow-based accounting or the - `connbytes' match. - - Please note that currently this option only sets a default state. - You may change it at boot time with nf_conntrack.acct=0/1 kernel - parameter or by loading the nf_conntrack module with acct=0/1. - - You may also disable/enable it on a running system with: - sysctl net.netfilter.nf_conntrack_acct=0/1 - - This option will be removed in 2.6.29. - - If unsure, say `N'. - config NF_CONNTRACK_MARK bool 'Connection mark tracking support' depends on NETFILTER_ADVANCED @@ -347,6 +326,22 @@ config NETFILTER_XT_CONNMARK comment "Xtables targets" +config NETFILTER_XT_TARGET_CHECKSUM + tristate "CHECKSUM target support" + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + ---help--- + This option adds a `CHECKSUM' target, which can be used in the iptables mangle + table. + + You can use this target to compute and fill in the checksum in + a packet that lacks a checksum. This is particularly useful, + if you need to work around old applications such as dhcp clients, + that do not work well with checksum offloads, but don't want to disable + checksum offload in your device. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_CLASSIFY tristate '"CLASSIFY" target support' depends on NETFILTER_ADVANCED @@ -424,6 +419,18 @@ config NETFILTER_XT_TARGET_HL since you can easily create immortal packets that loop forever on the network. +config NETFILTER_XT_TARGET_IDLETIMER + tristate "IDLETIMER target support" + depends on NETFILTER_ADVANCED + help + + This option adds the `IDLETIMER' target. Each matching packet + resets the timer associated with label specified when the rule is + added. When the timer expires, it triggers a sysfs notification. + The remaining time for expiration can be read via sysfs. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_LED tristate '"LED" target support' depends on LEDS_CLASS && LEDS_TRIGGERS @@ -503,7 +510,7 @@ config NETFILTER_XT_TARGET_RATEEST To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_TARGET_TEE - tristate '"TEE" - packet cloning to alternate destiantion' + tristate '"TEE" - packet cloning to alternate destination' depends on NETFILTER_ADVANCED depends on (IPV6 || IPV6=n) depends on !NF_CONNTRACK || NF_CONNTRACK @@ -618,7 +625,6 @@ config NETFILTER_XT_MATCH_CONNBYTES tristate '"connbytes" per-connection counter match support' depends on NF_CONNTRACK depends on NETFILTER_ADVANCED - select NF_CT_ACCT help This option adds a `connbytes' match, which allows you to match the number of bytes and/or packets for each direction within a connection. @@ -657,6 +663,15 @@ config NETFILTER_XT_MATCH_CONNTRACK To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_CPU + tristate '"cpu" match support' + depends on NETFILTER_ADVANCED + help + CPU matching allows you to match packets based on the CPU + currently handling the packet. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_DCCP tristate '"dccp" protocol match support' depends on NETFILTER_ADVANCED @@ -736,6 +751,16 @@ config NETFILTER_XT_MATCH_IPRANGE If unsure, say M. +config NETFILTER_XT_MATCH_IPVS + tristate '"ipvs" match support' + depends on IP_VS + depends on NETFILTER_ADVANCED + depends on NF_CONNTRACK + help + This option allows you to match against IPVS properties of a packet. + + If unsure, say N. + config NETFILTER_XT_MATCH_LENGTH tristate '"length" match support' depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 14e3a8fd8180..441050f31111 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -45,6 +45,7 @@ obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o # targets +obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o @@ -61,6 +62,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o # matches obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o @@ -68,6 +70,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o @@ -75,6 +78,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 712ccad13344..46a77d5c3887 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -3,7 +3,7 @@ # menuconfig IP_VS tristate "IP virtual server support" - depends on NET && INET && NETFILTER + depends on NET && INET && NETFILTER && NF_CONNTRACK ---help--- IP Virtual Server support will let you build a high-performance virtual server based on cluster of two or more real servers. This @@ -26,7 +26,7 @@ if IP_VS config IP_VS_IPV6 bool "IPv6 support for IPVS" - depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) + depends on IPV6 = y || IP_VS = IPV6 ---help--- Add IPv6 support to IPVS. This is incomplete and might be dangerous. @@ -87,19 +87,16 @@ config IP_VS_PROTO_UDP protocol. Say Y if unsure. config IP_VS_PROTO_AH_ESP - bool - depends on UNDEFINED + def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH config IP_VS_PROTO_ESP bool "ESP load balancing support" - select IP_VS_PROTO_AH_ESP ---help--- This option enables support for load balancing ESP (Encapsulation Security Payload) transport protocol. Say Y if unsure. config IP_VS_PROTO_AH bool "AH load balancing support" - select IP_VS_PROTO_AH_ESP ---help--- This option enables support for load balancing AH (Authentication Header) transport protocol. Say Y if unsure. @@ -238,7 +235,7 @@ comment 'IPVS application helper' config IP_VS_FTP tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP + depends on IP_VS_PROTO_TCP && NF_NAT ---help--- FTP is a protocol that transfers IP address and/or port number in the payload. In the virtual server via Network Address Translation, diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 1cb0e834f8ff..e76f87f4aca8 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -569,49 +569,6 @@ static const struct file_operations ip_vs_app_fops = { }; #endif - -/* - * Replace a segment of data with a new segment - */ -int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, - char *o_buf, int o_len, char *n_buf, int n_len) -{ - int diff; - int o_offset; - int o_left; - - EnterFunction(9); - - diff = n_len - o_len; - o_offset = o_buf - (char *)skb->data; - /* The length of left data after o_buf+o_len in the skb data */ - o_left = skb->len - (o_offset + o_len); - - if (diff <= 0) { - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - skb_trim(skb, skb->len + diff); - } else if (diff <= skb_tailroom(skb)) { - skb_put(skb, diff); - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - } else { - if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) - return -ENOMEM; - skb_put(skb, diff); - memmove(skb->data + o_offset + n_len, - skb->data + o_offset + o_len, o_left); - skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); - } - - /* must update the iph total length here */ - ip_hdr(skb)->tot_len = htons(skb->len); - - LeaveFunction(9); - return 0; -} - - int __init ip_vs_app_init(void) { /* we will replace it with proc_net_ipvs_create() soon */ diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index ff04e9edbed6..b71c69a2db13 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -158,6 +158,9 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) unsigned hash; int ret; + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return 0; + /* Hash by protocol, client address and port */ hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); @@ -268,6 +271,29 @@ struct ip_vs_conn *ip_vs_conn_in_get return cp; } +struct ip_vs_conn * +ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) + return ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + else + return ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); + /* Get reference to connection template */ struct ip_vs_conn *ip_vs_ct_in_get (int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, @@ -353,14 +379,37 @@ struct ip_vs_conn *ip_vs_conn_out_get return ret; } +struct ip_vs_conn * +ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) + return ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + else + return ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); /* * Put back the conn and restart its timer with its timeout */ void ip_vs_conn_put(struct ip_vs_conn *cp) { - /* reset it expire in its timeout */ - mod_timer(&cp->timer, jiffies+cp->timeout); + unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? + 0 : cp->timeout; + mod_timer(&cp->timer, jiffies+t); __ip_vs_conn_put(cp); } @@ -653,7 +702,7 @@ static void ip_vs_conn_expire(unsigned long data) /* * unhash it if it is hashed in the conn table */ - if (!ip_vs_conn_unhash(cp)) + if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) goto expire_later; /* diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 1cd6e3fd058b..4f8ddba48011 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -54,7 +54,6 @@ EXPORT_SYMBOL(register_ip_vs_scheduler); EXPORT_SYMBOL(unregister_ip_vs_scheduler); -EXPORT_SYMBOL(ip_vs_skb_replace); EXPORT_SYMBOL(ip_vs_proto_name); EXPORT_SYMBOL(ip_vs_conn_new); EXPORT_SYMBOL(ip_vs_conn_in_get); @@ -194,6 +193,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport; /* destination port to forward */ + __be16 flags; union nf_inet_addr snet; /* source network of the client, after masking */ @@ -340,6 +340,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc, dport = ports[1]; } + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph.protocol == IPPROTO_UDP)? + IP_VS_CONN_F_ONE_PACKET : 0; + /* * Create a new connection according to the template */ @@ -347,7 +351,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, &iph.saddr, ports[0], &iph.daddr, ports[1], &dest->addr, dport, - 0, + flags, dest); if (cp == NULL) { ip_vs_conn_put(ct); @@ -377,7 +381,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; struct ip_vs_dest *dest; - __be16 _ports[2], *pptr; + __be16 _ports[2], *pptr, flags; ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); @@ -407,6 +411,10 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) return NULL; } + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph.protocol == IPPROTO_UDP)? + IP_VS_CONN_F_ONE_PACKET : 0; + /* * Create a connection entry. */ @@ -414,7 +422,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) &iph.saddr, pptr[0], &iph.daddr, pptr[1], &dest->addr, dest->port ? dest->port : pptr[1], - 0, + flags, dest); if (cp == NULL) return NULL; @@ -464,6 +472,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { int ret, cs; struct ip_vs_conn *cp; + __u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && + iph.protocol == IPPROTO_UDP)? + IP_VS_CONN_F_ONE_PACKET : 0; union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; ip_vs_service_put(svc); @@ -474,7 +485,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, &iph.saddr, pptr[0], &iph.daddr, pptr[1], &daddr, 0, - IP_VS_CONN_F_BYPASS, + IP_VS_CONN_F_BYPASS | flags, NULL); if (cp == NULL) return NF_DROP; @@ -524,26 +535,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, return NF_DROP; } - -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain, and is used for VS/NAT. - * It detects packets for VS/NAT connections and sends the packets - * immediately. This can avoid that iptable_nat mangles the packets - * for VS/NAT. - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; -} - __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); @@ -1487,14 +1478,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, - }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1523,14 +1506,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_NAT_SRC-1, - }, #endif }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 36dc1d88c2fa..0f0c079c422a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1864,14 +1864,16 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) svc->scheduler->name); else #endif - seq_printf(seq, "%s %08X:%04X %s ", + seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), - svc->scheduler->name); + svc->scheduler->name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { - seq_printf(seq, "FWM %08X %s ", - svc->fwmark, svc->scheduler->name); + seq_printf(seq, "FWM %08X %s %s", + svc->fwmark, svc->scheduler->name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } if (svc->flags & IP_VS_SVC_F_PERSISTENT) diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 2ae747a376a5..f228a17ec649 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -20,6 +20,17 @@ * * Author: Wouter Gadeyne * + * + * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from + * http://www.ssi.bg/~ja/nfct/: + * + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS + * + * Portions Copyright (C) 2001-2002 + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. + * + * Portions Copyright (C) 2003-2008 + * Julian Anastasov */ #define KMSG_COMPONENT "IPVS" @@ -32,6 +43,9 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat_helper.h> #include <linux/gfp.h> #include <net/protocol.h> #include <net/tcp.h> @@ -43,6 +57,16 @@ #define SERVER_STRING "227 Entering Passive Mode (" #define CLIENT_STRING "PORT " +#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" +#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ + &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ + (T)->dst.protonum + +#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" +#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ + &((C)->vaddr.ip), ntohs((C)->vport), \ + &((C)->daddr.ip), ntohs((C)->dport), \ + (C)->protocol, (C)->state /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper @@ -123,6 +147,119 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, return 1; } +/* + * Called from init_conntrack() as expectfn handler. + */ +static void +ip_vs_expect_callback(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_tuple *orig, new_reply; + struct ip_vs_conn *cp; + + if (exp->tuple.src.l3num != PF_INET) + return; + + /* + * We assume that no NF locks are held before this callback. + * ip_vs_conn_out_get and ip_vs_conn_in_get should match their + * expectations even if they use wildcard values, now we provide the + * actual values from the newly created original conntrack direction. + * The conntrack is confirmed when packet reaches IPVS hooks. + */ + + /* RS->CLIENT */ + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port); + if (cp) { + /* Change reply CLIENT->RS to CLIENT->VS */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found inout cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.dst.u3 = cp->vaddr; + new_reply.dst.u.tcp.port = cp->vport; + IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE + ", inout cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + /* CLIENT->VS */ + cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port); + if (cp) { + /* Change reply VS->CLIENT to RS->CLIENT */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found outin cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.src.u3 = cp->daddr; + new_reply.src.u.tcp.port = cp->dport; + IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " + FMT_TUPLE ", outin cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); + return; + +alter: + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) + nf_conntrack_alter_reply(ct, &new_reply); + ip_vs_conn_put(cp); + return; +} + +/* + * Create NF conntrack expectation with wildcard (optional) source port. + * Then the default callback function will alter the reply and will confirm + * the conntrack entry when the first packet comes. + */ +static void +ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 *port, int from_rs) +{ + struct nf_conntrack_expect *exp; + + BUG_ON(!ct || ct == &nf_conntrack_untracked); + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return; + + if (from_rs) + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), &cp->daddr, &cp->caddr, + proto, port, &cp->cport); + else + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), &cp->caddr, &cp->vaddr, + proto, port, &cp->vport); + + exp->expectfn = ip_vs_expect_callback; + + IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); +} /* * Look at outgoing ftp packets to catch the response to a PASV command @@ -149,7 +286,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, struct ip_vs_conn *n_cp; char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ unsigned buf_len; - int ret; + int ret = 0; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; #ifdef CONFIG_IP_VS_IPV6 /* This application helper doesn't work with IPv6 yet, @@ -219,19 +358,26 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, buf_len = strlen(buf); + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct)) { + /* If mangling fails this function will return 0 + * which will cause the packet to be dropped. + * Mangling can only fail under memory pressure, + * hopefully it will succeed on the retransmitted + * packet. + */ + ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + start-data, end-start, + buf, buf_len); + if (ret) + ip_vs_expect_related(skb, ct, n_cp, + IPPROTO_TCP, NULL, 0); + } + /* - * Calculate required delta-offset to keep TCP happy + * Not setting 'diff' is intentional, otherwise the sequence + * would be adjusted twice. */ - *diff = buf_len - (end-start); - - if (*diff == 0) { - /* simply replace it with new passive address */ - memcpy(start, buf, buf_len); - ret = 1; - } else { - ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, - end-start, buf, buf_len); - } cp->app_data = NULL; ip_vs_tcp_conn_listen(n_cp); @@ -263,6 +409,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; + struct nf_conn *ct; #ifdef CONFIG_IP_VS_IPV6 /* This application helper doesn't work with IPv6 yet, @@ -349,6 +496,11 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, ip_vs_control_add(n_cp, cp); } + ct = (struct nf_conn *)skb->nfct; + if (ct && ct != &nf_conntrack_untracked) + ip_vs_expect_related(skb, ct, n_cp, + IPPROTO_TCP, &n_cp->dport, 1); + /* * Move tunnel to listen state */ diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 2d3d5e4b35f8..027f654799fe 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -98,6 +98,7 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) return NULL; } +EXPORT_SYMBOL(ip_vs_proto_get); /* diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index c9a3f7a21d53..4c0855cb006e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -8,55 +8,6 @@ #include <net/sctp/checksum.h> #include <net/ip_vs.h> - -static struct ip_vs_conn * -sctp_conn_in_get(int af, - const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) - return ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - else - return ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); -} - -static struct ip_vs_conn * -sctp_conn_out_get(int af, - const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) - return ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - else - return ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); -} - static int sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) @@ -173,7 +124,7 @@ sctp_dnat_handler(struct sk_buff *skb, return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + if (!ip_vs_app_pkt_in(cp, skb)) return 0; } @@ -1169,8 +1120,8 @@ struct ip_vs_protocol ip_vs_protocol_sctp = { .register_app = sctp_register_app, .unregister_app = sctp_unregister_app, .conn_schedule = sctp_conn_schedule, - .conn_in_get = sctp_conn_in_get, - .conn_out_get = sctp_conn_out_get, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = sctp_snat_handler, .dnat_handler = sctp_dnat_handler, .csum_check = sctp_csum_check, diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 91d28e073742..282d24de8592 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -27,52 +27,6 @@ #include <net/ip_vs.h> - -static struct ip_vs_conn * -tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - -static struct ip_vs_conn * -tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - - static int tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) @@ -721,8 +675,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, - .conn_in_get = tcp_conn_in_get, - .conn_out_get = tcp_conn_out_get, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = tcp_snat_handler, .dnat_handler = tcp_dnat_handler, .csum_check = tcp_csum_check, diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index e7a6885e0167..8553231b5d41 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -27,58 +27,6 @@ #include <net/ip.h> #include <net/ip6_checksum.h> -static struct ip_vs_conn * -udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - -static struct ip_vs_conn * -udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - static int udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) @@ -520,8 +468,8 @@ struct ip_vs_protocol ip_vs_protocol_udp = { .init = udp_init, .exit = udp_exit, .conn_schedule = udp_conn_schedule, - .conn_in_get = udp_conn_in_get, - .conn_out_get = udp_conn_out_get, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = udp_snat_handler, .dnat_handler = udp_dnat_handler, .csum_check = udp_csum_check, diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 93c15a107b2c..21e1a5e9b9d3 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -28,6 +28,7 @@ #include <net/ip6_route.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack.h> #include <linux/netfilter_ipv4.h> #include <net/ip_vs.h> @@ -90,10 +91,10 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) &dest->addr.ip); return NULL; } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst)); IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n", &dest->addr.ip, - atomic_read(&rt->u.dst.__refcnt), rtos); + atomic_read(&rt->dst.__refcnt), rtos); } spin_unlock(&dest->dst_lock); } else { @@ -148,10 +149,10 @@ __ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) &dest->addr.in6); return NULL; } - __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); + __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst)); IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n", &dest->addr.in6, - atomic_read(&rt->u.dst.__refcnt)); + atomic_read(&rt->dst.__refcnt)); } spin_unlock(&dest->dst_lock); } else { @@ -198,7 +199,7 @@ do { \ (skb)->ipvs_property = 1; \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - (rt)->u.dst.dev, dst_output); \ + (rt)->dst.dev, dst_output); \ } while (0) @@ -245,7 +246,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); @@ -265,7 +266,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; @@ -309,9 +310,9 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error; @@ -323,13 +324,13 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); return NF_STOLEN; } /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; @@ -348,6 +349,30 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } #endif +static void +ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + struct nf_conn *ct = (struct nf_conn *)skb->nfct; + struct nf_conntrack_tuple new_tuple; + + if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct)) + return; + + /* + * The connection is not yet in the hashtable, so we update it. + * CIP->VIP will remain the same, so leave the tuple in + * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the + * real-server we will see RIP->DIP. + */ + new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + new_tuple.src.u3 = cp->daddr; + /* + * This will also take care of UDP and other protocols. + */ + new_tuple.src.u.tcp.port = cp->dport; + nf_conntrack_alter_reply(ct, &new_tuple); +} + /* * NAT transmitter (only for outside-to-inside nat forwarding) * Not used for related ICMP @@ -376,7 +401,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); @@ -388,12 +413,12 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (!skb_make_writable(skb, sizeof(struct iphdr))) goto tx_error_put; - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) @@ -403,6 +428,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + ip_vs_update_conntrack(skb, cp); + /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ @@ -452,9 +479,9 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit_v6(): frag needed for"); @@ -465,12 +492,12 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) goto tx_error_put; - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) @@ -479,6 +506,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + ip_vs_update_conntrack(skb, cp); + /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ @@ -498,7 +527,7 @@ tx_error: kfree_skb(skb); return NF_STOLEN; tx_error_put: - dst_release(&rt->u.dst); + dst_release(&rt->dst); goto tx_error; } #endif @@ -549,9 +578,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) goto tx_error_icmp; - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { ip_rt_put(rt); IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); @@ -601,7 +630,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. @@ -615,7 +644,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->ttl = old_iph->ttl; - ip_select_ident(iph, &rt->u.dst, NULL); + ip_select_ident(iph, &rt->dst, NULL); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; @@ -660,12 +689,12 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (!rt) goto tx_error_icmp; - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; - mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); + mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); /* TODO IPv6: do we need this check in IPv6? */ if (mtu < 1280) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__); goto tx_error; } @@ -674,7 +703,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->u.dst); + dst_release(&rt->dst); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error; } @@ -689,7 +718,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); kfree_skb(skb); IP_VS_ERR_RL("%s(): no memory\n", __func__); return NF_STOLEN; @@ -707,7 +736,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. @@ -760,7 +789,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); @@ -780,7 +809,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; @@ -813,10 +842,10 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->u.dst); + dst_release(&rt->dst); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error; } @@ -827,13 +856,13 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); return NF_STOLEN; } /* drop old route */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; @@ -888,7 +917,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); @@ -900,12 +929,12 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (!skb_make_writable(skb, offset)) goto tx_error_put; - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; /* drop the old route when skb is not shared */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); ip_vs_nat_icmp(skb, pp, cp, 0); @@ -963,9 +992,9 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_icmp; /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); + mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->u.dst); + dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error; @@ -975,12 +1004,12 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (!skb_make_writable(skb, offset)) goto tx_error_put; - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; /* drop the old route when skb is not shared */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); ip_vs_nat_icmp_v6(skb, pp, cp, 0); @@ -1001,7 +1030,7 @@ out: LeaveFunction(10); return rc; tx_error_put: - dst_release(&rt->u.dst); + dst_release(&rt->dst); goto tx_error; } #endif diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index ab81b380eae6..5178c691ecbf 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -17,13 +17,7 @@ #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_acct.h> -#ifdef CONFIG_NF_CT_ACCT -#define NF_CT_ACCT_DEFAULT 1 -#else -#define NF_CT_ACCT_DEFAULT 0 -#endif - -static int nf_ct_acct __read_mostly = NF_CT_ACCT_DEFAULT; +static int nf_ct_acct __read_mostly; module_param_named(acct, nf_ct_acct, bool, 0644); MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); @@ -114,12 +108,6 @@ int nf_conntrack_acct_init(struct net *net) net->ct.sysctl_acct = nf_ct_acct; if (net_eq(net, &init_net)) { -#ifdef CONFIG_NF_CT_ACCT - printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Please use\n"); - printk(KERN_WARNING "nf_conntrack.acct=1 kernel parameter, acct=1 nf_conntrack module option or\n"); - printk(KERN_WARNING "sysctl net.netfilter.nf_conntrack_acct=1 to enable it.\n"); -#endif - ret = nf_ct_extend_register(&acct_extend); if (ret < 0) { printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index eeeb8bc73982..df3eedb142ff 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -62,8 +62,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); -struct nf_conn nf_conntrack_untracked __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_untracked); +DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); static int nf_conntrack_hash_rnd_initted; static unsigned int nf_conntrack_hash_rnd; @@ -619,9 +619,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL; /* Don't set timer yet: wait for confirmation */ setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); -#ifdef CONFIG_NET_NS - ct->ct_net = net; -#endif + write_pnet(&ct->ct_net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES if (zone) { struct nf_conntrack_zone *nf_ct_zone; @@ -968,8 +966,7 @@ acct: if (acct) { spin_lock_bh(&ct->lock); acct[CTINFO2DIR(ctinfo)].packets++; - acct[CTINFO2DIR(ctinfo)].bytes += - skb->len - skb_network_offset(skb); + acct[CTINFO2DIR(ctinfo)].bytes += skb->len; spin_unlock_bh(&ct->lock); } } @@ -1183,10 +1180,21 @@ static void nf_ct_release_dying_list(struct net *net) spin_unlock_bh(&nf_conntrack_lock); } +static int untrack_refs(void) +{ + int cnt = 0, cpu; + + for_each_possible_cpu(cpu) { + struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); + + cnt += atomic_read(&ct->ct_general.use) - 1; + } + return cnt; +} + static void nf_conntrack_cleanup_init_net(void) { - /* wait until all references to nf_conntrack_untracked are dropped */ - while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) + while (untrack_refs() > 0) schedule(); nf_conntrack_helper_fini(); @@ -1321,10 +1329,19 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); +void nf_ct_untracked_status_or(unsigned long bits) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(nf_conntrack_untracked, cpu).status |= bits; +} +EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); + static int nf_conntrack_init_init_net(void) { int max_factor = 8; - int ret; + int ret, cpu; /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ @@ -1363,13 +1380,13 @@ static int nf_conntrack_init_init_net(void) goto err_extend; #endif /* Set up fake conntrack: to never be deleted, not in any hashes */ -#ifdef CONFIG_NET_NS - nf_conntrack_untracked.ct_net = &init_net; -#endif - atomic_set(&nf_conntrack_untracked.ct_general.use, 1); + for_each_possible_cpu(cpu) { + struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); + write_pnet(&ct->ct_net, &init_net); + atomic_set(&ct->ct_general.use, 1); + } /* - and look it like as a confirmed connection */ - set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); - + nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); return 0; #ifdef CONFIG_NF_CONNTRACK_ZONES diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index fdc8fb4ae10f..7dcf7a404190 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -23,9 +23,10 @@ void __nf_ct_ext_destroy(struct nf_conn *ct) { unsigned int i; struct nf_ct_ext_type *t; + struct nf_ct_ext *ext = ct->ext; for (i = 0; i < NF_CT_EXT_NUM; i++) { - if (!nf_ct_ext_exist(ct, i)) + if (!__nf_ct_ext_exist(ext, i)) continue; rcu_read_lock(); @@ -73,44 +74,45 @@ static void __nf_ct_ext_free_rcu(struct rcu_head *head) void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { - struct nf_ct_ext *new; + struct nf_ct_ext *old, *new; int i, newlen, newoff; struct nf_ct_ext_type *t; /* Conntrack must not be confirmed to avoid races on reallocation. */ NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); - if (!ct->ext) + old = ct->ext; + if (!old) return nf_ct_ext_create(&ct->ext, id, gfp); - if (nf_ct_ext_exist(ct, id)) + if (__nf_ct_ext_exist(old, id)) return NULL; rcu_read_lock(); t = rcu_dereference(nf_ct_ext_types[id]); BUG_ON(t == NULL); - newoff = ALIGN(ct->ext->len, t->align); + newoff = ALIGN(old->len, t->align); newlen = newoff + t->len; rcu_read_unlock(); - new = __krealloc(ct->ext, newlen, gfp); + new = __krealloc(old, newlen, gfp); if (!new) return NULL; - if (new != ct->ext) { + if (new != old) { for (i = 0; i < NF_CT_EXT_NUM; i++) { - if (!nf_ct_ext_exist(ct, i)) + if (!__nf_ct_ext_exist(old, i)) continue; rcu_read_lock(); t = rcu_dereference(nf_ct_ext_types[i]); if (t && t->move) t->move((void *)new + new->offset[i], - (void *)ct->ext + ct->ext->offset[i]); + (void *)old + old->offset[i]); rcu_read_unlock(); } - call_rcu(&ct->ext->rcu, __nf_ct_ext_free_rcu); + call_rcu(&old->rcu, __nf_ct_ext_free_rcu); ct->ext = new; } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 6eaee7c8a337..b969025cf82f 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -734,11 +734,11 @@ static int callforward_do_filter(const union nf_inet_addr *src, if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) { if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) { if (rt1->rt_gateway == rt2->rt_gateway && - rt1->u.dst.dev == rt2->u.dst.dev) + rt1->dst.dev == rt2->dst.dev) ret = 1; - dst_release(&rt2->u.dst); + dst_release(&rt2->dst); } - dst_release(&rt1->u.dst); + dst_release(&rt1->dst); } break; } @@ -753,11 +753,11 @@ static int callforward_do_filter(const union nf_inet_addr *src, if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) { if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, sizeof(rt1->rt6i_gateway)) && - rt1->u.dst.dev == rt2->u.dst.dev) + rt1->dst.dev == rt2->dst.dev) ret = 1; - dst_release(&rt2->u.dst); + dst_release(&rt2->dst); } - dst_release(&rt1->u.dst); + dst_release(&rt1->dst); } break; } diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index 497b2224536f..aadde018a072 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -61,7 +61,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, goto out; rcu_read_lock(); - in_dev = __in_dev_get_rcu(rt->u.dst.dev); + in_dev = __in_dev_get_rcu(rt->dst.dev); if (in_dev != NULL) { for_primary_ifa(in_dev) { if (ifa->ifa_broadcast == iph->daddr) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c42ff6aa441d..5bae1cd15eea 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -480,7 +480,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) int err; /* ignore our fake conntrack entry */ - if (ct == &nf_conntrack_untracked) + if (nf_ct_is_untracked(ct)) return 0; if (events & (1 << IPCT_DESTROY)) { diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 9dd8cd4fb6e6..c4c885dca3bd 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -585,8 +585,16 @@ static bool tcp_in_window(const struct nf_conn *ct, * Let's try to use the data from the packet. */ sender->td_end = end; + win <<= sender->td_scale; sender->td_maxwin = (win == 0 ? 1 : win); sender->td_maxend = end + sender->td_maxwin; + /* + * We haven't seen traffic in the other direction yet + * but we have to tweak window tracking to pass III + * and IV until that happens. + */ + if (receiver->td_maxwin == 0) + receiver->td_end = receiver->td_maxend = sack; } } else if (((state->state == TCP_CONNTRACK_SYN_SENT && dir == IP_CT_DIR_ORIGINAL) @@ -680,7 +688,7 @@ static bool tcp_in_window(const struct nf_conn *ct, /* * Update receiver data. */ - if (after(end, sender->td_maxend)) + if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) receiver->td_maxwin += end - sender->td_maxend; if (after(sack + win, receiver->td_maxend - 1)) { receiver->td_maxend = sack + win; @@ -736,27 +744,19 @@ static bool tcp_in_window(const struct nf_conn *ct, return res; } -#define TH_FIN 0x01 -#define TH_SYN 0x02 -#define TH_RST 0x04 -#define TH_PUSH 0x08 -#define TH_ACK 0x10 -#define TH_URG 0x20 -#define TH_ECE 0x40 -#define TH_CWR 0x80 - /* table of valid flag combinations - PUSH, ECE and CWR are always valid */ -static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] = +static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| + TCPHDR_URG) + 1] = { - [TH_SYN] = 1, - [TH_SYN|TH_URG] = 1, - [TH_SYN|TH_ACK] = 1, - [TH_RST] = 1, - [TH_RST|TH_ACK] = 1, - [TH_FIN|TH_ACK] = 1, - [TH_FIN|TH_ACK|TH_URG] = 1, - [TH_ACK] = 1, - [TH_ACK|TH_URG] = 1, + [TCPHDR_SYN] = 1, + [TCPHDR_SYN|TCPHDR_URG] = 1, + [TCPHDR_SYN|TCPHDR_ACK] = 1, + [TCPHDR_RST] = 1, + [TCPHDR_RST|TCPHDR_ACK] = 1, + [TCPHDR_FIN|TCPHDR_ACK] = 1, + [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1, + [TCPHDR_ACK] = 1, + [TCPHDR_ACK|TCPHDR_URG] = 1, }; /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ @@ -803,7 +803,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, } /* Check TCP flags. */ - tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH)); + tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index fc9a211e629e..6a1572b0ab41 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -66,9 +66,10 @@ struct nfulnl_instance { u_int16_t group_num; /* number of this queue */ u_int16_t flags; u_int8_t copy_mode; + struct rcu_head rcu; }; -static DEFINE_RWLOCK(instances_lock); +static DEFINE_SPINLOCK(instances_lock); static atomic_t global_seq; #define INSTANCE_BUCKETS 16 @@ -88,7 +89,7 @@ __instance_lookup(u_int16_t group_num) struct nfulnl_instance *inst; head = &instance_table[instance_hashfn(group_num)]; - hlist_for_each_entry(inst, pos, head, hlist) { + hlist_for_each_entry_rcu(inst, pos, head, hlist) { if (inst->group_num == group_num) return inst; } @@ -106,22 +107,26 @@ instance_lookup_get(u_int16_t group_num) { struct nfulnl_instance *inst; - read_lock_bh(&instances_lock); + rcu_read_lock_bh(); inst = __instance_lookup(group_num); - if (inst) - instance_get(inst); - read_unlock_bh(&instances_lock); + if (inst && !atomic_inc_not_zero(&inst->use)) + inst = NULL; + rcu_read_unlock_bh(); return inst; } +static void nfulnl_instance_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct nfulnl_instance, rcu)); + module_put(THIS_MODULE); +} + static void instance_put(struct nfulnl_instance *inst) { - if (inst && atomic_dec_and_test(&inst->use)) { - kfree(inst); - module_put(THIS_MODULE); - } + if (inst && atomic_dec_and_test(&inst->use)) + call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu); } static void nfulnl_timer(unsigned long data); @@ -132,7 +137,7 @@ instance_create(u_int16_t group_num, int pid) struct nfulnl_instance *inst; int err; - write_lock_bh(&instances_lock); + spin_lock_bh(&instances_lock); if (__instance_lookup(group_num)) { err = -EEXIST; goto out_unlock; @@ -166,32 +171,37 @@ instance_create(u_int16_t group_num, int pid) inst->copy_mode = NFULNL_COPY_PACKET; inst->copy_range = NFULNL_COPY_RANGE_MAX; - hlist_add_head(&inst->hlist, + hlist_add_head_rcu(&inst->hlist, &instance_table[instance_hashfn(group_num)]); - write_unlock_bh(&instances_lock); + spin_unlock_bh(&instances_lock); return inst; out_unlock: - write_unlock_bh(&instances_lock); + spin_unlock_bh(&instances_lock); return ERR_PTR(err); } static void __nfulnl_flush(struct nfulnl_instance *inst); +/* called with BH disabled */ static void __instance_destroy(struct nfulnl_instance *inst) { /* first pull it out of the global list */ - hlist_del(&inst->hlist); + hlist_del_rcu(&inst->hlist); /* then flush all pending packets from skb */ - spin_lock_bh(&inst->lock); + spin_lock(&inst->lock); + + /* lockless readers wont be able to use us */ + inst->copy_mode = NFULNL_COPY_DISABLED; + if (inst->skb) __nfulnl_flush(inst); - spin_unlock_bh(&inst->lock); + spin_unlock(&inst->lock); /* and finally put the refcount */ instance_put(inst); @@ -200,9 +210,9 @@ __instance_destroy(struct nfulnl_instance *inst) static inline void instance_destroy(struct nfulnl_instance *inst) { - write_lock_bh(&instances_lock); + spin_lock_bh(&instances_lock); __instance_destroy(inst); - write_unlock_bh(&instances_lock); + spin_unlock_bh(&instances_lock); } static int @@ -403,8 +413,9 @@ __build_packet_message(struct nfulnl_instance *inst, NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)); /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, - htonl(indev->br_port->br->dev->ifindex)); + htonl(br_port_get_rcu(indev)->br->dev->ifindex)); } else { /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ @@ -430,8 +441,9 @@ __build_packet_message(struct nfulnl_instance *inst, NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)); /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, - htonl(outdev->br_port->br->dev->ifindex)); + htonl(br_port_get_rcu(outdev)->br->dev->ifindex)); } else { /* Case 2: indev is a bridge group, we need to look * for physical device (when called from ipv4) */ @@ -619,6 +631,7 @@ nfulnl_log_packet(u_int8_t pf, size += nla_total_size(data_len); break; + case NFULNL_COPY_DISABLED: default: goto unlock_and_release; } @@ -672,7 +685,7 @@ nfulnl_rcv_nl_event(struct notifier_block *this, int i; /* destroy all instances for this pid */ - write_lock_bh(&instances_lock); + spin_lock_bh(&instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *tmp, *t2; struct nfulnl_instance *inst; @@ -684,7 +697,7 @@ nfulnl_rcv_nl_event(struct notifier_block *this, __instance_destroy(inst); } } - write_unlock_bh(&instances_lock); + spin_unlock_bh(&instances_lock); } return NOTIFY_DONE; } @@ -861,19 +874,19 @@ static struct hlist_node *get_first(struct iter_state *st) for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { if (!hlist_empty(&instance_table[st->bucket])) - return instance_table[st->bucket].first; + return rcu_dereference_bh(instance_table[st->bucket].first); } return NULL; } static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) { - h = h->next; + h = rcu_dereference_bh(h->next); while (!h) { if (++st->bucket >= INSTANCE_BUCKETS) return NULL; - h = instance_table[st->bucket].first; + h = rcu_dereference_bh(instance_table[st->bucket].first); } return h; } @@ -890,9 +903,9 @@ static struct hlist_node *get_idx(struct iter_state *st, loff_t pos) } static void *seq_start(struct seq_file *seq, loff_t *pos) - __acquires(instances_lock) + __acquires(rcu_bh) { - read_lock_bh(&instances_lock); + rcu_read_lock_bh(); return get_idx(seq->private, *pos); } @@ -903,9 +916,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos) } static void seq_stop(struct seq_file *s, void *v) - __releases(instances_lock) + __releases(rcu_bh) { - read_unlock_bh(&instances_lock); + rcu_read_unlock_bh(); } static int seq_show(struct seq_file *s, void *v) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 12e1ab37fcd8..68e67d19724d 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -46,17 +46,19 @@ struct nfqnl_instance { int peer_pid; unsigned int queue_maxlen; unsigned int copy_range; - unsigned int queue_total; unsigned int queue_dropped; unsigned int queue_user_dropped; - unsigned int id_sequence; /* 'sequence' of pkt ids */ u_int16_t queue_num; /* number of this queue */ u_int8_t copy_mode; - - spinlock_t lock; - +/* + * Following fields are dirtied for each queued packet, + * keep them in same cache line if possible. + */ + spinlock_t lock; + unsigned int queue_total; + atomic_t id_sequence; /* 'sequence' of pkt ids */ struct list_head queue_list; /* packets in queue */ }; @@ -238,32 +240,24 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, outdev = entry->outdev; - spin_lock_bh(&queue->lock); - - switch ((enum nfqnl_config_mode)queue->copy_mode) { + switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { case NFQNL_COPY_META: case NFQNL_COPY_NONE: break; case NFQNL_COPY_PACKET: if (entskb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(entskb)) { - spin_unlock_bh(&queue->lock); + skb_checksum_help(entskb)) return NULL; - } - if (queue->copy_range == 0 - || queue->copy_range > entskb->len) + + data_len = ACCESS_ONCE(queue->copy_range); + if (data_len == 0 || data_len > entskb->len) data_len = entskb->len; - else - data_len = queue->copy_range; size += nla_total_size(data_len); break; } - entry->id = queue->id_sequence++; - - spin_unlock_bh(&queue->lock); skb = alloc_skb(size, GFP_ATOMIC); if (!skb) @@ -278,6 +272,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(queue->queue_num); + entry->id = atomic_inc_return(&queue->id_sequence); pmsg.packet_id = htonl(entry->id); pmsg.hw_protocol = entskb->protocol; pmsg.hook = entry->hook; @@ -296,8 +291,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)); /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, - htonl(indev->br_port->br->dev->ifindex)); + htonl(br_port_get_rcu(indev)->br->dev->ifindex)); } else { /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ @@ -321,8 +317,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)); /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, - htonl(outdev->br_port->br->dev->ifindex)); + htonl(br_port_get_rcu(outdev)->br->dev->ifindex)); } else { /* Case 2: outdev is bridge group, we need to look for * physical output device (when called from ipv4) */ @@ -866,7 +863,7 @@ static int seq_show(struct seq_file *s, void *v) inst->peer_pid, inst->queue_total, inst->copy_mode, inst->copy_range, inst->queue_dropped, inst->queue_user_dropped, - inst->id_sequence, 1); + atomic_read(&inst->id_sequence), 1); } static const struct seq_operations nfqnl_seq_ops = { diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c new file mode 100644 index 000000000000..0f642ef8cd26 --- /dev/null +++ b/net/netfilter/xt_CHECKSUM.c @@ -0,0 +1,70 @@ +/* iptables module for the packet checksum mangling + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * (C) 2010 Red Hat, Inc. + * + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_CHECKSUM.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>"); +MODULE_DESCRIPTION("Xtables: checksum modification"); +MODULE_ALIAS("ipt_CHECKSUM"); +MODULE_ALIAS("ip6t_CHECKSUM"); + +static unsigned int +checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb_checksum_help(skb); + + return XT_CONTINUE; +} + +static int checksum_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_CHECKSUM_info *einfo = par->targinfo; + + if (einfo->operation & ~XT_CHECKSUM_OP_FILL) { + pr_info("unsupported CHECKSUM operation %x\n", einfo->operation); + return -EINVAL; + } + if (!einfo->operation) { + pr_info("no CHECKSUM operation enabled\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_target checksum_tg_reg __read_mostly = { + .name = "CHECKSUM", + .family = NFPROTO_UNSPEC, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, +}; + +static int __init checksum_tg_init(void) +{ + return xt_register_target(&checksum_tg_reg); +} + +static void __exit checksum_tg_exit(void) +{ + xt_unregister_target(&checksum_tg_reg); +} + +module_init(checksum_tg_init); +module_exit(checksum_tg_exit); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 562bf3266e04..0cb6053f02fd 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -67,7 +67,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par) return -EINVAL; if (info->flags & XT_CT_NOTRACK) { - ct = &nf_conntrack_untracked; + ct = nf_ct_untracked_get(); atomic_inc(&ct->ct_general.use); goto out; } @@ -132,7 +132,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par) struct nf_conn *ct = info->ct; struct nf_conn_help *help; - if (ct != &nf_conntrack_untracked) { + if (!nf_ct_is_untracked(ct)) { help = nfct_help(ct); if (help) module_put(help->helper->me); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c new file mode 100644 index 000000000000..be1f22e13545 --- /dev/null +++ b/net/netfilter/xt_IDLETIMER.c @@ -0,0 +1,315 @@ +/* + * linux/net/netfilter/xt_IDLETIMER.c + * + * Netfilter module to trigger a timer when packet matches. + * After timer expires a kevent will be sent. + * + * Copyright (C) 2004, 2010 Nokia Corporation + * Written by Timo Teras <ext-timo.teras@nokia.com> + * + * Converted to x_tables and reworked for upstream inclusion + * by Luciano Coelho <luciano.coelho@nokia.com> + * + * Contact: Luciano Coelho <luciano.coelho@nokia.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_IDLETIMER.h> +#include <linux/kdev_t.h> +#include <linux/kobject.h> +#include <linux/workqueue.h> +#include <linux/sysfs.h> + +struct idletimer_tg_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); +}; + +struct idletimer_tg { + struct list_head entry; + struct timer_list timer; + struct work_struct work; + + struct kobject *kobj; + struct idletimer_tg_attr attr; + + unsigned int refcnt; +}; + +static LIST_HEAD(idletimer_tg_list); +static DEFINE_MUTEX(list_mutex); + +static struct kobject *idletimer_tg_kobj; + +static +struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) +{ + struct idletimer_tg *entry; + + BUG_ON(!label); + + list_for_each_entry(entry, &idletimer_tg_list, entry) { + if (!strcmp(label, entry->attr.attr.name)) + return entry; + } + + return NULL; +} + +static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct idletimer_tg *timer; + unsigned long expires = 0; + + mutex_lock(&list_mutex); + + timer = __idletimer_tg_find_by_label(attr->name); + if (timer) + expires = timer->timer.expires; + + mutex_unlock(&list_mutex); + + if (time_after(expires, jiffies)) + return sprintf(buf, "%u\n", + jiffies_to_msecs(expires - jiffies) / 1000); + + return sprintf(buf, "0\n"); +} + +static void idletimer_tg_work(struct work_struct *work) +{ + struct idletimer_tg *timer = container_of(work, struct idletimer_tg, + work); + + sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); +} + +static void idletimer_tg_expired(unsigned long data) +{ + struct idletimer_tg *timer = (struct idletimer_tg *) data; + + pr_debug("timer %s expired\n", timer->attr.attr.name); + + schedule_work(&timer->work); +} + +static int idletimer_tg_create(struct idletimer_tg_info *info) +{ + int ret; + + info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + if (!info->timer) { + pr_debug("couldn't alloc timer\n"); + ret = -ENOMEM; + goto out; + } + + info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); + if (!info->timer->attr.attr.name) { + pr_debug("couldn't alloc attribute name\n"); + ret = -ENOMEM; + goto out_free_timer; + } + info->timer->attr.attr.mode = S_IRUGO; + info->timer->attr.show = idletimer_tg_show; + + ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); + if (ret < 0) { + pr_debug("couldn't add file to sysfs"); + goto out_free_attr; + } + + list_add(&info->timer->entry, &idletimer_tg_list); + + setup_timer(&info->timer->timer, idletimer_tg_expired, + (unsigned long) info->timer); + info->timer->refcnt = 1; + + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + + INIT_WORK(&info->timer->work, idletimer_tg_work); + + return 0; + +out_free_attr: + kfree(info->timer->attr.attr.name); +out_free_timer: + kfree(info->timer); +out: + return ret; +} + +/* + * The actual xt_tables plugin. + */ +static unsigned int idletimer_tg_target(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct idletimer_tg_info *info = par->targinfo; + + pr_debug("resetting timer %s, timeout period %u\n", + info->label, info->timeout); + + BUG_ON(!info->timer); + + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + + return XT_CONTINUE; +} + +static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) +{ + struct idletimer_tg_info *info = par->targinfo; + int ret; + + pr_debug("checkentry targinfo%s\n", info->label); + + if (info->timeout == 0) { + pr_debug("timeout value is zero\n"); + return -EINVAL; + } + + if (info->label[0] == '\0' || + strnlen(info->label, + MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) { + pr_debug("label is empty or not nul-terminated\n"); + return -EINVAL; + } + + mutex_lock(&list_mutex); + + info->timer = __idletimer_tg_find_by_label(info->label); + if (info->timer) { + info->timer->refcnt++; + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + + pr_debug("increased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } else { + ret = idletimer_tg_create(info); + if (ret < 0) { + pr_debug("failed to create timer\n"); + mutex_unlock(&list_mutex); + return ret; + } + } + + mutex_unlock(&list_mutex); + return 0; +} + +static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) +{ + const struct idletimer_tg_info *info = par->targinfo; + + pr_debug("destroy targinfo %s\n", info->label); + + mutex_lock(&list_mutex); + + if (--info->timer->refcnt == 0) { + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); + del_timer_sync(&info->timer->timer); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); + } else { + pr_debug("decreased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } + + mutex_unlock(&list_mutex); +} + +static struct xt_target idletimer_tg __read_mostly = { + .name = "IDLETIMER", + .family = NFPROTO_UNSPEC, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, +}; + +static struct class *idletimer_tg_class; + +static struct device *idletimer_tg_device; + +static int __init idletimer_tg_init(void) +{ + int err; + + idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer"); + err = PTR_ERR(idletimer_tg_class); + if (IS_ERR(idletimer_tg_class)) { + pr_debug("couldn't register device class\n"); + goto out; + } + + idletimer_tg_device = device_create(idletimer_tg_class, NULL, + MKDEV(0, 0), NULL, "timers"); + err = PTR_ERR(idletimer_tg_device); + if (IS_ERR(idletimer_tg_device)) { + pr_debug("couldn't register system device\n"); + goto out_class; + } + + idletimer_tg_kobj = &idletimer_tg_device->kobj; + + err = xt_register_target(&idletimer_tg); + if (err < 0) { + pr_debug("couldn't register xt target\n"); + goto out_dev; + } + + return 0; +out_dev: + device_destroy(idletimer_tg_class, MKDEV(0, 0)); +out_class: + class_destroy(idletimer_tg_class); +out: + return err; +} + +static void __exit idletimer_tg_exit(void) +{ + xt_unregister_target(&idletimer_tg); + + device_destroy(idletimer_tg_class, MKDEV(0, 0)); + class_destroy(idletimer_tg_class); +} + +module_init(idletimer_tg_init); +module_exit(idletimer_tg_exit); + +MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>"); +MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>"); +MODULE_DESCRIPTION("Xtables: idle time monitor"); +MODULE_LICENSE("GPL v2"); diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c index 512b9123252f..9d782181b6c8 100644 --- a/net/netfilter/xt_NOTRACK.c +++ b/net/netfilter/xt_NOTRACK.c @@ -23,7 +23,7 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) If there is a real ct entry correspondig to this packet, it'll hang aroun till timing out. We don't deal with it for performance reasons. JK */ - skb->nfct = &nf_conntrack_untracked.ct_general; + skb->nfct = &nf_ct_untracked_get()->ct_general; skb->nfctinfo = IP_CT_NEW; nf_conntrack_get(skb->nfct); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 69c01e10f8af..de079abd5bc8 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -60,13 +60,22 @@ struct xt_rateest *xt_rateest_lookup(const char *name) } EXPORT_SYMBOL_GPL(xt_rateest_lookup); +static void xt_rateest_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct xt_rateest, rcu)); +} + void xt_rateest_put(struct xt_rateest *est) { mutex_lock(&xt_rateest_mutex); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->bstats, &est->rstats); - kfree(est); + /* + * gen_estimator est_timer() might access est->lock or bstats, + * wait a RCU grace period before freeing 'est' + */ + call_rcu(&est->rcu, xt_rateest_free_rcu); } mutex_unlock(&xt_rateest_mutex); } @@ -179,6 +188,7 @@ static int __init xt_rateest_tg_init(void) static void __exit xt_rateest_tg_fini(void) { xt_unregister_target(&xt_rateest_tg_reg); + rcu_barrier(); /* Wait for completion of call_rcu()'s (xt_rateest_free_rcu) */ } diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 62ec021fbd50..eb81c380da1b 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -165,8 +165,8 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, rcu_read_unlock(); if (rt != NULL) { - mtu = dst_mtu(&rt->u.dst); - dst_release(&rt->u.dst); + mtu = dst_mtu(&rt->dst); + dst_release(&rt->dst); } return mtu; } @@ -220,15 +220,13 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) } #endif -#define TH_SYN 0x02 - /* Must specify -p tcp --syn */ static inline bool find_syn_match(const struct xt_entry_match *m) { const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data; if (strcmp(m->u.kernel.match->name, "tcp") == 0 && - tcpinfo->flg_cmp & TH_SYN && + tcpinfo->flg_cmp & TCPHDR_SYN && !(tcpinfo->invflags & XT_TCP_INV_FLAGS)) return true; diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 859d9fd429c8..22a2d421e7eb 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -77,8 +77,8 @@ tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info) return false; skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); - skb->dev = rt->u.dst.dev; + skb_dst_set(skb, &rt->dst); + skb->dev = rt->dst.dev; skb->protocol = htons(ETH_P_IP); return true; } @@ -104,7 +104,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) #ifdef WITH_CONNTRACK /* Avoid counting cloned packets towards the original connection. */ nf_conntrack_put(skb->nfct); - skb->nfct = &nf_conntrack_untracked.ct_general; + skb->nfct = &nf_ct_untracked_get()->ct_general; skb->nfctinfo = IP_CT_NEW; nf_conntrack_get(skb->nfct); #endif @@ -177,7 +177,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) #ifdef WITH_CONNTRACK nf_conntrack_put(skb->nfct); - skb->nfct = &nf_conntrack_untracked.ct_general; + skb->nfct = &nf_ct_untracked_get()->ct_general; skb->nfctinfo = IP_CT_NEW; nf_conntrack_get(skb->nfct); #endif diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index e1a0dedac258..c61294d85fda 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -37,8 +37,10 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par) return NF_DROP; sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, - iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr, - hp->source, tgi->lport ? tgi->lport : hp->dest, + iph->saddr, + tgi->laddr ? tgi->laddr : iph->daddr, + hp->source, + tgi->lport ? tgi->lport : hp->dest, par->in, true); /* NOTE: assign_sock consumes our sk reference */ diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index 30b95a1c1c89..f4af1bfafb1c 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -120,7 +120,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ct == NULL) return false; - if (ct == &nf_conntrack_untracked) + if (nf_ct_is_untracked(ct)) return false; if (ct->master) diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 73517835303d..5b138506690e 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -112,6 +112,16 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par) if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); + + /* + * This filter cannot function correctly unless connection tracking + * accounting is enabled, so complain in the hope that someone notices. + */ + if (!nf_ct_acct_enabled(par->net)) { + pr_warning("Forcing CT accounting to be enabled\n"); + nf_ct_set_acct(par->net, true); + } + return ret; } diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 39681f10291c..e536710ad916 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -123,11 +123,12 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, ct = nf_ct_get(skb, &ctinfo); - if (ct == &nf_conntrack_untracked) - statebit = XT_CONNTRACK_STATE_UNTRACKED; - else if (ct != NULL) - statebit = XT_CONNTRACK_STATE_BIT(ctinfo); - else + if (ct) { + if (nf_ct_is_untracked(ct)) + statebit = XT_CONNTRACK_STATE_UNTRACKED; + else + statebit = XT_CONNTRACK_STATE_BIT(ctinfo); + } else statebit = XT_CONNTRACK_STATE_INVALID; if (info->match_flags & XT_CONNTRACK_STATE) { diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c new file mode 100644 index 000000000000..b39db8a5cbae --- /dev/null +++ b/net/netfilter/xt_cpu.c @@ -0,0 +1,63 @@ +/* Kernel module to match running CPU */ + +/* + * Might be used to distribute connections on several daemons, if + * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable, + * each RX queue IRQ affined to one CPU (1:1 mapping) + * + */ + +/* (C) 2010 Eric Dumazet + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter/xt_cpu.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>"); +MODULE_DESCRIPTION("Xtables: CPU match"); + +static int cpu_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_cpu_info *info = par->matchinfo; + + if (info->invert & ~1) + return -EINVAL; + return 0; +} + +static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cpu_info *info = par->matchinfo; + + return (info->cpu == smp_processor_id()) ^ info->invert; +} + +static struct xt_match cpu_mt_reg __read_mostly = { + .name = "cpu", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = cpu_mt_check, + .match = cpu_mt, + .matchsize = sizeof(struct xt_cpu_info), + .me = THIS_MODULE, +}; + +static int __init cpu_mt_init(void) +{ + return xt_register_match(&cpu_mt_reg); +} + +static void __exit cpu_mt_exit(void) +{ + xt_unregister_match(&cpu_mt_reg); +} + +module_init(cpu_mt_init); +module_exit(cpu_mt_exit); diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c new file mode 100644 index 000000000000..7a4d66db95ae --- /dev/null +++ b/net/netfilter/xt_ipvs.c @@ -0,0 +1,189 @@ +/* + * xt_ipvs - kernel module to match IPVS connection properties + * + * Author: Hannes Eder <heder@google.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#ifdef CONFIG_IP_VS_IPV6 +#include <net/ipv6.h> +#endif +#include <linux/ip_vs.h> +#include <linux/types.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_ipvs.h> +#include <net/netfilter/nf_conntrack.h> + +#include <net/ip_vs.h> + +MODULE_AUTHOR("Hannes Eder <heder@google.com>"); +MODULE_DESCRIPTION("Xtables: match IPVS connection properties"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ipvs"); +MODULE_ALIAS("ip6t_ipvs"); + +/* borrowed from xt_conntrack */ +static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr, + const union nf_inet_addr *uaddr, + const union nf_inet_addr *umask, + unsigned int l3proto) +{ + if (l3proto == NFPROTO_IPV4) + return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; +#ifdef CONFIG_IP_VS_IPV6 + else if (l3proto == NFPROTO_IPV6) + return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, + &uaddr->in6) == 0; +#endif + else + return false; +} + +static bool +ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_ipvs_mtinfo *data = par->matchinfo; + /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */ + const u_int8_t family = par->family; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + bool match = true; + + if (data->bitmask == XT_IPVS_IPVS_PROPERTY) { + match = skb->ipvs_property ^ + !!(data->invert & XT_IPVS_IPVS_PROPERTY); + goto out; + } + + /* other flags than XT_IPVS_IPVS_PROPERTY are set */ + if (!skb->ipvs_property) { + match = false; + goto out; + } + + ip_vs_fill_iphdr(family, skb_network_header(skb), &iph); + + if (data->bitmask & XT_IPVS_PROTO) + if ((iph.protocol == data->l4proto) ^ + !(data->invert & XT_IPVS_PROTO)) { + match = false; + goto out; + } + + pp = ip_vs_proto_get(iph.protocol); + if (unlikely(!pp)) { + match = false; + goto out; + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */); + if (unlikely(cp == NULL)) { + match = false; + goto out; + } + + /* + * We found a connection, i.e. ct != 0, make sure to call + * __ip_vs_conn_put before returning. In our case jump to out_put_con. + */ + + if (data->bitmask & XT_IPVS_VPORT) + if ((cp->vport == data->vport) ^ + !(data->invert & XT_IPVS_VPORT)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_VPORTCTL) + if ((cp->control != NULL && + cp->control->vport == data->vportctl) ^ + !(data->invert & XT_IPVS_VPORTCTL)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_DIR) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct == NULL || nf_ct_is_untracked(ct)) { + match = false; + goto out_put_cp; + } + + if ((ctinfo >= IP_CT_IS_REPLY) ^ + !!(data->invert & XT_IPVS_DIR)) { + match = false; + goto out_put_cp; + } + } + + if (data->bitmask & XT_IPVS_METHOD) + if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^ + !(data->invert & XT_IPVS_METHOD)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_VADDR) { + if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr, + &data->vmask, family) ^ + !(data->invert & XT_IPVS_VADDR)) { + match = false; + goto out_put_cp; + } + } + +out_put_cp: + __ip_vs_conn_put(cp); +out: + pr_debug("match=%d\n", match); + return match; +} + +static int ipvs_mt_check(const struct xt_mtchk_param *par) +{ + if (par->family != NFPROTO_IPV4 +#ifdef CONFIG_IP_VS_IPV6 + && par->family != NFPROTO_IPV6 +#endif + ) { + pr_info("protocol family %u not supported\n", par->family); + return -EINVAL; + } + + return 0; +} + +static struct xt_match xt_ipvs_mt_reg __read_mostly = { + .name = "ipvs", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = ipvs_mt, + .checkentry = ipvs_mt_check, + .matchsize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)), + .me = THIS_MODULE, +}; + +static int __init ipvs_mt_init(void) +{ + return xt_register_match(&xt_ipvs_mt_reg); +} + +static void __exit ipvs_mt_exit(void) +{ + xt_unregister_match(&xt_ipvs_mt_reg); +} + +module_init(ipvs_mt_init); +module_exit(ipvs_mt_exit); diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index b4f7dfea5980..70eb2b4984dd 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -11,7 +11,8 @@ #include <linux/netfilter/xt_quota.h> struct xt_quota_priv { - uint64_t quota; + spinlock_t lock; + uint64_t quota; }; MODULE_LICENSE("GPL"); @@ -20,8 +21,6 @@ MODULE_DESCRIPTION("Xtables: countdown quota match"); MODULE_ALIAS("ipt_quota"); MODULE_ALIAS("ip6t_quota"); -static DEFINE_SPINLOCK(quota_lock); - static bool quota_mt(const struct sk_buff *skb, struct xt_action_param *par) { @@ -29,7 +28,7 @@ quota_mt(const struct sk_buff *skb, struct xt_action_param *par) struct xt_quota_priv *priv = q->master; bool ret = q->flags & XT_QUOTA_INVERT; - spin_lock_bh("a_lock); + spin_lock_bh(&priv->lock); if (priv->quota >= skb->len) { priv->quota -= skb->len; ret = !ret; @@ -37,9 +36,7 @@ quota_mt(const struct sk_buff *skb, struct xt_action_param *par) /* we do not allow even small packets from now on */ priv->quota = 0; } - /* Copy quota back to matchinfo so that iptables can display it */ - q->quota = priv->quota; - spin_unlock_bh("a_lock); + spin_unlock_bh(&priv->lock); return ret; } @@ -55,6 +52,7 @@ static int quota_mt_check(const struct xt_mtchk_param *par) if (q->master == NULL) return -ENOMEM; + spin_lock_init(&q->master->lock); q->master->quota = q->quota; return 0; } diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index c04fcf385c59..ef36a56a02c6 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -3,6 +3,7 @@ #include <linux/skbuff.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/sctp/sctp.h> #include <linux/sctp.h> #include <linux/netfilter/x_tables.h> @@ -67,7 +68,7 @@ match_packet(const struct sk_buff *skb, ++i, offset, sch->type, htons(sch->length), sch->flags); #endif - offset += (ntohs(sch->length) + 3) & ~3; + offset += WORD_ROUND(ntohs(sch->length)); pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 3d54c236a1ba..1ca89908cbad 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -127,7 +127,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, * reply packet of an established SNAT-ted connection. */ ct = nf_ct_get(skb, &ctinfo); - if (ct && (ct != &nf_conntrack_untracked) && + if (ct && !nf_ct_is_untracked(ct) && ((iph->protocol != IPPROTO_ICMP && ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) || (iph->protocol == IPPROTO_ICMP && diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index e12e053d3782..a507922d80cd 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -26,14 +26,16 @@ state_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_state_info *sinfo = par->matchinfo; enum ip_conntrack_info ctinfo; unsigned int statebit; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - if (nf_ct_is_untracked(skb)) - statebit = XT_STATE_UNTRACKED; - else if (!nf_ct_get(skb, &ctinfo)) + if (!ct) statebit = XT_STATE_INVALID; - else - statebit = XT_STATE_BIT(ctinfo); - + else { + if (nf_ct_is_untracked(ct)) + statebit = XT_STATE_UNTRACKED; + else + statebit = XT_STATE_BIT(ctinfo); + } return (sinfo->statemask & statebit); } diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 96e62b8fd6b1..42ecb71d445f 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -18,8 +18,8 @@ #include <linux/netfilter/x_tables.h> struct xt_statistic_priv { - uint32_t count; -}; + atomic_t count; +} ____cacheline_aligned_in_smp; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); @@ -27,13 +27,12 @@ MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)"); MODULE_ALIAS("ipt_statistic"); MODULE_ALIAS("ip6t_statistic"); -static DEFINE_SPINLOCK(nth_lock); - static bool statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_statistic_info *info = par->matchinfo; bool ret = info->flags & XT_STATISTIC_INVERT; + int nval, oval; switch (info->mode) { case XT_STATISTIC_MODE_RANDOM: @@ -41,12 +40,12 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) ret = !ret; break; case XT_STATISTIC_MODE_NTH: - spin_lock_bh(&nth_lock); - if (info->master->count++ == info->u.nth.every) { - info->master->count = 0; + do { + oval = atomic_read(&info->master->count); + nval = (oval == info->u.nth.every) ? 0 : oval + 1; + } while (atomic_cmpxchg(&info->master->count, oval, nval) != oval); + if (nval == 0) ret = !ret; - } - spin_unlock_bh(&nth_lock); break; } @@ -64,7 +63,7 @@ static int statistic_mt_check(const struct xt_mtchk_param *par) info->master = kzalloc(sizeof(*info->master), GFP_KERNEL); if (info->master == NULL) return -ENOMEM; - info->master->count = info->u.nth.count; + atomic_set(&info->master->count, info->u.nth.count); return 0; } |