1/*
2 * sfe-cm.c
3 *	Shortcut forwarding engine connection manager.
4 *
5 * Copyright (c) 2013-2015 The Linux Foundation. All rights reserved.
6 * Permission to use, copy, modify, and/or distribute this software for
7 * any purpose with or without fee is hereby granted, provided that the
8 * above copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
15 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <linux/module.h>
19#include <linux/sysfs.h>
20#include <linux/skbuff.h>
21#include <net/route.h>
22#include <net/ip6_route.h>
23#include <net/addrconf.h>
24#include <net/dsfield.h>
25#include <linux/inetdevice.h>
26#include <linux/netfilter_bridge.h>
27#include <linux/netfilter_ipv6.h>
28#include <net/netfilter/nf_conntrack_acct.h>
29#include <net/netfilter/nf_conntrack_helper.h>
30#include <net/netfilter/nf_conntrack_zones.h>
31#include <net/netfilter/nf_conntrack_core.h>
32#include <linux/netfilter/xt_dscp.h>
33#include <linux/if_bridge.h>
34
35#include "sfe.h"
36#include "sfe_cm.h"
37#include "sfe_backport.h"
38
39typedef enum sfe_cm_exception {
40	SFE_CM_EXCEPTION_PACKET_BROADCAST,
41	SFE_CM_EXCEPTION_PACKET_MULTICAST,
42	SFE_CM_EXCEPTION_NO_IIF,
43	SFE_CM_EXCEPTION_NO_CT,
44	SFE_CM_EXCEPTION_CT_NO_TRACK,
45	SFE_CM_EXCEPTION_CT_NO_CONFIRM,
46	SFE_CM_EXCEPTION_CT_IS_ALG,
47	SFE_CM_EXCEPTION_IS_IPV4_MCAST,
48	SFE_CM_EXCEPTION_IS_IPV6_MCAST,
49	SFE_CM_EXCEPTION_TCP_NOT_ASSURED,
50	SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED,
51	SFE_CM_EXCEPTION_UNKNOW_PROTOCOL,
52	SFE_CM_EXCEPTION_NO_SRC_DEV,
53	SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV,
54	SFE_CM_EXCEPTION_NO_DEST_DEV,
55	SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV,
56	SFE_CM_EXCEPTION_NO_BRIDGE,
57	SFE_CM_EXCEPTION_LOCAL_OUT,
58	SFE_CM_EXCEPTION_MAX
59} sfe_cm_exception_t;
60
61static char *sfe_cm_exception_events_string[SFE_CM_EXCEPTION_MAX] = {
62	"PACKET_BROADCAST",
63	"PACKET_MULTICAST",
64	"NO_IIF",
65	"NO_CT",
66	"CT_NO_TRACK",
67	"CT_NO_CONFIRM",
68	"CT_IS_ALG",
69	"IS_IPV4_MCAST",
70	"IS_IPV6_MCAST",
71	"TCP_NOT_ASSURED",
72	"TCP_NOT_ESTABLISHED",
73	"UNKNOW_PROTOCOL",
74	"NO_SRC_DEV",
75	"NO_SRC_XLATE_DEV",
76	"NO_DEST_DEV",
77	"NO_DEST_XLATE_DEV",
78	"NO_BRIDGE",
79	"LOCAL_OUT"
80};
81
82/*
83 * Per-module structure.
84 */
85struct sfe_cm {
86	spinlock_t lock;		/* Lock for SMP correctness */
87
88	/*
89	 * Control state.
90	 */
91	struct kobject *sys_sfe_cm;	/* sysfs linkage */
92
93	/*
94	 * Callback notifiers.
95	 */
96	struct notifier_block dev_notifier;
97					/* Device notifier */
98	struct notifier_block inet_notifier;
99					/* IPv4 notifier */
100	struct notifier_block inet6_notifier;
101					/* IPv6 notifier */
102	uint32_t exceptions[SFE_CM_EXCEPTION_MAX];
103};
104
105struct sfe_cm __sc;
106
107/*
108 * Expose the hook for the receive processing.
109 */
110extern int (*athrs_fast_nat_recv)(struct sk_buff *skb);
111
112/*
113 * Expose what should be a static flag in the TCP connection tracker.
114 */
115extern int nf_ct_tcp_no_window_check;
116
117/*
118 * sfe_cm_incr_exceptions()
119 *	increase an exception counter.
120 */
121static inline void sfe_cm_incr_exceptions(sfe_cm_exception_t except)
122{
123	struct sfe_cm *sc = &__sc;
124
125	spin_lock_bh(&sc->lock);
126	sc->exceptions[except]++;
127	spin_unlock_bh(&sc->lock);
128}
129
130/*
131 * sfe_cm_recv()
132 *	Handle packet receives.
133 *
134 * Returns 1 if the packet is forwarded or 0 if it isn't.
135 */
136int sfe_cm_recv(struct sk_buff *skb)
137{
138	struct net_device *dev;
139
140	/*
141	 * We know that for the vast majority of packets we need the transport
142	 * layer header so we may as well start to fetch it now!
143	 */
144	prefetch(skb->data + 32);
145	barrier();
146
147	dev = skb->dev;
148
149	/*
150	 * We're only interested in IPv4 and IPv6 packets.
151	 */
152	if (likely(htons(ETH_P_IP) == skb->protocol)) {
153#if (SFE_HOOK_ABOVE_BRIDGE)
154		struct in_device *in_dev;
155
156		/*
157		 * Does our input device support IP processing?
158		 */
159		in_dev = (struct in_device *)dev->ip_ptr;
160		if (unlikely(!in_dev)) {
161			DEBUG_TRACE("no IP processing for device: %s\n", dev->name);
162			return 0;
163		}
164
165		/*
166		 * Does it have an IP address?  If it doesn't then we can't do anything
167		 * interesting here!
168		 */
169		if (unlikely(!in_dev->ifa_list)) {
170			DEBUG_TRACE("no IP address for device: %s\n", dev->name);
171			return 0;
172		}
173#endif
174
175		return sfe_ipv4_recv(dev, skb);
176	}
177
178	if (likely(htons(ETH_P_IPV6) == skb->protocol)) {
179#if (SFE_HOOK_ABOVE_BRIDGE)
180		struct inet6_dev *in_dev;
181
182		/*
183		 * Does our input device support IPv6 processing?
184		 */
185		in_dev = (struct inet6_dev *)dev->ip6_ptr;
186		if (unlikely(!in_dev)) {
187			DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name);
188			return 0;
189		}
190
191		/*
192		 * Does it have an IPv6 address?  If it doesn't then we can't do anything
193		 * interesting here!
194		 */
195		if (unlikely(list_empty(&in_dev->addr_list))) {
196			DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name);
197			return 0;
198		}
199#endif
200
201		return sfe_ipv6_recv(dev, skb);
202	}
203
204	DEBUG_TRACE("not IP packet\n");
205	return 0;
206}
207
208/*
209 * sfe_cm_find_dev_and_mac_addr()
210 *	Find the device and MAC address for a given IPv4/IPv6 address.
211 *
212 * Returns true if we find the device and MAC address, otherwise false.
213 *
214 * We look up the rtable entry for the address and, from its neighbour
215 * structure, obtain the hardware address.  This means this function also
216 * works if the neighbours are routers too.
217 */
218static bool sfe_cm_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, uint8_t *mac_addr, int is_v4)
219{
220	struct neighbour *neigh;
221	struct rtable *rt;
222	struct rt6_info *rt6;
223	struct dst_entry *dst;
224	struct net_device *mac_dev;
225
226	/*
227	 * Look up the rtable entry for the IP address then get the hardware
228	 * address from its neighbour structure.  This means this work when the
229	 * neighbours are routers too.
230	 */
231	if (likely(is_v4)) {
232		rt = ip_route_output(&init_net, addr->ip, 0, 0, 0);
233		if (unlikely(IS_ERR(rt))) {
234			goto ret_fail;
235		}
236
237		dst = (struct dst_entry *)rt;
238	} else {
239		rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0);
240		if (!rt6) {
241			goto ret_fail;
242		}
243
244		dst = (struct dst_entry *)rt6;
245	}
246
247	rcu_read_lock();
248	neigh = dst_neigh_lookup(dst, addr);
249	if (unlikely(!neigh)) {
250		rcu_read_unlock();
251		dst_release(dst);
252		goto ret_fail;
253	}
254
255	if (unlikely(!(neigh->nud_state & NUD_VALID))) {
256		rcu_read_unlock();
257		neigh_release(neigh);
258		dst_release(dst);
259		goto ret_fail;
260	}
261
262	mac_dev = neigh->dev;
263	if (!mac_dev) {
264		rcu_read_unlock();
265		neigh_release(neigh);
266		dst_release(dst);
267		goto ret_fail;
268	}
269
270	memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len);
271
272	dev_hold(mac_dev);
273	*dev = mac_dev;
274	rcu_read_unlock();
275	neigh_release(neigh);
276	dst_release(dst);
277
278	return true;
279
280ret_fail:
281	if (is_v4) {
282		DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", &addr->ip);
283
284	} else {
285		DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr->ip6);
286	}
287
288	return false;
289}
290
291/*
292 * sfe_cm_post_routing()
293 *	Called for packets about to leave the box - either locally generated or forwarded from another interface
294 */
295static unsigned int sfe_cm_post_routing(struct sk_buff *skb, int is_v4)
296{
297	struct sfe_connection_create sic;
298	struct net_device *in;
299	struct nf_conn *ct;
300	enum ip_conntrack_info ctinfo;
301	struct net_device *dev;
302	struct net_device *src_dev;
303	struct net_device *dest_dev;
304	struct net_device *src_br_dev = NULL;
305	struct net_device *dest_br_dev = NULL;
306	struct nf_conntrack_tuple orig_tuple;
307	struct nf_conntrack_tuple reply_tuple;
308
309	/*
310	 * Don't process broadcast or multicast packets.
311	 */
312	if (unlikely(skb->pkt_type == PACKET_BROADCAST)) {
313		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_BROADCAST);
314		DEBUG_TRACE("broadcast, ignoring\n");
315		return NF_ACCEPT;
316	}
317	if (unlikely(skb->pkt_type == PACKET_MULTICAST)) {
318		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_MULTICAST);
319		DEBUG_TRACE("multicast, ignoring\n");
320		return NF_ACCEPT;
321	}
322
323#ifdef CONFIG_XFRM
324	/*
325	 * Packet to xfrm for encapsulation, we can't process it
326	 */
327	if (unlikely(skb_dst(skb)->xfrm)) {
328		DEBUG_TRACE("packet to xfrm, ignoring\n");
329		return NF_ACCEPT;
330	}
331#endif
332
333	/*
334	 * Don't process locally generated packets.
335	 */
336	if (skb->sk) {
337		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_LOCAL_OUT);
338		DEBUG_TRACE("skip local out packet\n");
339		return NF_ACCEPT;
340	}
341
342	/*
343	 * Don't process packets that are not being forwarded.
344	 */
345	in = dev_get_by_index(&init_net, skb->skb_iif);
346	if (!in) {
347		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_IIF);
348		DEBUG_TRACE("packet not forwarding\n");
349		return NF_ACCEPT;
350	}
351
352	dev_put(in);
353
354	/*
355	 * Don't process packets that aren't being tracked by conntrack.
356	 */
357	ct = nf_ct_get(skb, &ctinfo);
358	if (unlikely(!ct)) {
359		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_CT);
360		DEBUG_TRACE("no conntrack connection, ignoring\n");
361		return NF_ACCEPT;
362	}
363
364	/*
365	 * Don't process untracked connections.
366	 */
367	if (unlikely(ct == &nf_conntrack_untracked)) {
368		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_TRACK);
369		DEBUG_TRACE("untracked connection\n");
370		return NF_ACCEPT;
371	}
372
373	/*
374	 * Unconfirmed connection may be dropped by Linux at the final step,
375	 * So we don't process unconfirmed connections.
376	 */
377	if (!nf_ct_is_confirmed(ct)) {
378		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_CONFIRM);
379		DEBUG_TRACE("unconfirmed connection\n");
380		return NF_ACCEPT;
381	}
382
383	/*
384	 * Don't process connections that require support from a 'helper' (typically a NAT ALG).
385	 */
386	if (unlikely(nfct_help(ct))) {
387		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_IS_ALG);
388		DEBUG_TRACE("connection has helper\n");
389		return NF_ACCEPT;
390	}
391
392	/*
393	 * Look up the details of our connection in conntrack.
394	 *
395	 * Note that the data we get from conntrack is for the "ORIGINAL" direction
396	 * but our packet may actually be in the "REPLY" direction.
397	 */
398	orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
399	reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
400	sic.protocol = (int32_t)orig_tuple.dst.protonum;
401
402	sic.flags = 0;
403
404	/*
405	 * Get addressing information, non-NAT first
406	 */
407	if (likely(is_v4)) {
408		uint32_t dscp;
409
410		sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
411		sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
412
413		if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) {
414			sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV4_MCAST);
415			DEBUG_TRACE("multicast address\n");
416			return NF_ACCEPT;
417		}
418
419		/*
420		 * NAT'ed addresses - note these are as seen from the 'reply' direction
421		 * When NAT does not apply to this connection these will be identical to the above.
422		 */
423		sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip;
424		sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip;
425
426		dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
427		if (dscp) {
428			sic.src_dscp = sic.dest_dscp = dscp;
429			sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
430		}
431	} else {
432		uint32_t dscp;
433
434		sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
435		sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
436
437		if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) ||
438		    ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) {
439			sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV6_MCAST);
440			DEBUG_TRACE("multicast address\n");
441			return NF_ACCEPT;
442		}
443
444		/*
445		 * NAT'ed addresses - note these are as seen from the 'reply' direction
446		 * When NAT does not apply to this connection these will be identical to the above.
447		 */
448		sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6);
449		sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6);
450
451		dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
452		if (dscp) {
453			sic.src_dscp = sic.dest_dscp = dscp;
454			sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
455		}
456	}
457
458	switch (sic.protocol) {
459	case IPPROTO_TCP:
460		sic.src_port = orig_tuple.src.u.tcp.port;
461		sic.dest_port = orig_tuple.dst.u.tcp.port;
462		sic.src_port_xlate = reply_tuple.dst.u.tcp.port;
463		sic.dest_port_xlate = reply_tuple.src.u.tcp.port;
464		sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale;
465		sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin;
466		sic.src_td_end = ct->proto.tcp.seen[0].td_end;
467		sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend;
468		sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale;
469		sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin;
470		sic.dest_td_end = ct->proto.tcp.seen[1].td_end;
471		sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend;
472		if (nf_ct_tcp_no_window_check
473		    || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL)
474		    || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) {
475			sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK;
476		}
477
478		/*
479		 * Don't try to manage a non-established connection.
480		 */
481		if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
482			sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ASSURED);
483			DEBUG_TRACE("non-established connection\n");
484			return NF_ACCEPT;
485		}
486
487		/*
488		 * If the connection is shutting down do not manage it.
489		 * state can not be SYN_SENT, SYN_RECV because connection is assured
490		 * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE.
491		 */
492		spin_lock_bh(&ct->lock);
493		if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) {
494			spin_unlock_bh(&ct->lock);
495			sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED);
496			DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n",
497				    ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port),
498				    &sic.dest_ip, ntohs(sic.dest_port));
499			return NF_ACCEPT;
500		}
501		spin_unlock_bh(&ct->lock);
502		break;
503
504	case IPPROTO_UDP:
505		sic.src_port = orig_tuple.src.u.udp.port;
506		sic.dest_port = orig_tuple.dst.u.udp.port;
507		sic.src_port_xlate = reply_tuple.dst.u.udp.port;
508		sic.dest_port_xlate = reply_tuple.src.u.udp.port;
509		break;
510
511	default:
512		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_UNKNOW_PROTOCOL);
513		DEBUG_TRACE("unhandled protocol %d\n", sic.protocol);
514		return NF_ACCEPT;
515	}
516
517#ifdef CONFIG_XFRM
518	sic.original_accel = 1;
519	sic.reply_accel = 1;
520
521	/*
522	 * For packets de-capsulated from xfrm, we still can accelerate it
523	 * on the direction we just received the packet.
524	 */
525	if (unlikely(skb->sp)) {
526		if (sic.protocol == IPPROTO_TCP &&
527			!(sic.flags & SFE_CREATE_FLAG_NO_SEQ_CHECK)) {
528			return NF_ACCEPT;
529		}
530
531		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
532			sic.reply_accel = 0;
533		} else {
534			sic.original_accel = 0;
535		}
536	}
537#endif
538
539	/*
540	 * Get QoS information
541	 */
542	if (skb->priority) {
543		sic.src_priority = sic.dest_priority = skb->priority;
544		sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY;
545	}
546
547	/*
548	 * Get the net device and MAC addresses that correspond to the various source and
549	 * destination host addresses.
550	 */
551	if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip, &src_dev, sic.src_mac, is_v4)) {
552		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_DEV);
553		return NF_ACCEPT;
554	}
555
556	if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) {
557		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV);
558		goto done1;
559	}
560
561	dev_put(dev);
562
563	if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) {
564		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_DEV);
565		goto done1;
566	}
567
568	dev_put(dev);
569
570	if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev, sic.dest_mac_xlate, is_v4)) {
571		sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV);
572		goto done1;
573	}
574
575#if (!SFE_HOOK_ABOVE_BRIDGE)
576	/*
577	 * Now our devices may actually be a bridge interface.  If that's
578	 * the case then we need to hunt down the underlying interface.
579	 */
580	if (src_dev->priv_flags & IFF_EBRIDGE) {
581		src_br_dev = br_port_dev_get(src_dev, sic.src_mac);
582		if (!src_br_dev) {
583			sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
584			DEBUG_TRACE("no port found on bridge\n");
585			goto done2;
586		}
587
588		src_dev = src_br_dev;
589	}
590
591	if (dest_dev->priv_flags & IFF_EBRIDGE) {
592		dest_br_dev = br_port_dev_get(dest_dev, sic.dest_mac_xlate);
593		if (!dest_br_dev) {
594			sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
595			DEBUG_TRACE("no port found on bridge\n");
596			goto done3;
597		}
598
599		dest_dev = dest_br_dev;
600	}
601#else
602	/*
603	 * Our devices may actually be part of a bridge interface.  If that's
604	 * the case then find the bridge interface instead.
605	 */
606	if (src_dev->priv_flags & IFF_BRIDGE_PORT) {
607		src_br_dev = sfe_dev_get_master(src_dev);
608		if (!src_br_dev) {
609			sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
610			DEBUG_TRACE("no bridge found for: %s\n", src_dev->name);
611			goto done2;
612		}
613
614		src_dev = src_br_dev;
615	}
616
617	if (dest_dev->priv_flags & IFF_BRIDGE_PORT) {
618		dest_br_dev = sfe_dev_get_master(dest_dev);
619		if (!dest_br_dev) {
620			sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
621			DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name);
622			goto done3;
623		}
624
625		dest_dev = dest_br_dev;
626	}
627#endif
628
629	sic.src_dev = src_dev;
630	sic.dest_dev = dest_dev;
631
632	sic.src_mtu = src_dev->mtu;
633	sic.dest_mtu = dest_dev->mtu;
634
635	if (likely(is_v4)) {
636		sfe_ipv4_create_rule(&sic);
637	} else {
638		sfe_ipv6_create_rule(&sic);
639	}
640
641	/*
642	 * If we had bridge ports then release them too.
643	 */
644	if (dest_br_dev) {
645		dev_put(dest_br_dev);
646	}
647
648done3:
649	if (src_br_dev) {
650		dev_put(src_br_dev);
651	}
652
653done2:
654	dev_put(dest_dev);
655
656done1:
657	dev_put(src_dev);
658
659	return NF_ACCEPT;
660}
661
662/*
663 * sfe_cm_ipv4_post_routing_hook()
664 *	Called for packets about to leave the box - either locally generated or forwarded from another interface
665 */
666sfe_cm_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
667{
668	return sfe_cm_post_routing(skb, true);
669}
670
671/*
672 * sfe_cm_ipv6_post_routing_hook()
673 *	Called for packets about to leave the box - either locally generated or forwarded from another interface
674 */
675sfe_cm_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
676{
677	return sfe_cm_post_routing(skb, false);
678}
679
680
681#ifdef CONFIG_NF_CONNTRACK_EVENTS
682/*
683 * sfe_cm_conntrack_event()
684 *	Callback event invoked when a conntrack connection's state changes.
685 */
686#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
687static int sfe_cm_conntrack_event(struct notifier_block *this,
688			unsigned long events, void *ptr)
689#else
690static int sfe_cm_conntrack_event(unsigned int events, struct nf_ct_event *item)
691#endif
692{
693#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
694	struct nf_ct_event *item = ptr;
695#endif
696	struct sfe_connection_destroy sid;
697	struct nf_conn *ct = item->ct;
698	struct nf_conntrack_tuple orig_tuple;
699
700	/*
701	 * If we don't have a conntrack entry then we're done.
702	 */
703	if (unlikely(!ct)) {
704		DEBUG_WARN("no ct in conntrack event callback\n");
705		return NOTIFY_DONE;
706	}
707
708	/*
709	 * If this is an untracked connection then we can't have any state either.
710	 */
711	if (unlikely(ct == &nf_conntrack_untracked)) {
712		DEBUG_TRACE("ignoring untracked conn\n");
713		return NOTIFY_DONE;
714	}
715
716	/*
717	 * We're only interested in destroy events.
718	 */
719	if (unlikely(!(events & (1 << IPCT_DESTROY)))) {
720		DEBUG_TRACE("ignoring non-destroy event\n");
721		return NOTIFY_DONE;
722	}
723
724	orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
725	sid.protocol = (int32_t)orig_tuple.dst.protonum;
726
727	/*
728	 * Extract information from the conntrack connection.  We're only interested
729	 * in nominal connection information (i.e. we're ignoring any NAT information).
730	 */
731	switch (sid.protocol) {
732	case IPPROTO_TCP:
733		sid.src_port = orig_tuple.src.u.tcp.port;
734		sid.dest_port = orig_tuple.dst.u.tcp.port;
735		break;
736
737	case IPPROTO_UDP:
738		sid.src_port = orig_tuple.src.u.udp.port;
739		sid.dest_port = orig_tuple.dst.u.udp.port;
740		break;
741
742	default:
743		DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol);
744		return NOTIFY_DONE;
745	}
746
747	if (likely(nf_ct_l3num(ct) == AF_INET)) {
748		sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
749		sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
750
751		sfe_ipv4_destroy_rule(&sid);
752	} else if (likely(nf_ct_l3num(ct) == AF_INET6)) {
753		sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
754		sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
755
756		sfe_ipv6_destroy_rule(&sid);
757	} else {
758		DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n");
759	}
760
761	return NOTIFY_DONE;
762}
763
764/*
765 * Netfilter conntrack event system to monitor connection tracking changes
766 */
767#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
768static struct notifier_block sfe_cm_conntrack_notifier = {
769	.notifier_call = sfe_cm_conntrack_event,
770};
771#else
772static struct nf_ct_event_notifier sfe_cm_conntrack_notifier = {
773	.fcn = sfe_cm_conntrack_event,
774};
775#endif
776#endif
777
778/*
779 * Structure to establish a hook into the post routing netfilter point - this
780 * will pick up local outbound and packets going from one interface to another.
781 *
782 * Note: see include/linux/netfilter_ipv4.h for info related to priority levels.
783 * We want to examine packets after NAT translation and any ALG processing.
784 */
785static struct nf_hook_ops sfe_cm_ops_post_routing[] __read_mostly = {
786	{
787		.hook = __sfe_cm_ipv4_post_routing_hook,
788		.owner = THIS_MODULE,
789		.pf = NFPROTO_IPV4,
790		.hooknum = NF_INET_POST_ROUTING,
791		.priority = NF_IP_PRI_NAT_SRC + 1,
792	},
793#ifdef SFE_SUPPORT_IPV6
794	{
795		.hook = __sfe_cm_ipv6_post_routing_hook,
796		.owner = THIS_MODULE,
797		.pf = NFPROTO_IPV6,
798		.hooknum = NF_INET_POST_ROUTING,
799		.priority = NF_IP6_PRI_NAT_SRC + 1,
800	},
801#endif
802};
803
804/*
805 * sfe_cm_sync_rule()
806 *	Synchronize a connection's state.
807 */
808static void sfe_cm_sync_rule(struct sfe_connection_sync *sis)
809{
810	struct nf_conntrack_tuple_hash *h;
811	struct nf_conntrack_tuple tuple;
812	struct nf_conn *ct;
813	SFE_NF_CONN_ACCT(acct);
814
815	/*
816	 * Create a tuple so as to be able to look up a connection
817	 */
818	memset(&tuple, 0, sizeof(tuple));
819	tuple.src.u.all = (__be16)sis->src_port;
820	tuple.dst.dir = IP_CT_DIR_ORIGINAL;
821	tuple.dst.protonum = (uint8_t)sis->protocol;
822	tuple.dst.u.all = (__be16)sis->dest_port;
823
824	if (sis->is_v6) {
825		tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6);
826		tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6);
827		tuple.src.l3num = AF_INET6;
828
829		DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n",
830			    (int)tuple.dst.protonum,
831			    &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all),
832			    &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all));
833	} else {
834		tuple.src.u3.ip = sis->src_ip.ip;
835		tuple.dst.u3.ip = sis->dest_ip.ip;
836		tuple.src.l3num = AF_INET;
837
838		DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n",
839			    (int)tuple.dst.protonum,
840			    &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all),
841			    &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all));
842	}
843
844	/*
845	 * Look up conntrack connection
846	 */
847	h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
848	if (unlikely(!h)) {
849		DEBUG_TRACE("no connection found\n");
850		return;
851	}
852
853	ct = nf_ct_tuplehash_to_ctrack(h);
854	NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
855
856	/*
857	 * Only update if this is not a fixed timeout
858	 */
859	if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
860		spin_lock_bh(&ct->lock);
861		ct->timeout.expires += sis->delta_jiffies;
862		spin_unlock_bh(&ct->lock);
863	}
864
865	acct = nf_conn_acct_find(ct);
866	if (acct) {
867		spin_lock_bh(&ct->lock);
868		atomic64_set(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets, sis->src_packet_count);
869		atomic64_set(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes, sis->src_byte_count);
870		atomic64_set(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets, sis->dest_packet_count);
871		atomic64_set(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes, sis->dest_byte_count);
872		spin_unlock_bh(&ct->lock);
873	}
874
875	switch (sis->protocol) {
876	case IPPROTO_TCP:
877		spin_lock_bh(&ct->lock);
878		if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) {
879			ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window;
880		}
881		if ((int32_t)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) {
882			ct->proto.tcp.seen[0].td_end = sis->src_td_end;
883		}
884		if ((int32_t)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) {
885			ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end;
886		}
887		if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) {
888			ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window;
889		}
890		if ((int32_t)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) {
891			ct->proto.tcp.seen[1].td_end = sis->dest_td_end;
892		}
893		if ((int32_t)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) {
894			ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end;
895		}
896		spin_unlock_bh(&ct->lock);
897		break;
898	}
899
900	/*
901	 * Release connection
902	 */
903	nf_ct_put(ct);
904}
905
906/*
907 * sfe_cm_device_event()
908 */
909int sfe_cm_device_event(struct notifier_block *this, unsigned long event, void *ptr)
910{
911	struct net_device *dev = SFE_DEV_EVENT_PTR(ptr);
912
913	switch (event) {
914	case NETDEV_DOWN:
915		if (dev) {
916			sfe_ipv4_destroy_all_rules_for_dev(dev);
917			sfe_ipv6_destroy_all_rules_for_dev(dev);
918		}
919		break;
920	}
921
922	return NOTIFY_DONE;
923}
924
925/*
926 * sfe_cm_inet_event()
927 */
928static int sfe_cm_inet_event(struct notifier_block *this, unsigned long event, void *ptr)
929{
930	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
931	return sfe_cm_propagate_event(this, event, dev);
932}
933
934/*
935 * sfe_cm_inet6_event()
936 */
937static int sfe_cm_inet6_event(struct notifier_block *this, unsigned long event, void *ptr)
938{
939	struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev;
940	return sfe_cm_propagate_event(this, event, dev);
941}
942
943/*
944 * sfe_cm_get_exceptions
945 * 	dump exception counters
946 */
947static ssize_t sfe_cm_get_exceptions(struct device *dev,
948				     struct device_attribute *attr,
949				     char *buf)
950{
951	int idx, len;
952	struct sfe_cm *sc = &__sc;
953
954	spin_lock_bh(&sc->lock);
955	for (len = 0, idx = 0; idx < SFE_CM_EXCEPTION_MAX; idx++) {
956		if (sc->exceptions[idx]) {
957			len += sprintf(buf + len, "%s = %d\n", sfe_cm_exception_events_string[idx], sc->exceptions[idx]);
958		}
959	}
960	spin_unlock_bh(&sc->lock);
961
962	return len;
963}
964
965/*
966 * sysfs attributes.
967 */
968static const struct device_attribute sfe_cm_exceptions_attr =
969	__ATTR(exceptions, S_IRUGO, sfe_cm_get_exceptions, NULL);
970
971/*
972 * sfe_cm_init()
973 */
974static int __init sfe_cm_init(void)
975{
976	struct sfe_cm *sc = &__sc;
977	int result = -1;
978
979	DEBUG_INFO("SFE CM init\n");
980
981	/*
982	 * Create sys/sfe_cm
983	 */
984	sc->sys_sfe_cm = kobject_create_and_add("sfe_cm", NULL);
985	if (!sc->sys_sfe_cm) {
986		DEBUG_ERROR("failed to register sfe_cm\n");
987		goto exit1;
988	}
989
990	/*
991	 * Create sys/sfe_cm/exceptions
992	 */
993	result = sysfs_create_file(sc->sys_sfe_cm, &sfe_cm_exceptions_attr.attr);
994	if (result) {
995		DEBUG_ERROR("failed to register exceptions file: %d\n", result);
996		goto exit2;
997	}
998
999	sc->dev_notifier.notifier_call = sfe_cm_device_event;
1000	sc->dev_notifier.priority = 1;
1001	register_netdevice_notifier(&sc->dev_notifier);
1002
1003	sc->inet_notifier.notifier_call = sfe_cm_inet_event;
1004	sc->inet_notifier.priority = 1;
1005	register_inetaddr_notifier(&sc->inet_notifier);
1006
1007	sc->inet6_notifier.notifier_call = sfe_cm_inet6_event;
1008	sc->inet6_notifier.priority = 1;
1009	register_inet6addr_notifier(&sc->inet6_notifier);
1010	/*
1011	 * Register our netfilter hooks.
1012	 */
1013	result = nf_register_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1014	if (result < 0) {
1015		DEBUG_ERROR("can't register nf post routing hook: %d\n", result);
1016		goto exit3;
1017	}
1018
1019#ifdef CONFIG_NF_CONNTRACK_EVENTS
1020	/*
1021	 * Register a notifier hook to get fast notifications of expired connections.
1022	 */
1023	result = nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier);
1024	if (result < 0) {
1025		DEBUG_ERROR("can't register nf notifier hook: %d\n", result);
1026		goto exit4;
1027	}
1028#endif
1029
1030	spin_lock_init(&sc->lock);
1031
1032	/*
1033	 * Hook the receive path in the network stack.
1034	 */
1035	BUG_ON(athrs_fast_nat_recv != NULL);
1036	RCU_INIT_POINTER(athrs_fast_nat_recv, sfe_cm_recv);
1037
1038	/*
1039	 * Hook the shortcut sync callback.
1040	 */
1041	sfe_ipv4_register_sync_rule_callback(sfe_cm_sync_rule);
1042	sfe_ipv6_register_sync_rule_callback(sfe_cm_sync_rule);
1043	return 0;
1044
1045#ifdef CONFIG_NF_CONNTRACK_EVENTS
1046exit4:
1047#endif
1048	nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1049
1050exit3:
1051	unregister_inet6addr_notifier(&sc->inet6_notifier);
1052	unregister_inetaddr_notifier(&sc->inet_notifier);
1053	unregister_netdevice_notifier(&sc->dev_notifier);
1054exit2:
1055	kobject_put(sc->sys_sfe_cm);
1056
1057exit1:
1058	return result;
1059}
1060
1061/*
1062 * sfe_cm_exit()
1063 */
1064static void __exit sfe_cm_exit(void)
1065{
1066	struct sfe_cm *sc = &__sc;
1067
1068	DEBUG_INFO("SFE CM exit\n");
1069
1070	/*
1071	 * Unregister our sync callback.
1072	 */
1073	sfe_ipv4_register_sync_rule_callback(NULL);
1074	sfe_ipv6_register_sync_rule_callback(NULL);
1075
1076	/*
1077	 * Unregister our receive callback.
1078	 */
1079	RCU_INIT_POINTER(athrs_fast_nat_recv, NULL);
1080
1081	/*
1082	 * Wait for all callbacks to complete.
1083	 */
1084	rcu_barrier();
1085
1086	/*
1087	 * Destroy all connections.
1088	 */
1089	sfe_ipv4_destroy_all_rules_for_dev(NULL);
1090	sfe_ipv6_destroy_all_rules_for_dev(NULL);
1091
1092#ifdef CONFIG_NF_CONNTRACK_EVENTS
1093	nf_conntrack_unregister_notifier(&init_net, &sfe_cm_conntrack_notifier);
1094
1095#endif
1096	nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1097
1098	unregister_inet6addr_notifier(&sc->inet6_notifier);
1099	unregister_inetaddr_notifier(&sc->inet_notifier);
1100	unregister_netdevice_notifier(&sc->dev_notifier);
1101
1102	kobject_put(sc->sys_sfe_cm);
1103}
1104
1105module_init(sfe_cm_init)
1106module_exit(sfe_cm_exit)
1107
1108MODULE_AUTHOR("Qualcomm Atheros Inc.");
1109MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager");
1110MODULE_LICENSE("Dual BSD/GPL");
1111
1112