1262743Sglebius/*-
2262743Sglebius * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
3262743Sglebius * Copyright (c) 2008-2010, BitGravity Inc.
4262743Sglebius * All rights reserved.
5262743Sglebius *
6262743Sglebius * Redistribution and use in source and binary forms, with or without
7262743Sglebius * modification, are permitted provided that the following conditions are met:
8262743Sglebius *
9262743Sglebius *  1. Redistributions of source code must retain the above copyright notice,
10262743Sglebius *     this list of conditions and the following disclaimer.
11262743Sglebius *
12262743Sglebius *  2. Neither the name of the BitGravity Corporation nor the names of its
13262743Sglebius *     contributors may be used to endorse or promote products derived from
14262743Sglebius *     this software without specific prior written permission.
15262743Sglebius *
16262743Sglebius * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17262743Sglebius * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18262743Sglebius * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19262743Sglebius * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20262743Sglebius * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21262743Sglebius * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22262743Sglebius * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23262743Sglebius * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24262743Sglebius * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25262743Sglebius * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26262743Sglebius * POSSIBILITY OF SUCH DAMAGE.
27262743Sglebius */
28191255Skmacy
29191255Skmacy#include "opt_route.h"
30191255Skmacy#include "opt_mpath.h"
31196368Skmacy#include "opt_ddb.h"
32205066Skmacy#include "opt_inet.h"
33205066Skmacy#include "opt_inet6.h"
34191255Skmacy
35191255Skmacy#include <sys/cdefs.h>
36191255Skmacy__FBSDID("$FreeBSD: releng/10.3/sys/net/flowtable.c 281955 2015-04-24 23:26:44Z hiren $");
37191255Skmacy
38262743Sglebius#include <sys/param.h>
39191255Skmacy#include <sys/types.h>
40191255Skmacy#include <sys/bitstring.h>
41196368Skmacy#include <sys/condvar.h>
42191255Skmacy#include <sys/callout.h>
43240086Sglebius#include <sys/hash.h>
44262743Sglebius#include <sys/kernel.h>
45191255Skmacy#include <sys/kthread.h>
46191255Skmacy#include <sys/limits.h>
47191255Skmacy#include <sys/malloc.h>
48191255Skmacy#include <sys/mbuf.h>
49262743Sglebius#include <sys/pcpu.h>
50191255Skmacy#include <sys/proc.h>
51262743Sglebius#include <sys/queue.h>
52205066Skmacy#include <sys/sbuf.h>
53191255Skmacy#include <sys/sched.h>
54191255Skmacy#include <sys/smp.h>
55191255Skmacy#include <sys/socket.h>
56191255Skmacy#include <sys/syslog.h>
57191255Skmacy#include <sys/sysctl.h>
58262743Sglebius#include <vm/uma.h>
59191255Skmacy
60191255Skmacy#include <net/if.h>
61191255Skmacy#include <net/if_llatbl.h>
62191255Skmacy#include <net/if_var.h>
63262743Sglebius#include <net/route.h>
64191255Skmacy#include <net/flowtable.h>
65195837Srwatson#include <net/vnet.h>
66191255Skmacy
67191255Skmacy#include <netinet/in.h>
68191255Skmacy#include <netinet/in_systm.h>
69191255Skmacy#include <netinet/in_var.h>
70191255Skmacy#include <netinet/if_ether.h>
71191255Skmacy#include <netinet/ip.h>
72205066Skmacy#ifdef INET6
73205066Skmacy#include <netinet/ip6.h>
74205066Skmacy#endif
75262743Sglebius#ifdef FLOWTABLE_HASH_ALL
76191255Skmacy#include <netinet/tcp.h>
77191255Skmacy#include <netinet/udp.h>
78191255Skmacy#include <netinet/sctp.h>
79262743Sglebius#endif
80191255Skmacy
81196368Skmacy#include <ddb/ddb.h>
82191255Skmacy
83262743Sglebius#ifdef	FLOWTABLE_HASH_ALL
84262743Sglebius#define	KEY_PORTS	(sizeof(uint16_t) * 2)
85262743Sglebius#define	KEY_ADDRS	2
86262743Sglebius#else
87262743Sglebius#define	KEY_PORTS	0
88262743Sglebius#define	KEY_ADDRS	1
89262743Sglebius#endif
90191255Skmacy
91262743Sglebius#ifdef	INET6
92262743Sglebius#define	KEY_ADDR_LEN	sizeof(struct in6_addr)
93262743Sglebius#else
94262743Sglebius#define	KEY_ADDR_LEN	sizeof(struct in_addr)
95262743Sglebius#endif
96191255Skmacy
97262743Sglebius#define	KEYLEN	((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
98191255Skmacy
99191255Skmacystruct flentry {
100262743Sglebius	uint32_t		f_hash;		/* hash flowing forward */
101262743Sglebius	uint32_t		f_key[KEYLEN];	/* address(es and ports) */
102262743Sglebius	uint32_t		f_uptime;	/* uptime at last access */
103262743Sglebius	uint16_t		f_fibnum;	/* fib index */
104262743Sglebius#ifdef FLOWTABLE_HASH_ALL
105191255Skmacy	uint8_t			f_proto;	/* protocol */
106262743Sglebius	uint8_t			f_flags;	/* stale? */
107262743Sglebius#define FL_STALE 		1
108262743Sglebius#endif
109262743Sglebius	SLIST_ENTRY(flentry)	f_next;		/* pointer to collision entry */
110262743Sglebius	struct rtentry		*f_rt;		/* rtentry for flow */
111262743Sglebius	struct llentry		*f_lle;		/* llentry for flow */
112191255Skmacy};
113262743Sglebius#undef KEYLEN
114191255Skmacy
115262743SglebiusSLIST_HEAD(flist, flentry);
116262743Sglebius/* Make sure we can use pcpu_zone_ptr for struct flist. */
117262743SglebiusCTASSERT(sizeof(struct flist) == sizeof(void *));
118191255Skmacy
119191255Skmacystruct flowtable {
120262743Sglebius	counter_u64_t	*ft_stat;
121191255Skmacy	int 		ft_size;
122205488Skmacy	/*
123262743Sglebius	 * ft_table is a malloc(9)ed array of pointers.  Pointers point to
124262743Sglebius	 * memory from UMA_ZONE_PCPU zone.
125262743Sglebius	 * ft_masks is per-cpu pointer itself.  Each instance points
126262743Sglebius	 * to a malloc(9)ed bitset, that is private to corresponding CPU.
127262743Sglebius	 */
128262743Sglebius	struct flist	**ft_table;
129262743Sglebius	bitstr_t 	**ft_masks;
130191324Skmacy	bitstr_t	*ft_tmpmask;
131262743Sglebius};
132191255Skmacy
133262743Sglebius#define	FLOWSTAT_ADD(ft, name, v)	\
134262743Sglebius	counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
135262743Sglebius#define	FLOWSTAT_INC(ft, name)	FLOWSTAT_ADD(ft, name, 1)
136205488Skmacy
137191255Skmacystatic struct proc *flowcleanerproc;
138262743Sglebiusstatic uint32_t flow_hashjitter;
139191255Skmacy
140216855Sbzstatic struct cv 	flowclean_f_cv;
141216855Sbzstatic struct cv 	flowclean_c_cv;
142196368Skmacystatic struct mtx	flowclean_lock;
143196368Skmacystatic uint32_t		flowclean_cycles;
144196368Skmacy
145191255Skmacy/*
146191255Skmacy * TODO:
147262743Sglebius * - add sysctls to resize && flush flow tables
148191255Skmacy * - Add per flowtable sysctls for statistics and configuring timeouts
149191255Skmacy * - add saturation counter to rtentry to support per-packet load-balancing
150191255Skmacy *   add flag to indicate round-robin flow, add list lookup from head
151191255Skmacy     for flows
152191255Skmacy * - add sysctl / device node / syscall to support exporting and importing
153191255Skmacy *   of flows with flag to indicate that a flow was imported so should
154191255Skmacy *   not be considered for auto-cleaning
155191255Skmacy * - support explicit connection state (currently only ad-hoc for DSR)
156194660Szec * - idetach() cleanup for options VIMAGE builds.
157191255Skmacy */
158262743Sglebius#ifdef INET
159262743Sglebiusstatic VNET_DEFINE(struct flowtable, ip4_ft);
160262743Sglebius#define	V_ip4_ft	VNET(ip4_ft)
161262743Sglebius#endif
162262743Sglebius#ifdef INET6
163262743Sglebiusstatic VNET_DEFINE(struct flowtable, ip6_ft);
164262743Sglebius#define	V_ip6_ft	VNET(ip6_ft)
165262743Sglebius#endif
166195699Srwatson
167262743Sglebiusstatic uma_zone_t flow_zone;
168262743Sglebius
169262743Sglebiusstatic VNET_DEFINE(int, flowtable_enable) = 1;
170195727Srwatson#define	V_flowtable_enable		VNET(flowtable_enable)
171195699Srwatson
172262743Sglebiusstatic SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
173227309Sed    "flowtable");
174262743SglebiusSYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW,
175195699Srwatson    &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
176262743SglebiusSYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
177262743Sglebius    &flow_zone, "Maximum number of flows allowed");
178191255Skmacy
179262743Sglebiusstatic MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
180191255Skmacy
181262743Sglebiusstatic struct flentry *
182262743Sglebiusflowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
183191255Skmacy
184205066Skmacy#ifdef INET
185262743Sglebiusstatic struct flentry *
186262743Sglebiusflowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
187205066Skmacy{
188262743Sglebius	struct flentry *fle;
189262743Sglebius	struct sockaddr_in *sin;
190205066Skmacy	struct ip *ip;
191262743Sglebius	uint32_t fibnum;
192262743Sglebius#ifdef FLOWTABLE_HASH_ALL
193262743Sglebius	uint32_t key[3];
194191255Skmacy	int iphlen;
195205066Skmacy	uint16_t sport, dport;
196262743Sglebius	uint8_t proto;
197262743Sglebius#endif
198191255Skmacy
199205066Skmacy	ip = mtod(m, struct ip *);
200191255Skmacy
201262743Sglebius	if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
202262743Sglebius	    (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
203262743Sglebius	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
204262743Sglebius		return (NULL);
205262743Sglebius
206262743Sglebius	fibnum = M_GETFIB(m);
207262743Sglebius
208262743Sglebius#ifdef FLOWTABLE_HASH_ALL
209262743Sglebius	iphlen = ip->ip_hl << 2;
210205066Skmacy	proto = ip->ip_p;
211191255Skmacy
212262743Sglebius	switch (proto) {
213262743Sglebius	case IPPROTO_TCP: {
214262743Sglebius		struct tcphdr *th;
215205066Skmacy
216262743Sglebius		th = (struct tcphdr *)((char *)ip + iphlen);
217205066Skmacy		sport = th->th_sport;
218205066Skmacy		dport = th->th_dport;
219262743Sglebius		if (th->th_flags & (TH_RST|TH_FIN))
220262743Sglebius			fibnum |= (FL_STALE << 24);
221262743Sglebius		break;
222262743Sglebius	}
223262743Sglebius	case IPPROTO_UDP: {
224262743Sglebius		struct udphdr *uh;
225262743Sglebius
226262743Sglebius		uh = (struct udphdr *)((char *)ip + iphlen);
227191255Skmacy		sport = uh->uh_sport;
228191255Skmacy		dport = uh->uh_dport;
229262743Sglebius		break;
230262743Sglebius	}
231262743Sglebius	case IPPROTO_SCTP: {
232262743Sglebius		struct sctphdr *sh;
233262743Sglebius
234262743Sglebius		sh = (struct sctphdr *)((char *)ip + iphlen);
235191255Skmacy		sport = sh->src_port;
236191255Skmacy		dport = sh->dest_port;
237262743Sglebius		/* XXXGL: handle stale? */
238262743Sglebius		break;
239262743Sglebius	}
240191255Skmacy	default:
241262743Sglebius		sport = dport = 0;
242201758Smbr		break;
243191255Skmacy	}
244191255Skmacy
245262743Sglebius	key[0] = ip->ip_dst.s_addr;
246262743Sglebius	key[1] = ip->ip_src.s_addr;
247262743Sglebius	key[2] = (dport << 16) | sport;
248262743Sglebius	fibnum |= proto << 16;
249191255Skmacy
250262743Sglebius	fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
251262743Sglebius	    fibnum);
252191255Skmacy
253262743Sglebius#else	/* !FLOWTABLE_HASH_ALL */
254191255Skmacy
255262743Sglebius	fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
256262743Sglebius	    sizeof(struct in_addr), fibnum);
257205066Skmacy
258262743Sglebius#endif	/* FLOWTABLE_HASH_ALL */
259205066Skmacy
260262743Sglebius	if (fle == NULL)
261205066Skmacy		return (NULL);
262205066Skmacy
263205066Skmacy	sin = (struct sockaddr_in *)&ro->ro_dst;
264205066Skmacy	sin->sin_family = AF_INET;
265205066Skmacy	sin->sin_len = sizeof(*sin);
266262743Sglebius	sin->sin_addr = ip->ip_dst;
267262743Sglebius
268262743Sglebius	return (fle);
269205066Skmacy}
270205066Skmacy#endif /* INET */
271205066Skmacy
272205066Skmacy#ifdef INET6
273205066Skmacy/*
274205066Skmacy * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
275205066Skmacy * then it sets p to point at the offset "len" in the mbuf. WARNING: the
276205066Skmacy * pointer might become stale after other pullups (but we never use it
277205066Skmacy * this way).
278205066Skmacy */
279205066Skmacy#define PULLUP_TO(_len, p, T)						\
280205066Skmacydo {									\
281205066Skmacy	int x = (_len) + sizeof(T);					\
282262743Sglebius	if ((m)->m_len < x)						\
283262743Sglebius		return (NULL);						\
284205066Skmacy	p = (mtod(m, char *) + (_len));					\
285205066Skmacy} while (0)
286205066Skmacy
287205066Skmacy#define	TCP(p)		((struct tcphdr *)(p))
288205066Skmacy#define	SCTP(p)		((struct sctphdr *)(p))
289205066Skmacy#define	UDP(p)		((struct udphdr *)(p))
290205066Skmacy
291262743Sglebiusstatic struct flentry *
292262743Sglebiusflowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
293205066Skmacy{
294262743Sglebius	struct flentry *fle;
295262743Sglebius	struct sockaddr_in6 *sin6;
296205066Skmacy	struct ip6_hdr *ip6;
297262743Sglebius	uint32_t fibnum;
298262743Sglebius#ifdef FLOWTABLE_HASH_ALL
299262743Sglebius	uint32_t key[9];
300262743Sglebius	void *ulp;
301205066Skmacy	int hlen;
302262743Sglebius	uint16_t sport, dport;
303205066Skmacy	u_short offset;
304262743Sglebius	uint8_t proto;
305262743Sglebius#else
306262743Sglebius	uint32_t key[4];
307262743Sglebius#endif
308205066Skmacy
309205066Skmacy	ip6 = mtod(m, struct ip6_hdr *);
310262743Sglebius	if (in6_localaddr(&ip6->ip6_dst))
311262743Sglebius		return (NULL);
312262743Sglebius
313262743Sglebius	fibnum = M_GETFIB(m);
314262743Sglebius
315262743Sglebius#ifdef	FLOWTABLE_HASH_ALL
316205066Skmacy	hlen = sizeof(struct ip6_hdr);
317205066Skmacy	proto = ip6->ip6_nxt;
318262743Sglebius	offset = sport = dport = 0;
319262743Sglebius	ulp = NULL;
320205066Skmacy	while (ulp == NULL) {
321205066Skmacy		switch (proto) {
322205066Skmacy		case IPPROTO_ICMPV6:
323205066Skmacy		case IPPROTO_OSPFIGP:
324205066Skmacy		case IPPROTO_PIM:
325205066Skmacy		case IPPROTO_CARP:
326205066Skmacy		case IPPROTO_ESP:
327205066Skmacy		case IPPROTO_NONE:
328205066Skmacy			ulp = ip6;
329205066Skmacy			break;
330205066Skmacy		case IPPROTO_TCP:
331205066Skmacy			PULLUP_TO(hlen, ulp, struct tcphdr);
332262743Sglebius			dport = TCP(ulp)->th_dport;
333262743Sglebius			sport = TCP(ulp)->th_sport;
334262743Sglebius			if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
335262743Sglebius				fibnum |= (FL_STALE << 24);
336205066Skmacy			break;
337205066Skmacy		case IPPROTO_SCTP:
338205066Skmacy			PULLUP_TO(hlen, ulp, struct sctphdr);
339262743Sglebius			dport = SCTP(ulp)->src_port;
340262743Sglebius			sport = SCTP(ulp)->dest_port;
341262743Sglebius			/* XXXGL: handle stale? */
342205066Skmacy			break;
343205066Skmacy		case IPPROTO_UDP:
344205066Skmacy			PULLUP_TO(hlen, ulp, struct udphdr);
345262743Sglebius			dport = UDP(ulp)->uh_dport;
346262743Sglebius			sport = UDP(ulp)->uh_sport;
347205066Skmacy			break;
348205066Skmacy		case IPPROTO_HOPOPTS:	/* RFC 2460 */
349205066Skmacy			PULLUP_TO(hlen, ulp, struct ip6_hbh);
350205066Skmacy			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
351205066Skmacy			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
352205066Skmacy			ulp = NULL;
353205066Skmacy			break;
354205066Skmacy		case IPPROTO_ROUTING:	/* RFC 2460 */
355262743Sglebius			PULLUP_TO(hlen, ulp, struct ip6_rthdr);
356205066Skmacy			hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
357205066Skmacy			proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
358205066Skmacy			ulp = NULL;
359205066Skmacy			break;
360205066Skmacy		case IPPROTO_FRAGMENT:	/* RFC 2460 */
361205066Skmacy			PULLUP_TO(hlen, ulp, struct ip6_frag);
362205066Skmacy			hlen += sizeof (struct ip6_frag);
363205066Skmacy			proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
364205066Skmacy			offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
365205066Skmacy			    IP6F_OFF_MASK;
366205066Skmacy			ulp = NULL;
367205066Skmacy			break;
368205066Skmacy		case IPPROTO_DSTOPTS:	/* RFC 2460 */
369205066Skmacy			PULLUP_TO(hlen, ulp, struct ip6_hbh);
370205066Skmacy			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
371205066Skmacy			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
372205066Skmacy			ulp = NULL;
373205066Skmacy			break;
374205066Skmacy		case IPPROTO_AH:	/* RFC 2402 */
375205066Skmacy			PULLUP_TO(hlen, ulp, struct ip6_ext);
376205066Skmacy			hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
377205066Skmacy			proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
378205066Skmacy			ulp = NULL;
379205066Skmacy			break;
380205066Skmacy		default:
381205066Skmacy			PULLUP_TO(hlen, ulp, struct ip6_ext);
382205066Skmacy			break;
383205066Skmacy		}
384205066Skmacy	}
385205066Skmacy
386262743Sglebius	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
387262743Sglebius	bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
388262743Sglebius	key[8] = (dport << 16) | sport;
389262743Sglebius	fibnum |= proto << 16;
390205066Skmacy
391262743Sglebius	fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
392262743Sglebius	    fibnum);
393262743Sglebius#else	/* !FLOWTABLE_HASH_ALL */
394262743Sglebius	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
395262743Sglebius	fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
396262743Sglebius	    fibnum);
397262743Sglebius#endif	/* FLOWTABLE_HASH_ALL */
398205066Skmacy
399262743Sglebius	if (fle == NULL)
400205066Skmacy		return (NULL);
401205066Skmacy
402205066Skmacy	sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
403205066Skmacy	sin6->sin6_family = AF_INET6;
404205066Skmacy	sin6->sin6_len = sizeof(*sin6);
405262743Sglebius	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
406262743Sglebius
407262743Sglebius	return (fle);
408205066Skmacy}
409205066Skmacy#endif /* INET6 */
410205066Skmacy
411191255Skmacystatic bitstr_t *
412191255Skmacyflowtable_mask(struct flowtable *ft)
413191255Skmacy{
414196368Skmacy
415262743Sglebius	/*
416262743Sglebius	 * flowtable_free_stale() calls w/o critical section, but
417262743Sglebius	 * with sched_bind(). Since pointer is stable throughout
418262743Sglebius	 * ft lifetime, it is safe, otherwise...
419262743Sglebius	 *
420262743Sglebius	 * CRITICAL_ASSERT(curthread);
421262743Sglebius	 */
422191255Skmacy
423262743Sglebius	return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
424191255Skmacy}
425191255Skmacy
426262743Sglebiusstatic struct flist *
427262743Sglebiusflowtable_list(struct flowtable *ft, uint32_t hash)
428191255Skmacy{
429191255Skmacy
430262743Sglebius	CRITICAL_ASSERT(curthread);
431262743Sglebius	return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
432191255Skmacy}
433191255Skmacy
434191255Skmacystatic int
435262743Sglebiusflow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
436191255Skmacy{
437191255Skmacy
438262743Sglebius	if (((fle->f_rt->rt_flags & RTF_HOST) &&
439262743Sglebius	    ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) ||
440262743Sglebius	    (fle->f_rt->rt_ifp == NULL) ||
441262743Sglebius	    !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
442262743Sglebius	    (fle->f_lle->la_flags & LLE_VALID) == 0)
443191255Skmacy		return (1);
444191255Skmacy
445262743Sglebius	if (time_uptime - fle->f_uptime > maxidle)
446262743Sglebius		return (1);
447191255Skmacy
448262743Sglebius#ifdef FLOWTABLE_HASH_ALL
449262743Sglebius	if (fle->f_flags & FL_STALE)
450191255Skmacy		return (1);
451262743Sglebius#endif
452191255Skmacy
453191255Skmacy	return (0);
454191255Skmacy}
455191255Skmacy
456262743Sglebiusstatic int
457262743Sglebiusflow_full(void)
458191255Skmacy{
459262743Sglebius	int count, max;
460191255Skmacy
461262743Sglebius	count = uma_zone_get_cur(flow_zone);
462262743Sglebius	max = uma_zone_get_max(flow_zone);
463191255Skmacy
464262743Sglebius	return (count > (max - (max >> 3)));
465205488Skmacy}
466205488Skmacy
467262743Sglebiusstatic int
468262743Sglebiusflow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
469205488Skmacy{
470262743Sglebius#ifdef FLOWTABLE_HASH_ALL
471262743Sglebius	uint8_t proto;
472205488Skmacy
473262743Sglebius	proto = (fibnum >> 16) & 0xff;
474262743Sglebius	fibnum &= 0xffff;
475262743Sglebius#endif
476205488Skmacy
477262743Sglebius	CRITICAL_ASSERT(curthread);
478205488Skmacy
479262743Sglebius	/* Microoptimization for IPv4: don't use bcmp(). */
480262743Sglebius	if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) ||
481262743Sglebius	    (bcmp(fle->f_key, key, keylen) == 0)) &&
482262743Sglebius	    fibnum == fle->f_fibnum &&
483262743Sglebius#ifdef FLOWTABLE_HASH_ALL
484262743Sglebius	    proto == fle->f_proto &&
485262743Sglebius#endif
486262743Sglebius	    (fle->f_rt->rt_flags & RTF_UP) &&
487262743Sglebius	    fle->f_rt->rt_ifp != NULL &&
488262743Sglebius	    (fle->f_lle->la_flags & LLE_VALID))
489262743Sglebius		return (1);
490205488Skmacy
491262743Sglebius	return (0);
492205488Skmacy}
493205488Skmacy
494262743Sglebiusstatic struct flentry *
495191255Skmacyflowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
496262743Sglebius    int keylen, uint32_t fibnum0)
497191255Skmacy{
498262743Sglebius#ifdef INET6
499262743Sglebius	struct route_in6 sro6;
500262743Sglebius#endif
501262743Sglebius#ifdef INET
502262743Sglebius	struct route sro;
503262743Sglebius#endif
504262743Sglebius	struct route *ro = NULL;
505262743Sglebius	struct rtentry *rt;
506262743Sglebius	struct lltable *lt = NULL;
507262743Sglebius	struct llentry *lle;
508262743Sglebius	struct sockaddr_storage *l3addr;
509262743Sglebius	struct ifnet *ifp;
510262743Sglebius	struct flist *flist;
511262743Sglebius	struct flentry *fle, *iter;
512191255Skmacy	bitstr_t *mask;
513262743Sglebius	uint16_t fibnum = fibnum0;
514262743Sglebius#ifdef FLOWTABLE_HASH_ALL
515205066Skmacy	uint8_t proto;
516191255Skmacy
517262743Sglebius	proto = (fibnum0 >> 16) & 0xff;
518262743Sglebius	fibnum = fibnum0 & 0xffff;
519262743Sglebius#endif
520191255Skmacy
521191255Skmacy	/*
522262743Sglebius	 * This bit of code ends up locking the
523262743Sglebius	 * same route 3 times (just like ip_output + ether_output)
524262743Sglebius	 * - at lookup
525262743Sglebius	 * - in rt_check when called by arpresolve
526262743Sglebius	 * - dropping the refcount for the rtentry
527262743Sglebius	 *
528262743Sglebius	 * This could be consolidated to one if we wrote a variant
529262743Sglebius	 * of arpresolve with an rt_check variant that expected to
530262743Sglebius	 * receive the route locked
531191255Skmacy	 */
532262743Sglebius#ifdef INET
533262743Sglebius	if (ft == &V_ip4_ft) {
534262743Sglebius		struct sockaddr_in *sin;
535191255Skmacy
536262743Sglebius		ro = &sro;
537262743Sglebius		bzero(&sro.ro_dst, sizeof(sro.ro_dst));
538191255Skmacy
539262743Sglebius		sin = (struct sockaddr_in *)&sro.ro_dst;
540262743Sglebius		sin->sin_family = AF_INET;
541262743Sglebius		sin->sin_len = sizeof(*sin);
542262743Sglebius		sin->sin_addr.s_addr = key[0];
543262743Sglebius	}
544205066Skmacy#endif
545205066Skmacy#ifdef INET6
546262743Sglebius	if (ft == &V_ip6_ft) {
547262743Sglebius		struct sockaddr_in6 *sin6;
548205066Skmacy
549262743Sglebius		ro = (struct route *)&sro6;
550262743Sglebius		sin6 = &sro6.ro_dst;
551205066Skmacy
552262743Sglebius		bzero(sin6, sizeof(*sin6));
553262743Sglebius		sin6->sin6_family = AF_INET6;
554262743Sglebius		sin6->sin6_len = sizeof(*sin6);
555262743Sglebius		bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
556191255Skmacy	}
557262743Sglebius#endif
558205066Skmacy
559262743Sglebius	ro->ro_rt = NULL;
560262743Sglebius#ifdef RADIX_MPATH
561262743Sglebius	rtalloc_mpath_fib(ro, hash, fibnum);
562262743Sglebius#else
563262743Sglebius	rtalloc_ign_fib(ro, 0, fibnum);
564262743Sglebius#endif
565262743Sglebius	if (ro->ro_rt == NULL)
566262743Sglebius		return (NULL);
567191255Skmacy
568262743Sglebius	rt = ro->ro_rt;
569262743Sglebius	ifp = rt->rt_ifp;
570191255Skmacy
571262743Sglebius	if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
572262743Sglebius		RTFREE(rt);
573262743Sglebius		return (NULL);
574262743Sglebius	}
575205066Skmacy
576205066Skmacy#ifdef INET
577262743Sglebius	if (ft == &V_ip4_ft)
578262743Sglebius		lt = LLTABLE(ifp);
579205066Skmacy#endif
580205066Skmacy#ifdef INET6
581262743Sglebius	if (ft == &V_ip6_ft)
582262743Sglebius		lt = LLTABLE6(ifp);
583262743Sglebius#endif
584191255Skmacy
585262743Sglebius	if (rt->rt_flags & RTF_GATEWAY)
586262743Sglebius		l3addr = (struct sockaddr_storage *)rt->rt_gateway;
587262743Sglebius	else
588262743Sglebius		l3addr = (struct sockaddr_storage *)&ro->ro_dst;
589262743Sglebius	lle = llentry_alloc(ifp, lt, l3addr);
590191255Skmacy
591262743Sglebius	if (lle == NULL) {
592262743Sglebius		RTFREE(rt);
593262743Sglebius		return (NULL);
594262743Sglebius	}
595191255Skmacy
596262743Sglebius	/* Don't insert the entry if the ARP hasn't yet finished resolving. */
597262743Sglebius	if ((lle->la_flags & LLE_VALID) == 0) {
598262743Sglebius		RTFREE(rt);
599262743Sglebius		LLE_FREE(lle);
600262743Sglebius		FLOWSTAT_INC(ft, ft_fail_lle_invalid);
601262743Sglebius		return (NULL);
602205066Skmacy	}
603205066Skmacy
604262743Sglebius	fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
605262743Sglebius	if (fle == NULL) {
606262743Sglebius		RTFREE(rt);
607262743Sglebius		LLE_FREE(lle);
608262743Sglebius		return (NULL);
609262743Sglebius	}
610205066Skmacy
611262743Sglebius	fle->f_hash = hash;
612262743Sglebius	bcopy(key, &fle->f_key, keylen);
613262743Sglebius	fle->f_rt = rt;
614262743Sglebius	fle->f_lle = lle;
615262743Sglebius	fle->f_fibnum = fibnum;
616262743Sglebius	fle->f_uptime = time_uptime;
617262743Sglebius#ifdef FLOWTABLE_HASH_ALL
618262743Sglebius	fle->f_proto = proto;
619262743Sglebius	fle->f_flags = fibnum0 >> 24;
620205066Skmacy#endif
621191255Skmacy
622262743Sglebius	critical_enter();
623262743Sglebius	mask = flowtable_mask(ft);
624262743Sglebius	flist = flowtable_list(ft, hash);
625262743Sglebius
626262743Sglebius	if (SLIST_EMPTY(flist)) {
627262743Sglebius		bit_set(mask, (hash % ft->ft_size));
628262743Sglebius		SLIST_INSERT_HEAD(flist, fle, f_next);
629262743Sglebius		goto skip;
630191255Skmacy	}
631191255Skmacy
632191255Skmacy	/*
633262743Sglebius	 * find end of list and make sure that we were not
634262743Sglebius	 * preempted by another thread handling this flow
635191255Skmacy	 */
636262743Sglebius	SLIST_FOREACH(iter, flist, f_next) {
637262743Sglebius		KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size,
638262743Sglebius		    ("%s: wrong hash", __func__));
639262743Sglebius		if (flow_matches(iter, key, keylen, fibnum)) {
640262743Sglebius			/*
641262743Sglebius			 * We probably migrated to an other CPU after
642262743Sglebius			 * lookup in flowtable_lookup_common() failed.
643262743Sglebius			 * It appeared that this CPU already has flow
644262743Sglebius			 * entry.
645262743Sglebius			 */
646262743Sglebius			iter->f_uptime = time_uptime;
647262743Sglebius#ifdef FLOWTABLE_HASH_ALL
648262743Sglebius			iter->f_flags |= fibnum >> 24;
649205066Skmacy#endif
650262743Sglebius			critical_exit();
651262743Sglebius			FLOWSTAT_INC(ft, ft_collisions);
652262743Sglebius			uma_zfree(flow_zone, fle);
653262743Sglebius			return (iter);
654262743Sglebius		}
655262743Sglebius	}
656205066Skmacy
657262743Sglebius	SLIST_INSERT_HEAD(flist, fle, f_next);
658262743Sglebiusskip:
659262743Sglebius	critical_exit();
660262743Sglebius	FLOWSTAT_INC(ft, ft_inserts);
661191255Skmacy
662262743Sglebius	return (fle);
663262743Sglebius}
664196609Sqingli
665262743Sglebiusint
666262743Sglebiusflowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
667262743Sglebius{
668262743Sglebius	struct flentry *fle;
669205066Skmacy
670262743Sglebius	if (V_flowtable_enable == 0)
671262743Sglebius		return (ENXIO);
672262743Sglebius
673262743Sglebius	switch (sa) {
674205066Skmacy#ifdef INET
675262743Sglebius	case AF_INET:
676262743Sglebius		fle = flowtable_lookup_ipv4(m, ro);
677262743Sglebius		break;
678205066Skmacy#endif
679262743Sglebius#ifdef INET6
680262743Sglebius	case AF_INET6:
681262743Sglebius		fle = flowtable_lookup_ipv6(m, ro);
682262743Sglebius		break;
683262743Sglebius#endif
684262743Sglebius	default:
685262743Sglebius		panic("%s: sa %d", __func__, sa);
686262743Sglebius	}
687191255Skmacy
688262743Sglebius	if (fle == NULL)
689262743Sglebius		return (EHOSTUNREACH);
690205066Skmacy
691281955Shiren	if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) {
692281955Shiren		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
693262743Sglebius		m->m_pkthdr.flowid = fle->f_hash;
694262743Sglebius	}
695191255Skmacy
696262743Sglebius	ro->ro_rt = fle->f_rt;
697262743Sglebius	ro->ro_lle = fle->f_lle;
698262743Sglebius	ro->ro_flags |= RT_NORTREF;
699262743Sglebius
700262743Sglebius	return (0);
701191255Skmacy}
702191255Skmacy
703262743Sglebiusstatic struct flentry *
704262743Sglebiusflowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen,
705262743Sglebius    uint32_t fibnum)
706191255Skmacy{
707262743Sglebius	struct flist *flist;
708262743Sglebius	struct flentry *fle;
709262743Sglebius	uint32_t hash;
710191255Skmacy
711262743Sglebius	FLOWSTAT_INC(ft, ft_lookups);
712191255Skmacy
713262743Sglebius	hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter);
714191255Skmacy
715262743Sglebius	critical_enter();
716262743Sglebius	flist = flowtable_list(ft, hash);
717262743Sglebius	SLIST_FOREACH(fle, flist, f_next) {
718262743Sglebius		KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size,
719262743Sglebius		    ("%s: wrong hash", __func__));
720262743Sglebius		if (flow_matches(fle, key, keylen, fibnum)) {
721262743Sglebius			fle->f_uptime = time_uptime;
722262743Sglebius#ifdef FLOWTABLE_HASH_ALL
723262743Sglebius			fle->f_flags |= fibnum >> 24;
724191255Skmacy#endif
725262743Sglebius			critical_exit();
726262743Sglebius			FLOWSTAT_INC(ft, ft_hits);
727262743Sglebius			return (fle);
728191255Skmacy		}
729191255Skmacy	}
730262743Sglebius	critical_exit();
731191255Skmacy
732262743Sglebius	FLOWSTAT_INC(ft, ft_misses);
733191324Skmacy
734262743Sglebius	return (flowtable_insert(ft, hash, key, keylen, fibnum));
735191255Skmacy}
736191255Skmacy
737191255Skmacy/*
738262743Sglebius * used by the bit_alloc macro
739191255Skmacy */
740262743Sglebius#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO)
741191255Skmacystatic void
742262743Sglebiusflowtable_alloc(struct flowtable *ft)
743191255Skmacy{
744191255Skmacy
745262743Sglebius	ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
746262743Sglebius	    M_FTABLE, M_WAITOK);
747262743Sglebius	for (int i = 0; i < ft->ft_size; i++)
748262743Sglebius		ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
749262743Sglebius
750262743Sglebius	ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
751262743Sglebius	for (int i = 0; i < mp_ncpus; i++) {
752262743Sglebius		bitstr_t **b;
753262743Sglebius
754262743Sglebius		b = zpcpu_get_cpu(ft->ft_masks, i);
755262743Sglebius		*b = bit_alloc(ft->ft_size);
756262743Sglebius	}
757262743Sglebius	ft->ft_tmpmask = bit_alloc(ft->ft_size);
758191255Skmacy}
759262743Sglebius#undef calloc
760191255Skmacy
761191255Skmacystatic void
762262743Sglebiusflowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle)
763191255Skmacy{
764262743Sglebius	struct flist *flist, freelist;
765262743Sglebius	struct flentry *fle, *fle1, *fleprev;
766191324Skmacy	bitstr_t *mask, *tmpmask;
767262743Sglebius	int curbit, tmpsize;
768205066Skmacy
769262743Sglebius	SLIST_INIT(&freelist);
770191255Skmacy	mask = flowtable_mask(ft);
771191324Skmacy	tmpmask = ft->ft_tmpmask;
772260077Sscottl	tmpsize = ft->ft_size;
773191324Skmacy	memcpy(tmpmask, mask, ft->ft_size/8);
774262743Sglebius	curbit = 0;
775262772Sglebius	fleprev = NULL; /* pacify gcc */
776191324Skmacy	/*
777191324Skmacy	 * XXX Note to self, bit_ffs operates at the byte level
778191324Skmacy	 * and thus adds gratuitous overhead
779191324Skmacy	 */
780191324Skmacy	bit_ffs(tmpmask, ft->ft_size, &curbit);
781191324Skmacy	while (curbit != -1) {
782191257Skmacy		if (curbit >= ft->ft_size || curbit < -1) {
783191257Skmacy			log(LOG_ALERT,
784191257Skmacy			    "warning: bad curbit value %d \n",
785191255Skmacy			    curbit);
786191257Skmacy			break;
787191255Skmacy		}
788205066Skmacy
789262743Sglebius		FLOWSTAT_INC(ft, ft_free_checks);
790191255Skmacy
791262743Sglebius		critical_enter();
792262743Sglebius		flist = flowtable_list(ft, curbit);
793191257Skmacy#ifdef DIAGNOSTIC
794262743Sglebius		if (SLIST_EMPTY(flist) && curbit > 0) {
795191257Skmacy			log(LOG_ALERT,
796191257Skmacy			    "warning bit=%d set, but no fle found\n",
797191257Skmacy			    curbit);
798191255Skmacy		}
799262743Sglebius#endif
800262743Sglebius		SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
801262743Sglebius			if (rt != NULL && fle->f_rt != rt) {
802191255Skmacy				fleprev = fle;
803191255Skmacy				continue;
804191255Skmacy			}
805262743Sglebius			if (!flow_stale(ft, fle, maxidle)) {
806262743Sglebius				fleprev = fle;
807262743Sglebius				continue;
808191255Skmacy			}
809205066Skmacy
810262743Sglebius			if (fle == SLIST_FIRST(flist))
811262743Sglebius				SLIST_REMOVE_HEAD(flist, f_next);
812262743Sglebius			else
813262743Sglebius				SLIST_REMOVE_AFTER(fleprev, f_next);
814262743Sglebius			SLIST_INSERT_HEAD(&freelist, fle, f_next);
815191255Skmacy		}
816262743Sglebius		if (SLIST_EMPTY(flist))
817191255Skmacy			bit_clear(mask, curbit);
818262743Sglebius		critical_exit();
819262743Sglebius
820191324Skmacy		bit_clear(tmpmask, curbit);
821260077Sscottl		tmpmask += (curbit / 8);
822260077Sscottl		tmpsize -= (curbit / 8) * 8;
823260077Sscottl		bit_ffs(tmpmask, tmpsize, &curbit);
824191255Skmacy	}
825262743Sglebius
826262743Sglebius	SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
827262743Sglebius		FLOWSTAT_INC(ft, ft_frees);
828262743Sglebius		if (fle->f_rt != NULL)
829262743Sglebius			RTFREE(fle->f_rt);
830262743Sglebius		if (fle->f_lle != NULL)
831262743Sglebius			LLE_FREE(fle->f_lle);
832262743Sglebius		uma_zfree(flow_zone, fle);
833191255Skmacy	}
834191255Skmacy}
835191255Skmacy
836262743Sglebiusstatic void
837262743Sglebiusflowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle)
838197687Sqingli{
839197687Sqingli	int i;
840205066Skmacy
841262743Sglebius	CPU_FOREACH(i) {
842262743Sglebius		if (smp_started == 1) {
843262743Sglebius			thread_lock(curthread);
844262743Sglebius			sched_bind(curthread, i);
845262743Sglebius			thread_unlock(curthread);
846262743Sglebius		}
847197687Sqingli
848262743Sglebius		flowtable_free_stale(ft, rt, maxidle);
849197687Sqingli
850262743Sglebius		if (smp_started == 1) {
851262743Sglebius			thread_lock(curthread);
852262743Sglebius			sched_unbind(curthread);
853262743Sglebius			thread_unlock(curthread);
854197687Sqingli		}
855197687Sqingli	}
856197687Sqingli}
857197687Sqingli
858262743Sglebiusvoid
859262743Sglebiusflowtable_route_flush(sa_family_t sa, struct rtentry *rt)
860191255Skmacy{
861191255Skmacy	struct flowtable *ft;
862191255Skmacy
863262743Sglebius	switch (sa) {
864262743Sglebius#ifdef INET
865262743Sglebius	case AF_INET:
866262743Sglebius		ft = &V_ip4_ft;
867262743Sglebius		break;
868262743Sglebius#endif
869262743Sglebius#ifdef INET6
870262743Sglebius	case AF_INET6:
871262743Sglebius		ft = &V_ip6_ft;
872262743Sglebius		break;
873262743Sglebius#endif
874262743Sglebius	default:
875262743Sglebius		panic("%s: sa %d", __func__, sa);
876262743Sglebius	}
877191255Skmacy
878262743Sglebius	flowtable_clean_vnet(ft, rt, 0);
879194660Szec}
880194660Szec
881194660Szecstatic void
882194660Szecflowtable_cleaner(void)
883194660Szec{
884194660Szec	VNET_ITERATOR_DECL(vnet_iter);
885217076Sjhb	struct thread *td;
886194660Szec
887194660Szec	if (bootverbose)
888194660Szec		log(LOG_INFO, "flowtable cleaner started\n");
889217076Sjhb	td = curthread;
890194660Szec	while (1) {
891262743Sglebius		uint32_t flowclean_freq, maxidle;
892262743Sglebius
893262743Sglebius		/*
894262743Sglebius		 * The maximum idle time, as well as frequency are arbitrary.
895262743Sglebius		 */
896262743Sglebius		if (flow_full())
897262743Sglebius			maxidle = 5;
898262743Sglebius		else
899262743Sglebius			maxidle = 30;
900262743Sglebius
901194660Szec		VNET_LIST_RLOCK();
902194660Szec		VNET_FOREACH(vnet_iter) {
903194660Szec			CURVNET_SET(vnet_iter);
904262743Sglebius#ifdef INET
905262743Sglebius			flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle);
906262743Sglebius#endif
907262743Sglebius#ifdef INET6
908262743Sglebius			flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle);
909262743Sglebius#endif
910194660Szec			CURVNET_RESTORE();
911194660Szec		}
912194660Szec		VNET_LIST_RUNLOCK();
913194660Szec
914262743Sglebius		if (flow_full())
915262743Sglebius			flowclean_freq = 4*hz;
916262743Sglebius		else
917262743Sglebius			flowclean_freq = 20*hz;
918196368Skmacy		mtx_lock(&flowclean_lock);
919217076Sjhb		thread_lock(td);
920217076Sjhb		sched_prio(td, PPAUSE);
921217076Sjhb		thread_unlock(td);
922216855Sbz		flowclean_cycles++;
923216855Sbz		cv_broadcast(&flowclean_f_cv);
924216855Sbz		cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
925196368Skmacy		mtx_unlock(&flowclean_lock);
926191255Skmacy	}
927191255Skmacy}
928191255Skmacy
929196368Skmacystatic void
930196368Skmacyflowtable_flush(void *unused __unused)
931196368Skmacy{
932196368Skmacy	uint64_t start;
933205066Skmacy
934196368Skmacy	mtx_lock(&flowclean_lock);
935196368Skmacy	start = flowclean_cycles;
936196368Skmacy	while (start == flowclean_cycles) {
937216855Sbz		cv_broadcast(&flowclean_c_cv);
938216855Sbz		cv_wait(&flowclean_f_cv, &flowclean_lock);
939196368Skmacy	}
940196368Skmacy	mtx_unlock(&flowclean_lock);
941196368Skmacy}
942196368Skmacy
943191255Skmacystatic struct kproc_desc flow_kp = {
944191255Skmacy	"flowcleaner",
945191255Skmacy	flowtable_cleaner,
946191255Skmacy	&flowcleanerproc
947191255Skmacy};
948191255SkmacySYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
949193863Skmacy
950262743Sglebiusstatic int
951262743Sglebiusflowtable_get_size(char *name)
952196368Skmacy{
953262743Sglebius	int size;
954196368Skmacy
955262743Sglebius	if (TUNABLE_INT_FETCH(name, &size)) {
956262743Sglebius		if (size < 256)
957262743Sglebius			size = 256;
958262743Sglebius		if (!powerof2(size)) {
959262743Sglebius			printf("%s must be power of 2\n", name);
960262743Sglebius			size = 2048;
961262743Sglebius		}
962262743Sglebius	} else {
963262743Sglebius		/*
964262743Sglebius		 * round up to the next power of 2
965262743Sglebius		 */
966262743Sglebius		size = 1 << fls((1024 + maxusers * 64) - 1);
967262743Sglebius	}
968262743Sglebius
969262743Sglebius	return (size);
970196368Skmacy}
971196368Skmacy
972196368Skmacystatic void
973196368Skmacyflowtable_init(const void *unused __unused)
974196368Skmacy{
975196368Skmacy
976262743Sglebius	flow_hashjitter = arc4random();
977262743Sglebius
978262743Sglebius	flow_zone = uma_zcreate("flows", sizeof(struct flentry),
979262743Sglebius	    NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET);
980262743Sglebius	uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
981262743Sglebius
982216855Sbz	cv_init(&flowclean_c_cv, "c_flowcleanwait");
983216855Sbz	cv_init(&flowclean_f_cv, "f_flowcleanwait");
984196368Skmacy	mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
985196368Skmacy	EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
986196368Skmacy	    EVENTHANDLER_PRI_ANY);
987196368Skmacy}
988262743SglebiusSYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
989196368Skmacy    flowtable_init, NULL);
990196368Skmacy
991262743Sglebius#ifdef INET
992262743Sglebiusstatic SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
993262743Sglebius    "Flowtable for IPv4");
994196368Skmacy
995262743Sglebiusstatic VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
996262743SglebiusVNET_PCPUSTAT_SYSINIT(ip4_ftstat);
997262743SglebiusVNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
998262743SglebiusSYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
999262743Sglebius    ip4_ftstat, "Flowtable statistics for IPv4 "
1000262743Sglebius    "(struct flowtable_stat, net/flowtable.h)");
1001262743Sglebius
1002196368Skmacystatic void
1003262743Sglebiusflowtable_init_vnet_v4(const void *unused __unused)
1004196368Skmacy{
1005196368Skmacy
1006262743Sglebius	V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
1007262743Sglebius	V_ip4_ft.ft_stat = VNET(ip4_ftstat);
1008262743Sglebius	flowtable_alloc(&V_ip4_ft);
1009196368Skmacy}
1010262743SglebiusVNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1011262743Sglebius    flowtable_init_vnet_v4, NULL);
1012262743Sglebius#endif /* INET */
1013196368Skmacy
1014262743Sglebius#ifdef INET6
1015262743Sglebiusstatic SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
1016262743Sglebius    "Flowtable for IPv6");
1017196368Skmacy
1018262743Sglebiusstatic VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
1019262743SglebiusVNET_PCPUSTAT_SYSINIT(ip6_ftstat);
1020262743SglebiusVNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
1021262743SglebiusSYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
1022262743Sglebius    ip6_ftstat, "Flowtable statistics for IPv6 "
1023262743Sglebius    "(struct flowtable_stat, net/flowtable.h)");
1024262743Sglebius
1025262743Sglebiusstatic void
1026262743Sglebiusflowtable_init_vnet_v6(const void *unused __unused)
1027205097Skmacy{
1028205097Skmacy
1029262743Sglebius	V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
1030262743Sglebius	V_ip6_ft.ft_stat = VNET(ip6_ftstat);
1031262743Sglebius	flowtable_alloc(&V_ip6_ft);
1032205097Skmacy}
1033262743SglebiusVNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1034262743Sglebius    flowtable_init_vnet_v6, NULL);
1035262743Sglebius#endif /* INET6 */
1036205097Skmacy
1037262743Sglebius#ifdef DDB
1038196368Skmacystatic bitstr_t *
1039196368Skmacyflowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1040196368Skmacy{
1041196368Skmacy
1042262743Sglebius	return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
1043196368Skmacy}
1044196368Skmacy
1045262743Sglebiusstatic struct flist *
1046262743Sglebiusflowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1047196368Skmacy{
1048196368Skmacy
1049262743Sglebius	return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
1050196368Skmacy}
1051196368Skmacy
1052196368Skmacystatic void
1053196368Skmacyflow_show(struct flowtable *ft, struct flentry *fle)
1054196368Skmacy{
1055196368Skmacy	int idle_time;
1056205066Skmacy	int rt_valid, ifp_valid;
1057205066Skmacy	volatile struct rtentry *rt;
1058205066Skmacy	struct ifnet *ifp = NULL;
1059262743Sglebius	uint32_t *hashkey = fle->f_key;
1060196368Skmacy
1061196368Skmacy	idle_time = (int)(time_uptime - fle->f_uptime);
1062205066Skmacy	rt = fle->f_rt;
1063205066Skmacy	rt_valid = rt != NULL;
1064262743Sglebius	if (rt_valid)
1065205066Skmacy		ifp = rt->rt_ifp;
1066205066Skmacy	ifp_valid = ifp != NULL;
1067205066Skmacy
1068262743Sglebius#ifdef INET
1069262743Sglebius	if (ft == &V_ip4_ft) {
1070262743Sglebius		char daddr[4*sizeof "123"];
1071262743Sglebius#ifdef FLOWTABLE_HASH_ALL
1072262743Sglebius		char saddr[4*sizeof "123"];
1073262743Sglebius		uint16_t sport, dport;
1074262743Sglebius#endif
1075262743Sglebius
1076262743Sglebius		inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr);
1077262743Sglebius#ifdef FLOWTABLE_HASH_ALL
1078262743Sglebius		inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1079262743Sglebius		dport = ntohs((uint16_t)(hashkey[2] >> 16));
1080262743Sglebius		sport = ntohs((uint16_t)(hashkey[2] & 0xffff));
1081262743Sglebius		db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport);
1082262743Sglebius#else
1083205066Skmacy		db_printf("%s ", daddr);
1084262743Sglebius#endif
1085262743Sglebius	}
1086262743Sglebius#endif /* INET */
1087262743Sglebius#ifdef INET6
1088262743Sglebius	if (ft == &V_ip6_ft) {
1089262743Sglebius#ifdef FLOWTABLE_HASH_ALL
1090262743Sglebius		db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1091262743Sglebius		    hashkey[0], hashkey[1], hashkey[2],
1092262743Sglebius		    hashkey[3], hashkey[4], hashkey[5],
1093262743Sglebius		    hashkey[6], hashkey[7], hashkey[8]);
1094262743Sglebius#else
1095262743Sglebius		db_printf("\n\tkey=%08x:%08x:%08x ",
1096262743Sglebius		    hashkey[0], hashkey[1], hashkey[2]);
1097262743Sglebius#endif
1098262743Sglebius	}
1099262743Sglebius#endif /* INET6 */
1100262743Sglebius
1101262743Sglebius	db_printf("hash=%08x idle_time=%03d"
1102262743Sglebius	    "\n\tfibnum=%02d rt=%p",
1103262743Sglebius	    fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt);
1104262743Sglebius
1105262743Sglebius#ifdef FLOWTABLE_HASH_ALL
1106196368Skmacy	if (fle->f_flags & FL_STALE)
1107196368Skmacy		db_printf(" FL_STALE ");
1108262743Sglebius#endif
1109205066Skmacy	if (rt_valid) {
1110205066Skmacy		if (rt->rt_flags & RTF_UP)
1111205066Skmacy			db_printf(" RTF_UP ");
1112205066Skmacy	}
1113205066Skmacy	if (ifp_valid) {
1114205066Skmacy		if (ifp->if_flags & IFF_LOOPBACK)
1115205066Skmacy			db_printf(" IFF_LOOPBACK ");
1116205066Skmacy		if (ifp->if_flags & IFF_UP)
1117262743Sglebius			db_printf(" IFF_UP ");
1118205066Skmacy		if (ifp->if_flags & IFF_POINTOPOINT)
1119262743Sglebius			db_printf(" IFF_POINTOPOINT ");
1120205066Skmacy	}
1121196368Skmacy	db_printf("\n");
1122196368Skmacy}
1123196368Skmacy
1124196368Skmacystatic void
1125196368Skmacyflowtable_show(struct flowtable *ft, int cpuid)
1126196368Skmacy{
1127196368Skmacy	int curbit = 0;
1128196368Skmacy	bitstr_t *mask, *tmpmask;
1129196368Skmacy
1130205066Skmacy	if (cpuid != -1)
1131205066Skmacy		db_printf("cpu: %d\n", cpuid);
1132196368Skmacy	mask = flowtable_mask_pcpu(ft, cpuid);
1133196368Skmacy	tmpmask = ft->ft_tmpmask;
1134196368Skmacy	memcpy(tmpmask, mask, ft->ft_size/8);
1135196368Skmacy	/*
1136196368Skmacy	 * XXX Note to self, bit_ffs operates at the byte level
1137196368Skmacy	 * and thus adds gratuitous overhead
1138196368Skmacy	 */
1139196368Skmacy	bit_ffs(tmpmask, ft->ft_size, &curbit);
1140196368Skmacy	while (curbit != -1) {
1141262743Sglebius		struct flist *flist;
1142262743Sglebius		struct flentry *fle;
1143262743Sglebius
1144196368Skmacy		if (curbit >= ft->ft_size || curbit < -1) {
1145196368Skmacy			db_printf("warning: bad curbit value %d \n",
1146196368Skmacy			    curbit);
1147196368Skmacy			break;
1148196368Skmacy		}
1149196368Skmacy
1150262743Sglebius		flist = flowtable_list_pcpu(ft, curbit, cpuid);
1151196368Skmacy
1152262743Sglebius		SLIST_FOREACH(fle, flist, f_next)
1153196368Skmacy			flow_show(ft, fle);
1154196368Skmacy		bit_clear(tmpmask, curbit);
1155196368Skmacy		bit_ffs(tmpmask, ft->ft_size, &curbit);
1156196368Skmacy	}
1157196368Skmacy}
1158196368Skmacy
1159196368Skmacystatic void
1160262743Sglebiusflowtable_show_vnet(struct flowtable *ft)
1161196368Skmacy{
1162262743Sglebius
1163196368Skmacy	int i;
1164196368Skmacy
1165262743Sglebius	CPU_FOREACH(i)
1166262743Sglebius		flowtable_show(ft, i);
1167196368Skmacy}
1168196368Skmacy
1169196368SkmacyDB_SHOW_COMMAND(flowtables, db_show_flowtables)
1170196368Skmacy{
1171196368Skmacy	VNET_ITERATOR_DECL(vnet_iter);
1172196368Skmacy
1173196368Skmacy	VNET_FOREACH(vnet_iter) {
1174196368Skmacy		CURVNET_SET(vnet_iter);
1175216856Sbz#ifdef VIMAGE
1176216856Sbz		db_printf("vnet %p\n", vnet_iter);
1177216856Sbz#endif
1178262743Sglebius#ifdef INET
1179262743Sglebius		printf("IPv4:\n");
1180262743Sglebius		flowtable_show_vnet(&V_ip4_ft);
1181262743Sglebius#endif
1182262743Sglebius#ifdef INET6
1183262743Sglebius		printf("IPv6:\n");
1184262743Sglebius		flowtable_show_vnet(&V_ip6_ft);
1185262743Sglebius#endif
1186196368Skmacy		CURVNET_RESTORE();
1187196368Skmacy	}
1188196368Skmacy}
1189196368Skmacy#endif
1190