Deleted Added
full compact
1/*-
2 * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
3 * Copyright (c) 2008-2010, BitGravity Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice,
10 * this list of conditions and the following disclaimer.
11 *
12 * 2. Neither the name of the BitGravity Corporation nor the names of its
13 * contributors may be used to endorse or promote products derived from
14 * this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "opt_route.h"
30#include "opt_mpath.h"
31#include "opt_ddb.h"
32#include "opt_inet.h"
33#include "opt_inet6.h"
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: head/sys/net/flowtable.c 301538 2016-06-07 04:51:50Z sephe $");
36__FBSDID("$FreeBSD: head/sys/net/flowtable.c 302372 2016-07-06 14:09:49Z nwhitehorn $");
37
38#include <sys/param.h>
39#include <sys/types.h>
40#include <sys/bitstring.h>
41#include <sys/condvar.h>
42#include <sys/callout.h>
43#include <sys/hash.h>
44#include <sys/kernel.h>
45#include <sys/kthread.h>
46#include <sys/limits.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/pcpu.h>
50#include <sys/proc.h>
51#include <sys/queue.h>
52#include <sys/sbuf.h>
53#include <sys/sched.h>
54#include <sys/smp.h>
55#include <sys/socket.h>
56#include <sys/syslog.h>
57#include <sys/sysctl.h>
58#include <vm/uma.h>
59
60#include <net/if.h>
61#include <net/if_llatbl.h>
62#include <net/if_var.h>
63#include <net/route.h>
64#include <net/flowtable.h>
65#include <net/vnet.h>
66
67#include <netinet/in.h>
68#include <netinet/in_systm.h>
69#include <netinet/in_var.h>
70#include <netinet/if_ether.h>
71#include <netinet/ip.h>
72#ifdef INET6
73#include <netinet/ip6.h>
74#endif
75#ifdef FLOWTABLE_HASH_ALL
76#include <netinet/tcp.h>
77#include <netinet/udp.h>
78#include <netinet/sctp.h>
79#endif
80
81#include <ddb/ddb.h>
82
83#ifdef FLOWTABLE_HASH_ALL
84#define KEY_PORTS (sizeof(uint16_t) * 2)
85#define KEY_ADDRS 2
86#else
87#define KEY_PORTS 0
88#define KEY_ADDRS 1
89#endif
90
91#ifdef INET6
92#define KEY_ADDR_LEN sizeof(struct in6_addr)
93#else
94#define KEY_ADDR_LEN sizeof(struct in_addr)
95#endif
96
97#define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
98
99struct flentry {
100 uint32_t f_hash; /* hash flowing forward */
101 uint32_t f_key[KEYLEN]; /* address(es and ports) */
102 uint32_t f_uptime; /* uptime at last access */
103 uint16_t f_fibnum; /* fib index */
104#ifdef FLOWTABLE_HASH_ALL
105 uint8_t f_proto; /* protocol */
106 uint8_t f_flags; /* stale? */
107#define FL_STALE 1
108#endif
109 SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */
110 struct rtentry *f_rt; /* rtentry for flow */
111 struct llentry *f_lle; /* llentry for flow */
112};
113#undef KEYLEN
114
115SLIST_HEAD(flist, flentry);
116/* Make sure we can use pcpu_zone_ptr for struct flist. */
117CTASSERT(sizeof(struct flist) == sizeof(void *));
118
119struct flowtable {
120 counter_u64_t *ft_stat;
121 int ft_size;
122 /*
123 * ft_table is a malloc(9)ed array of pointers. Pointers point to
124 * memory from UMA_ZONE_PCPU zone.
125 * ft_masks is per-cpu pointer itself. Each instance points
126 * to a malloc(9)ed bitset, that is private to corresponding CPU.
127 */
128 struct flist **ft_table;
129 bitstr_t **ft_masks;
130 bitstr_t *ft_tmpmask;
131};
132
133#define FLOWSTAT_ADD(ft, name, v) \
134 counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
135#define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1)
136
137static struct proc *flowcleanerproc;
138static uint32_t flow_hashjitter;
139
140static struct cv flowclean_f_cv;
141static struct cv flowclean_c_cv;
142static struct mtx flowclean_lock;
143static uint32_t flowclean_cycles;
144
145/*
146 * TODO:
147 * - add sysctls to resize && flush flow tables
148 * - Add per flowtable sysctls for statistics and configuring timeouts
149 * - add saturation counter to rtentry to support per-packet load-balancing
150 * add flag to indicate round-robin flow, add list lookup from head
151 for flows
152 * - add sysctl / device node / syscall to support exporting and importing
153 * of flows with flag to indicate that a flow was imported so should
154 * not be considered for auto-cleaning
155 * - support explicit connection state (currently only ad-hoc for DSR)
156 * - idetach() cleanup for options VIMAGE builds.
157 */
158#ifdef INET
159static VNET_DEFINE(struct flowtable, ip4_ft);
160#define V_ip4_ft VNET(ip4_ft)
161#endif
162#ifdef INET6
163static VNET_DEFINE(struct flowtable, ip6_ft);
164#define V_ip6_ft VNET(ip6_ft)
165#endif
166
167static uma_zone_t flow_zone;
168
169static VNET_DEFINE(int, flowtable_enable) = 1;
170#define V_flowtable_enable VNET(flowtable_enable)
171
172static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
173 "flowtable");
174SYSCTL_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
175 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
176SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
177 &flow_zone, "Maximum number of flows allowed");
178
179static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
180
181static struct flentry *
182flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
183
184#ifdef INET
185static struct flentry *
186flowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
187{
188 struct flentry *fle;
189 struct sockaddr_in *sin;
190 struct ip *ip;
191 uint32_t fibnum;
192#ifdef FLOWTABLE_HASH_ALL
193 uint32_t key[3];
194 int iphlen;
195 uint16_t sport, dport;
196 uint8_t proto;
197#endif
198
199 ip = mtod(m, struct ip *);
200
201 if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
202 (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
203 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
204 return (NULL);
205
206 fibnum = M_GETFIB(m);
207
208#ifdef FLOWTABLE_HASH_ALL
209 iphlen = ip->ip_hl << 2;
210 proto = ip->ip_p;
211
212 switch (proto) {
213 case IPPROTO_TCP: {
214 struct tcphdr *th;
215
216 th = (struct tcphdr *)((char *)ip + iphlen);
217 sport = th->th_sport;
218 dport = th->th_dport;
219 if (th->th_flags & (TH_RST|TH_FIN))
220 fibnum |= (FL_STALE << 24);
221 break;
222 }
223 case IPPROTO_UDP: {
224 struct udphdr *uh;
225
226 uh = (struct udphdr *)((char *)ip + iphlen);
227 sport = uh->uh_sport;
228 dport = uh->uh_dport;
229 break;
230 }
231 case IPPROTO_SCTP: {
232 struct sctphdr *sh;
233
234 sh = (struct sctphdr *)((char *)ip + iphlen);
235 sport = sh->src_port;
236 dport = sh->dest_port;
237 /* XXXGL: handle stale? */
238 break;
239 }
240 default:
241 sport = dport = 0;
242 break;
243 }
244
245 key[0] = ip->ip_dst.s_addr;
246 key[1] = ip->ip_src.s_addr;
247 key[2] = (dport << 16) | sport;
248 fibnum |= proto << 16;
249
250 fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
251 fibnum);
252
253#else /* !FLOWTABLE_HASH_ALL */
254
255 fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
256 sizeof(struct in_addr), fibnum);
257
258#endif /* FLOWTABLE_HASH_ALL */
259
260 if (fle == NULL)
261 return (NULL);
262
263 sin = (struct sockaddr_in *)&ro->ro_dst;
264 sin->sin_family = AF_INET;
265 sin->sin_len = sizeof(*sin);
266 sin->sin_addr = ip->ip_dst;
267
268 return (fle);
269}
270#endif /* INET */
271
272#ifdef INET6
273/*
274 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
275 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
276 * pointer might become stale after other pullups (but we never use it
277 * this way).
278 */
279#define PULLUP_TO(_len, p, T) \
280do { \
281 int x = (_len) + sizeof(T); \
282 if ((m)->m_len < x) \
283 return (NULL); \
284 p = (mtod(m, char *) + (_len)); \
285} while (0)
286
287#define TCP(p) ((struct tcphdr *)(p))
288#define SCTP(p) ((struct sctphdr *)(p))
289#define UDP(p) ((struct udphdr *)(p))
290
291static struct flentry *
292flowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
293{
294 struct flentry *fle;
295 struct sockaddr_in6 *sin6;
296 struct ip6_hdr *ip6;
297 uint32_t fibnum;
298#ifdef FLOWTABLE_HASH_ALL
299 uint32_t key[9];
300 void *ulp;
301 int hlen;
302 uint16_t sport, dport;
303 u_short offset;
304 uint8_t proto;
305#else
306 uint32_t key[4];
307#endif
308
309 ip6 = mtod(m, struct ip6_hdr *);
310 if (in6_localaddr(&ip6->ip6_dst))
311 return (NULL);
312
313 fibnum = M_GETFIB(m);
314
315#ifdef FLOWTABLE_HASH_ALL
316 hlen = sizeof(struct ip6_hdr);
317 proto = ip6->ip6_nxt;
318 offset = sport = dport = 0;
319 ulp = NULL;
320 while (ulp == NULL) {
321 switch (proto) {
322 case IPPROTO_ICMPV6:
323 case IPPROTO_OSPFIGP:
324 case IPPROTO_PIM:
325 case IPPROTO_CARP:
326 case IPPROTO_ESP:
327 case IPPROTO_NONE:
328 ulp = ip6;
329 break;
330 case IPPROTO_TCP:
331 PULLUP_TO(hlen, ulp, struct tcphdr);
332 dport = TCP(ulp)->th_dport;
333 sport = TCP(ulp)->th_sport;
334 if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
335 fibnum |= (FL_STALE << 24);
336 break;
337 case IPPROTO_SCTP:
338 PULLUP_TO(hlen, ulp, struct sctphdr);
339 dport = SCTP(ulp)->src_port;
340 sport = SCTP(ulp)->dest_port;
341 /* XXXGL: handle stale? */
342 break;
343 case IPPROTO_UDP:
344 PULLUP_TO(hlen, ulp, struct udphdr);
345 dport = UDP(ulp)->uh_dport;
346 sport = UDP(ulp)->uh_sport;
347 break;
348 case IPPROTO_HOPOPTS: /* RFC 2460 */
349 PULLUP_TO(hlen, ulp, struct ip6_hbh);
350 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
351 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
352 ulp = NULL;
353 break;
354 case IPPROTO_ROUTING: /* RFC 2460 */
355 PULLUP_TO(hlen, ulp, struct ip6_rthdr);
356 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
357 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
358 ulp = NULL;
359 break;
360 case IPPROTO_FRAGMENT: /* RFC 2460 */
361 PULLUP_TO(hlen, ulp, struct ip6_frag);
362 hlen += sizeof (struct ip6_frag);
363 proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
364 offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
365 IP6F_OFF_MASK;
366 ulp = NULL;
367 break;
368 case IPPROTO_DSTOPTS: /* RFC 2460 */
369 PULLUP_TO(hlen, ulp, struct ip6_hbh);
370 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
371 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
372 ulp = NULL;
373 break;
374 case IPPROTO_AH: /* RFC 2402 */
375 PULLUP_TO(hlen, ulp, struct ip6_ext);
376 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
377 proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
378 ulp = NULL;
379 break;
380 default:
381 PULLUP_TO(hlen, ulp, struct ip6_ext);
382 break;
383 }
384 }
385
386 bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
387 bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
388 key[8] = (dport << 16) | sport;
389 fibnum |= proto << 16;
390
391 fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
392 fibnum);
393#else /* !FLOWTABLE_HASH_ALL */
394 bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
395 fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
396 fibnum);
397#endif /* FLOWTABLE_HASH_ALL */
398
399 if (fle == NULL)
400 return (NULL);
401
402 sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
403 sin6->sin6_family = AF_INET6;
404 sin6->sin6_len = sizeof(*sin6);
405 bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
406
407 return (fle);
408}
409#endif /* INET6 */
410
411static bitstr_t *
412flowtable_mask(struct flowtable *ft)
413{
414
415 /*
416 * flowtable_free_stale() calls w/o critical section, but
417 * with sched_bind(). Since pointer is stable throughout
418 * ft lifetime, it is safe, otherwise...
419 *
420 * CRITICAL_ASSERT(curthread);
421 */
422
423 return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
424}
425
426static struct flist *
427flowtable_list(struct flowtable *ft, uint32_t hash)
428{
429
430 CRITICAL_ASSERT(curthread);
431 return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
432}
433
434static int
435flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
436{
437
438 if (((fle->f_rt->rt_flags & RTF_UP) == 0) ||
439 (fle->f_rt->rt_ifp == NULL) ||
440 !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
441 (fle->f_lle->la_flags & LLE_VALID) == 0)
442 return (1);
443
444 if (time_uptime - fle->f_uptime > maxidle)
445 return (1);
446
447#ifdef FLOWTABLE_HASH_ALL
448 if (fle->f_flags & FL_STALE)
449 return (1);
450#endif
451
452 return (0);
453}
454
455static int
456flow_full(void)
457{
458 int count, max;
459
460 count = uma_zone_get_cur(flow_zone);
461 max = uma_zone_get_max(flow_zone);
462
463 return (count > (max - (max >> 3)));
464}
465
466static int
467flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
468{
469#ifdef FLOWTABLE_HASH_ALL
470 uint8_t proto;
471
472 proto = (fibnum >> 16) & 0xff;
473 fibnum &= 0xffff;
474#endif
475
476 CRITICAL_ASSERT(curthread);
477
478 /* Microoptimization for IPv4: don't use bcmp(). */
479 if (((keylen == sizeof(uint32_t) && (fle->f_key[0] == key[0])) ||
480 (bcmp(fle->f_key, key, keylen) == 0)) &&
481 fibnum == fle->f_fibnum &&
482#ifdef FLOWTABLE_HASH_ALL
483 proto == fle->f_proto &&
484#endif
485 (fle->f_rt->rt_flags & RTF_UP) &&
486 fle->f_rt->rt_ifp != NULL &&
487 (fle->f_lle->la_flags & LLE_VALID))
488 return (1);
489
490 return (0);
491}
492
493static struct flentry *
494flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
495 int keylen, uint32_t fibnum0)
496{
497#ifdef INET6
498 struct route_in6 sro6;
499#endif
500#ifdef INET
501 struct route sro;
502#endif
503 struct route *ro = NULL;
504 struct rtentry *rt;
505 struct lltable *lt = NULL;
506 struct llentry *lle;
507 struct sockaddr_storage *l3addr;
508 struct ifnet *ifp;
509 struct flist *flist;
510 struct flentry *fle, *iter;
511 bitstr_t *mask;
512 uint16_t fibnum = fibnum0;
513#ifdef FLOWTABLE_HASH_ALL
514 uint8_t proto;
515
516 proto = (fibnum0 >> 16) & 0xff;
517 fibnum = fibnum0 & 0xffff;
518#endif
519
520 /*
521 * This bit of code ends up locking the
522 * same route 3 times (just like ip_output + ether_output)
523 * - at lookup
524 * - in rt_check when called by arpresolve
525 * - dropping the refcount for the rtentry
526 *
527 * This could be consolidated to one if we wrote a variant
528 * of arpresolve with an rt_check variant that expected to
529 * receive the route locked
530 */
531#ifdef INET
532 if (ft == &V_ip4_ft) {
533 struct sockaddr_in *sin;
534
535 ro = &sro;
536 bzero(&sro.ro_dst, sizeof(sro.ro_dst));
537
538 sin = (struct sockaddr_in *)&sro.ro_dst;
539 sin->sin_family = AF_INET;
540 sin->sin_len = sizeof(*sin);
541 sin->sin_addr.s_addr = key[0];
542 }
543#endif
544#ifdef INET6
545 if (ft == &V_ip6_ft) {
546 struct sockaddr_in6 *sin6;
547
548 ro = (struct route *)&sro6;
549 sin6 = &sro6.ro_dst;
550
551 bzero(sin6, sizeof(*sin6));
552 sin6->sin6_family = AF_INET6;
553 sin6->sin6_len = sizeof(*sin6);
554 bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
555 }
556#endif
557
558 ro->ro_rt = NULL;
559#ifdef RADIX_MPATH
560 rtalloc_mpath_fib(ro, hash, fibnum);
561#else
562 rtalloc_ign_fib(ro, 0, fibnum);
563#endif
564 if (ro->ro_rt == NULL)
565 return (NULL);
566
567 rt = ro->ro_rt;
568 ifp = rt->rt_ifp;
569
570 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
571 RTFREE(rt);
572 return (NULL);
573 }
574
575#ifdef INET
576 if (ft == &V_ip4_ft)
577 lt = LLTABLE(ifp);
578#endif
579#ifdef INET6
580 if (ft == &V_ip6_ft)
581 lt = LLTABLE6(ifp);
582#endif
583
584 if (rt->rt_flags & RTF_GATEWAY)
585 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
586 else
587 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
588 lle = llentry_alloc(ifp, lt, l3addr);
589
590 if (lle == NULL) {
591 RTFREE(rt);
592 return (NULL);
593 }
594
595 /* Don't insert the entry if the ARP hasn't yet finished resolving. */
596 if ((lle->la_flags & LLE_VALID) == 0) {
597 RTFREE(rt);
598 LLE_FREE(lle);
599 FLOWSTAT_INC(ft, ft_fail_lle_invalid);
600 return (NULL);
601 }
602
603 fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
604 if (fle == NULL) {
605 RTFREE(rt);
606 LLE_FREE(lle);
607 return (NULL);
608 }
609
610 fle->f_hash = hash;
611 bcopy(key, &fle->f_key, keylen);
612 fle->f_rt = rt;
613 fle->f_lle = lle;
614 fle->f_fibnum = fibnum;
615 fle->f_uptime = time_uptime;
616#ifdef FLOWTABLE_HASH_ALL
617 fle->f_proto = proto;
618 fle->f_flags = fibnum0 >> 24;
619#endif
620
621 critical_enter();
622 mask = flowtable_mask(ft);
623 flist = flowtable_list(ft, hash);
624
625 if (SLIST_EMPTY(flist)) {
626 bit_set(mask, (hash % ft->ft_size));
627 SLIST_INSERT_HEAD(flist, fle, f_next);
628 goto skip;
629 }
630
631 /*
632 * find end of list and make sure that we were not
633 * preempted by another thread handling this flow
634 */
635 SLIST_FOREACH(iter, flist, f_next) {
636 KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size,
637 ("%s: wrong hash", __func__));
638 if (flow_matches(iter, key, keylen, fibnum)) {
639 /*
640 * We probably migrated to an other CPU after
641 * lookup in flowtable_lookup_common() failed.
642 * It appeared that this CPU already has flow
643 * entry.
644 */
645 iter->f_uptime = time_uptime;
646#ifdef FLOWTABLE_HASH_ALL
647 iter->f_flags |= fibnum >> 24;
648#endif
649 critical_exit();
650 FLOWSTAT_INC(ft, ft_collisions);
651 uma_zfree(flow_zone, fle);
652 return (iter);
653 }
654 }
655
656 SLIST_INSERT_HEAD(flist, fle, f_next);
657skip:
658 critical_exit();
659 FLOWSTAT_INC(ft, ft_inserts);
660
661 return (fle);
662}
663
664int
665flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
666{
667 struct flentry *fle;
668 struct llentry *lle;
669
670 if (V_flowtable_enable == 0)
671 return (ENXIO);
672
673 switch (sa) {
674#ifdef INET
675 case AF_INET:
676 fle = flowtable_lookup_ipv4(m, ro);
677 break;
678#endif
679#ifdef INET6
680 case AF_INET6:
681 fle = flowtable_lookup_ipv6(m, ro);
682 break;
683#endif
684 default:
685 panic("%s: sa %d", __func__, sa);
686 }
687
688 if (fle == NULL)
689 return (EHOSTUNREACH);
690
691 if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) {
692 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE_HASH);
693 m->m_pkthdr.flowid = fle->f_hash;
694 }
695
696 ro->ro_rt = fle->f_rt;
697 ro->ro_flags |= RT_NORTREF;
698 lle = fle->f_lle;
699 if (lle != NULL && (lle->la_flags & LLE_VALID))
700 ro->ro_lle = lle; /* share ref with fle->f_lle */
701
702 return (0);
703}
704
705static struct flentry *
706flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen,
707 uint32_t fibnum)
708{
709 struct flist *flist;
710 struct flentry *fle;
711 uint32_t hash;
712
713 FLOWSTAT_INC(ft, ft_lookups);
714
715 hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter);
716
717 critical_enter();
718 flist = flowtable_list(ft, hash);
719 SLIST_FOREACH(fle, flist, f_next) {
720 KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size,
721 ("%s: wrong hash", __func__));
722 if (flow_matches(fle, key, keylen, fibnum)) {
723 fle->f_uptime = time_uptime;
724#ifdef FLOWTABLE_HASH_ALL
725 fle->f_flags |= fibnum >> 24;
726#endif
727 critical_exit();
728 FLOWSTAT_INC(ft, ft_hits);
729 return (fle);
730 }
731 }
732 critical_exit();
733
734 FLOWSTAT_INC(ft, ft_misses);
735
736 return (flowtable_insert(ft, hash, key, keylen, fibnum));
737}
738
739static void
740flowtable_alloc(struct flowtable *ft)
741{
742
743 ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
744 M_FTABLE, M_WAITOK);
745 for (int i = 0; i < ft->ft_size; i++)
746 ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
747
748 ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
749 for (int i = 0; i < mp_ncpus; i++) {
749 CPU_FOREACH(i) {
750 bitstr_t **b;
751
752 b = zpcpu_get_cpu(ft->ft_masks, i);
753 *b = bit_alloc(ft->ft_size, M_FTABLE, M_WAITOK);
754 }
755 ft->ft_tmpmask = bit_alloc(ft->ft_size, M_FTABLE, M_WAITOK);
756}
757
758static void
759flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle)
760{
761 struct flist *flist, freelist;
762 struct flentry *fle, *fle1, *fleprev;
763 bitstr_t *mask, *tmpmask;
764 int curbit, tmpsize;
765
766 SLIST_INIT(&freelist);
767 mask = flowtable_mask(ft);
768 tmpmask = ft->ft_tmpmask;
769 tmpsize = ft->ft_size;
770 memcpy(tmpmask, mask, ft->ft_size/8);
771 curbit = 0;
772 fleprev = NULL; /* pacify gcc */
773 /*
774 * XXX Note to self, bit_ffs operates at the byte level
775 * and thus adds gratuitous overhead
776 */
777 bit_ffs(tmpmask, ft->ft_size, &curbit);
778 while (curbit != -1) {
779 if (curbit >= ft->ft_size || curbit < -1) {
780 log(LOG_ALERT,
781 "warning: bad curbit value %d \n",
782 curbit);
783 break;
784 }
785
786 FLOWSTAT_INC(ft, ft_free_checks);
787
788 critical_enter();
789 flist = flowtable_list(ft, curbit);
790#ifdef DIAGNOSTIC
791 if (SLIST_EMPTY(flist) && curbit > 0) {
792 log(LOG_ALERT,
793 "warning bit=%d set, but no fle found\n",
794 curbit);
795 }
796#endif
797 SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
798 if (rt != NULL && fle->f_rt != rt) {
799 fleprev = fle;
800 continue;
801 }
802 if (!flow_stale(ft, fle, maxidle)) {
803 fleprev = fle;
804 continue;
805 }
806
807 if (fle == SLIST_FIRST(flist))
808 SLIST_REMOVE_HEAD(flist, f_next);
809 else
810 SLIST_REMOVE_AFTER(fleprev, f_next);
811 SLIST_INSERT_HEAD(&freelist, fle, f_next);
812 }
813 if (SLIST_EMPTY(flist))
814 bit_clear(mask, curbit);
815 critical_exit();
816
817 bit_clear(tmpmask, curbit);
818 bit_ffs(tmpmask, tmpsize, &curbit);
819 }
820
821 SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
822 FLOWSTAT_INC(ft, ft_frees);
823 if (fle->f_rt != NULL)
824 RTFREE(fle->f_rt);
825 if (fle->f_lle != NULL)
826 LLE_FREE(fle->f_lle);
827 uma_zfree(flow_zone, fle);
828 }
829}
830
831static void
832flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle)
833{
834 int i;
835
836 CPU_FOREACH(i) {
837 if (smp_started == 1) {
838 thread_lock(curthread);
839 sched_bind(curthread, i);
840 thread_unlock(curthread);
841 }
842
843 flowtable_free_stale(ft, rt, maxidle);
844
845 if (smp_started == 1) {
846 thread_lock(curthread);
847 sched_unbind(curthread);
848 thread_unlock(curthread);
849 }
850 }
851}
852
853void
854flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
855{
856 struct flowtable *ft;
857
858 switch (sa) {
859#ifdef INET
860 case AF_INET:
861 ft = &V_ip4_ft;
862 break;
863#endif
864#ifdef INET6
865 case AF_INET6:
866 ft = &V_ip6_ft;
867 break;
868#endif
869 default:
870 panic("%s: sa %d", __func__, sa);
871 }
872
873 flowtable_clean_vnet(ft, rt, 0);
874}
875
876static void
877flowtable_cleaner(void)
878{
879 VNET_ITERATOR_DECL(vnet_iter);
880 struct thread *td;
881
882 if (bootverbose)
883 log(LOG_INFO, "flowtable cleaner started\n");
884 td = curthread;
885 while (1) {
886 uint32_t flowclean_freq, maxidle;
887
888 /*
889 * The maximum idle time, as well as frequency are arbitrary.
890 */
891 if (flow_full())
892 maxidle = 5;
893 else
894 maxidle = 30;
895
896 VNET_LIST_RLOCK();
897 VNET_FOREACH(vnet_iter) {
898 CURVNET_SET(vnet_iter);
899#ifdef INET
900 flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle);
901#endif
902#ifdef INET6
903 flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle);
904#endif
905 CURVNET_RESTORE();
906 }
907 VNET_LIST_RUNLOCK();
908
909 if (flow_full())
910 flowclean_freq = 4*hz;
911 else
912 flowclean_freq = 20*hz;
913 mtx_lock(&flowclean_lock);
914 thread_lock(td);
915 sched_prio(td, PPAUSE);
916 thread_unlock(td);
917 flowclean_cycles++;
918 cv_broadcast(&flowclean_f_cv);
919 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
920 mtx_unlock(&flowclean_lock);
921 }
922}
923
924static void
925flowtable_flush(void *unused __unused)
926{
927 uint64_t start;
928
929 mtx_lock(&flowclean_lock);
930 start = flowclean_cycles;
931 while (start == flowclean_cycles) {
932 cv_broadcast(&flowclean_c_cv);
933 cv_wait(&flowclean_f_cv, &flowclean_lock);
934 }
935 mtx_unlock(&flowclean_lock);
936}
937
938static struct kproc_desc flow_kp = {
939 "flowcleaner",
940 flowtable_cleaner,
941 &flowcleanerproc
942};
943SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
944
945static int
946flowtable_get_size(char *name)
947{
948 int size;
949
950 if (TUNABLE_INT_FETCH(name, &size)) {
951 if (size < 256)
952 size = 256;
953 if (!powerof2(size)) {
954 printf("%s must be power of 2\n", name);
955 size = 2048;
956 }
957 } else {
958 /*
959 * round up to the next power of 2
960 */
961 size = 1 << fls((1024 + maxusers * 64) - 1);
962 }
963
964 return (size);
965}
966
967static void
968flowtable_init(const void *unused __unused)
969{
970
971 flow_hashjitter = arc4random();
972
973 flow_zone = uma_zcreate("flows", sizeof(struct flentry),
974 NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET);
975 uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
976
977 cv_init(&flowclean_c_cv, "c_flowcleanwait");
978 cv_init(&flowclean_f_cv, "f_flowcleanwait");
979 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
980 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
981 EVENTHANDLER_PRI_ANY);
982}
983SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
984 flowtable_init, NULL);
985
986#ifdef INET
987static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
988 "Flowtable for IPv4");
989
990static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
991VNET_PCPUSTAT_SYSINIT(ip4_ftstat);
992VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
993SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
994 ip4_ftstat, "Flowtable statistics for IPv4 "
995 "(struct flowtable_stat, net/flowtable.h)");
996
997static void
998flowtable_init_vnet_v4(const void *unused __unused)
999{
1000
1001 V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
1002 V_ip4_ft.ft_stat = VNET(ip4_ftstat);
1003 flowtable_alloc(&V_ip4_ft);
1004}
1005VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1006 flowtable_init_vnet_v4, NULL);
1007#endif /* INET */
1008
1009#ifdef INET6
1010static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
1011 "Flowtable for IPv6");
1012
1013static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
1014VNET_PCPUSTAT_SYSINIT(ip6_ftstat);
1015VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
1016SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
1017 ip6_ftstat, "Flowtable statistics for IPv6 "
1018 "(struct flowtable_stat, net/flowtable.h)");
1019
1020static void
1021flowtable_init_vnet_v6(const void *unused __unused)
1022{
1023
1024 V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
1025 V_ip6_ft.ft_stat = VNET(ip6_ftstat);
1026 flowtable_alloc(&V_ip6_ft);
1027}
1028VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1029 flowtable_init_vnet_v6, NULL);
1030#endif /* INET6 */
1031
1032#ifdef DDB
1033static bitstr_t *
1034flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1035{
1036
1037 return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
1038}
1039
1040static struct flist *
1041flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1042{
1043
1044 return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
1045}
1046
1047static void
1048flow_show(struct flowtable *ft, struct flentry *fle)
1049{
1050 int idle_time;
1051 int rt_valid, ifp_valid;
1052 volatile struct rtentry *rt;
1053 struct ifnet *ifp = NULL;
1054 uint32_t *hashkey = fle->f_key;
1055
1056 idle_time = (int)(time_uptime - fle->f_uptime);
1057 rt = fle->f_rt;
1058 rt_valid = rt != NULL;
1059 if (rt_valid)
1060 ifp = rt->rt_ifp;
1061 ifp_valid = ifp != NULL;
1062
1063#ifdef INET
1064 if (ft == &V_ip4_ft) {
1065 char daddr[4*sizeof "123"];
1066#ifdef FLOWTABLE_HASH_ALL
1067 char saddr[4*sizeof "123"];
1068 uint16_t sport, dport;
1069#endif
1070
1071 inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr);
1072#ifdef FLOWTABLE_HASH_ALL
1073 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1074 dport = ntohs((uint16_t)(hashkey[2] >> 16));
1075 sport = ntohs((uint16_t)(hashkey[2] & 0xffff));
1076 db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport);
1077#else
1078 db_printf("%s ", daddr);
1079#endif
1080 }
1081#endif /* INET */
1082#ifdef INET6
1083 if (ft == &V_ip6_ft) {
1084#ifdef FLOWTABLE_HASH_ALL
1085 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1086 hashkey[0], hashkey[1], hashkey[2],
1087 hashkey[3], hashkey[4], hashkey[5],
1088 hashkey[6], hashkey[7], hashkey[8]);
1089#else
1090 db_printf("\n\tkey=%08x:%08x:%08x ",
1091 hashkey[0], hashkey[1], hashkey[2]);
1092#endif
1093 }
1094#endif /* INET6 */
1095
1096 db_printf("hash=%08x idle_time=%03d"
1097 "\n\tfibnum=%02d rt=%p",
1098 fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt);
1099
1100#ifdef FLOWTABLE_HASH_ALL
1101 if (fle->f_flags & FL_STALE)
1102 db_printf(" FL_STALE ");
1103#endif
1104 if (rt_valid) {
1105 if (rt->rt_flags & RTF_UP)
1106 db_printf(" RTF_UP ");
1107 }
1108 if (ifp_valid) {
1109 if (ifp->if_flags & IFF_LOOPBACK)
1110 db_printf(" IFF_LOOPBACK ");
1111 if (ifp->if_flags & IFF_UP)
1112 db_printf(" IFF_UP ");
1113 if (ifp->if_flags & IFF_POINTOPOINT)
1114 db_printf(" IFF_POINTOPOINT ");
1115 }
1116 db_printf("\n");
1117}
1118
1119static void
1120flowtable_show(struct flowtable *ft, int cpuid)
1121{
1122 int curbit = 0;
1123 bitstr_t *mask, *tmpmask;
1124
1125 if (cpuid != -1)
1126 db_printf("cpu: %d\n", cpuid);
1127 mask = flowtable_mask_pcpu(ft, cpuid);
1128 tmpmask = ft->ft_tmpmask;
1129 memcpy(tmpmask, mask, ft->ft_size/8);
1130 /*
1131 * XXX Note to self, bit_ffs operates at the byte level
1132 * and thus adds gratuitous overhead
1133 */
1134 bit_ffs(tmpmask, ft->ft_size, &curbit);
1135 while (curbit != -1) {
1136 struct flist *flist;
1137 struct flentry *fle;
1138
1139 if (curbit >= ft->ft_size || curbit < -1) {
1140 db_printf("warning: bad curbit value %d \n",
1141 curbit);
1142 break;
1143 }
1144
1145 flist = flowtable_list_pcpu(ft, curbit, cpuid);
1146
1147 SLIST_FOREACH(fle, flist, f_next)
1148 flow_show(ft, fle);
1149 bit_clear(tmpmask, curbit);
1150 bit_ffs(tmpmask, ft->ft_size, &curbit);
1151 }
1152}
1153
1154static void
1155flowtable_show_vnet(struct flowtable *ft)
1156{
1157
1158 int i;
1159
1160 CPU_FOREACH(i)
1161 flowtable_show(ft, i);
1162}
1163
1164DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1165{
1166 VNET_ITERATOR_DECL(vnet_iter);
1167
1168 VNET_FOREACH(vnet_iter) {
1169 CURVNET_SET(vnet_iter);
1170#ifdef VIMAGE
1171 db_printf("vnet %p\n", vnet_iter);
1172#endif
1173#ifdef INET
1174 printf("IPv4:\n");
1175 flowtable_show_vnet(&V_ip4_ft);
1176#endif
1177#ifdef INET6
1178 printf("IPv6:\n");
1179 flowtable_show_vnet(&V_ip6_ft);
1180#endif
1181 CURVNET_RESTORE();
1182 }
1183}
1184#endif