1/*- 2 * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org> 3 * Copyright (c) 2008-2010, BitGravity Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, 10 * this list of conditions and the following disclaimer. 11 * 12 * 2. Neither the name of the BitGravity Corporation nor the names of its 13 * contributors may be used to endorse or promote products derived from 14 * this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "opt_route.h" 30#include "opt_mpath.h" 31#include "opt_ddb.h" 32#include "opt_inet.h" 33#include "opt_inet6.h" 34 35#include <sys/cdefs.h>
|
36__FBSDID("$FreeBSD: head/sys/net/flowtable.c 301538 2016-06-07 04:51:50Z sephe $");
|
36__FBSDID("$FreeBSD: head/sys/net/flowtable.c 302372 2016-07-06 14:09:49Z nwhitehorn $"); |
37 38#include <sys/param.h> 39#include <sys/types.h> 40#include <sys/bitstring.h> 41#include <sys/condvar.h> 42#include <sys/callout.h> 43#include <sys/hash.h> 44#include <sys/kernel.h> 45#include <sys/kthread.h> 46#include <sys/limits.h> 47#include <sys/malloc.h> 48#include <sys/mbuf.h> 49#include <sys/pcpu.h> 50#include <sys/proc.h> 51#include <sys/queue.h> 52#include <sys/sbuf.h> 53#include <sys/sched.h> 54#include <sys/smp.h> 55#include <sys/socket.h> 56#include <sys/syslog.h> 57#include <sys/sysctl.h> 58#include <vm/uma.h> 59 60#include <net/if.h> 61#include <net/if_llatbl.h> 62#include <net/if_var.h> 63#include <net/route.h> 64#include <net/flowtable.h> 65#include <net/vnet.h> 66 67#include <netinet/in.h> 68#include <netinet/in_systm.h> 69#include <netinet/in_var.h> 70#include <netinet/if_ether.h> 71#include <netinet/ip.h> 72#ifdef INET6 73#include <netinet/ip6.h> 74#endif 75#ifdef FLOWTABLE_HASH_ALL 76#include <netinet/tcp.h> 77#include <netinet/udp.h> 78#include <netinet/sctp.h> 79#endif 80 81#include <ddb/ddb.h> 82 83#ifdef FLOWTABLE_HASH_ALL 84#define KEY_PORTS (sizeof(uint16_t) * 2) 85#define KEY_ADDRS 2 86#else 87#define KEY_PORTS 0 88#define KEY_ADDRS 1 89#endif 90 91#ifdef INET6 92#define KEY_ADDR_LEN sizeof(struct in6_addr) 93#else 94#define KEY_ADDR_LEN sizeof(struct in_addr) 95#endif 96 97#define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t)) 98 99struct flentry { 100 uint32_t f_hash; /* hash flowing forward */ 101 uint32_t f_key[KEYLEN]; /* address(es and ports) */ 102 uint32_t f_uptime; /* uptime at last access */ 103 uint16_t f_fibnum; /* fib index */ 104#ifdef FLOWTABLE_HASH_ALL 105 uint8_t f_proto; /* protocol */ 106 uint8_t f_flags; /* stale? */ 107#define FL_STALE 1 108#endif 109 SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */ 110 struct rtentry *f_rt; /* rtentry for flow */ 111 struct llentry *f_lle; /* llentry for flow */ 112}; 113#undef KEYLEN 114 115SLIST_HEAD(flist, flentry); 116/* Make sure we can use pcpu_zone_ptr for struct flist. */ 117CTASSERT(sizeof(struct flist) == sizeof(void *)); 118 119struct flowtable { 120 counter_u64_t *ft_stat; 121 int ft_size; 122 /* 123 * ft_table is a malloc(9)ed array of pointers. Pointers point to 124 * memory from UMA_ZONE_PCPU zone. 125 * ft_masks is per-cpu pointer itself. Each instance points 126 * to a malloc(9)ed bitset, that is private to corresponding CPU. 127 */ 128 struct flist **ft_table; 129 bitstr_t **ft_masks; 130 bitstr_t *ft_tmpmask; 131}; 132 133#define FLOWSTAT_ADD(ft, name, v) \ 134 counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v)) 135#define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1) 136 137static struct proc *flowcleanerproc; 138static uint32_t flow_hashjitter; 139 140static struct cv flowclean_f_cv; 141static struct cv flowclean_c_cv; 142static struct mtx flowclean_lock; 143static uint32_t flowclean_cycles; 144 145/* 146 * TODO: 147 * - add sysctls to resize && flush flow tables 148 * - Add per flowtable sysctls for statistics and configuring timeouts 149 * - add saturation counter to rtentry to support per-packet load-balancing 150 * add flag to indicate round-robin flow, add list lookup from head 151 for flows 152 * - add sysctl / device node / syscall to support exporting and importing 153 * of flows with flag to indicate that a flow was imported so should 154 * not be considered for auto-cleaning 155 * - support explicit connection state (currently only ad-hoc for DSR) 156 * - idetach() cleanup for options VIMAGE builds. 157 */ 158#ifdef INET 159static VNET_DEFINE(struct flowtable, ip4_ft); 160#define V_ip4_ft VNET(ip4_ft) 161#endif 162#ifdef INET6 163static VNET_DEFINE(struct flowtable, ip6_ft); 164#define V_ip6_ft VNET(ip6_ft) 165#endif 166 167static uma_zone_t flow_zone; 168 169static VNET_DEFINE(int, flowtable_enable) = 1; 170#define V_flowtable_enable VNET(flowtable_enable) 171 172static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL, 173 "flowtable"); 174SYSCTL_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, 175 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); 176SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW, 177 &flow_zone, "Maximum number of flows allowed"); 178 179static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings"); 180 181static struct flentry * 182flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t); 183 184#ifdef INET 185static struct flentry * 186flowtable_lookup_ipv4(struct mbuf *m, struct route *ro) 187{ 188 struct flentry *fle; 189 struct sockaddr_in *sin; 190 struct ip *ip; 191 uint32_t fibnum; 192#ifdef FLOWTABLE_HASH_ALL 193 uint32_t key[3]; 194 int iphlen; 195 uint16_t sport, dport; 196 uint8_t proto; 197#endif 198 199 ip = mtod(m, struct ip *); 200 201 if (ip->ip_src.s_addr == ip->ip_dst.s_addr || 202 (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 203 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) 204 return (NULL); 205 206 fibnum = M_GETFIB(m); 207 208#ifdef FLOWTABLE_HASH_ALL 209 iphlen = ip->ip_hl << 2; 210 proto = ip->ip_p; 211 212 switch (proto) { 213 case IPPROTO_TCP: { 214 struct tcphdr *th; 215 216 th = (struct tcphdr *)((char *)ip + iphlen); 217 sport = th->th_sport; 218 dport = th->th_dport; 219 if (th->th_flags & (TH_RST|TH_FIN)) 220 fibnum |= (FL_STALE << 24); 221 break; 222 } 223 case IPPROTO_UDP: { 224 struct udphdr *uh; 225 226 uh = (struct udphdr *)((char *)ip + iphlen); 227 sport = uh->uh_sport; 228 dport = uh->uh_dport; 229 break; 230 } 231 case IPPROTO_SCTP: { 232 struct sctphdr *sh; 233 234 sh = (struct sctphdr *)((char *)ip + iphlen); 235 sport = sh->src_port; 236 dport = sh->dest_port; 237 /* XXXGL: handle stale? */ 238 break; 239 } 240 default: 241 sport = dport = 0; 242 break; 243 } 244 245 key[0] = ip->ip_dst.s_addr; 246 key[1] = ip->ip_src.s_addr; 247 key[2] = (dport << 16) | sport; 248 fibnum |= proto << 16; 249 250 fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t), 251 fibnum); 252 253#else /* !FLOWTABLE_HASH_ALL */ 254 255 fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst, 256 sizeof(struct in_addr), fibnum); 257 258#endif /* FLOWTABLE_HASH_ALL */ 259 260 if (fle == NULL) 261 return (NULL); 262 263 sin = (struct sockaddr_in *)&ro->ro_dst; 264 sin->sin_family = AF_INET; 265 sin->sin_len = sizeof(*sin); 266 sin->sin_addr = ip->ip_dst; 267 268 return (fle); 269} 270#endif /* INET */ 271 272#ifdef INET6 273/* 274 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, 275 * then it sets p to point at the offset "len" in the mbuf. WARNING: the 276 * pointer might become stale after other pullups (but we never use it 277 * this way). 278 */ 279#define PULLUP_TO(_len, p, T) \ 280do { \ 281 int x = (_len) + sizeof(T); \ 282 if ((m)->m_len < x) \ 283 return (NULL); \ 284 p = (mtod(m, char *) + (_len)); \ 285} while (0) 286 287#define TCP(p) ((struct tcphdr *)(p)) 288#define SCTP(p) ((struct sctphdr *)(p)) 289#define UDP(p) ((struct udphdr *)(p)) 290 291static struct flentry * 292flowtable_lookup_ipv6(struct mbuf *m, struct route *ro) 293{ 294 struct flentry *fle; 295 struct sockaddr_in6 *sin6; 296 struct ip6_hdr *ip6; 297 uint32_t fibnum; 298#ifdef FLOWTABLE_HASH_ALL 299 uint32_t key[9]; 300 void *ulp; 301 int hlen; 302 uint16_t sport, dport; 303 u_short offset; 304 uint8_t proto; 305#else 306 uint32_t key[4]; 307#endif 308 309 ip6 = mtod(m, struct ip6_hdr *); 310 if (in6_localaddr(&ip6->ip6_dst)) 311 return (NULL); 312 313 fibnum = M_GETFIB(m); 314 315#ifdef FLOWTABLE_HASH_ALL 316 hlen = sizeof(struct ip6_hdr); 317 proto = ip6->ip6_nxt; 318 offset = sport = dport = 0; 319 ulp = NULL; 320 while (ulp == NULL) { 321 switch (proto) { 322 case IPPROTO_ICMPV6: 323 case IPPROTO_OSPFIGP: 324 case IPPROTO_PIM: 325 case IPPROTO_CARP: 326 case IPPROTO_ESP: 327 case IPPROTO_NONE: 328 ulp = ip6; 329 break; 330 case IPPROTO_TCP: 331 PULLUP_TO(hlen, ulp, struct tcphdr); 332 dport = TCP(ulp)->th_dport; 333 sport = TCP(ulp)->th_sport; 334 if (TCP(ulp)->th_flags & (TH_RST|TH_FIN)) 335 fibnum |= (FL_STALE << 24); 336 break; 337 case IPPROTO_SCTP: 338 PULLUP_TO(hlen, ulp, struct sctphdr); 339 dport = SCTP(ulp)->src_port; 340 sport = SCTP(ulp)->dest_port; 341 /* XXXGL: handle stale? */ 342 break; 343 case IPPROTO_UDP: 344 PULLUP_TO(hlen, ulp, struct udphdr); 345 dport = UDP(ulp)->uh_dport; 346 sport = UDP(ulp)->uh_sport; 347 break; 348 case IPPROTO_HOPOPTS: /* RFC 2460 */ 349 PULLUP_TO(hlen, ulp, struct ip6_hbh); 350 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; 351 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; 352 ulp = NULL; 353 break; 354 case IPPROTO_ROUTING: /* RFC 2460 */ 355 PULLUP_TO(hlen, ulp, struct ip6_rthdr); 356 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; 357 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; 358 ulp = NULL; 359 break; 360 case IPPROTO_FRAGMENT: /* RFC 2460 */ 361 PULLUP_TO(hlen, ulp, struct ip6_frag); 362 hlen += sizeof (struct ip6_frag); 363 proto = ((struct ip6_frag *)ulp)->ip6f_nxt; 364 offset = ((struct ip6_frag *)ulp)->ip6f_offlg & 365 IP6F_OFF_MASK; 366 ulp = NULL; 367 break; 368 case IPPROTO_DSTOPTS: /* RFC 2460 */ 369 PULLUP_TO(hlen, ulp, struct ip6_hbh); 370 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; 371 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; 372 ulp = NULL; 373 break; 374 case IPPROTO_AH: /* RFC 2402 */ 375 PULLUP_TO(hlen, ulp, struct ip6_ext); 376 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; 377 proto = ((struct ip6_ext *)ulp)->ip6e_nxt; 378 ulp = NULL; 379 break; 380 default: 381 PULLUP_TO(hlen, ulp, struct ip6_ext); 382 break; 383 } 384 } 385 386 bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); 387 bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr)); 388 key[8] = (dport << 16) | sport; 389 fibnum |= proto << 16; 390 391 fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t), 392 fibnum); 393#else /* !FLOWTABLE_HASH_ALL */ 394 bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); 395 fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr), 396 fibnum); 397#endif /* FLOWTABLE_HASH_ALL */ 398 399 if (fle == NULL) 400 return (NULL); 401 402 sin6 = (struct sockaddr_in6 *)&ro->ro_dst; 403 sin6->sin6_family = AF_INET6; 404 sin6->sin6_len = sizeof(*sin6); 405 bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr)); 406 407 return (fle); 408} 409#endif /* INET6 */ 410 411static bitstr_t * 412flowtable_mask(struct flowtable *ft) 413{ 414 415 /* 416 * flowtable_free_stale() calls w/o critical section, but 417 * with sched_bind(). Since pointer is stable throughout 418 * ft lifetime, it is safe, otherwise... 419 * 420 * CRITICAL_ASSERT(curthread); 421 */ 422 423 return (*(bitstr_t **)zpcpu_get(ft->ft_masks)); 424} 425 426static struct flist * 427flowtable_list(struct flowtable *ft, uint32_t hash) 428{ 429 430 CRITICAL_ASSERT(curthread); 431 return (zpcpu_get(ft->ft_table[hash % ft->ft_size])); 432} 433 434static int 435flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle) 436{ 437 438 if (((fle->f_rt->rt_flags & RTF_UP) == 0) || 439 (fle->f_rt->rt_ifp == NULL) || 440 !RT_LINK_IS_UP(fle->f_rt->rt_ifp) || 441 (fle->f_lle->la_flags & LLE_VALID) == 0) 442 return (1); 443 444 if (time_uptime - fle->f_uptime > maxidle) 445 return (1); 446 447#ifdef FLOWTABLE_HASH_ALL 448 if (fle->f_flags & FL_STALE) 449 return (1); 450#endif 451 452 return (0); 453} 454 455static int 456flow_full(void) 457{ 458 int count, max; 459 460 count = uma_zone_get_cur(flow_zone); 461 max = uma_zone_get_max(flow_zone); 462 463 return (count > (max - (max >> 3))); 464} 465 466static int 467flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum) 468{ 469#ifdef FLOWTABLE_HASH_ALL 470 uint8_t proto; 471 472 proto = (fibnum >> 16) & 0xff; 473 fibnum &= 0xffff; 474#endif 475 476 CRITICAL_ASSERT(curthread); 477 478 /* Microoptimization for IPv4: don't use bcmp(). */ 479 if (((keylen == sizeof(uint32_t) && (fle->f_key[0] == key[0])) || 480 (bcmp(fle->f_key, key, keylen) == 0)) && 481 fibnum == fle->f_fibnum && 482#ifdef FLOWTABLE_HASH_ALL 483 proto == fle->f_proto && 484#endif 485 (fle->f_rt->rt_flags & RTF_UP) && 486 fle->f_rt->rt_ifp != NULL && 487 (fle->f_lle->la_flags & LLE_VALID)) 488 return (1); 489 490 return (0); 491} 492 493static struct flentry * 494flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, 495 int keylen, uint32_t fibnum0) 496{ 497#ifdef INET6 498 struct route_in6 sro6; 499#endif 500#ifdef INET 501 struct route sro; 502#endif 503 struct route *ro = NULL; 504 struct rtentry *rt; 505 struct lltable *lt = NULL; 506 struct llentry *lle; 507 struct sockaddr_storage *l3addr; 508 struct ifnet *ifp; 509 struct flist *flist; 510 struct flentry *fle, *iter; 511 bitstr_t *mask; 512 uint16_t fibnum = fibnum0; 513#ifdef FLOWTABLE_HASH_ALL 514 uint8_t proto; 515 516 proto = (fibnum0 >> 16) & 0xff; 517 fibnum = fibnum0 & 0xffff; 518#endif 519 520 /* 521 * This bit of code ends up locking the 522 * same route 3 times (just like ip_output + ether_output) 523 * - at lookup 524 * - in rt_check when called by arpresolve 525 * - dropping the refcount for the rtentry 526 * 527 * This could be consolidated to one if we wrote a variant 528 * of arpresolve with an rt_check variant that expected to 529 * receive the route locked 530 */ 531#ifdef INET 532 if (ft == &V_ip4_ft) { 533 struct sockaddr_in *sin; 534 535 ro = &sro; 536 bzero(&sro.ro_dst, sizeof(sro.ro_dst)); 537 538 sin = (struct sockaddr_in *)&sro.ro_dst; 539 sin->sin_family = AF_INET; 540 sin->sin_len = sizeof(*sin); 541 sin->sin_addr.s_addr = key[0]; 542 } 543#endif 544#ifdef INET6 545 if (ft == &V_ip6_ft) { 546 struct sockaddr_in6 *sin6; 547 548 ro = (struct route *)&sro6; 549 sin6 = &sro6.ro_dst; 550 551 bzero(sin6, sizeof(*sin6)); 552 sin6->sin6_family = AF_INET6; 553 sin6->sin6_len = sizeof(*sin6); 554 bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr)); 555 } 556#endif 557 558 ro->ro_rt = NULL; 559#ifdef RADIX_MPATH 560 rtalloc_mpath_fib(ro, hash, fibnum); 561#else 562 rtalloc_ign_fib(ro, 0, fibnum); 563#endif 564 if (ro->ro_rt == NULL) 565 return (NULL); 566 567 rt = ro->ro_rt; 568 ifp = rt->rt_ifp; 569 570 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) { 571 RTFREE(rt); 572 return (NULL); 573 } 574 575#ifdef INET 576 if (ft == &V_ip4_ft) 577 lt = LLTABLE(ifp); 578#endif 579#ifdef INET6 580 if (ft == &V_ip6_ft) 581 lt = LLTABLE6(ifp); 582#endif 583 584 if (rt->rt_flags & RTF_GATEWAY) 585 l3addr = (struct sockaddr_storage *)rt->rt_gateway; 586 else 587 l3addr = (struct sockaddr_storage *)&ro->ro_dst; 588 lle = llentry_alloc(ifp, lt, l3addr); 589 590 if (lle == NULL) { 591 RTFREE(rt); 592 return (NULL); 593 } 594 595 /* Don't insert the entry if the ARP hasn't yet finished resolving. */ 596 if ((lle->la_flags & LLE_VALID) == 0) { 597 RTFREE(rt); 598 LLE_FREE(lle); 599 FLOWSTAT_INC(ft, ft_fail_lle_invalid); 600 return (NULL); 601 } 602 603 fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO); 604 if (fle == NULL) { 605 RTFREE(rt); 606 LLE_FREE(lle); 607 return (NULL); 608 } 609 610 fle->f_hash = hash; 611 bcopy(key, &fle->f_key, keylen); 612 fle->f_rt = rt; 613 fle->f_lle = lle; 614 fle->f_fibnum = fibnum; 615 fle->f_uptime = time_uptime; 616#ifdef FLOWTABLE_HASH_ALL 617 fle->f_proto = proto; 618 fle->f_flags = fibnum0 >> 24; 619#endif 620 621 critical_enter(); 622 mask = flowtable_mask(ft); 623 flist = flowtable_list(ft, hash); 624 625 if (SLIST_EMPTY(flist)) { 626 bit_set(mask, (hash % ft->ft_size)); 627 SLIST_INSERT_HEAD(flist, fle, f_next); 628 goto skip; 629 } 630 631 /* 632 * find end of list and make sure that we were not 633 * preempted by another thread handling this flow 634 */ 635 SLIST_FOREACH(iter, flist, f_next) { 636 KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size, 637 ("%s: wrong hash", __func__)); 638 if (flow_matches(iter, key, keylen, fibnum)) { 639 /* 640 * We probably migrated to an other CPU after 641 * lookup in flowtable_lookup_common() failed. 642 * It appeared that this CPU already has flow 643 * entry. 644 */ 645 iter->f_uptime = time_uptime; 646#ifdef FLOWTABLE_HASH_ALL 647 iter->f_flags |= fibnum >> 24; 648#endif 649 critical_exit(); 650 FLOWSTAT_INC(ft, ft_collisions); 651 uma_zfree(flow_zone, fle); 652 return (iter); 653 } 654 } 655 656 SLIST_INSERT_HEAD(flist, fle, f_next); 657skip: 658 critical_exit(); 659 FLOWSTAT_INC(ft, ft_inserts); 660 661 return (fle); 662} 663 664int 665flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro) 666{ 667 struct flentry *fle; 668 struct llentry *lle; 669 670 if (V_flowtable_enable == 0) 671 return (ENXIO); 672 673 switch (sa) { 674#ifdef INET 675 case AF_INET: 676 fle = flowtable_lookup_ipv4(m, ro); 677 break; 678#endif 679#ifdef INET6 680 case AF_INET6: 681 fle = flowtable_lookup_ipv6(m, ro); 682 break; 683#endif 684 default: 685 panic("%s: sa %d", __func__, sa); 686 } 687 688 if (fle == NULL) 689 return (EHOSTUNREACH); 690 691 if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) { 692 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE_HASH); 693 m->m_pkthdr.flowid = fle->f_hash; 694 } 695 696 ro->ro_rt = fle->f_rt; 697 ro->ro_flags |= RT_NORTREF; 698 lle = fle->f_lle; 699 if (lle != NULL && (lle->la_flags & LLE_VALID)) 700 ro->ro_lle = lle; /* share ref with fle->f_lle */ 701 702 return (0); 703} 704 705static struct flentry * 706flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen, 707 uint32_t fibnum) 708{ 709 struct flist *flist; 710 struct flentry *fle; 711 uint32_t hash; 712 713 FLOWSTAT_INC(ft, ft_lookups); 714 715 hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter); 716 717 critical_enter(); 718 flist = flowtable_list(ft, hash); 719 SLIST_FOREACH(fle, flist, f_next) { 720 KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size, 721 ("%s: wrong hash", __func__)); 722 if (flow_matches(fle, key, keylen, fibnum)) { 723 fle->f_uptime = time_uptime; 724#ifdef FLOWTABLE_HASH_ALL 725 fle->f_flags |= fibnum >> 24; 726#endif 727 critical_exit(); 728 FLOWSTAT_INC(ft, ft_hits); 729 return (fle); 730 } 731 } 732 critical_exit(); 733 734 FLOWSTAT_INC(ft, ft_misses); 735 736 return (flowtable_insert(ft, hash, key, keylen, fibnum)); 737} 738 739static void 740flowtable_alloc(struct flowtable *ft) 741{ 742 743 ft->ft_table = malloc(ft->ft_size * sizeof(struct flist), 744 M_FTABLE, M_WAITOK); 745 for (int i = 0; i < ft->ft_size; i++) 746 ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO); 747 748 ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
|
749 for (int i = 0; i < mp_ncpus; i++) {
|
749 CPU_FOREACH(i) { |
750 bitstr_t **b; 751 752 b = zpcpu_get_cpu(ft->ft_masks, i); 753 *b = bit_alloc(ft->ft_size, M_FTABLE, M_WAITOK); 754 } 755 ft->ft_tmpmask = bit_alloc(ft->ft_size, M_FTABLE, M_WAITOK); 756} 757 758static void 759flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle) 760{ 761 struct flist *flist, freelist; 762 struct flentry *fle, *fle1, *fleprev; 763 bitstr_t *mask, *tmpmask; 764 int curbit, tmpsize; 765 766 SLIST_INIT(&freelist); 767 mask = flowtable_mask(ft); 768 tmpmask = ft->ft_tmpmask; 769 tmpsize = ft->ft_size; 770 memcpy(tmpmask, mask, ft->ft_size/8); 771 curbit = 0; 772 fleprev = NULL; /* pacify gcc */ 773 /* 774 * XXX Note to self, bit_ffs operates at the byte level 775 * and thus adds gratuitous overhead 776 */ 777 bit_ffs(tmpmask, ft->ft_size, &curbit); 778 while (curbit != -1) { 779 if (curbit >= ft->ft_size || curbit < -1) { 780 log(LOG_ALERT, 781 "warning: bad curbit value %d \n", 782 curbit); 783 break; 784 } 785 786 FLOWSTAT_INC(ft, ft_free_checks); 787 788 critical_enter(); 789 flist = flowtable_list(ft, curbit); 790#ifdef DIAGNOSTIC 791 if (SLIST_EMPTY(flist) && curbit > 0) { 792 log(LOG_ALERT, 793 "warning bit=%d set, but no fle found\n", 794 curbit); 795 } 796#endif 797 SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) { 798 if (rt != NULL && fle->f_rt != rt) { 799 fleprev = fle; 800 continue; 801 } 802 if (!flow_stale(ft, fle, maxidle)) { 803 fleprev = fle; 804 continue; 805 } 806 807 if (fle == SLIST_FIRST(flist)) 808 SLIST_REMOVE_HEAD(flist, f_next); 809 else 810 SLIST_REMOVE_AFTER(fleprev, f_next); 811 SLIST_INSERT_HEAD(&freelist, fle, f_next); 812 } 813 if (SLIST_EMPTY(flist)) 814 bit_clear(mask, curbit); 815 critical_exit(); 816 817 bit_clear(tmpmask, curbit); 818 bit_ffs(tmpmask, tmpsize, &curbit); 819 } 820 821 SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) { 822 FLOWSTAT_INC(ft, ft_frees); 823 if (fle->f_rt != NULL) 824 RTFREE(fle->f_rt); 825 if (fle->f_lle != NULL) 826 LLE_FREE(fle->f_lle); 827 uma_zfree(flow_zone, fle); 828 } 829} 830 831static void 832flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle) 833{ 834 int i; 835 836 CPU_FOREACH(i) { 837 if (smp_started == 1) { 838 thread_lock(curthread); 839 sched_bind(curthread, i); 840 thread_unlock(curthread); 841 } 842 843 flowtable_free_stale(ft, rt, maxidle); 844 845 if (smp_started == 1) { 846 thread_lock(curthread); 847 sched_unbind(curthread); 848 thread_unlock(curthread); 849 } 850 } 851} 852 853void 854flowtable_route_flush(sa_family_t sa, struct rtentry *rt) 855{ 856 struct flowtable *ft; 857 858 switch (sa) { 859#ifdef INET 860 case AF_INET: 861 ft = &V_ip4_ft; 862 break; 863#endif 864#ifdef INET6 865 case AF_INET6: 866 ft = &V_ip6_ft; 867 break; 868#endif 869 default: 870 panic("%s: sa %d", __func__, sa); 871 } 872 873 flowtable_clean_vnet(ft, rt, 0); 874} 875 876static void 877flowtable_cleaner(void) 878{ 879 VNET_ITERATOR_DECL(vnet_iter); 880 struct thread *td; 881 882 if (bootverbose) 883 log(LOG_INFO, "flowtable cleaner started\n"); 884 td = curthread; 885 while (1) { 886 uint32_t flowclean_freq, maxidle; 887 888 /* 889 * The maximum idle time, as well as frequency are arbitrary. 890 */ 891 if (flow_full()) 892 maxidle = 5; 893 else 894 maxidle = 30; 895 896 VNET_LIST_RLOCK(); 897 VNET_FOREACH(vnet_iter) { 898 CURVNET_SET(vnet_iter); 899#ifdef INET 900 flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle); 901#endif 902#ifdef INET6 903 flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle); 904#endif 905 CURVNET_RESTORE(); 906 } 907 VNET_LIST_RUNLOCK(); 908 909 if (flow_full()) 910 flowclean_freq = 4*hz; 911 else 912 flowclean_freq = 20*hz; 913 mtx_lock(&flowclean_lock); 914 thread_lock(td); 915 sched_prio(td, PPAUSE); 916 thread_unlock(td); 917 flowclean_cycles++; 918 cv_broadcast(&flowclean_f_cv); 919 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq); 920 mtx_unlock(&flowclean_lock); 921 } 922} 923 924static void 925flowtable_flush(void *unused __unused) 926{ 927 uint64_t start; 928 929 mtx_lock(&flowclean_lock); 930 start = flowclean_cycles; 931 while (start == flowclean_cycles) { 932 cv_broadcast(&flowclean_c_cv); 933 cv_wait(&flowclean_f_cv, &flowclean_lock); 934 } 935 mtx_unlock(&flowclean_lock); 936} 937 938static struct kproc_desc flow_kp = { 939 "flowcleaner", 940 flowtable_cleaner, 941 &flowcleanerproc 942}; 943SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp); 944 945static int 946flowtable_get_size(char *name) 947{ 948 int size; 949 950 if (TUNABLE_INT_FETCH(name, &size)) { 951 if (size < 256) 952 size = 256; 953 if (!powerof2(size)) { 954 printf("%s must be power of 2\n", name); 955 size = 2048; 956 } 957 } else { 958 /* 959 * round up to the next power of 2 960 */ 961 size = 1 << fls((1024 + maxusers * 64) - 1); 962 } 963 964 return (size); 965} 966 967static void 968flowtable_init(const void *unused __unused) 969{ 970 971 flow_hashjitter = arc4random(); 972 973 flow_zone = uma_zcreate("flows", sizeof(struct flentry), 974 NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET); 975 uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus); 976 977 cv_init(&flowclean_c_cv, "c_flowcleanwait"); 978 cv_init(&flowclean_f_cv, "f_flowcleanwait"); 979 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF); 980 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL, 981 EVENTHANDLER_PRI_ANY); 982} 983SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, 984 flowtable_init, NULL); 985 986#ifdef INET 987static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL, 988 "Flowtable for IPv4"); 989 990static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat); 991VNET_PCPUSTAT_SYSINIT(ip4_ftstat); 992VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat); 993SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat, 994 ip4_ftstat, "Flowtable statistics for IPv4 " 995 "(struct flowtable_stat, net/flowtable.h)"); 996 997static void 998flowtable_init_vnet_v4(const void *unused __unused) 999{ 1000 1001 V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size"); 1002 V_ip4_ft.ft_stat = VNET(ip4_ftstat); 1003 flowtable_alloc(&V_ip4_ft); 1004} 1005VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, 1006 flowtable_init_vnet_v4, NULL); 1007#endif /* INET */ 1008 1009#ifdef INET6 1010static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL, 1011 "Flowtable for IPv6"); 1012 1013static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat); 1014VNET_PCPUSTAT_SYSINIT(ip6_ftstat); 1015VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat); 1016SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat, 1017 ip6_ftstat, "Flowtable statistics for IPv6 " 1018 "(struct flowtable_stat, net/flowtable.h)"); 1019 1020static void 1021flowtable_init_vnet_v6(const void *unused __unused) 1022{ 1023 1024 V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size"); 1025 V_ip6_ft.ft_stat = VNET(ip6_ftstat); 1026 flowtable_alloc(&V_ip6_ft); 1027} 1028VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, 1029 flowtable_init_vnet_v6, NULL); 1030#endif /* INET6 */ 1031 1032#ifdef DDB 1033static bitstr_t * 1034flowtable_mask_pcpu(struct flowtable *ft, int cpuid) 1035{ 1036 1037 return (zpcpu_get_cpu(*ft->ft_masks, cpuid)); 1038} 1039 1040static struct flist * 1041flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) 1042{ 1043 1044 return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid)); 1045} 1046 1047static void 1048flow_show(struct flowtable *ft, struct flentry *fle) 1049{ 1050 int idle_time; 1051 int rt_valid, ifp_valid; 1052 volatile struct rtentry *rt; 1053 struct ifnet *ifp = NULL; 1054 uint32_t *hashkey = fle->f_key; 1055 1056 idle_time = (int)(time_uptime - fle->f_uptime); 1057 rt = fle->f_rt; 1058 rt_valid = rt != NULL; 1059 if (rt_valid) 1060 ifp = rt->rt_ifp; 1061 ifp_valid = ifp != NULL; 1062 1063#ifdef INET 1064 if (ft == &V_ip4_ft) { 1065 char daddr[4*sizeof "123"]; 1066#ifdef FLOWTABLE_HASH_ALL 1067 char saddr[4*sizeof "123"]; 1068 uint16_t sport, dport; 1069#endif 1070 1071 inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr); 1072#ifdef FLOWTABLE_HASH_ALL 1073 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr); 1074 dport = ntohs((uint16_t)(hashkey[2] >> 16)); 1075 sport = ntohs((uint16_t)(hashkey[2] & 0xffff)); 1076 db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport); 1077#else 1078 db_printf("%s ", daddr); 1079#endif 1080 } 1081#endif /* INET */ 1082#ifdef INET6 1083 if (ft == &V_ip6_ft) { 1084#ifdef FLOWTABLE_HASH_ALL 1085 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x", 1086 hashkey[0], hashkey[1], hashkey[2], 1087 hashkey[3], hashkey[4], hashkey[5], 1088 hashkey[6], hashkey[7], hashkey[8]); 1089#else 1090 db_printf("\n\tkey=%08x:%08x:%08x ", 1091 hashkey[0], hashkey[1], hashkey[2]); 1092#endif 1093 } 1094#endif /* INET6 */ 1095 1096 db_printf("hash=%08x idle_time=%03d" 1097 "\n\tfibnum=%02d rt=%p", 1098 fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt); 1099 1100#ifdef FLOWTABLE_HASH_ALL 1101 if (fle->f_flags & FL_STALE) 1102 db_printf(" FL_STALE "); 1103#endif 1104 if (rt_valid) { 1105 if (rt->rt_flags & RTF_UP) 1106 db_printf(" RTF_UP "); 1107 } 1108 if (ifp_valid) { 1109 if (ifp->if_flags & IFF_LOOPBACK) 1110 db_printf(" IFF_LOOPBACK "); 1111 if (ifp->if_flags & IFF_UP) 1112 db_printf(" IFF_UP "); 1113 if (ifp->if_flags & IFF_POINTOPOINT) 1114 db_printf(" IFF_POINTOPOINT "); 1115 } 1116 db_printf("\n"); 1117} 1118 1119static void 1120flowtable_show(struct flowtable *ft, int cpuid) 1121{ 1122 int curbit = 0; 1123 bitstr_t *mask, *tmpmask; 1124 1125 if (cpuid != -1) 1126 db_printf("cpu: %d\n", cpuid); 1127 mask = flowtable_mask_pcpu(ft, cpuid); 1128 tmpmask = ft->ft_tmpmask; 1129 memcpy(tmpmask, mask, ft->ft_size/8); 1130 /* 1131 * XXX Note to self, bit_ffs operates at the byte level 1132 * and thus adds gratuitous overhead 1133 */ 1134 bit_ffs(tmpmask, ft->ft_size, &curbit); 1135 while (curbit != -1) { 1136 struct flist *flist; 1137 struct flentry *fle; 1138 1139 if (curbit >= ft->ft_size || curbit < -1) { 1140 db_printf("warning: bad curbit value %d \n", 1141 curbit); 1142 break; 1143 } 1144 1145 flist = flowtable_list_pcpu(ft, curbit, cpuid); 1146 1147 SLIST_FOREACH(fle, flist, f_next) 1148 flow_show(ft, fle); 1149 bit_clear(tmpmask, curbit); 1150 bit_ffs(tmpmask, ft->ft_size, &curbit); 1151 } 1152} 1153 1154static void 1155flowtable_show_vnet(struct flowtable *ft) 1156{ 1157 1158 int i; 1159 1160 CPU_FOREACH(i) 1161 flowtable_show(ft, i); 1162} 1163 1164DB_SHOW_COMMAND(flowtables, db_show_flowtables) 1165{ 1166 VNET_ITERATOR_DECL(vnet_iter); 1167 1168 VNET_FOREACH(vnet_iter) { 1169 CURVNET_SET(vnet_iter); 1170#ifdef VIMAGE 1171 db_printf("vnet %p\n", vnet_iter); 1172#endif 1173#ifdef INET 1174 printf("IPv4:\n"); 1175 flowtable_show_vnet(&V_ip4_ft); 1176#endif 1177#ifdef INET6 1178 printf("IPv6:\n"); 1179 flowtable_show_vnet(&V_ip6_ft); 1180#endif 1181 CURVNET_RESTORE(); 1182 } 1183} 1184#endif
|