240 241#define V_flowtable_enable VNET(flowtable_enable) 242#define V_flowtable_debug VNET(flowtable_debug) 243#define V_flowtable_syn_expire VNET(flowtable_syn_expire) 244#define V_flowtable_udp_expire VNET(flowtable_udp_expire) 245#define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire) 246#define V_flowtable_tcp_expire VNET(flowtable_tcp_expire) 247#define V_flowtable_nmbflows VNET(flowtable_nmbflows) 248#define V_flowtable_ready VNET(flowtable_ready) 249 250SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable"); 251SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW, 252 &VNET_NAME(flowtable_debug), 0, "print debug info."); 253SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW, 254 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); 255 256/* 257 * XXX This does not end up updating timeouts at runtime 258 * and only reflects the value for the last table added :-/ 259 */ 260SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW, 261 &VNET_NAME(flowtable_syn_expire), 0, 262 "seconds after which to remove syn allocated flow."); 263SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW, 264 &VNET_NAME(flowtable_udp_expire), 0, 265 "seconds after which to remove flow allocated to UDP."); 266SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW, 267 &VNET_NAME(flowtable_fin_wait_expire), 0, 268 "seconds after which to remove a flow in FIN_WAIT."); 269SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW, 270 &VNET_NAME(flowtable_tcp_expire), 0, 271 "seconds after which to remove flow allocated to a TCP connection."); 272 273 274/* 275 * Maximum number of flows that can be allocated of a given type. 276 * 277 * The table is allocated at boot time (for the pure caching case 278 * there is no reason why this could not be changed at runtime) 279 * and thus (currently) needs to be set with a tunable. 280 */ 281static int 282sysctl_nmbflows(SYSCTL_HANDLER_ARGS) 283{ 284 int error, newnmbflows; 285 286 newnmbflows = V_flowtable_nmbflows; 287 error = sysctl_handle_int(oidp, &newnmbflows, 0, req); 288 if (error == 0 && req->newptr) { 289 if (newnmbflows > V_flowtable_nmbflows) { 290 V_flowtable_nmbflows = newnmbflows; 291 uma_zone_set_max(V_flow_ipv4_zone, 292 V_flowtable_nmbflows); 293 uma_zone_set_max(V_flow_ipv6_zone, 294 V_flowtable_nmbflows); 295 } else 296 error = EINVAL; 297 } 298 return (error); 299} 300SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows, 301 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU", 302 "Maximum number of flows allowed"); 303 304 305 306#define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field) 307 308static void 309fs_print(struct sbuf *sb, struct flowtable_stats *fs) 310{ 311 312 FS_PRINT(sb, collisions); 313 FS_PRINT(sb, allocated); 314 FS_PRINT(sb, misses); 315 FS_PRINT(sb, max_depth); 316 FS_PRINT(sb, free_checks); 317 FS_PRINT(sb, frees); 318 FS_PRINT(sb, hits); 319 FS_PRINT(sb, lookups); 320} 321 322static void 323flowtable_show_stats(struct sbuf *sb, struct flowtable *ft) 324{ 325 int i; 326 struct flowtable_stats fs, *pfs; 327 328 if (ft->ft_flags & FL_PCPU) { 329 bzero(&fs, sizeof(fs)); 330 pfs = &fs; 331 CPU_FOREACH(i) { 332 pfs->ft_collisions += ft->ft_stats[i].ft_collisions; 333 pfs->ft_allocated += ft->ft_stats[i].ft_allocated; 334 pfs->ft_misses += ft->ft_stats[i].ft_misses; 335 pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks; 336 pfs->ft_frees += ft->ft_stats[i].ft_frees; 337 pfs->ft_hits += ft->ft_stats[i].ft_hits; 338 pfs->ft_lookups += ft->ft_stats[i].ft_lookups; 339 if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth) 340 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth; 341 } 342 } else { 343 pfs = &ft->ft_stats[0]; 344 } 345 fs_print(sb, pfs); 346} 347 348static int 349sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS) 350{ 351 struct flowtable *ft; 352 struct sbuf *sb; 353 int error; 354 355 sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN); 356 357 ft = V_flow_list_head; 358 while (ft != NULL) { 359 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name); 360 flowtable_show_stats(sb, ft); 361 ft = ft->ft_next; 362 } 363 sbuf_finish(sb); 364 error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); 365 sbuf_delete(sb); 366 367 return (error); 368} 369SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD, 370 NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics"); 371 372 373#ifndef RADIX_MPATH 374static void 375in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum) 376{ 377 378 rtalloc_ign_fib(ro, 0, fibnum); 379} 380#endif 381 382static void 383flowtable_global_lock(struct flowtable *table, uint32_t hash) 384{ 385 int lock_index = (hash)&(table->ft_lock_count - 1); 386 387 mtx_lock(&table->ft_locks[lock_index]); 388} 389 390static void 391flowtable_global_unlock(struct flowtable *table, uint32_t hash) 392{ 393 int lock_index = (hash)&(table->ft_lock_count - 1); 394 395 mtx_unlock(&table->ft_locks[lock_index]); 396} 397 398static void 399flowtable_pcpu_lock(struct flowtable *table, uint32_t hash) 400{ 401 402 critical_enter(); 403} 404 405static void 406flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash) 407{ 408 409 critical_exit(); 410} 411 412#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size) 413#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash)) 414#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash)) 415#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash)) 416 417#define FL_STALE (1<<8) 418#define FL_OVERWRITE (1<<10) 419 420void 421flow_invalidate(struct flentry *fle) 422{ 423 424 fle->f_flags |= FL_STALE; 425} 426 427static __inline int 428proto_to_flags(uint8_t proto) 429{ 430 int flag; 431 432 switch (proto) { 433 case IPPROTO_TCP: 434 flag = FL_TCP; 435 break; 436 case IPPROTO_SCTP: 437 flag = FL_SCTP; 438 break; 439 case IPPROTO_UDP: 440 flag = FL_UDP; 441 break; 442 default: 443 flag = 0; 444 break; 445 } 446 447 return (flag); 448} 449 450static __inline int 451flags_to_proto(int flags) 452{ 453 int proto, protoflags; 454 455 protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP); 456 switch (protoflags) { 457 case FL_TCP: 458 proto = IPPROTO_TCP; 459 break; 460 case FL_SCTP: 461 proto = IPPROTO_SCTP; 462 break; 463 case FL_UDP: 464 proto = IPPROTO_UDP; 465 break; 466 default: 467 proto = 0; 468 break; 469 } 470 return (proto); 471} 472 473#ifdef INET 474#ifdef FLOWTABLE_DEBUG 475static void 476ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin, 477 struct sockaddr_in *dsin) 478{ 479 char saddr[4*sizeof "123"], daddr[4*sizeof "123"]; 480 481 if (flags & FL_HASH_ALL) { 482 inet_ntoa_r(ssin->sin_addr, saddr); 483 inet_ntoa_r(dsin->sin_addr, daddr); 484 printf("proto=%d %s:%d->%s:%d\n", 485 proto, saddr, ntohs(ssin->sin_port), daddr, 486 ntohs(dsin->sin_port)); 487 } else { 488 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr); 489 printf("proto=%d %s\n", proto, daddr); 490 } 491 492} 493#endif 494 495static int 496ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, 497 struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags) 498{ 499 struct ip *ip; 500 uint8_t proto; 501 int iphlen; 502 struct tcphdr *th; 503 struct udphdr *uh; 504 struct sctphdr *sh; 505 uint16_t sport, dport; 506 507 proto = sport = dport = 0; 508 ip = mtod(m, struct ip *); 509 dsin->sin_family = AF_INET; 510 dsin->sin_len = sizeof(*dsin); 511 dsin->sin_addr = ip->ip_dst; 512 ssin->sin_family = AF_INET; 513 ssin->sin_len = sizeof(*ssin); 514 ssin->sin_addr = ip->ip_src; 515 516 proto = ip->ip_p; 517 if ((*flags & FL_HASH_ALL) == 0) { 518 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ", 519 *flags); 520 goto skipports; 521 } 522 523 iphlen = ip->ip_hl << 2; /* XXX options? */ 524 525 switch (proto) { 526 case IPPROTO_TCP: 527 th = (struct tcphdr *)((caddr_t)ip + iphlen); 528 sport = th->th_sport; 529 dport = th->th_dport; 530 if ((*flags & FL_HASH_ALL) && 531 (th->th_flags & (TH_RST|TH_FIN))) 532 *flags |= FL_STALE; 533 break; 534 case IPPROTO_UDP: 535 uh = (struct udphdr *)((caddr_t)ip + iphlen); 536 sport = uh->uh_sport; 537 dport = uh->uh_dport; 538 break; 539 case IPPROTO_SCTP: 540 sh = (struct sctphdr *)((caddr_t)ip + iphlen); 541 sport = sh->src_port; 542 dport = sh->dest_port; 543 break; 544 default: 545 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto); 546 return (ENOTSUP); 547 /* no port - hence not a protocol we care about */ 548 break; 549 550 } 551 552skipports: 553 *flags |= proto_to_flags(proto); 554 ssin->sin_port = sport; 555 dsin->sin_port = dport; 556 return (0); 557} 558 559static uint32_t 560ipv4_flow_lookup_hash_internal( 561 struct sockaddr_in *ssin, struct sockaddr_in *dsin, 562 uint32_t *key, uint16_t flags) 563{ 564 uint16_t sport, dport; 565 uint8_t proto; 566 int offset = 0; 567 568 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) 569 return (0); 570 proto = flags_to_proto(flags); 571 sport = dport = key[2] = key[1] = key[0] = 0; 572 if ((ssin != NULL) && (flags & FL_HASH_ALL)) { 573 key[1] = ssin->sin_addr.s_addr; 574 sport = ssin->sin_port; 575 } 576 if (dsin != NULL) { 577 key[2] = dsin->sin_addr.s_addr; 578 dport = dsin->sin_port; 579 } 580 if (flags & FL_HASH_ALL) { 581 ((uint16_t *)key)[0] = sport; 582 ((uint16_t *)key)[1] = dport; 583 } else 584 offset = V_flow_hashjitter + proto; 585 586 return (jenkins_hashword(key, 3, offset)); 587} 588 589static struct flentry * 590flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m) 591{ 592 struct sockaddr_storage ssa, dsa; 593 uint16_t flags; 594 struct sockaddr_in *dsin, *ssin; 595 596 dsin = (struct sockaddr_in *)&dsa; 597 ssin = (struct sockaddr_in *)&ssa; 598 bzero(dsin, sizeof(*dsin)); 599 bzero(ssin, sizeof(*ssin)); 600 flags = ft->ft_flags; 601 if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0) 602 return (NULL); 603 604 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags)); 605} 606 607void 608flow_to_route(struct flentry *fle, struct route *ro) 609{ 610 uint32_t *hashkey = NULL; 611 struct sockaddr_in *sin; 612 613 sin = (struct sockaddr_in *)&ro->ro_dst; 614 sin->sin_family = AF_INET; 615 sin->sin_len = sizeof(*sin); 616 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; 617 sin->sin_addr.s_addr = hashkey[2]; 618 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); 619 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); 620} 621#endif /* INET */ 622 623#ifdef INET6 624/* 625 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, 626 * then it sets p to point at the offset "len" in the mbuf. WARNING: the 627 * pointer might become stale after other pullups (but we never use it 628 * this way). 629 */ 630#define PULLUP_TO(_len, p, T) \ 631do { \ 632 int x = (_len) + sizeof(T); \ 633 if ((m)->m_len < x) { \ 634 goto receive_failed; \ 635 } \ 636 p = (mtod(m, char *) + (_len)); \ 637} while (0) 638 639#define TCP(p) ((struct tcphdr *)(p)) 640#define SCTP(p) ((struct sctphdr *)(p)) 641#define UDP(p) ((struct udphdr *)(p)) 642 643static int 644ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, 645 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags) 646{ 647 struct ip6_hdr *ip6; 648 uint8_t proto; 649 int hlen; 650 uint16_t src_port, dst_port; 651 u_short offset; 652 void *ulp; 653 654 offset = hlen = src_port = dst_port = 0; 655 ulp = NULL; 656 ip6 = mtod(m, struct ip6_hdr *); 657 hlen = sizeof(struct ip6_hdr); 658 proto = ip6->ip6_nxt; 659 660 if ((*flags & FL_HASH_ALL) == 0) 661 goto skipports; 662 663 while (ulp == NULL) { 664 switch (proto) { 665 case IPPROTO_ICMPV6: 666 case IPPROTO_OSPFIGP: 667 case IPPROTO_PIM: 668 case IPPROTO_CARP: 669 case IPPROTO_ESP: 670 case IPPROTO_NONE: 671 ulp = ip6; 672 break; 673 case IPPROTO_TCP: 674 PULLUP_TO(hlen, ulp, struct tcphdr); 675 dst_port = TCP(ulp)->th_dport; 676 src_port = TCP(ulp)->th_sport; 677 if ((*flags & FL_HASH_ALL) && 678 (TCP(ulp)->th_flags & (TH_RST|TH_FIN))) 679 *flags |= FL_STALE; 680 break; 681 case IPPROTO_SCTP: 682 PULLUP_TO(hlen, ulp, struct sctphdr); 683 src_port = SCTP(ulp)->src_port; 684 dst_port = SCTP(ulp)->dest_port; 685 break; 686 case IPPROTO_UDP: 687 PULLUP_TO(hlen, ulp, struct udphdr); 688 dst_port = UDP(ulp)->uh_dport; 689 src_port = UDP(ulp)->uh_sport; 690 break; 691 case IPPROTO_HOPOPTS: /* RFC 2460 */ 692 PULLUP_TO(hlen, ulp, struct ip6_hbh); 693 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; 694 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; 695 ulp = NULL; 696 break; 697 case IPPROTO_ROUTING: /* RFC 2460 */ 698 PULLUP_TO(hlen, ulp, struct ip6_rthdr); 699 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; 700 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; 701 ulp = NULL; 702 break; 703 case IPPROTO_FRAGMENT: /* RFC 2460 */ 704 PULLUP_TO(hlen, ulp, struct ip6_frag); 705 hlen += sizeof (struct ip6_frag); 706 proto = ((struct ip6_frag *)ulp)->ip6f_nxt; 707 offset = ((struct ip6_frag *)ulp)->ip6f_offlg & 708 IP6F_OFF_MASK; 709 ulp = NULL; 710 break; 711 case IPPROTO_DSTOPTS: /* RFC 2460 */ 712 PULLUP_TO(hlen, ulp, struct ip6_hbh); 713 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; 714 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; 715 ulp = NULL; 716 break; 717 case IPPROTO_AH: /* RFC 2402 */ 718 PULLUP_TO(hlen, ulp, struct ip6_ext); 719 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; 720 proto = ((struct ip6_ext *)ulp)->ip6e_nxt; 721 ulp = NULL; 722 break; 723 default: 724 PULLUP_TO(hlen, ulp, struct ip6_ext); 725 break; 726 } 727 } 728 729 if (src_port == 0) { 730 receive_failed: 731 return (ENOTSUP); 732 } 733 734skipports: 735 dsin6->sin6_family = AF_INET6; 736 dsin6->sin6_len = sizeof(*dsin6); 737 dsin6->sin6_port = dst_port; 738 memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr)); 739 740 ssin6->sin6_family = AF_INET6; 741 ssin6->sin6_len = sizeof(*ssin6); 742 ssin6->sin6_port = src_port; 743 memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr)); 744 *flags |= proto_to_flags(proto); 745 746 return (0); 747} 748 749#define zero_key(key) \ 750do { \ 751 key[0] = 0; \ 752 key[1] = 0; \ 753 key[2] = 0; \ 754 key[3] = 0; \ 755 key[4] = 0; \ 756 key[5] = 0; \ 757 key[6] = 0; \ 758 key[7] = 0; \ 759 key[8] = 0; \ 760} while (0) 761 762static uint32_t 763ipv6_flow_lookup_hash_internal( 764 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, 765 uint32_t *key, uint16_t flags) 766{ 767 uint16_t sport, dport; 768 uint8_t proto; 769 int offset = 0; 770 771 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) 772 return (0); 773 774 proto = flags_to_proto(flags); 775 zero_key(key); 776 sport = dport = 0; 777 if (dsin6 != NULL) { 778 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr)); 779 dport = dsin6->sin6_port; 780 } 781 if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) { 782 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr)); 783 sport = ssin6->sin6_port; 784 } 785 if (flags & FL_HASH_ALL) { 786 ((uint16_t *)key)[0] = sport; 787 ((uint16_t *)key)[1] = dport; 788 } else 789 offset = V_flow_hashjitter + proto; 790 791 return (jenkins_hashword(key, 9, offset)); 792} 793 794static struct flentry * 795flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m) 796{ 797 struct sockaddr_storage ssa, dsa; 798 struct sockaddr_in6 *dsin6, *ssin6; 799 uint16_t flags; 800 801 dsin6 = (struct sockaddr_in6 *)&dsa; 802 ssin6 = (struct sockaddr_in6 *)&ssa; 803 bzero(dsin6, sizeof(*dsin6)); 804 bzero(ssin6, sizeof(*ssin6)); 805 flags = ft->ft_flags; 806 807 if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0) 808 return (NULL); 809 810 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags)); 811} 812 813void 814flow_to_route_in6(struct flentry *fle, struct route_in6 *ro) 815{ 816 uint32_t *hashkey = NULL; 817 struct sockaddr_in6 *sin6; 818 819 sin6 = (struct sockaddr_in6 *)&ro->ro_dst; 820 821 sin6->sin6_family = AF_INET6; 822 sin6->sin6_len = sizeof(*sin6); 823 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; 824 memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr)); 825 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); 826 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); 827 828} 829#endif /* INET6 */ 830 831static bitstr_t * 832flowtable_mask(struct flowtable *ft) 833{ 834 bitstr_t *mask; 835 836 if (ft->ft_flags & FL_PCPU) 837 mask = ft->ft_masks[curcpu]; 838 else 839 mask = ft->ft_masks[0]; 840 841 return (mask); 842} 843 844static struct flentry ** 845flowtable_entry(struct flowtable *ft, uint32_t hash) 846{ 847 struct flentry **fle; 848 int index = (hash % ft->ft_size); 849 850 if (ft->ft_flags & FL_PCPU) { 851 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set")); 852 fle = &ft->ft_table.pcpu[curcpu][index]; 853 } else { 854 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set")); 855 fle = &ft->ft_table.global[index]; 856 } 857 858 return (fle); 859} 860 861static int 862flow_stale(struct flowtable *ft, struct flentry *fle) 863{ 864 time_t idle_time; 865 866 if ((fle->f_fhash == 0) 867 || ((fle->f_rt->rt_flags & RTF_HOST) && 868 ((fle->f_rt->rt_flags & (RTF_UP)) 869 != (RTF_UP))) 870 || (fle->f_rt->rt_ifp == NULL) 871 || !RT_LINK_IS_UP(fle->f_rt->rt_ifp)) 872 return (1); 873 874 idle_time = time_uptime - fle->f_uptime; 875 876 if ((fle->f_flags & FL_STALE) || 877 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0 878 && (idle_time > ft->ft_udp_idle)) || 879 ((fle->f_flags & TH_FIN) 880 && (idle_time > ft->ft_fin_wait_idle)) || 881 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN 882 && (idle_time > ft->ft_syn_idle)) || 883 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK) 884 && (idle_time > ft->ft_tcp_idle)) || 885 ((fle->f_rt->rt_flags & RTF_UP) == 0 || 886 (fle->f_rt->rt_ifp == NULL))) 887 return (1); 888 889 return (0); 890} 891 892static void 893flowtable_set_hashkey(struct flentry *fle, uint32_t *key) 894{ 895 uint32_t *hashkey; 896 int i, nwords; 897 898 if (fle->f_flags & FL_IPV6) { 899 nwords = 9; 900 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; 901 } else { 902 nwords = 3; 903 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; 904 } 905 906 for (i = 0; i < nwords; i++) 907 hashkey[i] = key[i]; 908} 909 910static struct flentry * 911flow_alloc(struct flowtable *ft) 912{ 913 struct flentry *newfle; 914 uma_zone_t zone; 915 916 newfle = NULL; 917 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone; 918 919 newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO); 920 if (newfle != NULL) 921 atomic_add_int(&ft->ft_count, 1); 922 return (newfle); 923} 924 925static void 926flow_free(struct flentry *fle, struct flowtable *ft) 927{ 928 uma_zone_t zone; 929 930 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone; 931 atomic_add_int(&ft->ft_count, -1); 932 uma_zfree(zone, fle); 933} 934 935static int 936flow_full(struct flowtable *ft) 937{ 938 boolean_t full; 939 uint32_t count; 940 941 full = ft->ft_full; 942 count = ft->ft_count; 943 944 if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3)))) 945 ft->ft_full = FALSE; 946 else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5)))) 947 ft->ft_full = TRUE; 948 949 if (full && !ft->ft_full) { 950 flowclean_freq = 4*hz; 951 if ((ft->ft_flags & FL_HASH_ALL) == 0) 952 ft->ft_udp_idle = ft->ft_fin_wait_idle = 953 ft->ft_syn_idle = ft->ft_tcp_idle = 5; 954 cv_broadcast(&flowclean_cv); 955 } else if (!full && ft->ft_full) { 956 flowclean_freq = 20*hz; 957 if ((ft->ft_flags & FL_HASH_ALL) == 0) 958 ft->ft_udp_idle = ft->ft_fin_wait_idle = 959 ft->ft_syn_idle = ft->ft_tcp_idle = 30; 960 } 961 962 return (ft->ft_full); 963} 964 965static int 966flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, 967 uint32_t fibnum, struct route *ro, uint16_t flags) 968{ 969 struct flentry *fle, *fletail, *newfle, **flep; 970 struct flowtable_stats *fs = &ft->ft_stats[curcpu]; 971 int depth; 972 bitstr_t *mask; 973 uint8_t proto; 974 975 newfle = flow_alloc(ft); 976 if (newfle == NULL) 977 return (ENOMEM); 978 979 newfle->f_flags |= (flags & FL_IPV6); 980 proto = flags_to_proto(flags); 981 982 FL_ENTRY_LOCK(ft, hash); 983 mask = flowtable_mask(ft); 984 flep = flowtable_entry(ft, hash); 985 fletail = fle = *flep; 986 987 if (fle == NULL) { 988 bit_set(mask, FL_ENTRY_INDEX(ft, hash)); 989 *flep = fle = newfle; 990 goto skip; 991 } 992 993 depth = 0; 994 fs->ft_collisions++; 995 /* 996 * find end of list and make sure that we were not 997 * preempted by another thread handling this flow 998 */ 999 while (fle != NULL) { 1000 if (fle->f_fhash == hash && !flow_stale(ft, fle)) { 1001 /* 1002 * there was either a hash collision 1003 * or we lost a race to insert 1004 */ 1005 FL_ENTRY_UNLOCK(ft, hash); 1006 flow_free(newfle, ft); 1007 1008 if (flags & FL_OVERWRITE) 1009 goto skip; 1010 return (EEXIST); 1011 } 1012 /* 1013 * re-visit this double condition XXX 1014 */ 1015 if (fletail->f_next != NULL) 1016 fletail = fle->f_next; 1017 1018 depth++; 1019 fle = fle->f_next; 1020 } 1021 1022 if (depth > fs->ft_max_depth) 1023 fs->ft_max_depth = depth; 1024 fletail->f_next = newfle; 1025 fle = newfle; 1026skip: 1027 flowtable_set_hashkey(fle, key); 1028 1029 fle->f_proto = proto; 1030 fle->f_rt = ro->ro_rt; 1031 fle->f_lle = ro->ro_lle; 1032 fle->f_fhash = hash; 1033 fle->f_fibnum = fibnum; 1034 fle->f_uptime = time_uptime; 1035 FL_ENTRY_UNLOCK(ft, hash); 1036 return (0); 1037} 1038 1039int 1040kern_flowtable_insert(struct flowtable *ft, 1041 struct sockaddr_storage *ssa, struct sockaddr_storage *dsa, 1042 struct route *ro, uint32_t fibnum, int flags) 1043{ 1044 uint32_t key[9], hash; 1045 1046 flags = (ft->ft_flags | flags | FL_OVERWRITE); 1047 hash = 0; 1048 1049#ifdef INET 1050 if (ssa->ss_family == AF_INET) 1051 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa, 1052 (struct sockaddr_in *)dsa, key, flags); 1053#endif 1054#ifdef INET6 1055 if (ssa->ss_family == AF_INET6) 1056 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa, 1057 (struct sockaddr_in6 *)dsa, key, flags); 1058#endif 1059 if (ro->ro_rt == NULL || ro->ro_lle == NULL) 1060 return (EINVAL); 1061 1062 FLDPRINTF(ft, FL_DEBUG, 1063 "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n", 1064 key[0], key[1], key[2], hash, fibnum, flags); 1065 return (flowtable_insert(ft, hash, key, fibnum, ro, flags)); 1066} 1067 1068static int 1069flowtable_key_equal(struct flentry *fle, uint32_t *key) 1070{ 1071 uint32_t *hashkey; 1072 int i, nwords; 1073 1074 if (fle->f_flags & FL_IPV6) { 1075 nwords = 9; 1076 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; 1077 } else { 1078 nwords = 3; 1079 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; 1080 } 1081 1082 for (i = 0; i < nwords; i++) 1083 if (hashkey[i] != key[i]) 1084 return (0); 1085 1086 return (1); 1087} 1088 1089struct flentry * 1090flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af) 1091{ 1092 struct flentry *fle = NULL; 1093 1094#ifdef INET 1095 if (af == AF_INET) 1096 fle = flowtable_lookup_mbuf4(ft, m); 1097#endif 1098#ifdef INET6 1099 if (af == AF_INET6) 1100 fle = flowtable_lookup_mbuf6(ft, m); 1101#endif 1102 if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) { 1103 m->m_flags |= M_FLOWID; 1104 m->m_pkthdr.flowid = fle->f_fhash; 1105 } 1106 return (fle); 1107} 1108 1109struct flentry * 1110flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa, 1111 struct sockaddr_storage *dsa, uint32_t fibnum, int flags) 1112{ 1113 uint32_t key[9], hash; 1114 struct flentry *fle; 1115 struct flowtable_stats *fs = &ft->ft_stats[curcpu]; 1116 uint8_t proto = 0; 1117 int error = 0; 1118 struct rtentry *rt; 1119 struct llentry *lle; 1120 struct route sro, *ro; 1121 struct route_in6 sro6; 1122 1123 sro.ro_rt = sro6.ro_rt = NULL; 1124 sro.ro_lle = sro6.ro_lle = NULL; 1125 ro = NULL; 1126 hash = 0; 1127 flags |= ft->ft_flags; 1128 proto = flags_to_proto(flags); 1129#ifdef INET 1130 if (ssa->ss_family == AF_INET) { 1131 struct sockaddr_in *ssin, *dsin; 1132 1133 ro = &sro; 1134 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in)); 1135 /* 1136 * The harvested source and destination addresses 1137 * may contain port information if the packet is 1138 * from a transport protocol (e.g. TCP/UDP). The 1139 * port field must be cleared before performing 1140 * a route lookup. 1141 */ 1142 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0; 1143 dsin = (struct sockaddr_in *)dsa; 1144 ssin = (struct sockaddr_in *)ssa; 1145 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) || 1146 (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 1147 (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) 1148 return (NULL); 1149 1150 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags); 1151 } 1152#endif 1153#ifdef INET6 1154 if (ssa->ss_family == AF_INET6) { 1155 struct sockaddr_in6 *ssin6, *dsin6; 1156 1157 ro = (struct route *)&sro6; 1158 memcpy(&sro6.ro_dst, dsa, 1159 sizeof(struct sockaddr_in6)); 1160 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0; 1161 dsin6 = (struct sockaddr_in6 *)dsa; 1162 ssin6 = (struct sockaddr_in6 *)ssa; 1163 1164 flags |= FL_IPV6; 1165 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags); 1166 } 1167#endif 1168 /* 1169 * Ports are zero and this isn't a transmit cache 1170 * - thus not a protocol for which we need to keep 1171 * state 1172 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP 1173 */ 1174 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL))) 1175 return (NULL); 1176 1177 fs->ft_lookups++; 1178 FL_ENTRY_LOCK(ft, hash); 1179 if ((fle = FL_ENTRY(ft, hash)) == NULL) { 1180 FL_ENTRY_UNLOCK(ft, hash); 1181 goto uncached; 1182 } 1183keycheck: 1184 rt = __DEVOLATILE(struct rtentry *, fle->f_rt); 1185 lle = __DEVOLATILE(struct llentry *, fle->f_lle); 1186 if ((rt != NULL) 1187 && fle->f_fhash == hash 1188 && flowtable_key_equal(fle, key) 1189 && (proto == fle->f_proto) 1190 && (fibnum == fle->f_fibnum) 1191 && (rt->rt_flags & RTF_UP) 1192 && (rt->rt_ifp != NULL)) { 1193 fs->ft_hits++; 1194 fle->f_uptime = time_uptime; 1195 fle->f_flags |= flags; 1196 FL_ENTRY_UNLOCK(ft, hash); 1197 return (fle); 1198 } else if (fle->f_next != NULL) { 1199 fle = fle->f_next; 1200 goto keycheck; 1201 } 1202 FL_ENTRY_UNLOCK(ft, hash); 1203uncached: 1204 if (flags & FL_NOAUTO || flow_full(ft)) 1205 return (NULL); 1206 1207 fs->ft_misses++; 1208 /* 1209 * This bit of code ends up locking the 1210 * same route 3 times (just like ip_output + ether_output) 1211 * - at lookup 1212 * - in rt_check when called by arpresolve 1213 * - dropping the refcount for the rtentry 1214 * 1215 * This could be consolidated to one if we wrote a variant 1216 * of arpresolve with an rt_check variant that expected to 1217 * receive the route locked 1218 */ 1219 1220#ifdef INVARIANTS 1221 if ((ro->ro_dst.sa_family != AF_INET) && 1222 (ro->ro_dst.sa_family != AF_INET6)) 1223 panic("sa_family == %d\n", ro->ro_dst.sa_family); 1224#endif 1225 1226 ft->ft_rtalloc(ro, hash, fibnum); 1227 if (ro->ro_rt == NULL) 1228 error = ENETUNREACH; 1229 else { 1230 struct llentry *lle = NULL; 1231 struct sockaddr_storage *l3addr; 1232 struct rtentry *rt = ro->ro_rt; 1233 struct ifnet *ifp = rt->rt_ifp; 1234 1235 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) { 1236 RTFREE(rt); 1237 ro->ro_rt = NULL; 1238 return (NULL); 1239 } 1240#ifdef INET6 1241 if (ssa->ss_family == AF_INET6) { 1242 struct sockaddr_in6 *dsin6; 1243 1244 dsin6 = (struct sockaddr_in6 *)dsa; 1245 if (in6_localaddr(&dsin6->sin6_addr)) { 1246 RTFREE(rt); 1247 ro->ro_rt = NULL; 1248 return (NULL); 1249 } 1250 1251 if (rt->rt_flags & RTF_GATEWAY) 1252 l3addr = (struct sockaddr_storage *)rt->rt_gateway; 1253 1254 else 1255 l3addr = (struct sockaddr_storage *)&ro->ro_dst; 1256 llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp); 1257 } 1258#endif 1259#ifdef INET 1260 if (ssa->ss_family == AF_INET) { 1261 if (rt->rt_flags & RTF_GATEWAY) 1262 l3addr = (struct sockaddr_storage *)rt->rt_gateway; 1263 else 1264 l3addr = (struct sockaddr_storage *)&ro->ro_dst; 1265 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp); 1266 } 1267 1268#endif 1269 ro->ro_lle = lle; 1270 1271 if (lle == NULL) { 1272 RTFREE(rt); 1273 ro->ro_rt = NULL; 1274 return (NULL); 1275 } 1276 error = flowtable_insert(ft, hash, key, fibnum, ro, flags); 1277 1278 if (error) { 1279 RTFREE(rt); 1280 LLE_FREE(lle); 1281 ro->ro_rt = NULL; 1282 ro->ro_lle = NULL; 1283 } 1284 } 1285 1286 return ((error) ? NULL : fle); 1287} 1288 1289/* 1290 * used by the bit_alloc macro 1291 */ 1292#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO) 1293 1294struct flowtable * 1295flowtable_alloc(char *name, int nentry, int flags) 1296{ 1297 struct flowtable *ft, *fttail; 1298 int i; 1299 1300 if (V_flow_hashjitter == 0) 1301 V_flow_hashjitter = arc4random(); 1302 1303 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry)); 1304 1305 ft = malloc(sizeof(struct flowtable), 1306 M_RTABLE, M_WAITOK | M_ZERO); 1307 1308 ft->ft_name = name; 1309 ft->ft_flags = flags; 1310 ft->ft_size = nentry; 1311#ifdef RADIX_MPATH 1312 ft->ft_rtalloc = rtalloc_mpath_fib; 1313#else 1314 ft->ft_rtalloc = in_rtalloc_ign_wrapper; 1315#endif 1316 if (flags & FL_PCPU) { 1317 ft->ft_lock = flowtable_pcpu_lock; 1318 ft->ft_unlock = flowtable_pcpu_unlock; 1319 1320 for (i = 0; i <= mp_maxid; i++) { 1321 ft->ft_table.pcpu[i] = 1322 malloc(nentry*sizeof(struct flentry *), 1323 M_RTABLE, M_WAITOK | M_ZERO); 1324 ft->ft_masks[i] = bit_alloc(nentry); 1325 } 1326 } else { 1327 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1): 1328 (fls(mp_maxid + 1) << 1)); 1329 1330 ft->ft_lock = flowtable_global_lock; 1331 ft->ft_unlock = flowtable_global_unlock; 1332 ft->ft_table.global = 1333 malloc(nentry*sizeof(struct flentry *), 1334 M_RTABLE, M_WAITOK | M_ZERO); 1335 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx), 1336 M_RTABLE, M_WAITOK | M_ZERO); 1337 for (i = 0; i < ft->ft_lock_count; i++) 1338 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK); 1339 1340 ft->ft_masks[0] = bit_alloc(nentry); 1341 } 1342 ft->ft_tmpmask = bit_alloc(nentry); 1343 1344 /* 1345 * In the local transmit case the table truly is 1346 * just a cache - so everything is eligible for 1347 * replacement after 5s of non-use 1348 */ 1349 if (flags & FL_HASH_ALL) { 1350 ft->ft_udp_idle = V_flowtable_udp_expire; 1351 ft->ft_syn_idle = V_flowtable_syn_expire; 1352 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire; 1353 ft->ft_tcp_idle = V_flowtable_fin_wait_expire; 1354 } else { 1355 ft->ft_udp_idle = ft->ft_fin_wait_idle = 1356 ft->ft_syn_idle = ft->ft_tcp_idle = 30; 1357 1358 } 1359 1360 /* 1361 * hook in to the cleaner list 1362 */ 1363 if (V_flow_list_head == NULL) 1364 V_flow_list_head = ft; 1365 else { 1366 fttail = V_flow_list_head; 1367 while (fttail->ft_next != NULL) 1368 fttail = fttail->ft_next; 1369 fttail->ft_next = ft; 1370 } 1371 1372 return (ft); 1373} 1374 1375/* 1376 * The rest of the code is devoted to garbage collection of expired entries. 1377 * It is a new additon made necessary by the switch to dynamically allocating 1378 * flow tables. 1379 * 1380 */ 1381static void 1382fle_free(struct flentry *fle, struct flowtable *ft) 1383{ 1384 struct rtentry *rt; 1385 struct llentry *lle; 1386 1387 rt = __DEVOLATILE(struct rtentry *, fle->f_rt); 1388 lle = __DEVOLATILE(struct llentry *, fle->f_lle); 1389 if (rt != NULL) 1390 RTFREE(rt); 1391 if (lle != NULL) 1392 LLE_FREE(lle); 1393 flow_free(fle, ft); 1394} 1395 1396static void 1397flowtable_free_stale(struct flowtable *ft, struct rtentry *rt) 1398{ 1399 int curbit = 0, count; 1400 struct flentry *fle, **flehead, *fleprev; 1401 struct flentry *flefreehead, *flefreetail, *fletmp; 1402 bitstr_t *mask, *tmpmask; 1403 struct flowtable_stats *fs = &ft->ft_stats[curcpu]; 1404 1405 flefreehead = flefreetail = NULL; 1406 mask = flowtable_mask(ft); 1407 tmpmask = ft->ft_tmpmask; 1408 memcpy(tmpmask, mask, ft->ft_size/8); 1409 /* 1410 * XXX Note to self, bit_ffs operates at the byte level 1411 * and thus adds gratuitous overhead 1412 */ 1413 bit_ffs(tmpmask, ft->ft_size, &curbit); 1414 while (curbit != -1) { 1415 if (curbit >= ft->ft_size || curbit < -1) { 1416 log(LOG_ALERT, 1417 "warning: bad curbit value %d \n", 1418 curbit); 1419 break; 1420 } 1421 1422 FL_ENTRY_LOCK(ft, curbit); 1423 flehead = flowtable_entry(ft, curbit); 1424 fle = fleprev = *flehead; 1425 1426 fs->ft_free_checks++; 1427#ifdef DIAGNOSTIC 1428 if (fle == NULL && curbit > 0) { 1429 log(LOG_ALERT, 1430 "warning bit=%d set, but no fle found\n", 1431 curbit); 1432 } 1433#endif 1434 while (fle != NULL) { 1435 if (rt != NULL) { 1436 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) { 1437 fleprev = fle; 1438 fle = fle->f_next; 1439 continue; 1440 } 1441 } else if (!flow_stale(ft, fle)) { 1442 fleprev = fle; 1443 fle = fle->f_next; 1444 continue; 1445 } 1446 /* 1447 * delete head of the list 1448 */ 1449 if (fleprev == *flehead) { 1450 fletmp = fleprev; 1451 if (fle == fleprev) { 1452 fleprev = *flehead = fle->f_next; 1453 } else 1454 fleprev = *flehead = fle; 1455 fle = fle->f_next; 1456 } else { 1457 /* 1458 * don't advance fleprev 1459 */ 1460 fletmp = fle; 1461 fleprev->f_next = fle->f_next; 1462 fle = fleprev->f_next; 1463 } 1464 1465 if (flefreehead == NULL) 1466 flefreehead = flefreetail = fletmp; 1467 else { 1468 flefreetail->f_next = fletmp; 1469 flefreetail = fletmp; 1470 } 1471 fletmp->f_next = NULL; 1472 } 1473 if (*flehead == NULL) 1474 bit_clear(mask, curbit); 1475 FL_ENTRY_UNLOCK(ft, curbit); 1476 bit_clear(tmpmask, curbit); 1477 bit_ffs(tmpmask, ft->ft_size, &curbit); 1478 } 1479 count = 0; 1480 while ((fle = flefreehead) != NULL) { 1481 flefreehead = fle->f_next; 1482 count++; 1483 fs->ft_frees++; 1484 fle_free(fle, ft); 1485 } 1486 if (V_flowtable_debug && count) 1487 log(LOG_DEBUG, "freed %d flow entries\n", count); 1488} 1489 1490void 1491flowtable_route_flush(struct flowtable *ft, struct rtentry *rt) 1492{ 1493 int i; 1494 1495 if (ft->ft_flags & FL_PCPU) { 1496 CPU_FOREACH(i) { 1497 if (smp_started == 1) { 1498 thread_lock(curthread); 1499 sched_bind(curthread, i); 1500 thread_unlock(curthread); 1501 } 1502 1503 flowtable_free_stale(ft, rt); 1504 1505 if (smp_started == 1) { 1506 thread_lock(curthread); 1507 sched_unbind(curthread); 1508 thread_unlock(curthread); 1509 } 1510 } 1511 } else { 1512 flowtable_free_stale(ft, rt); 1513 } 1514} 1515 1516static void 1517flowtable_clean_vnet(void) 1518{ 1519 struct flowtable *ft; 1520 int i; 1521 1522 ft = V_flow_list_head; 1523 while (ft != NULL) { 1524 if (ft->ft_flags & FL_PCPU) { 1525 CPU_FOREACH(i) { 1526 if (smp_started == 1) { 1527 thread_lock(curthread); 1528 sched_bind(curthread, i); 1529 thread_unlock(curthread); 1530 } 1531 1532 flowtable_free_stale(ft, NULL); 1533 1534 if (smp_started == 1) { 1535 thread_lock(curthread); 1536 sched_unbind(curthread); 1537 thread_unlock(curthread); 1538 } 1539 } 1540 } else { 1541 flowtable_free_stale(ft, NULL); 1542 } 1543 ft = ft->ft_next; 1544 } 1545} 1546 1547static void 1548flowtable_cleaner(void) 1549{ 1550 VNET_ITERATOR_DECL(vnet_iter); 1551 1552 if (bootverbose) 1553 log(LOG_INFO, "flowtable cleaner started\n"); 1554 while (1) { 1555 VNET_LIST_RLOCK(); 1556 VNET_FOREACH(vnet_iter) { 1557 CURVNET_SET(vnet_iter); 1558 flowtable_clean_vnet(); 1559 CURVNET_RESTORE(); 1560 } 1561 VNET_LIST_RUNLOCK(); 1562 1563 flowclean_cycles++; 1564 /* 1565 * The 10 second interval between cleaning checks 1566 * is arbitrary 1567 */ 1568 mtx_lock(&flowclean_lock); 1569 cv_broadcast(&flowclean_cv); 1570 cv_timedwait(&flowclean_cv, &flowclean_lock, flowclean_freq); 1571 mtx_unlock(&flowclean_lock); 1572 } 1573} 1574 1575static void 1576flowtable_flush(void *unused __unused) 1577{ 1578 uint64_t start; 1579 1580 mtx_lock(&flowclean_lock); 1581 start = flowclean_cycles; 1582 while (start == flowclean_cycles) { 1583 cv_broadcast(&flowclean_cv); 1584 cv_wait(&flowclean_cv, &flowclean_lock); 1585 } 1586 mtx_unlock(&flowclean_lock); 1587} 1588 1589static struct kproc_desc flow_kp = { 1590 "flowcleaner", 1591 flowtable_cleaner, 1592 &flowcleanerproc 1593}; 1594SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp); 1595 1596static void 1597flowtable_init_vnet(const void *unused __unused) 1598{ 1599 1600 V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus; 1601 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4), 1602 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET); 1603 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6), 1604 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET); 1605 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows); 1606 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows); 1607 V_flowtable_ready = 1; 1608} 1609VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY, 1610 flowtable_init_vnet, NULL); 1611 1612static void 1613flowtable_init(const void *unused __unused) 1614{ 1615 1616 cv_init(&flowclean_cv, "flowcleanwait"); 1617 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF); 1618 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL, 1619 EVENTHANDLER_PRI_ANY); 1620 flowclean_freq = 20*hz; 1621} 1622SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, 1623 flowtable_init, NULL); 1624 1625 1626#ifdef VIMAGE 1627static void 1628flowtable_uninit(const void *unused __unused) 1629{ 1630 1631 V_flowtable_ready = 0; 1632 uma_zdestroy(V_flow_ipv4_zone); 1633 uma_zdestroy(V_flow_ipv6_zone); 1634} 1635 1636VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, 1637 flowtable_uninit, NULL); 1638#endif 1639 1640#ifdef DDB 1641static uint32_t * 1642flowtable_get_hashkey(struct flentry *fle) 1643{ 1644 uint32_t *hashkey; 1645 1646 if (fle->f_flags & FL_IPV6) 1647 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; 1648 else 1649 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; 1650 1651 return (hashkey); 1652} 1653 1654static bitstr_t * 1655flowtable_mask_pcpu(struct flowtable *ft, int cpuid) 1656{ 1657 bitstr_t *mask; 1658 1659 if (ft->ft_flags & FL_PCPU) 1660 mask = ft->ft_masks[cpuid]; 1661 else 1662 mask = ft->ft_masks[0]; 1663 1664 return (mask); 1665} 1666 1667static struct flentry ** 1668flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) 1669{ 1670 struct flentry **fle; 1671 int index = (hash % ft->ft_size); 1672 1673 if (ft->ft_flags & FL_PCPU) { 1674 fle = &ft->ft_table.pcpu[cpuid][index]; 1675 } else { 1676 fle = &ft->ft_table.global[index]; 1677 } 1678 1679 return (fle); 1680} 1681 1682static void 1683flow_show(struct flowtable *ft, struct flentry *fle) 1684{ 1685 int idle_time; 1686 int rt_valid, ifp_valid; 1687 uint16_t sport, dport; 1688 uint32_t *hashkey; 1689 char saddr[4*sizeof "123"], daddr[4*sizeof "123"]; 1690 volatile struct rtentry *rt; 1691 struct ifnet *ifp = NULL; 1692 1693 idle_time = (int)(time_uptime - fle->f_uptime); 1694 rt = fle->f_rt; 1695 rt_valid = rt != NULL; 1696 if (rt_valid) 1697 ifp = rt->rt_ifp; 1698 ifp_valid = ifp != NULL; 1699 hashkey = flowtable_get_hashkey(fle); 1700 if (fle->f_flags & FL_IPV6) 1701 goto skipaddr; 1702 1703 inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr); 1704 if (ft->ft_flags & FL_HASH_ALL) { 1705 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr); 1706 sport = ntohs(((uint16_t *)hashkey)[0]); 1707 dport = ntohs(((uint16_t *)hashkey)[1]); 1708 db_printf("%s:%d->%s:%d", 1709 saddr, sport, daddr, 1710 dport); 1711 } else 1712 db_printf("%s ", daddr); 1713 1714skipaddr: 1715 if (fle->f_flags & FL_STALE) 1716 db_printf(" FL_STALE "); 1717 if (fle->f_flags & FL_TCP) 1718 db_printf(" FL_TCP "); 1719 if (fle->f_flags & FL_UDP) 1720 db_printf(" FL_UDP "); 1721 if (rt_valid) { 1722 if (rt->rt_flags & RTF_UP) 1723 db_printf(" RTF_UP "); 1724 } 1725 if (ifp_valid) { 1726 if (ifp->if_flags & IFF_LOOPBACK) 1727 db_printf(" IFF_LOOPBACK "); 1728 if (ifp->if_flags & IFF_UP) 1729 db_printf(" IFF_UP "); 1730 if (ifp->if_flags & IFF_POINTOPOINT) 1731 db_printf(" IFF_POINTOPOINT "); 1732 } 1733 if (fle->f_flags & FL_IPV6) 1734 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x", 1735 hashkey[0], hashkey[1], hashkey[2], 1736 hashkey[3], hashkey[4], hashkey[5], 1737 hashkey[6], hashkey[7], hashkey[8]); 1738 else 1739 db_printf("\n\tkey=%08x:%08x:%08x ", 1740 hashkey[0], hashkey[1], hashkey[2]); 1741 db_printf("hash=%08x idle_time=%03d" 1742 "\n\tfibnum=%02d rt=%p", 1743 fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt); 1744 db_printf("\n"); 1745} 1746 1747static void 1748flowtable_show(struct flowtable *ft, int cpuid) 1749{ 1750 int curbit = 0; 1751 struct flentry *fle, **flehead; 1752 bitstr_t *mask, *tmpmask; 1753 1754 if (cpuid != -1) 1755 db_printf("cpu: %d\n", cpuid); 1756 mask = flowtable_mask_pcpu(ft, cpuid); 1757 tmpmask = ft->ft_tmpmask; 1758 memcpy(tmpmask, mask, ft->ft_size/8); 1759 /* 1760 * XXX Note to self, bit_ffs operates at the byte level 1761 * and thus adds gratuitous overhead 1762 */ 1763 bit_ffs(tmpmask, ft->ft_size, &curbit); 1764 while (curbit != -1) { 1765 if (curbit >= ft->ft_size || curbit < -1) { 1766 db_printf("warning: bad curbit value %d \n", 1767 curbit); 1768 break; 1769 } 1770 1771 flehead = flowtable_entry_pcpu(ft, curbit, cpuid); 1772 fle = *flehead; 1773 1774 while (fle != NULL) { 1775 flow_show(ft, fle); 1776 fle = fle->f_next; 1777 continue; 1778 } 1779 bit_clear(tmpmask, curbit); 1780 bit_ffs(tmpmask, ft->ft_size, &curbit); 1781 } 1782} 1783 1784static void 1785flowtable_show_vnet(void) 1786{ 1787 struct flowtable *ft; 1788 int i; 1789 1790 ft = V_flow_list_head; 1791 while (ft != NULL) { 1792 printf("name: %s\n", ft->ft_name); 1793 if (ft->ft_flags & FL_PCPU) { 1794 CPU_FOREACH(i) { 1795 flowtable_show(ft, i); 1796 } 1797 } else { 1798 flowtable_show(ft, -1); 1799 } 1800 ft = ft->ft_next; 1801 } 1802} 1803 1804DB_SHOW_COMMAND(flowtables, db_show_flowtables) 1805{ 1806 VNET_ITERATOR_DECL(vnet_iter); 1807 1808 VNET_FOREACH(vnet_iter) { 1809 CURVNET_SET(vnet_iter); 1810 flowtable_show_vnet(); 1811 CURVNET_RESTORE(); 1812 } 1813} 1814#endif
| 240 241#define V_flowtable_enable VNET(flowtable_enable) 242#define V_flowtable_debug VNET(flowtable_debug) 243#define V_flowtable_syn_expire VNET(flowtable_syn_expire) 244#define V_flowtable_udp_expire VNET(flowtable_udp_expire) 245#define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire) 246#define V_flowtable_tcp_expire VNET(flowtable_tcp_expire) 247#define V_flowtable_nmbflows VNET(flowtable_nmbflows) 248#define V_flowtable_ready VNET(flowtable_ready) 249 250SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable"); 251SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW, 252 &VNET_NAME(flowtable_debug), 0, "print debug info."); 253SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW, 254 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); 255 256/* 257 * XXX This does not end up updating timeouts at runtime 258 * and only reflects the value for the last table added :-/ 259 */ 260SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW, 261 &VNET_NAME(flowtable_syn_expire), 0, 262 "seconds after which to remove syn allocated flow."); 263SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW, 264 &VNET_NAME(flowtable_udp_expire), 0, 265 "seconds after which to remove flow allocated to UDP."); 266SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW, 267 &VNET_NAME(flowtable_fin_wait_expire), 0, 268 "seconds after which to remove a flow in FIN_WAIT."); 269SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW, 270 &VNET_NAME(flowtable_tcp_expire), 0, 271 "seconds after which to remove flow allocated to a TCP connection."); 272 273 274/* 275 * Maximum number of flows that can be allocated of a given type. 276 * 277 * The table is allocated at boot time (for the pure caching case 278 * there is no reason why this could not be changed at runtime) 279 * and thus (currently) needs to be set with a tunable. 280 */ 281static int 282sysctl_nmbflows(SYSCTL_HANDLER_ARGS) 283{ 284 int error, newnmbflows; 285 286 newnmbflows = V_flowtable_nmbflows; 287 error = sysctl_handle_int(oidp, &newnmbflows, 0, req); 288 if (error == 0 && req->newptr) { 289 if (newnmbflows > V_flowtable_nmbflows) { 290 V_flowtable_nmbflows = newnmbflows; 291 uma_zone_set_max(V_flow_ipv4_zone, 292 V_flowtable_nmbflows); 293 uma_zone_set_max(V_flow_ipv6_zone, 294 V_flowtable_nmbflows); 295 } else 296 error = EINVAL; 297 } 298 return (error); 299} 300SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows, 301 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU", 302 "Maximum number of flows allowed"); 303 304 305 306#define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field) 307 308static void 309fs_print(struct sbuf *sb, struct flowtable_stats *fs) 310{ 311 312 FS_PRINT(sb, collisions); 313 FS_PRINT(sb, allocated); 314 FS_PRINT(sb, misses); 315 FS_PRINT(sb, max_depth); 316 FS_PRINT(sb, free_checks); 317 FS_PRINT(sb, frees); 318 FS_PRINT(sb, hits); 319 FS_PRINT(sb, lookups); 320} 321 322static void 323flowtable_show_stats(struct sbuf *sb, struct flowtable *ft) 324{ 325 int i; 326 struct flowtable_stats fs, *pfs; 327 328 if (ft->ft_flags & FL_PCPU) { 329 bzero(&fs, sizeof(fs)); 330 pfs = &fs; 331 CPU_FOREACH(i) { 332 pfs->ft_collisions += ft->ft_stats[i].ft_collisions; 333 pfs->ft_allocated += ft->ft_stats[i].ft_allocated; 334 pfs->ft_misses += ft->ft_stats[i].ft_misses; 335 pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks; 336 pfs->ft_frees += ft->ft_stats[i].ft_frees; 337 pfs->ft_hits += ft->ft_stats[i].ft_hits; 338 pfs->ft_lookups += ft->ft_stats[i].ft_lookups; 339 if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth) 340 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth; 341 } 342 } else { 343 pfs = &ft->ft_stats[0]; 344 } 345 fs_print(sb, pfs); 346} 347 348static int 349sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS) 350{ 351 struct flowtable *ft; 352 struct sbuf *sb; 353 int error; 354 355 sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN); 356 357 ft = V_flow_list_head; 358 while (ft != NULL) { 359 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name); 360 flowtable_show_stats(sb, ft); 361 ft = ft->ft_next; 362 } 363 sbuf_finish(sb); 364 error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); 365 sbuf_delete(sb); 366 367 return (error); 368} 369SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD, 370 NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics"); 371 372 373#ifndef RADIX_MPATH 374static void 375in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum) 376{ 377 378 rtalloc_ign_fib(ro, 0, fibnum); 379} 380#endif 381 382static void 383flowtable_global_lock(struct flowtable *table, uint32_t hash) 384{ 385 int lock_index = (hash)&(table->ft_lock_count - 1); 386 387 mtx_lock(&table->ft_locks[lock_index]); 388} 389 390static void 391flowtable_global_unlock(struct flowtable *table, uint32_t hash) 392{ 393 int lock_index = (hash)&(table->ft_lock_count - 1); 394 395 mtx_unlock(&table->ft_locks[lock_index]); 396} 397 398static void 399flowtable_pcpu_lock(struct flowtable *table, uint32_t hash) 400{ 401 402 critical_enter(); 403} 404 405static void 406flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash) 407{ 408 409 critical_exit(); 410} 411 412#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size) 413#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash)) 414#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash)) 415#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash)) 416 417#define FL_STALE (1<<8) 418#define FL_OVERWRITE (1<<10) 419 420void 421flow_invalidate(struct flentry *fle) 422{ 423 424 fle->f_flags |= FL_STALE; 425} 426 427static __inline int 428proto_to_flags(uint8_t proto) 429{ 430 int flag; 431 432 switch (proto) { 433 case IPPROTO_TCP: 434 flag = FL_TCP; 435 break; 436 case IPPROTO_SCTP: 437 flag = FL_SCTP; 438 break; 439 case IPPROTO_UDP: 440 flag = FL_UDP; 441 break; 442 default: 443 flag = 0; 444 break; 445 } 446 447 return (flag); 448} 449 450static __inline int 451flags_to_proto(int flags) 452{ 453 int proto, protoflags; 454 455 protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP); 456 switch (protoflags) { 457 case FL_TCP: 458 proto = IPPROTO_TCP; 459 break; 460 case FL_SCTP: 461 proto = IPPROTO_SCTP; 462 break; 463 case FL_UDP: 464 proto = IPPROTO_UDP; 465 break; 466 default: 467 proto = 0; 468 break; 469 } 470 return (proto); 471} 472 473#ifdef INET 474#ifdef FLOWTABLE_DEBUG 475static void 476ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin, 477 struct sockaddr_in *dsin) 478{ 479 char saddr[4*sizeof "123"], daddr[4*sizeof "123"]; 480 481 if (flags & FL_HASH_ALL) { 482 inet_ntoa_r(ssin->sin_addr, saddr); 483 inet_ntoa_r(dsin->sin_addr, daddr); 484 printf("proto=%d %s:%d->%s:%d\n", 485 proto, saddr, ntohs(ssin->sin_port), daddr, 486 ntohs(dsin->sin_port)); 487 } else { 488 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr); 489 printf("proto=%d %s\n", proto, daddr); 490 } 491 492} 493#endif 494 495static int 496ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, 497 struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags) 498{ 499 struct ip *ip; 500 uint8_t proto; 501 int iphlen; 502 struct tcphdr *th; 503 struct udphdr *uh; 504 struct sctphdr *sh; 505 uint16_t sport, dport; 506 507 proto = sport = dport = 0; 508 ip = mtod(m, struct ip *); 509 dsin->sin_family = AF_INET; 510 dsin->sin_len = sizeof(*dsin); 511 dsin->sin_addr = ip->ip_dst; 512 ssin->sin_family = AF_INET; 513 ssin->sin_len = sizeof(*ssin); 514 ssin->sin_addr = ip->ip_src; 515 516 proto = ip->ip_p; 517 if ((*flags & FL_HASH_ALL) == 0) { 518 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ", 519 *flags); 520 goto skipports; 521 } 522 523 iphlen = ip->ip_hl << 2; /* XXX options? */ 524 525 switch (proto) { 526 case IPPROTO_TCP: 527 th = (struct tcphdr *)((caddr_t)ip + iphlen); 528 sport = th->th_sport; 529 dport = th->th_dport; 530 if ((*flags & FL_HASH_ALL) && 531 (th->th_flags & (TH_RST|TH_FIN))) 532 *flags |= FL_STALE; 533 break; 534 case IPPROTO_UDP: 535 uh = (struct udphdr *)((caddr_t)ip + iphlen); 536 sport = uh->uh_sport; 537 dport = uh->uh_dport; 538 break; 539 case IPPROTO_SCTP: 540 sh = (struct sctphdr *)((caddr_t)ip + iphlen); 541 sport = sh->src_port; 542 dport = sh->dest_port; 543 break; 544 default: 545 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto); 546 return (ENOTSUP); 547 /* no port - hence not a protocol we care about */ 548 break; 549 550 } 551 552skipports: 553 *flags |= proto_to_flags(proto); 554 ssin->sin_port = sport; 555 dsin->sin_port = dport; 556 return (0); 557} 558 559static uint32_t 560ipv4_flow_lookup_hash_internal( 561 struct sockaddr_in *ssin, struct sockaddr_in *dsin, 562 uint32_t *key, uint16_t flags) 563{ 564 uint16_t sport, dport; 565 uint8_t proto; 566 int offset = 0; 567 568 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) 569 return (0); 570 proto = flags_to_proto(flags); 571 sport = dport = key[2] = key[1] = key[0] = 0; 572 if ((ssin != NULL) && (flags & FL_HASH_ALL)) { 573 key[1] = ssin->sin_addr.s_addr; 574 sport = ssin->sin_port; 575 } 576 if (dsin != NULL) { 577 key[2] = dsin->sin_addr.s_addr; 578 dport = dsin->sin_port; 579 } 580 if (flags & FL_HASH_ALL) { 581 ((uint16_t *)key)[0] = sport; 582 ((uint16_t *)key)[1] = dport; 583 } else 584 offset = V_flow_hashjitter + proto; 585 586 return (jenkins_hashword(key, 3, offset)); 587} 588 589static struct flentry * 590flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m) 591{ 592 struct sockaddr_storage ssa, dsa; 593 uint16_t flags; 594 struct sockaddr_in *dsin, *ssin; 595 596 dsin = (struct sockaddr_in *)&dsa; 597 ssin = (struct sockaddr_in *)&ssa; 598 bzero(dsin, sizeof(*dsin)); 599 bzero(ssin, sizeof(*ssin)); 600 flags = ft->ft_flags; 601 if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0) 602 return (NULL); 603 604 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags)); 605} 606 607void 608flow_to_route(struct flentry *fle, struct route *ro) 609{ 610 uint32_t *hashkey = NULL; 611 struct sockaddr_in *sin; 612 613 sin = (struct sockaddr_in *)&ro->ro_dst; 614 sin->sin_family = AF_INET; 615 sin->sin_len = sizeof(*sin); 616 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; 617 sin->sin_addr.s_addr = hashkey[2]; 618 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); 619 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); 620} 621#endif /* INET */ 622 623#ifdef INET6 624/* 625 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, 626 * then it sets p to point at the offset "len" in the mbuf. WARNING: the 627 * pointer might become stale after other pullups (but we never use it 628 * this way). 629 */ 630#define PULLUP_TO(_len, p, T) \ 631do { \ 632 int x = (_len) + sizeof(T); \ 633 if ((m)->m_len < x) { \ 634 goto receive_failed; \ 635 } \ 636 p = (mtod(m, char *) + (_len)); \ 637} while (0) 638 639#define TCP(p) ((struct tcphdr *)(p)) 640#define SCTP(p) ((struct sctphdr *)(p)) 641#define UDP(p) ((struct udphdr *)(p)) 642 643static int 644ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, 645 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags) 646{ 647 struct ip6_hdr *ip6; 648 uint8_t proto; 649 int hlen; 650 uint16_t src_port, dst_port; 651 u_short offset; 652 void *ulp; 653 654 offset = hlen = src_port = dst_port = 0; 655 ulp = NULL; 656 ip6 = mtod(m, struct ip6_hdr *); 657 hlen = sizeof(struct ip6_hdr); 658 proto = ip6->ip6_nxt; 659 660 if ((*flags & FL_HASH_ALL) == 0) 661 goto skipports; 662 663 while (ulp == NULL) { 664 switch (proto) { 665 case IPPROTO_ICMPV6: 666 case IPPROTO_OSPFIGP: 667 case IPPROTO_PIM: 668 case IPPROTO_CARP: 669 case IPPROTO_ESP: 670 case IPPROTO_NONE: 671 ulp = ip6; 672 break; 673 case IPPROTO_TCP: 674 PULLUP_TO(hlen, ulp, struct tcphdr); 675 dst_port = TCP(ulp)->th_dport; 676 src_port = TCP(ulp)->th_sport; 677 if ((*flags & FL_HASH_ALL) && 678 (TCP(ulp)->th_flags & (TH_RST|TH_FIN))) 679 *flags |= FL_STALE; 680 break; 681 case IPPROTO_SCTP: 682 PULLUP_TO(hlen, ulp, struct sctphdr); 683 src_port = SCTP(ulp)->src_port; 684 dst_port = SCTP(ulp)->dest_port; 685 break; 686 case IPPROTO_UDP: 687 PULLUP_TO(hlen, ulp, struct udphdr); 688 dst_port = UDP(ulp)->uh_dport; 689 src_port = UDP(ulp)->uh_sport; 690 break; 691 case IPPROTO_HOPOPTS: /* RFC 2460 */ 692 PULLUP_TO(hlen, ulp, struct ip6_hbh); 693 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; 694 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; 695 ulp = NULL; 696 break; 697 case IPPROTO_ROUTING: /* RFC 2460 */ 698 PULLUP_TO(hlen, ulp, struct ip6_rthdr); 699 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; 700 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; 701 ulp = NULL; 702 break; 703 case IPPROTO_FRAGMENT: /* RFC 2460 */ 704 PULLUP_TO(hlen, ulp, struct ip6_frag); 705 hlen += sizeof (struct ip6_frag); 706 proto = ((struct ip6_frag *)ulp)->ip6f_nxt; 707 offset = ((struct ip6_frag *)ulp)->ip6f_offlg & 708 IP6F_OFF_MASK; 709 ulp = NULL; 710 break; 711 case IPPROTO_DSTOPTS: /* RFC 2460 */ 712 PULLUP_TO(hlen, ulp, struct ip6_hbh); 713 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; 714 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; 715 ulp = NULL; 716 break; 717 case IPPROTO_AH: /* RFC 2402 */ 718 PULLUP_TO(hlen, ulp, struct ip6_ext); 719 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; 720 proto = ((struct ip6_ext *)ulp)->ip6e_nxt; 721 ulp = NULL; 722 break; 723 default: 724 PULLUP_TO(hlen, ulp, struct ip6_ext); 725 break; 726 } 727 } 728 729 if (src_port == 0) { 730 receive_failed: 731 return (ENOTSUP); 732 } 733 734skipports: 735 dsin6->sin6_family = AF_INET6; 736 dsin6->sin6_len = sizeof(*dsin6); 737 dsin6->sin6_port = dst_port; 738 memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr)); 739 740 ssin6->sin6_family = AF_INET6; 741 ssin6->sin6_len = sizeof(*ssin6); 742 ssin6->sin6_port = src_port; 743 memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr)); 744 *flags |= proto_to_flags(proto); 745 746 return (0); 747} 748 749#define zero_key(key) \ 750do { \ 751 key[0] = 0; \ 752 key[1] = 0; \ 753 key[2] = 0; \ 754 key[3] = 0; \ 755 key[4] = 0; \ 756 key[5] = 0; \ 757 key[6] = 0; \ 758 key[7] = 0; \ 759 key[8] = 0; \ 760} while (0) 761 762static uint32_t 763ipv6_flow_lookup_hash_internal( 764 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, 765 uint32_t *key, uint16_t flags) 766{ 767 uint16_t sport, dport; 768 uint8_t proto; 769 int offset = 0; 770 771 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) 772 return (0); 773 774 proto = flags_to_proto(flags); 775 zero_key(key); 776 sport = dport = 0; 777 if (dsin6 != NULL) { 778 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr)); 779 dport = dsin6->sin6_port; 780 } 781 if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) { 782 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr)); 783 sport = ssin6->sin6_port; 784 } 785 if (flags & FL_HASH_ALL) { 786 ((uint16_t *)key)[0] = sport; 787 ((uint16_t *)key)[1] = dport; 788 } else 789 offset = V_flow_hashjitter + proto; 790 791 return (jenkins_hashword(key, 9, offset)); 792} 793 794static struct flentry * 795flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m) 796{ 797 struct sockaddr_storage ssa, dsa; 798 struct sockaddr_in6 *dsin6, *ssin6; 799 uint16_t flags; 800 801 dsin6 = (struct sockaddr_in6 *)&dsa; 802 ssin6 = (struct sockaddr_in6 *)&ssa; 803 bzero(dsin6, sizeof(*dsin6)); 804 bzero(ssin6, sizeof(*ssin6)); 805 flags = ft->ft_flags; 806 807 if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0) 808 return (NULL); 809 810 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags)); 811} 812 813void 814flow_to_route_in6(struct flentry *fle, struct route_in6 *ro) 815{ 816 uint32_t *hashkey = NULL; 817 struct sockaddr_in6 *sin6; 818 819 sin6 = (struct sockaddr_in6 *)&ro->ro_dst; 820 821 sin6->sin6_family = AF_INET6; 822 sin6->sin6_len = sizeof(*sin6); 823 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; 824 memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr)); 825 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); 826 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); 827 828} 829#endif /* INET6 */ 830 831static bitstr_t * 832flowtable_mask(struct flowtable *ft) 833{ 834 bitstr_t *mask; 835 836 if (ft->ft_flags & FL_PCPU) 837 mask = ft->ft_masks[curcpu]; 838 else 839 mask = ft->ft_masks[0]; 840 841 return (mask); 842} 843 844static struct flentry ** 845flowtable_entry(struct flowtable *ft, uint32_t hash) 846{ 847 struct flentry **fle; 848 int index = (hash % ft->ft_size); 849 850 if (ft->ft_flags & FL_PCPU) { 851 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set")); 852 fle = &ft->ft_table.pcpu[curcpu][index]; 853 } else { 854 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set")); 855 fle = &ft->ft_table.global[index]; 856 } 857 858 return (fle); 859} 860 861static int 862flow_stale(struct flowtable *ft, struct flentry *fle) 863{ 864 time_t idle_time; 865 866 if ((fle->f_fhash == 0) 867 || ((fle->f_rt->rt_flags & RTF_HOST) && 868 ((fle->f_rt->rt_flags & (RTF_UP)) 869 != (RTF_UP))) 870 || (fle->f_rt->rt_ifp == NULL) 871 || !RT_LINK_IS_UP(fle->f_rt->rt_ifp)) 872 return (1); 873 874 idle_time = time_uptime - fle->f_uptime; 875 876 if ((fle->f_flags & FL_STALE) || 877 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0 878 && (idle_time > ft->ft_udp_idle)) || 879 ((fle->f_flags & TH_FIN) 880 && (idle_time > ft->ft_fin_wait_idle)) || 881 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN 882 && (idle_time > ft->ft_syn_idle)) || 883 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK) 884 && (idle_time > ft->ft_tcp_idle)) || 885 ((fle->f_rt->rt_flags & RTF_UP) == 0 || 886 (fle->f_rt->rt_ifp == NULL))) 887 return (1); 888 889 return (0); 890} 891 892static void 893flowtable_set_hashkey(struct flentry *fle, uint32_t *key) 894{ 895 uint32_t *hashkey; 896 int i, nwords; 897 898 if (fle->f_flags & FL_IPV6) { 899 nwords = 9; 900 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; 901 } else { 902 nwords = 3; 903 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; 904 } 905 906 for (i = 0; i < nwords; i++) 907 hashkey[i] = key[i]; 908} 909 910static struct flentry * 911flow_alloc(struct flowtable *ft) 912{ 913 struct flentry *newfle; 914 uma_zone_t zone; 915 916 newfle = NULL; 917 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone; 918 919 newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO); 920 if (newfle != NULL) 921 atomic_add_int(&ft->ft_count, 1); 922 return (newfle); 923} 924 925static void 926flow_free(struct flentry *fle, struct flowtable *ft) 927{ 928 uma_zone_t zone; 929 930 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone; 931 atomic_add_int(&ft->ft_count, -1); 932 uma_zfree(zone, fle); 933} 934 935static int 936flow_full(struct flowtable *ft) 937{ 938 boolean_t full; 939 uint32_t count; 940 941 full = ft->ft_full; 942 count = ft->ft_count; 943 944 if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3)))) 945 ft->ft_full = FALSE; 946 else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5)))) 947 ft->ft_full = TRUE; 948 949 if (full && !ft->ft_full) { 950 flowclean_freq = 4*hz; 951 if ((ft->ft_flags & FL_HASH_ALL) == 0) 952 ft->ft_udp_idle = ft->ft_fin_wait_idle = 953 ft->ft_syn_idle = ft->ft_tcp_idle = 5; 954 cv_broadcast(&flowclean_cv); 955 } else if (!full && ft->ft_full) { 956 flowclean_freq = 20*hz; 957 if ((ft->ft_flags & FL_HASH_ALL) == 0) 958 ft->ft_udp_idle = ft->ft_fin_wait_idle = 959 ft->ft_syn_idle = ft->ft_tcp_idle = 30; 960 } 961 962 return (ft->ft_full); 963} 964 965static int 966flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, 967 uint32_t fibnum, struct route *ro, uint16_t flags) 968{ 969 struct flentry *fle, *fletail, *newfle, **flep; 970 struct flowtable_stats *fs = &ft->ft_stats[curcpu]; 971 int depth; 972 bitstr_t *mask; 973 uint8_t proto; 974 975 newfle = flow_alloc(ft); 976 if (newfle == NULL) 977 return (ENOMEM); 978 979 newfle->f_flags |= (flags & FL_IPV6); 980 proto = flags_to_proto(flags); 981 982 FL_ENTRY_LOCK(ft, hash); 983 mask = flowtable_mask(ft); 984 flep = flowtable_entry(ft, hash); 985 fletail = fle = *flep; 986 987 if (fle == NULL) { 988 bit_set(mask, FL_ENTRY_INDEX(ft, hash)); 989 *flep = fle = newfle; 990 goto skip; 991 } 992 993 depth = 0; 994 fs->ft_collisions++; 995 /* 996 * find end of list and make sure that we were not 997 * preempted by another thread handling this flow 998 */ 999 while (fle != NULL) { 1000 if (fle->f_fhash == hash && !flow_stale(ft, fle)) { 1001 /* 1002 * there was either a hash collision 1003 * or we lost a race to insert 1004 */ 1005 FL_ENTRY_UNLOCK(ft, hash); 1006 flow_free(newfle, ft); 1007 1008 if (flags & FL_OVERWRITE) 1009 goto skip; 1010 return (EEXIST); 1011 } 1012 /* 1013 * re-visit this double condition XXX 1014 */ 1015 if (fletail->f_next != NULL) 1016 fletail = fle->f_next; 1017 1018 depth++; 1019 fle = fle->f_next; 1020 } 1021 1022 if (depth > fs->ft_max_depth) 1023 fs->ft_max_depth = depth; 1024 fletail->f_next = newfle; 1025 fle = newfle; 1026skip: 1027 flowtable_set_hashkey(fle, key); 1028 1029 fle->f_proto = proto; 1030 fle->f_rt = ro->ro_rt; 1031 fle->f_lle = ro->ro_lle; 1032 fle->f_fhash = hash; 1033 fle->f_fibnum = fibnum; 1034 fle->f_uptime = time_uptime; 1035 FL_ENTRY_UNLOCK(ft, hash); 1036 return (0); 1037} 1038 1039int 1040kern_flowtable_insert(struct flowtable *ft, 1041 struct sockaddr_storage *ssa, struct sockaddr_storage *dsa, 1042 struct route *ro, uint32_t fibnum, int flags) 1043{ 1044 uint32_t key[9], hash; 1045 1046 flags = (ft->ft_flags | flags | FL_OVERWRITE); 1047 hash = 0; 1048 1049#ifdef INET 1050 if (ssa->ss_family == AF_INET) 1051 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa, 1052 (struct sockaddr_in *)dsa, key, flags); 1053#endif 1054#ifdef INET6 1055 if (ssa->ss_family == AF_INET6) 1056 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa, 1057 (struct sockaddr_in6 *)dsa, key, flags); 1058#endif 1059 if (ro->ro_rt == NULL || ro->ro_lle == NULL) 1060 return (EINVAL); 1061 1062 FLDPRINTF(ft, FL_DEBUG, 1063 "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n", 1064 key[0], key[1], key[2], hash, fibnum, flags); 1065 return (flowtable_insert(ft, hash, key, fibnum, ro, flags)); 1066} 1067 1068static int 1069flowtable_key_equal(struct flentry *fle, uint32_t *key) 1070{ 1071 uint32_t *hashkey; 1072 int i, nwords; 1073 1074 if (fle->f_flags & FL_IPV6) { 1075 nwords = 9; 1076 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; 1077 } else { 1078 nwords = 3; 1079 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; 1080 } 1081 1082 for (i = 0; i < nwords; i++) 1083 if (hashkey[i] != key[i]) 1084 return (0); 1085 1086 return (1); 1087} 1088 1089struct flentry * 1090flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af) 1091{ 1092 struct flentry *fle = NULL; 1093 1094#ifdef INET 1095 if (af == AF_INET) 1096 fle = flowtable_lookup_mbuf4(ft, m); 1097#endif 1098#ifdef INET6 1099 if (af == AF_INET6) 1100 fle = flowtable_lookup_mbuf6(ft, m); 1101#endif 1102 if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) { 1103 m->m_flags |= M_FLOWID; 1104 m->m_pkthdr.flowid = fle->f_fhash; 1105 } 1106 return (fle); 1107} 1108 1109struct flentry * 1110flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa, 1111 struct sockaddr_storage *dsa, uint32_t fibnum, int flags) 1112{ 1113 uint32_t key[9], hash; 1114 struct flentry *fle; 1115 struct flowtable_stats *fs = &ft->ft_stats[curcpu]; 1116 uint8_t proto = 0; 1117 int error = 0; 1118 struct rtentry *rt; 1119 struct llentry *lle; 1120 struct route sro, *ro; 1121 struct route_in6 sro6; 1122 1123 sro.ro_rt = sro6.ro_rt = NULL; 1124 sro.ro_lle = sro6.ro_lle = NULL; 1125 ro = NULL; 1126 hash = 0; 1127 flags |= ft->ft_flags; 1128 proto = flags_to_proto(flags); 1129#ifdef INET 1130 if (ssa->ss_family == AF_INET) { 1131 struct sockaddr_in *ssin, *dsin; 1132 1133 ro = &sro; 1134 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in)); 1135 /* 1136 * The harvested source and destination addresses 1137 * may contain port information if the packet is 1138 * from a transport protocol (e.g. TCP/UDP). The 1139 * port field must be cleared before performing 1140 * a route lookup. 1141 */ 1142 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0; 1143 dsin = (struct sockaddr_in *)dsa; 1144 ssin = (struct sockaddr_in *)ssa; 1145 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) || 1146 (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 1147 (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) 1148 return (NULL); 1149 1150 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags); 1151 } 1152#endif 1153#ifdef INET6 1154 if (ssa->ss_family == AF_INET6) { 1155 struct sockaddr_in6 *ssin6, *dsin6; 1156 1157 ro = (struct route *)&sro6; 1158 memcpy(&sro6.ro_dst, dsa, 1159 sizeof(struct sockaddr_in6)); 1160 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0; 1161 dsin6 = (struct sockaddr_in6 *)dsa; 1162 ssin6 = (struct sockaddr_in6 *)ssa; 1163 1164 flags |= FL_IPV6; 1165 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags); 1166 } 1167#endif 1168 /* 1169 * Ports are zero and this isn't a transmit cache 1170 * - thus not a protocol for which we need to keep 1171 * state 1172 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP 1173 */ 1174 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL))) 1175 return (NULL); 1176 1177 fs->ft_lookups++; 1178 FL_ENTRY_LOCK(ft, hash); 1179 if ((fle = FL_ENTRY(ft, hash)) == NULL) { 1180 FL_ENTRY_UNLOCK(ft, hash); 1181 goto uncached; 1182 } 1183keycheck: 1184 rt = __DEVOLATILE(struct rtentry *, fle->f_rt); 1185 lle = __DEVOLATILE(struct llentry *, fle->f_lle); 1186 if ((rt != NULL) 1187 && fle->f_fhash == hash 1188 && flowtable_key_equal(fle, key) 1189 && (proto == fle->f_proto) 1190 && (fibnum == fle->f_fibnum) 1191 && (rt->rt_flags & RTF_UP) 1192 && (rt->rt_ifp != NULL)) { 1193 fs->ft_hits++; 1194 fle->f_uptime = time_uptime; 1195 fle->f_flags |= flags; 1196 FL_ENTRY_UNLOCK(ft, hash); 1197 return (fle); 1198 } else if (fle->f_next != NULL) { 1199 fle = fle->f_next; 1200 goto keycheck; 1201 } 1202 FL_ENTRY_UNLOCK(ft, hash); 1203uncached: 1204 if (flags & FL_NOAUTO || flow_full(ft)) 1205 return (NULL); 1206 1207 fs->ft_misses++; 1208 /* 1209 * This bit of code ends up locking the 1210 * same route 3 times (just like ip_output + ether_output) 1211 * - at lookup 1212 * - in rt_check when called by arpresolve 1213 * - dropping the refcount for the rtentry 1214 * 1215 * This could be consolidated to one if we wrote a variant 1216 * of arpresolve with an rt_check variant that expected to 1217 * receive the route locked 1218 */ 1219 1220#ifdef INVARIANTS 1221 if ((ro->ro_dst.sa_family != AF_INET) && 1222 (ro->ro_dst.sa_family != AF_INET6)) 1223 panic("sa_family == %d\n", ro->ro_dst.sa_family); 1224#endif 1225 1226 ft->ft_rtalloc(ro, hash, fibnum); 1227 if (ro->ro_rt == NULL) 1228 error = ENETUNREACH; 1229 else { 1230 struct llentry *lle = NULL; 1231 struct sockaddr_storage *l3addr; 1232 struct rtentry *rt = ro->ro_rt; 1233 struct ifnet *ifp = rt->rt_ifp; 1234 1235 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) { 1236 RTFREE(rt); 1237 ro->ro_rt = NULL; 1238 return (NULL); 1239 } 1240#ifdef INET6 1241 if (ssa->ss_family == AF_INET6) { 1242 struct sockaddr_in6 *dsin6; 1243 1244 dsin6 = (struct sockaddr_in6 *)dsa; 1245 if (in6_localaddr(&dsin6->sin6_addr)) { 1246 RTFREE(rt); 1247 ro->ro_rt = NULL; 1248 return (NULL); 1249 } 1250 1251 if (rt->rt_flags & RTF_GATEWAY) 1252 l3addr = (struct sockaddr_storage *)rt->rt_gateway; 1253 1254 else 1255 l3addr = (struct sockaddr_storage *)&ro->ro_dst; 1256 llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp); 1257 } 1258#endif 1259#ifdef INET 1260 if (ssa->ss_family == AF_INET) { 1261 if (rt->rt_flags & RTF_GATEWAY) 1262 l3addr = (struct sockaddr_storage *)rt->rt_gateway; 1263 else 1264 l3addr = (struct sockaddr_storage *)&ro->ro_dst; 1265 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp); 1266 } 1267 1268#endif 1269 ro->ro_lle = lle; 1270 1271 if (lle == NULL) { 1272 RTFREE(rt); 1273 ro->ro_rt = NULL; 1274 return (NULL); 1275 } 1276 error = flowtable_insert(ft, hash, key, fibnum, ro, flags); 1277 1278 if (error) { 1279 RTFREE(rt); 1280 LLE_FREE(lle); 1281 ro->ro_rt = NULL; 1282 ro->ro_lle = NULL; 1283 } 1284 } 1285 1286 return ((error) ? NULL : fle); 1287} 1288 1289/* 1290 * used by the bit_alloc macro 1291 */ 1292#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO) 1293 1294struct flowtable * 1295flowtable_alloc(char *name, int nentry, int flags) 1296{ 1297 struct flowtable *ft, *fttail; 1298 int i; 1299 1300 if (V_flow_hashjitter == 0) 1301 V_flow_hashjitter = arc4random(); 1302 1303 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry)); 1304 1305 ft = malloc(sizeof(struct flowtable), 1306 M_RTABLE, M_WAITOK | M_ZERO); 1307 1308 ft->ft_name = name; 1309 ft->ft_flags = flags; 1310 ft->ft_size = nentry; 1311#ifdef RADIX_MPATH 1312 ft->ft_rtalloc = rtalloc_mpath_fib; 1313#else 1314 ft->ft_rtalloc = in_rtalloc_ign_wrapper; 1315#endif 1316 if (flags & FL_PCPU) { 1317 ft->ft_lock = flowtable_pcpu_lock; 1318 ft->ft_unlock = flowtable_pcpu_unlock; 1319 1320 for (i = 0; i <= mp_maxid; i++) { 1321 ft->ft_table.pcpu[i] = 1322 malloc(nentry*sizeof(struct flentry *), 1323 M_RTABLE, M_WAITOK | M_ZERO); 1324 ft->ft_masks[i] = bit_alloc(nentry); 1325 } 1326 } else { 1327 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1): 1328 (fls(mp_maxid + 1) << 1)); 1329 1330 ft->ft_lock = flowtable_global_lock; 1331 ft->ft_unlock = flowtable_global_unlock; 1332 ft->ft_table.global = 1333 malloc(nentry*sizeof(struct flentry *), 1334 M_RTABLE, M_WAITOK | M_ZERO); 1335 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx), 1336 M_RTABLE, M_WAITOK | M_ZERO); 1337 for (i = 0; i < ft->ft_lock_count; i++) 1338 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK); 1339 1340 ft->ft_masks[0] = bit_alloc(nentry); 1341 } 1342 ft->ft_tmpmask = bit_alloc(nentry); 1343 1344 /* 1345 * In the local transmit case the table truly is 1346 * just a cache - so everything is eligible for 1347 * replacement after 5s of non-use 1348 */ 1349 if (flags & FL_HASH_ALL) { 1350 ft->ft_udp_idle = V_flowtable_udp_expire; 1351 ft->ft_syn_idle = V_flowtable_syn_expire; 1352 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire; 1353 ft->ft_tcp_idle = V_flowtable_fin_wait_expire; 1354 } else { 1355 ft->ft_udp_idle = ft->ft_fin_wait_idle = 1356 ft->ft_syn_idle = ft->ft_tcp_idle = 30; 1357 1358 } 1359 1360 /* 1361 * hook in to the cleaner list 1362 */ 1363 if (V_flow_list_head == NULL) 1364 V_flow_list_head = ft; 1365 else { 1366 fttail = V_flow_list_head; 1367 while (fttail->ft_next != NULL) 1368 fttail = fttail->ft_next; 1369 fttail->ft_next = ft; 1370 } 1371 1372 return (ft); 1373} 1374 1375/* 1376 * The rest of the code is devoted to garbage collection of expired entries. 1377 * It is a new additon made necessary by the switch to dynamically allocating 1378 * flow tables. 1379 * 1380 */ 1381static void 1382fle_free(struct flentry *fle, struct flowtable *ft) 1383{ 1384 struct rtentry *rt; 1385 struct llentry *lle; 1386 1387 rt = __DEVOLATILE(struct rtentry *, fle->f_rt); 1388 lle = __DEVOLATILE(struct llentry *, fle->f_lle); 1389 if (rt != NULL) 1390 RTFREE(rt); 1391 if (lle != NULL) 1392 LLE_FREE(lle); 1393 flow_free(fle, ft); 1394} 1395 1396static void 1397flowtable_free_stale(struct flowtable *ft, struct rtentry *rt) 1398{ 1399 int curbit = 0, count; 1400 struct flentry *fle, **flehead, *fleprev; 1401 struct flentry *flefreehead, *flefreetail, *fletmp; 1402 bitstr_t *mask, *tmpmask; 1403 struct flowtable_stats *fs = &ft->ft_stats[curcpu]; 1404 1405 flefreehead = flefreetail = NULL; 1406 mask = flowtable_mask(ft); 1407 tmpmask = ft->ft_tmpmask; 1408 memcpy(tmpmask, mask, ft->ft_size/8); 1409 /* 1410 * XXX Note to self, bit_ffs operates at the byte level 1411 * and thus adds gratuitous overhead 1412 */ 1413 bit_ffs(tmpmask, ft->ft_size, &curbit); 1414 while (curbit != -1) { 1415 if (curbit >= ft->ft_size || curbit < -1) { 1416 log(LOG_ALERT, 1417 "warning: bad curbit value %d \n", 1418 curbit); 1419 break; 1420 } 1421 1422 FL_ENTRY_LOCK(ft, curbit); 1423 flehead = flowtable_entry(ft, curbit); 1424 fle = fleprev = *flehead; 1425 1426 fs->ft_free_checks++; 1427#ifdef DIAGNOSTIC 1428 if (fle == NULL && curbit > 0) { 1429 log(LOG_ALERT, 1430 "warning bit=%d set, but no fle found\n", 1431 curbit); 1432 } 1433#endif 1434 while (fle != NULL) { 1435 if (rt != NULL) { 1436 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) { 1437 fleprev = fle; 1438 fle = fle->f_next; 1439 continue; 1440 } 1441 } else if (!flow_stale(ft, fle)) { 1442 fleprev = fle; 1443 fle = fle->f_next; 1444 continue; 1445 } 1446 /* 1447 * delete head of the list 1448 */ 1449 if (fleprev == *flehead) { 1450 fletmp = fleprev; 1451 if (fle == fleprev) { 1452 fleprev = *flehead = fle->f_next; 1453 } else 1454 fleprev = *flehead = fle; 1455 fle = fle->f_next; 1456 } else { 1457 /* 1458 * don't advance fleprev 1459 */ 1460 fletmp = fle; 1461 fleprev->f_next = fle->f_next; 1462 fle = fleprev->f_next; 1463 } 1464 1465 if (flefreehead == NULL) 1466 flefreehead = flefreetail = fletmp; 1467 else { 1468 flefreetail->f_next = fletmp; 1469 flefreetail = fletmp; 1470 } 1471 fletmp->f_next = NULL; 1472 } 1473 if (*flehead == NULL) 1474 bit_clear(mask, curbit); 1475 FL_ENTRY_UNLOCK(ft, curbit); 1476 bit_clear(tmpmask, curbit); 1477 bit_ffs(tmpmask, ft->ft_size, &curbit); 1478 } 1479 count = 0; 1480 while ((fle = flefreehead) != NULL) { 1481 flefreehead = fle->f_next; 1482 count++; 1483 fs->ft_frees++; 1484 fle_free(fle, ft); 1485 } 1486 if (V_flowtable_debug && count) 1487 log(LOG_DEBUG, "freed %d flow entries\n", count); 1488} 1489 1490void 1491flowtable_route_flush(struct flowtable *ft, struct rtentry *rt) 1492{ 1493 int i; 1494 1495 if (ft->ft_flags & FL_PCPU) { 1496 CPU_FOREACH(i) { 1497 if (smp_started == 1) { 1498 thread_lock(curthread); 1499 sched_bind(curthread, i); 1500 thread_unlock(curthread); 1501 } 1502 1503 flowtable_free_stale(ft, rt); 1504 1505 if (smp_started == 1) { 1506 thread_lock(curthread); 1507 sched_unbind(curthread); 1508 thread_unlock(curthread); 1509 } 1510 } 1511 } else { 1512 flowtable_free_stale(ft, rt); 1513 } 1514} 1515 1516static void 1517flowtable_clean_vnet(void) 1518{ 1519 struct flowtable *ft; 1520 int i; 1521 1522 ft = V_flow_list_head; 1523 while (ft != NULL) { 1524 if (ft->ft_flags & FL_PCPU) { 1525 CPU_FOREACH(i) { 1526 if (smp_started == 1) { 1527 thread_lock(curthread); 1528 sched_bind(curthread, i); 1529 thread_unlock(curthread); 1530 } 1531 1532 flowtable_free_stale(ft, NULL); 1533 1534 if (smp_started == 1) { 1535 thread_lock(curthread); 1536 sched_unbind(curthread); 1537 thread_unlock(curthread); 1538 } 1539 } 1540 } else { 1541 flowtable_free_stale(ft, NULL); 1542 } 1543 ft = ft->ft_next; 1544 } 1545} 1546 1547static void 1548flowtable_cleaner(void) 1549{ 1550 VNET_ITERATOR_DECL(vnet_iter); 1551 1552 if (bootverbose) 1553 log(LOG_INFO, "flowtable cleaner started\n"); 1554 while (1) { 1555 VNET_LIST_RLOCK(); 1556 VNET_FOREACH(vnet_iter) { 1557 CURVNET_SET(vnet_iter); 1558 flowtable_clean_vnet(); 1559 CURVNET_RESTORE(); 1560 } 1561 VNET_LIST_RUNLOCK(); 1562 1563 flowclean_cycles++; 1564 /* 1565 * The 10 second interval between cleaning checks 1566 * is arbitrary 1567 */ 1568 mtx_lock(&flowclean_lock); 1569 cv_broadcast(&flowclean_cv); 1570 cv_timedwait(&flowclean_cv, &flowclean_lock, flowclean_freq); 1571 mtx_unlock(&flowclean_lock); 1572 } 1573} 1574 1575static void 1576flowtable_flush(void *unused __unused) 1577{ 1578 uint64_t start; 1579 1580 mtx_lock(&flowclean_lock); 1581 start = flowclean_cycles; 1582 while (start == flowclean_cycles) { 1583 cv_broadcast(&flowclean_cv); 1584 cv_wait(&flowclean_cv, &flowclean_lock); 1585 } 1586 mtx_unlock(&flowclean_lock); 1587} 1588 1589static struct kproc_desc flow_kp = { 1590 "flowcleaner", 1591 flowtable_cleaner, 1592 &flowcleanerproc 1593}; 1594SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp); 1595 1596static void 1597flowtable_init_vnet(const void *unused __unused) 1598{ 1599 1600 V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus; 1601 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4), 1602 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET); 1603 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6), 1604 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET); 1605 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows); 1606 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows); 1607 V_flowtable_ready = 1; 1608} 1609VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY, 1610 flowtable_init_vnet, NULL); 1611 1612static void 1613flowtable_init(const void *unused __unused) 1614{ 1615 1616 cv_init(&flowclean_cv, "flowcleanwait"); 1617 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF); 1618 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL, 1619 EVENTHANDLER_PRI_ANY); 1620 flowclean_freq = 20*hz; 1621} 1622SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, 1623 flowtable_init, NULL); 1624 1625 1626#ifdef VIMAGE 1627static void 1628flowtable_uninit(const void *unused __unused) 1629{ 1630 1631 V_flowtable_ready = 0; 1632 uma_zdestroy(V_flow_ipv4_zone); 1633 uma_zdestroy(V_flow_ipv6_zone); 1634} 1635 1636VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, 1637 flowtable_uninit, NULL); 1638#endif 1639 1640#ifdef DDB 1641static uint32_t * 1642flowtable_get_hashkey(struct flentry *fle) 1643{ 1644 uint32_t *hashkey; 1645 1646 if (fle->f_flags & FL_IPV6) 1647 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; 1648 else 1649 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; 1650 1651 return (hashkey); 1652} 1653 1654static bitstr_t * 1655flowtable_mask_pcpu(struct flowtable *ft, int cpuid) 1656{ 1657 bitstr_t *mask; 1658 1659 if (ft->ft_flags & FL_PCPU) 1660 mask = ft->ft_masks[cpuid]; 1661 else 1662 mask = ft->ft_masks[0]; 1663 1664 return (mask); 1665} 1666 1667static struct flentry ** 1668flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) 1669{ 1670 struct flentry **fle; 1671 int index = (hash % ft->ft_size); 1672 1673 if (ft->ft_flags & FL_PCPU) { 1674 fle = &ft->ft_table.pcpu[cpuid][index]; 1675 } else { 1676 fle = &ft->ft_table.global[index]; 1677 } 1678 1679 return (fle); 1680} 1681 1682static void 1683flow_show(struct flowtable *ft, struct flentry *fle) 1684{ 1685 int idle_time; 1686 int rt_valid, ifp_valid; 1687 uint16_t sport, dport; 1688 uint32_t *hashkey; 1689 char saddr[4*sizeof "123"], daddr[4*sizeof "123"]; 1690 volatile struct rtentry *rt; 1691 struct ifnet *ifp = NULL; 1692 1693 idle_time = (int)(time_uptime - fle->f_uptime); 1694 rt = fle->f_rt; 1695 rt_valid = rt != NULL; 1696 if (rt_valid) 1697 ifp = rt->rt_ifp; 1698 ifp_valid = ifp != NULL; 1699 hashkey = flowtable_get_hashkey(fle); 1700 if (fle->f_flags & FL_IPV6) 1701 goto skipaddr; 1702 1703 inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr); 1704 if (ft->ft_flags & FL_HASH_ALL) { 1705 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr); 1706 sport = ntohs(((uint16_t *)hashkey)[0]); 1707 dport = ntohs(((uint16_t *)hashkey)[1]); 1708 db_printf("%s:%d->%s:%d", 1709 saddr, sport, daddr, 1710 dport); 1711 } else 1712 db_printf("%s ", daddr); 1713 1714skipaddr: 1715 if (fle->f_flags & FL_STALE) 1716 db_printf(" FL_STALE "); 1717 if (fle->f_flags & FL_TCP) 1718 db_printf(" FL_TCP "); 1719 if (fle->f_flags & FL_UDP) 1720 db_printf(" FL_UDP "); 1721 if (rt_valid) { 1722 if (rt->rt_flags & RTF_UP) 1723 db_printf(" RTF_UP "); 1724 } 1725 if (ifp_valid) { 1726 if (ifp->if_flags & IFF_LOOPBACK) 1727 db_printf(" IFF_LOOPBACK "); 1728 if (ifp->if_flags & IFF_UP) 1729 db_printf(" IFF_UP "); 1730 if (ifp->if_flags & IFF_POINTOPOINT) 1731 db_printf(" IFF_POINTOPOINT "); 1732 } 1733 if (fle->f_flags & FL_IPV6) 1734 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x", 1735 hashkey[0], hashkey[1], hashkey[2], 1736 hashkey[3], hashkey[4], hashkey[5], 1737 hashkey[6], hashkey[7], hashkey[8]); 1738 else 1739 db_printf("\n\tkey=%08x:%08x:%08x ", 1740 hashkey[0], hashkey[1], hashkey[2]); 1741 db_printf("hash=%08x idle_time=%03d" 1742 "\n\tfibnum=%02d rt=%p", 1743 fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt); 1744 db_printf("\n"); 1745} 1746 1747static void 1748flowtable_show(struct flowtable *ft, int cpuid) 1749{ 1750 int curbit = 0; 1751 struct flentry *fle, **flehead; 1752 bitstr_t *mask, *tmpmask; 1753 1754 if (cpuid != -1) 1755 db_printf("cpu: %d\n", cpuid); 1756 mask = flowtable_mask_pcpu(ft, cpuid); 1757 tmpmask = ft->ft_tmpmask; 1758 memcpy(tmpmask, mask, ft->ft_size/8); 1759 /* 1760 * XXX Note to self, bit_ffs operates at the byte level 1761 * and thus adds gratuitous overhead 1762 */ 1763 bit_ffs(tmpmask, ft->ft_size, &curbit); 1764 while (curbit != -1) { 1765 if (curbit >= ft->ft_size || curbit < -1) { 1766 db_printf("warning: bad curbit value %d \n", 1767 curbit); 1768 break; 1769 } 1770 1771 flehead = flowtable_entry_pcpu(ft, curbit, cpuid); 1772 fle = *flehead; 1773 1774 while (fle != NULL) { 1775 flow_show(ft, fle); 1776 fle = fle->f_next; 1777 continue; 1778 } 1779 bit_clear(tmpmask, curbit); 1780 bit_ffs(tmpmask, ft->ft_size, &curbit); 1781 } 1782} 1783 1784static void 1785flowtable_show_vnet(void) 1786{ 1787 struct flowtable *ft; 1788 int i; 1789 1790 ft = V_flow_list_head; 1791 while (ft != NULL) { 1792 printf("name: %s\n", ft->ft_name); 1793 if (ft->ft_flags & FL_PCPU) { 1794 CPU_FOREACH(i) { 1795 flowtable_show(ft, i); 1796 } 1797 } else { 1798 flowtable_show(ft, -1); 1799 } 1800 ft = ft->ft_next; 1801 } 1802} 1803 1804DB_SHOW_COMMAND(flowtables, db_show_flowtables) 1805{ 1806 VNET_ITERATOR_DECL(vnet_iter); 1807 1808 VNET_FOREACH(vnet_iter) { 1809 CURVNET_SET(vnet_iter); 1810 flowtable_show_vnet(); 1811 CURVNET_RESTORE(); 1812 } 1813} 1814#endif
|