1/* 2 * Copyright (c) 2011 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by Coyote Point Systems, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30/* 31 * Vestigial time-wait. 32 * 33 * This implementation uses cache-efficient techniques, which will 34 * appear somewhat peculiar. The main philosophy is to optimise the 35 * amount of information available within a cache line. Cache miss is 36 * expensive. So we employ ad-hoc techniques to pull a series of 37 * linked-list follows into a cache line. One cache line, multiple 38 * linked-list equivalents. 39 * 40 * One such ad-hoc technique is fat pointers. Additional degrees of 41 * ad-hoqueness result from having to hand tune it for pointer size 42 * and for cache line size. 43 * 44 * The 'fat pointer' approach aggregates, for x86_32, 15 linked-list 45 * data structures into one cache line. The additional 32 bits in the 46 * cache line are used for linking fat pointers, and for 47 * allocation/bookkeeping. 48 * 49 * The 15 32-bit tags encode the pointers to the linked list elements, 50 * and also encode the results of a search comparison. 51 * 52 * First, some more assumptions/restrictions. 53 * 54 * All the fat pointers are from a contiguous allocation arena. Thus, 55 * we can refer to them by offset from a base, not as full pointers. 56 * 57 * All the linked list data elements are also from a contiguous 58 * allocation arena, again so that we can refer to them as offset from 59 * a base. 60 * 61 * In order to add a data element to a fat pointer, a key value is 62 * computed, based on unique data within the data element. It is the 63 * linear searching of the linked lists of these elements based on 64 * these unique data that are being optimised here. 65 * 66 * Lets call the function that computes the key k(e), where e is the 67 * data element. In this example, k(e) returns 32-bits. 68 * 69 * Consider a set E (say of order 15) of data elements. Let K be 70 * the set of the k(e) for e in E. 71 * 72 * Let O be the set of the offsets from the base of the data elements in E. 73 * 74 * For each x in K, for each matching o in O, let t be x ^ o. These 75 * are the tags. (More or less). 76 * 77 * In order to search all the data elements in E, we compute the 78 * search key, and one at a time, XOR the key into the tags. If any 79 * result is a valid data element index, we have a possible match. If 80 * not, there is no match. 81 * 82 * The no-match cases mean we do not have to de-reference the pointer 83 * to the data element in question. We save cache miss penalty and 84 * cache load decreases. Only in the case of a valid looking data 85 * element index, do we have to look closer. 86 * 87 * Thus, in the absence of false positives, 15 data elements can be 88 * searched with one cache line fill, as opposed to 15 cache line 89 * fills for the usual implementation. 90 * 91 * The vestigial time waits (vtw_t), the data elements in the above, are 92 * searched by faddr, fport, laddr, lport. The key is a function of 93 * these values. 94 * 95 * We hash these keys into the traditional hash chains to reduce the 96 * search time, and use fat pointers to reduce the cache impacts of 97 * searching. 98 * 99 * The vtw_t are, per requirement, in a contiguous chunk. Allocation 100 * is done with a clock hand, and all vtw_t within one allocation 101 * domain have the same lifetime, so they will always be sorted by 102 * age. 103 * 104 * A vtw_t will be allocated, timestamped, and have a fixed future 105 * expiration. It will be added to a hash bucket implemented with fat 106 * pointers, which means that a cache line will be allocated in the 107 * hash bucket, placed at the head (more recent in time) and the vtw_t 108 * will be added to this. As more entries are added, the fat pointer 109 * cache line will fill, requiring additional cache lines for fat 110 * pointers to be allocated. These will be added at the head, and the 111 * aged entries will hang down, tapeworm like. As the vtw_t entries 112 * expire, the corresponding slot in the fat pointer will be 113 * reclaimed, and eventually the cache line will completely empty and 114 * be re-cycled, if not at the head of the chain. 115 * 116 * At times, a time-wait timer is restarted. This corresponds to 117 * deleting the current entry and re-adding it. 118 * 119 * Most of the time, they are just placed here to die. 120 */ 121#ifndef _NETINET_TCP_VTW_H 122#define _NETINET_TCP_VTW_H 123 124#include <sys/types.h> 125#include <sys/socket.h> 126#include <sys/sysctl.h> 127#include <net/if.h> 128#include <net/route.h> 129#include <netinet/in.h> 130#include <netinet/in_systm.h> 131#include <netinet/ip.h> 132#include <netinet/in_pcb.h> 133#include <netinet/in_var.h> 134#include <netinet/ip_var.h> 135#include <netinet/in.h> 136#include <netinet/tcp.h> 137#include <netinet/tcp_timer.h> 138#include <netinet/tcp_var.h> 139#include <netinet6/in6.h> 140#include <netinet/ip6.h> 141#include <netinet6/ip6_var.h> 142#include <netinet6/in6_pcb.h> 143#include <netinet6/ip6_var.h> 144#include <netinet6/in6_var.h> 145#include <netinet/icmp6.h> 146#include <netinet6/nd6.h> 147 148#define VTW_NCLASS (1+3) /* # different classes */ 149 150/* 151 * fat pointers, MI. 152 */ 153struct fatp_mi; 154 155typedef uint32_t fatp_word_t; 156 157typedef struct fatp_mi fatp_t; 158 159/* Supported cacheline sizes: 32 64 128 bytes. See fatp_key(), 160 * fatp_slot_from_key(), fatp_xtra[]. 161 */ 162#define FATP_NTAGS (CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1) 163#define FATP_NXT_WIDTH (sizeof(fatp_word_t) * NBBY - FATP_NTAGS) 164 165#define FATP_MAX (1 << FATP_NXT_WIDTH) 166 167/* Worked example: ULP32 with 64-byte cacheline (32-bit x86): 168 * 15 tags per cacheline. At most 2^17 fat pointers per fatp_ctl_t. 169 * The comments on the fatp_mi members, below, correspond to the worked 170 * example. 171 */ 172struct fatp_mi { 173 fatp_word_t inuse : FATP_NTAGS; /* (1+15)*4 == CL_SIZE */ 174 fatp_word_t nxt : FATP_NXT_WIDTH;/* at most 2^17 fat pointers */ 175 fatp_word_t tag[FATP_NTAGS]; /* 15 tags per CL */ 176}; 177 178static inline int 179fatp_ntags(void) 180{ 181 return FATP_NTAGS; 182} 183 184static inline int 185fatp_full(fatp_t *fp) 186{ 187 fatp_t full; 188 189 full.inuse = ~0; 190 191 return (fp->inuse == full.inuse); 192} 193 194struct vtw_common; 195struct vtw_v4; 196struct vtw_v6; 197struct vtw_ctl; 198 199/*!\brief common to all vtw 200 */ 201typedef struct vtw_common { 202 struct timeval expire; /* date of birth+msl */ 203 uint32_t key; /* hash key: full hash */ 204 uint32_t port_key; /* hash key: local port hash */ 205 uint32_t rcv_nxt; 206 uint32_t rcv_wnd; 207 uint32_t snd_nxt; 208 uint32_t snd_scale : 8; /* window scaling for send win */ 209 uint32_t msl_class : 2; /* TCP MSL class {0,1,2,3} */ 210 uint32_t reuse_port : 1; 211 uint32_t reuse_addr : 1; 212 uint32_t v6only : 1; 213 uint32_t hashed : 1; /* reachable via FATP */ 214 uint32_t uid; 215} vtw_t; 216 217/*!\brief vestigial timewait for IPv4 218 */ 219typedef struct vtw_v4 { 220 vtw_t common; /* must be first */ 221 uint16_t lport; 222 uint16_t fport; 223 uint32_t laddr; 224 uint32_t faddr; 225} vtw_v4_t; 226 227/*!\brief vestigial timewait for IPv6 228 */ 229typedef struct vtw_v6 { 230 vtw_t common; /* must be first */ 231 uint16_t lport; 232 uint16_t fport; 233 struct in6_addr laddr; 234 struct in6_addr faddr; 235} vtw_v6_t; 236 237struct fatp_ctl; 238typedef struct vtw_ctl vtw_ctl_t; 239typedef struct fatp_ctl fatp_ctl_t; 240 241/* 242 * The vestigial time waits are kept in a contiguous chunk. 243 * Allocation and free pointers run as clock hands thru this array. 244 */ 245struct vtw_ctl { 246 fatp_ctl_t *fat; /* collection of fatp to use */ 247 vtw_ctl_t *ctl; /* <! controller's controller */ 248 union { 249 vtw_t *v; /* common */ 250 struct vtw_v4 *v4; /* IPv4 resources */ 251 struct vtw_v6 *v6; /* IPv6 resources */ 252 } base, /* base of vtw_t array */ 253 /**/ lim, /* extent of vtw_t array */ 254 /**/ alloc, /* allocation pointer */ 255 /**/ oldest; /* ^ to oldest */ 256 uint32_t nfree; /* # free */ 257 uint32_t nalloc; /* # allocated */ 258 uint32_t idx_mask; /* mask capturing all index bits*/ 259 uint32_t is_v4 : 1; 260 uint32_t is_v6 : 1; 261 uint32_t idx_bits: 6; 262 uint32_t clidx : 3; /* <! class index */ 263}; 264 265/*!\brief Collections of fat pointers. 266 */ 267struct fatp_ctl { 268 vtw_ctl_t *vtw; /* associated VTWs */ 269 fatp_t *base; /* base of fatp_t array */ 270 fatp_t *lim; /* extent of fatp_t array */ 271 fatp_t *free; /* free list */ 272 uint32_t mask; /* hash mask */ 273 uint32_t nfree; /* # free */ 274 uint32_t nalloc; /* # allocated */ 275 fatp_t **hash; /* hash anchors */ 276 fatp_t **port; /* port hash anchors */ 277}; 278 279/*!\brief stats 280 */ 281struct vtw_stats { 282 uint64_t ins; /* <! inserts */ 283 uint64_t del; /* <! deleted */ 284 uint64_t kill; /* <! assassination */ 285 uint64_t look[2]; /* <! lookup: full hash, port hash */ 286 uint64_t hit[2]; /* <! lookups that hit */ 287 uint64_t miss[2]; /* <! lookups that miss */ 288 uint64_t probe[2]; /* <! hits+miss */ 289 uint64_t losing[2]; /* <! misses requiring dereference */ 290 uint64_t max_chain[2]; /* <! max fatp chain traversed */ 291 uint64_t max_probe[2]; /* <! max probes in any one chain */ 292 uint64_t max_loss[2]; /* <! max losing probes in any one 293 * chain 294 */ 295}; 296 297typedef struct vtw_stats vtw_stats_t; 298 299/*!\brief follow fatp next 'pointer' 300 */ 301static inline fatp_t * 302fatp_next(fatp_ctl_t *fat, fatp_t *fp) 303{ 304 return fp->nxt ? fat->base + fp->nxt-1 : 0; 305} 306 307/*!\brief determine a collection-relative fat pointer index. 308 */ 309static inline uint32_t 310fatp_index(fatp_ctl_t *fat, fatp_t *fp) 311{ 312 return fp ? 1 + (fp - fat->base) : 0; 313} 314 315 316static inline uint32_t 317v4_tag(uint32_t faddr, uint32_t fport, uint32_t laddr, uint32_t lport) 318{ 319 return (ntohl(faddr) + ntohs(fport) 320 + ntohl(laddr) + ntohs(lport)); 321} 322 323static inline uint32_t 324v6_tag(const struct in6_addr *faddr, uint16_t fport, 325 const struct in6_addr *laddr, uint16_t lport) 326{ 327#ifdef IN6_HASH 328 return IN6_HASH(faddr, fport, laddr, lport); 329#else 330 return 0; 331#endif 332} 333 334static inline uint32_t 335v4_port_tag(uint16_t lport) 336{ 337 uint32_t tag = lport ^ (lport << 11); 338 339 tag ^= tag << 3; 340 tag += tag >> 5; 341 tag ^= tag << 4; 342 tag += tag >> 17; 343 tag ^= tag << 25; 344 tag += tag >> 6; 345 346 return tag; 347} 348 349static inline uint32_t 350v6_port_tag(uint16_t lport) 351{ 352 return v4_port_tag(lport); 353} 354 355struct tcpcb; 356struct tcphdr; 357 358int vtw_add(int, struct tcpcb *); 359void vtw_del(vtw_ctl_t *, vtw_t *); 360int vtw_lookup_v4(const struct ip *ip, const struct tcphdr *th, 361 uint32_t faddr, uint16_t fport, 362 uint32_t laddr, uint16_t lport); 363struct ip6_hdr; 364struct in6_addr; 365 366int vtw_lookup_v6(const struct ip6_hdr *ip, const struct tcphdr *th, 367 const struct in6_addr *faddr, uint16_t fport, 368 const struct in6_addr *laddr, uint16_t lport); 369 370typedef struct vestigial_inpcb { 371 union { 372 struct in_addr v4; 373 struct in6_addr v6; 374 } faddr, laddr; 375 uint16_t fport, lport; 376 uint32_t valid : 1; 377 uint32_t v4 : 1; 378 uint32_t reuse_addr : 1; 379 uint32_t reuse_port : 1; 380 uint32_t v6only : 1; 381 uint32_t more_tbd : 1; 382 uint32_t uid; 383 uint32_t rcv_nxt; 384 uint32_t rcv_wnd; 385 uint32_t snd_nxt; 386 struct vtw_common *vtw; 387 struct vtw_ctl *ctl; 388} vestigial_inpcb_t; 389 390#ifdef _KERNEL 391void vtw_restart(vestigial_inpcb_t*); 392int vtw_earlyinit(void); 393int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO); 394#endif /* _KERNEL */ 395 396#ifdef VTW_DEBUG 397typedef struct sin_either { 398 uint8_t sin_len; 399 uint8_t sin_family; 400 uint16_t sin_port; 401 union { 402 struct in_addr v4; 403 struct in6_addr v6; 404 } sin_addr; 405} sin_either_t; 406 407int vtw_debug_add(int af, sin_either_t *, sin_either_t *, int, int); 408 409typedef struct vtw_sysargs { 410 uint32_t op; 411 sin_either_t fa; 412 sin_either_t la; 413} vtw_sysargs_t; 414 415#endif /* VTW_DEBUG */ 416 417#endif /* _NETINET_TCP_VTW_H */ 418