in_pcbgroup.c revision 268479
1/*- 2 * Copyright (c) 2010-2011 Juniper Networks, Inc. 3 * All rights reserved. 4 * 5 * This software was developed by Robert N. M. Watson under contract 6 * to Juniper Networks, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include <sys/cdefs.h> 31 32__FBSDID("$FreeBSD: head/sys/netinet/in_pcbgroup.c 268479 2014-07-10 03:10:56Z adrian $"); 33 34#include "opt_inet6.h" 35#include "opt_rss.h" 36 37#include <sys/param.h> 38#include <sys/lock.h> 39#include <sys/malloc.h> 40#include <sys/mbuf.h> 41#include <sys/mutex.h> 42#include <sys/smp.h> 43#include <sys/socketvar.h> 44 45#include <netinet/in.h> 46#include <netinet/in_pcb.h> 47#include <netinet/in_rss.h> 48#ifdef INET6 49#include <netinet6/in6_pcb.h> 50#endif /* INET6 */ 51 52/* 53 * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's 54 * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization 55 * Strategies in Modern Operating Systems". This implementation differs 56 * significantly from that described in the paper, in that it attempts to 57 * introduce not just notions of affinity for connections and distribute work 58 * so as to reduce lock contention, but also align those notions with 59 * hardware work distribution strategies such as RSS. In this construction, 60 * connection groups supplement, rather than replace, existing reservation 61 * tables for protocol 4-tuples, offering CPU-affine lookup tables with 62 * minimal cache line migration and lock contention during steady state 63 * operation. 64 * 65 * Hardware-offloaded checksums are often inefficient in software -- for 66 * example, Toeplitz, specified by RSS, introduced a significant overhead if 67 * performed during per-packge processing. It is therefore desirable to fall 68 * back on traditional reservation table lookups without affinity where 69 * hardware-offloaded checksums aren't available, such as for traffic over 70 * non-RSS interfaces. 71 * 72 * Internet protocols, such as UDP and TCP, register to use connection groups 73 * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this 74 * indicates to the connection group code whether a 2-tuple or 4-tuple is 75 * used as an argument to hashes that assign a connection to a particular 76 * group. This must be aligned with any hardware offloaded distribution 77 * model, such as RSS or similar approaches taken in embedded network boards. 78 * Wildcard sockets require special handling, as in Willman 2006, and are 79 * shared between connection groups -- while being protected by group-local 80 * locks. This means that connection establishment and teardown can be 81 * signficantly more expensive than without connection groups, but that 82 * steady-state processing can be significantly faster. 83 * 84 * When RSS is used, certain connection group parameters, such as the number 85 * of groups, are provided by the RSS implementation, found in in_rss.c. 86 * Otherwise, in_pcbgroup.c selects possible sensible parameters 87 * corresponding to the degree of parallelism exposed by netisr. 88 * 89 * Most of the implementation of connection groups is in this file; however, 90 * connection group lookup is implemented in in_pcb.c alongside reservation 91 * table lookups -- see in_pcblookup_group(). 92 * 93 * TODO: 94 * 95 * Implement dynamic rebalancing of buckets with connection groups; when 96 * load is unevenly distributed, search for more optimal balancing on 97 * demand. This might require scaling up the number of connection groups 98 * by <<1. 99 * 100 * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection 101 * groups for ip_input and ip6_input, allowing non-offloaded work 102 * distribution. 103 * 104 * Expose effective CPU affinity of connections to userspace using socket 105 * options. 106 * 107 * Investigate per-connection affinity overrides based on socket options; an 108 * option could be set, certainly resulting in work being distributed 109 * differently in software, and possibly propagated to supporting hardware 110 * with TCAMs or hardware hash tables. This might require connections to 111 * exist in more than one connection group at a time. 112 * 113 * Hook netisr thread reconfiguration events, and propagate those to RSS so 114 * that rebalancing can occur when the thread pool grows or shrinks. 115 * 116 * Expose per-pcbgroup statistics to userspace monitoring tools such as 117 * netstat, in order to allow better debugging and profiling. 118 */ 119 120void 121in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields, 122 int hash_nelements) 123{ 124 struct inpcbgroup *pcbgroup; 125 u_int numpcbgroups, pgn; 126 127 /* 128 * Only enable connection groups for a protocol if it has been 129 * specifically requested. 130 */ 131 if (hashfields == IPI_HASHFIELDS_NONE) 132 return; 133 134 /* 135 * Connection groups are about multi-processor load distribution, 136 * lock contention, and connection CPU affinity. As such, no point 137 * in turning them on for a uniprocessor machine, it only wastes 138 * memory. 139 */ 140 if (mp_ncpus == 1) 141 return; 142 143#ifdef RSS 144 /* 145 * If we're using RSS, then RSS determines the number of connection 146 * groups to use: one connection group per RSS bucket. If for some 147 * reason RSS isn't able to provide a number of buckets, disable 148 * connection groups entirely. 149 * 150 * XXXRW: Can this ever happen? 151 */ 152 numpcbgroups = rss_getnumbuckets(); 153 if (numpcbgroups == 0) 154 return; 155#else 156 /* 157 * Otherwise, we'll just use one per CPU for now. If we decide to 158 * do dynamic rebalancing a la RSS, we'll need similar logic here. 159 */ 160 numpcbgroups = mp_ncpus; 161#endif 162 163 pcbinfo->ipi_hashfields = hashfields; 164 pcbinfo->ipi_pcbgroups = malloc(numpcbgroups * 165 sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO); 166 pcbinfo->ipi_npcbgroups = numpcbgroups; 167 pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB, 168 &pcbinfo->ipi_wildmask); 169 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { 170 pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; 171 pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB, 172 &pcbgroup->ipg_hashmask); 173 INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup"); 174 175 /* 176 * Initialise notional affinity of the pcbgroup -- for RSS, 177 * we want the same notion of affinity as NICs to be used. In 178 * the non-RSS case, just round robin for the time being. 179 * 180 * XXXRW: The notion of a bucket to CPU mapping is common at 181 * both pcbgroup and RSS layers -- does that mean that we 182 * should migrate it all from RSS to here, and just leave RSS 183 * responsible only for providing hashing and mapping funtions? 184 */ 185#ifdef RSS 186 pcbgroup->ipg_cpu = rss_getcpu(pgn); 187#else 188 pcbgroup->ipg_cpu = (pgn % mp_ncpus); 189#endif 190 } 191} 192 193void 194in_pcbgroup_destroy(struct inpcbinfo *pcbinfo) 195{ 196 struct inpcbgroup *pcbgroup; 197 u_int pgn; 198 199 if (pcbinfo->ipi_npcbgroups == 0) 200 return; 201 202 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { 203 pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; 204 KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead), 205 ("in_pcbinfo_destroy: listhead not empty")); 206 INP_GROUP_LOCK_DESTROY(pcbgroup); 207 hashdestroy(pcbgroup->ipg_hashbase, M_PCB, 208 pcbgroup->ipg_hashmask); 209 } 210 hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask); 211 free(pcbinfo->ipi_pcbgroups, M_PCB); 212 pcbinfo->ipi_pcbgroups = NULL; 213 pcbinfo->ipi_npcbgroups = 0; 214 pcbinfo->ipi_hashfields = 0; 215} 216 217/* 218 * Given a hash of whatever the covered tuple might be, return a pcbgroup 219 * index. Where RSS is supported, try to align bucket selection with RSS CPU 220 * affinity strategy. 221 */ 222static __inline u_int 223in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) 224{ 225 226#ifdef RSS 227 return (rss_getbucket(hash)); 228#else 229 return (hash % pcbinfo->ipi_npcbgroups); 230#endif 231} 232 233/* 234 * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash 235 * information is insufficient to identify the pcbgroup. This might occur if 236 * a TCP packet turns up with a 2-tuple hash, or if an RSS hash is present but 237 * RSS is not compiled into the kernel. 238 */ 239struct inpcbgroup * 240in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) 241{ 242 243#ifdef RSS 244 if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && 245 hashtype == M_HASHTYPE_RSS_TCP_IPV4) || 246 (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE && 247 hashtype == M_HASHTYPE_RSS_IPV4)) 248 return (&pcbinfo->ipi_pcbgroups[ 249 in_pcbgroup_getbucket(pcbinfo, hash)]); 250#endif 251 return (NULL); 252} 253 254static struct inpcbgroup * 255in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m) 256{ 257 258 return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), 259 m->m_pkthdr.flowid)); 260} 261 262struct inpcbgroup * 263in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr, 264 u_short lport, struct in_addr faddr, u_short fport) 265{ 266 uint32_t hash; 267 268 /* 269 * RSS note: we pass foreign addr/port as source, and local addr/port 270 * as destination, as we want to align with what the hardware is 271 * doing. 272 */ 273 switch (pcbinfo->ipi_hashfields) { 274 case IPI_HASHFIELDS_4TUPLE: 275#ifdef RSS 276 hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport); 277#else 278 hash = faddr.s_addr ^ fport; 279#endif 280 break; 281 282 case IPI_HASHFIELDS_2TUPLE: 283#ifdef RSS 284 hash = rss_hash_ip4_2tuple(faddr, laddr); 285#else 286 hash = faddr.s_addr ^ laddr.s_addr; 287#endif 288 break; 289 290 default: 291 hash = 0; 292 } 293 return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo, 294 hash)]); 295} 296 297struct inpcbgroup * 298in_pcbgroup_byinpcb(struct inpcb *inp) 299{ 300#ifdef RSS 301 /* 302 * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined 303 * RSS bucket and thus we should use this pcbgroup, rather than 304 * using a tuple or hash. 305 * 306 * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket 307 * fits in that! 308 */ 309 if (inp->inp_flags2 & INP_RSS_BUCKET_SET) 310 return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]); 311#endif 312 313 return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr, 314 inp->inp_lport, inp->inp_faddr, inp->inp_fport)); 315} 316 317static void 318in_pcbwild_add(struct inpcb *inp) 319{ 320 struct inpcbinfo *pcbinfo; 321 struct inpcbhead *head; 322 u_int pgn; 323 324 INP_WLOCK_ASSERT(inp); 325 KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD), 326 ("%s: is wild",__func__)); 327 328 pcbinfo = inp->inp_pcbinfo; 329 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 330 INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); 331 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport, 332 0, pcbinfo->ipi_wildmask)]; 333 LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild); 334 inp->inp_flags2 |= INP_PCBGROUPWILD; 335 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 336 INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); 337} 338 339static void 340in_pcbwild_remove(struct inpcb *inp) 341{ 342 struct inpcbinfo *pcbinfo; 343 u_int pgn; 344 345 INP_WLOCK_ASSERT(inp); 346 KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD), 347 ("%s: not wild", __func__)); 348 349 pcbinfo = inp->inp_pcbinfo; 350 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 351 INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); 352 LIST_REMOVE(inp, inp_pcbgroup_wild); 353 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 354 INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); 355 inp->inp_flags2 &= ~INP_PCBGROUPWILD; 356} 357 358static __inline int 359in_pcbwild_needed(struct inpcb *inp) 360{ 361#ifdef RSS 362 /* 363 * If it's a listen socket and INP_RSS_BUCKET_SET is set, 364 * it's a wildcard socket _but_ it's in a specific pcbgroup. 365 * Thus we don't treat it as a pcbwild inp. 366 */ 367 if (inp->inp_flags2 & INP_RSS_BUCKET_SET) 368 return (0); 369#endif 370 371#ifdef INET6 372 if (inp->inp_vflag & INP_IPV6) 373 return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)); 374 else 375#endif 376 return (inp->inp_faddr.s_addr == htonl(INADDR_ANY)); 377} 378 379static void 380in_pcbwild_update_internal(struct inpcb *inp) 381{ 382 int wildcard_needed; 383 384 wildcard_needed = in_pcbwild_needed(inp); 385 if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD)) 386 in_pcbwild_add(inp); 387 else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD)) 388 in_pcbwild_remove(inp); 389} 390 391/* 392 * Update the pcbgroup of an inpcb, which might include removing an old 393 * pcbgroup reference and/or adding a new one. Wildcard processing is not 394 * performed here, although ideally we'll never install a pcbgroup for a 395 * wildcard inpcb (asserted below). 396 */ 397static void 398in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo, 399 struct inpcbgroup *newpcbgroup, struct inpcb *inp) 400{ 401 struct inpcbgroup *oldpcbgroup; 402 struct inpcbhead *pcbhash; 403 uint32_t hashkey_faddr; 404 405 INP_WLOCK_ASSERT(inp); 406 407 oldpcbgroup = inp->inp_pcbgroup; 408 if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) { 409 INP_GROUP_LOCK(oldpcbgroup); 410 LIST_REMOVE(inp, inp_pcbgrouphash); 411 inp->inp_pcbgroup = NULL; 412 INP_GROUP_UNLOCK(oldpcbgroup); 413 } 414 if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) { 415#ifdef INET6 416 if (inp->inp_vflag & INP_IPV6) 417 hashkey_faddr = inp->in6p_faddr.s6_addr32[3]; /* XXX */ 418 else 419#endif 420 hashkey_faddr = inp->inp_faddr.s_addr; 421 INP_GROUP_LOCK(newpcbgroup); 422 /* 423 * If the inp is an RSS bucket wildcard entry, ensure 424 * that the PCB hash is calculated correctly. 425 * 426 * The wildcard hash calculation differs from the 427 * non-wildcard definition. The source address is 428 * INADDR_ANY and the far port is 0. 429 */ 430 if (inp->inp_flags2 & INP_RSS_BUCKET_SET) { 431 pcbhash = &newpcbgroup->ipg_hashbase[ 432 INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0, 433 newpcbgroup->ipg_hashmask)]; 434 } else { 435 pcbhash = &newpcbgroup->ipg_hashbase[ 436 INP_PCBHASH(hashkey_faddr, inp->inp_lport, 437 inp->inp_fport, 438 newpcbgroup->ipg_hashmask)]; 439 } 440 LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash); 441 inp->inp_pcbgroup = newpcbgroup; 442 INP_GROUP_UNLOCK(newpcbgroup); 443 } 444 445 KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)), 446 ("%s: pcbgroup and wildcard!", __func__)); 447} 448 449/* 450 * Two update paths: one in which the 4-tuple on an inpcb has been updated 451 * and therefore connection groups may need to change (or a wildcard entry 452 * may needed to be installed), and another in which the 4-tuple has been 453 * set as a result of a packet received, in which case we may be able to use 454 * the hash on the mbuf to avoid doing a software hash calculation for RSS. 455 * 456 * In each case: first, let the wildcard code have a go at placing it as a 457 * wildcard socket. If it was a wildcard, or if the connection has been 458 * dropped, then no pcbgroup is required (so potentially clear it); 459 * otherwise, calculate and update the pcbgroup for the inpcb. 460 */ 461void 462in_pcbgroup_update(struct inpcb *inp) 463{ 464 struct inpcbinfo *pcbinfo; 465 struct inpcbgroup *newpcbgroup; 466 467 INP_WLOCK_ASSERT(inp); 468 469 pcbinfo = inp->inp_pcbinfo; 470 if (!in_pcbgroup_enabled(pcbinfo)) 471 return; 472 473 in_pcbwild_update_internal(inp); 474 if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && 475 !(inp->inp_flags & INP_DROPPED)) { 476#ifdef INET6 477 if (inp->inp_vflag & INP_IPV6) 478 newpcbgroup = in6_pcbgroup_byinpcb(inp); 479 else 480#endif 481 newpcbgroup = in_pcbgroup_byinpcb(inp); 482 } else 483 newpcbgroup = NULL; 484 in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); 485} 486 487void 488in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m) 489{ 490 struct inpcbinfo *pcbinfo; 491 struct inpcbgroup *newpcbgroup; 492 493 INP_WLOCK_ASSERT(inp); 494 495 pcbinfo = inp->inp_pcbinfo; 496 if (!in_pcbgroup_enabled(pcbinfo)) 497 return; 498 499 /* 500 * Possibly should assert !INP_PCBGROUPWILD rather than testing for 501 * it; presumably this function should never be called for anything 502 * other than non-wildcard socket? 503 */ 504 in_pcbwild_update_internal(inp); 505 if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && 506 !(inp->inp_flags & INP_DROPPED)) { 507 newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m); 508#ifdef INET6 509 if (inp->inp_vflag & INP_IPV6) { 510 if (newpcbgroup == NULL) 511 newpcbgroup = in6_pcbgroup_byinpcb(inp); 512 } else { 513#endif 514 if (newpcbgroup == NULL) 515 newpcbgroup = in_pcbgroup_byinpcb(inp); 516#ifdef INET6 517 } 518#endif 519 } else 520 newpcbgroup = NULL; 521 in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); 522} 523 524/* 525 * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb. 526 */ 527void 528in_pcbgroup_remove(struct inpcb *inp) 529{ 530 struct inpcbgroup *pcbgroup; 531 532 INP_WLOCK_ASSERT(inp); 533 534 if (!in_pcbgroup_enabled(inp->inp_pcbinfo)) 535 return; 536 537 if (inp->inp_flags2 & INP_PCBGROUPWILD) 538 in_pcbwild_remove(inp); 539 540 pcbgroup = inp->inp_pcbgroup; 541 if (pcbgroup != NULL) { 542 INP_GROUP_LOCK(pcbgroup); 543 LIST_REMOVE(inp, inp_pcbgrouphash); 544 inp->inp_pcbgroup = NULL; 545 INP_GROUP_UNLOCK(pcbgroup); 546 } 547} 548 549/* 550 * Query whether or not it is appropriate to use pcbgroups to look up inpcbs 551 * for a protocol. 552 */ 553int 554in_pcbgroup_enabled(struct inpcbinfo *pcbinfo) 555{ 556 557 return (pcbinfo->ipi_npcbgroups > 0); 558} 559