1/*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2010-2011 Juniper Networks, Inc. 5 * All rights reserved. 6 * 7 * This software was developed by Robert N. M. Watson under contract 8 * to Juniper Networks, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32#include <sys/cdefs.h> 33 34__FBSDID("$FreeBSD$"); 35 36#include "opt_inet6.h" 37#include "opt_rss.h" 38 39#include <sys/param.h> 40#include <sys/lock.h> 41#include <sys/malloc.h> 42#include <sys/mbuf.h> 43#include <sys/mutex.h> 44#include <sys/smp.h> 45#include <sys/socket.h> 46#include <sys/socketvar.h> 47 48#include <net/rss_config.h> 49 50#include <netinet/in.h> 51 52#include <netinet/in_pcb.h> 53#include <netinet/in_rss.h> 54#ifdef INET6 55#include <netinet6/in6_pcb.h> 56#endif /* INET6 */ 57 58/* 59 * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's 60 * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization 61 * Strategies in Modern Operating Systems". This implementation differs 62 * significantly from that described in the paper, in that it attempts to 63 * introduce not just notions of affinity for connections and distribute work 64 * so as to reduce lock contention, but also align those notions with 65 * hardware work distribution strategies such as RSS. In this construction, 66 * connection groups supplement, rather than replace, existing reservation 67 * tables for protocol 4-tuples, offering CPU-affine lookup tables with 68 * minimal cache line migration and lock contention during steady state 69 * operation. 70 * 71 * Hardware-offloaded checksums are often inefficient in software -- for 72 * example, Toeplitz, specified by RSS, introduced a significant overhead if 73 * performed during per-packge processing. It is therefore desirable to fall 74 * back on traditional reservation table lookups without affinity where 75 * hardware-offloaded checksums aren't available, such as for traffic over 76 * non-RSS interfaces. 77 * 78 * Internet protocols, such as UDP and TCP, register to use connection groups 79 * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this 80 * indicates to the connection group code whether a 2-tuple or 4-tuple is 81 * used as an argument to hashes that assign a connection to a particular 82 * group. This must be aligned with any hardware offloaded distribution 83 * model, such as RSS or similar approaches taken in embedded network boards. 84 * Wildcard sockets require special handling, as in Willman 2006, and are 85 * shared between connection groups -- while being protected by group-local 86 * locks. This means that connection establishment and teardown can be 87 * signficantly more expensive than without connection groups, but that 88 * steady-state processing can be significantly faster. 89 * 90 * When RSS is used, certain connection group parameters, such as the number 91 * of groups, are provided by the RSS implementation, found in in_rss.c. 92 * Otherwise, in_pcbgroup.c selects possible sensible parameters 93 * corresponding to the degree of parallelism exposed by netisr. 94 * 95 * Most of the implementation of connection groups is in this file; however, 96 * connection group lookup is implemented in in_pcb.c alongside reservation 97 * table lookups -- see in_pcblookup_group(). 98 * 99 * TODO: 100 * 101 * Implement dynamic rebalancing of buckets with connection groups; when 102 * load is unevenly distributed, search for more optimal balancing on 103 * demand. This might require scaling up the number of connection groups 104 * by <<1. 105 * 106 * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection 107 * groups for ip_input and ip6_input, allowing non-offloaded work 108 * distribution. 109 * 110 * Expose effective CPU affinity of connections to userspace using socket 111 * options. 112 * 113 * Investigate per-connection affinity overrides based on socket options; an 114 * option could be set, certainly resulting in work being distributed 115 * differently in software, and possibly propagated to supporting hardware 116 * with TCAMs or hardware hash tables. This might require connections to 117 * exist in more than one connection group at a time. 118 * 119 * Hook netisr thread reconfiguration events, and propagate those to RSS so 120 * that rebalancing can occur when the thread pool grows or shrinks. 121 * 122 * Expose per-pcbgroup statistics to userspace monitoring tools such as 123 * netstat, in order to allow better debugging and profiling. 124 */ 125 126void 127in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields, 128 int hash_nelements) 129{ 130 struct inpcbgroup *pcbgroup; 131 u_int numpcbgroups, pgn; 132 133 /* 134 * Only enable connection groups for a protocol if it has been 135 * specifically requested. 136 */ 137 if (hashfields == IPI_HASHFIELDS_NONE) 138 return; 139 140 /* 141 * Connection groups are about multi-processor load distribution, 142 * lock contention, and connection CPU affinity. As such, no point 143 * in turning them on for a uniprocessor machine, it only wastes 144 * memory. 145 */ 146 if (mp_ncpus == 1) 147 return; 148 149#ifdef RSS 150 /* 151 * If we're using RSS, then RSS determines the number of connection 152 * groups to use: one connection group per RSS bucket. If for some 153 * reason RSS isn't able to provide a number of buckets, disable 154 * connection groups entirely. 155 * 156 * XXXRW: Can this ever happen? 157 */ 158 numpcbgroups = rss_getnumbuckets(); 159 if (numpcbgroups == 0) 160 return; 161#else 162 /* 163 * Otherwise, we'll just use one per CPU for now. If we decide to 164 * do dynamic rebalancing a la RSS, we'll need similar logic here. 165 */ 166 numpcbgroups = mp_ncpus; 167#endif 168 169 pcbinfo->ipi_hashfields = hashfields; 170 pcbinfo->ipi_pcbgroups = malloc(numpcbgroups * 171 sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO); 172 pcbinfo->ipi_npcbgroups = numpcbgroups; 173 pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB, 174 &pcbinfo->ipi_wildmask); 175 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { 176 pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; 177 pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB, 178 &pcbgroup->ipg_hashmask); 179 INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup"); 180 181 /* 182 * Initialise notional affinity of the pcbgroup -- for RSS, 183 * we want the same notion of affinity as NICs to be used. In 184 * the non-RSS case, just round robin for the time being. 185 * 186 * XXXRW: The notion of a bucket to CPU mapping is common at 187 * both pcbgroup and RSS layers -- does that mean that we 188 * should migrate it all from RSS to here, and just leave RSS 189 * responsible only for providing hashing and mapping funtions? 190 */ 191#ifdef RSS 192 pcbgroup->ipg_cpu = rss_getcpu(pgn); 193#else 194 pcbgroup->ipg_cpu = (pgn % mp_ncpus); 195#endif 196 } 197} 198 199void 200in_pcbgroup_destroy(struct inpcbinfo *pcbinfo) 201{ 202 struct inpcbgroup *pcbgroup; 203 u_int pgn; 204 205 if (pcbinfo->ipi_npcbgroups == 0) 206 return; 207 208 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { 209 pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; 210 KASSERT(CK_LIST_EMPTY(pcbinfo->ipi_listhead), 211 ("in_pcbinfo_destroy: listhead not empty")); 212 INP_GROUP_LOCK_DESTROY(pcbgroup); 213 hashdestroy(pcbgroup->ipg_hashbase, M_PCB, 214 pcbgroup->ipg_hashmask); 215 } 216 hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask); 217 free(pcbinfo->ipi_pcbgroups, M_PCB); 218 pcbinfo->ipi_pcbgroups = NULL; 219 pcbinfo->ipi_npcbgroups = 0; 220 pcbinfo->ipi_hashfields = 0; 221} 222 223/* 224 * Given a hash of whatever the covered tuple might be, return a pcbgroup 225 * index. Where RSS is supported, try to align bucket selection with RSS CPU 226 * affinity strategy. 227 */ 228static __inline u_int 229in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) 230{ 231 232#ifdef RSS 233 return (rss_getbucket(hash)); 234#else 235 return (hash % pcbinfo->ipi_npcbgroups); 236#endif 237} 238 239/* 240 * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash 241 * information is insufficient to identify the pcbgroup. This might occur if 242 * a TCP packet turns up with a 2-tuple hash, or if an RSS hash is present but 243 * RSS is not compiled into the kernel. 244 */ 245struct inpcbgroup * 246in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) 247{ 248 249#ifdef RSS 250 if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && 251 hashtype == M_HASHTYPE_RSS_TCP_IPV4) || 252 (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && 253 hashtype == M_HASHTYPE_RSS_UDP_IPV4) || 254 (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE && 255 hashtype == M_HASHTYPE_RSS_IPV4)) 256 return (&pcbinfo->ipi_pcbgroups[ 257 in_pcbgroup_getbucket(pcbinfo, hash)]); 258#endif 259 return (NULL); 260} 261 262static struct inpcbgroup * 263in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m) 264{ 265 266 return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), 267 m->m_pkthdr.flowid)); 268} 269 270struct inpcbgroup * 271in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr, 272 u_short lport, struct in_addr faddr, u_short fport) 273{ 274 uint32_t hash; 275 276 /* 277 * RSS note: we pass foreign addr/port as source, and local addr/port 278 * as destination, as we want to align with what the hardware is 279 * doing. 280 */ 281 switch (pcbinfo->ipi_hashfields) { 282 case IPI_HASHFIELDS_4TUPLE: 283#ifdef RSS 284 hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport); 285#else 286 hash = faddr.s_addr ^ fport; 287#endif 288 break; 289 290 case IPI_HASHFIELDS_2TUPLE: 291#ifdef RSS 292 hash = rss_hash_ip4_2tuple(faddr, laddr); 293#else 294 hash = faddr.s_addr ^ laddr.s_addr; 295#endif 296 break; 297 298 default: 299 hash = 0; 300 } 301 return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo, 302 hash)]); 303} 304 305struct inpcbgroup * 306in_pcbgroup_byinpcb(struct inpcb *inp) 307{ 308#ifdef RSS 309 /* 310 * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined 311 * RSS bucket and thus we should use this pcbgroup, rather than 312 * using a tuple or hash. 313 * 314 * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket 315 * fits in that! 316 */ 317 if (inp->inp_flags2 & INP_RSS_BUCKET_SET) 318 return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]); 319#endif 320 321 return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr, 322 inp->inp_lport, inp->inp_faddr, inp->inp_fport)); 323} 324 325static void 326in_pcbwild_add(struct inpcb *inp) 327{ 328 struct inpcbinfo *pcbinfo; 329 struct inpcbhead *head; 330 u_int pgn; 331 332 INP_WLOCK_ASSERT(inp); 333 KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD), 334 ("%s: is wild",__func__)); 335 336 pcbinfo = inp->inp_pcbinfo; 337 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 338 INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); 339 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport, 340 0, pcbinfo->ipi_wildmask)]; 341 CK_LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild); 342 inp->inp_flags2 |= INP_PCBGROUPWILD; 343 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 344 INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); 345} 346 347static void 348in_pcbwild_remove(struct inpcb *inp) 349{ 350 struct inpcbinfo *pcbinfo; 351 u_int pgn; 352 353 INP_WLOCK_ASSERT(inp); 354 KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD), 355 ("%s: not wild", __func__)); 356 357 pcbinfo = inp->inp_pcbinfo; 358 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 359 INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); 360 CK_LIST_REMOVE(inp, inp_pcbgroup_wild); 361 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 362 INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); 363 inp->inp_flags2 &= ~INP_PCBGROUPWILD; 364} 365 366static __inline int 367in_pcbwild_needed(struct inpcb *inp) 368{ 369#ifdef RSS 370 /* 371 * If it's a listen socket and INP_RSS_BUCKET_SET is set, 372 * it's a wildcard socket _but_ it's in a specific pcbgroup. 373 * Thus we don't treat it as a pcbwild inp. 374 */ 375 if (inp->inp_flags2 & INP_RSS_BUCKET_SET) 376 return (0); 377#endif 378 379#ifdef INET6 380 if (inp->inp_vflag & INP_IPV6) 381 return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)); 382 else 383#endif 384 return (inp->inp_faddr.s_addr == htonl(INADDR_ANY)); 385} 386 387static void 388in_pcbwild_update_internal(struct inpcb *inp) 389{ 390 int wildcard_needed; 391 392 wildcard_needed = in_pcbwild_needed(inp); 393 if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD)) 394 in_pcbwild_add(inp); 395 else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD)) 396 in_pcbwild_remove(inp); 397} 398 399/* 400 * Update the pcbgroup of an inpcb, which might include removing an old 401 * pcbgroup reference and/or adding a new one. Wildcard processing is not 402 * performed here, although ideally we'll never install a pcbgroup for a 403 * wildcard inpcb (asserted below). 404 */ 405static void 406in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo, 407 struct inpcbgroup *newpcbgroup, struct inpcb *inp) 408{ 409 struct inpcbgroup *oldpcbgroup; 410 struct inpcbhead *pcbhash; 411 uint32_t hashkey_faddr; 412 413 INP_WLOCK_ASSERT(inp); 414 415 oldpcbgroup = inp->inp_pcbgroup; 416 if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) { 417 INP_GROUP_LOCK(oldpcbgroup); 418 CK_LIST_REMOVE(inp, inp_pcbgrouphash); 419 inp->inp_pcbgroup = NULL; 420 INP_GROUP_UNLOCK(oldpcbgroup); 421 } 422 if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) { 423#ifdef INET6 424 if (inp->inp_vflag & INP_IPV6) 425 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); 426 else 427#endif 428 hashkey_faddr = inp->inp_faddr.s_addr; 429 INP_GROUP_LOCK(newpcbgroup); 430 /* 431 * If the inp is an RSS bucket wildcard entry, ensure 432 * that the PCB hash is calculated correctly. 433 * 434 * The wildcard hash calculation differs from the 435 * non-wildcard definition. The source address is 436 * INADDR_ANY and the far port is 0. 437 */ 438 if (inp->inp_flags2 & INP_RSS_BUCKET_SET) { 439 pcbhash = &newpcbgroup->ipg_hashbase[ 440 INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0, 441 newpcbgroup->ipg_hashmask)]; 442 } else { 443 pcbhash = &newpcbgroup->ipg_hashbase[ 444 INP_PCBHASH(hashkey_faddr, inp->inp_lport, 445 inp->inp_fport, 446 newpcbgroup->ipg_hashmask)]; 447 } 448 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash); 449 inp->inp_pcbgroup = newpcbgroup; 450 INP_GROUP_UNLOCK(newpcbgroup); 451 } 452 453 KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)), 454 ("%s: pcbgroup and wildcard!", __func__)); 455} 456 457/* 458 * Two update paths: one in which the 4-tuple on an inpcb has been updated 459 * and therefore connection groups may need to change (or a wildcard entry 460 * may needed to be installed), and another in which the 4-tuple has been 461 * set as a result of a packet received, in which case we may be able to use 462 * the hash on the mbuf to avoid doing a software hash calculation for RSS. 463 * 464 * In each case: first, let the wildcard code have a go at placing it as a 465 * wildcard socket. If it was a wildcard, or if the connection has been 466 * dropped, then no pcbgroup is required (so potentially clear it); 467 * otherwise, calculate and update the pcbgroup for the inpcb. 468 */ 469void 470in_pcbgroup_update(struct inpcb *inp) 471{ 472 struct inpcbinfo *pcbinfo; 473 struct inpcbgroup *newpcbgroup; 474 475 INP_WLOCK_ASSERT(inp); 476 477 pcbinfo = inp->inp_pcbinfo; 478 if (!in_pcbgroup_enabled(pcbinfo)) 479 return; 480 481 in_pcbwild_update_internal(inp); 482 if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && 483 !(inp->inp_flags & INP_DROPPED)) { 484#ifdef INET6 485 if (inp->inp_vflag & INP_IPV6) 486 newpcbgroup = in6_pcbgroup_byinpcb(inp); 487 else 488#endif 489 newpcbgroup = in_pcbgroup_byinpcb(inp); 490 } else 491 newpcbgroup = NULL; 492 in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); 493} 494 495void 496in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m) 497{ 498 struct inpcbinfo *pcbinfo; 499 struct inpcbgroup *newpcbgroup; 500 501 INP_WLOCK_ASSERT(inp); 502 503 pcbinfo = inp->inp_pcbinfo; 504 if (!in_pcbgroup_enabled(pcbinfo)) 505 return; 506 507 /* 508 * Possibly should assert !INP_PCBGROUPWILD rather than testing for 509 * it; presumably this function should never be called for anything 510 * other than non-wildcard socket? 511 */ 512 in_pcbwild_update_internal(inp); 513 if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && 514 !(inp->inp_flags & INP_DROPPED)) { 515 newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m); 516#ifdef INET6 517 if (inp->inp_vflag & INP_IPV6) { 518 if (newpcbgroup == NULL) 519 newpcbgroup = in6_pcbgroup_byinpcb(inp); 520 } else { 521#endif 522 if (newpcbgroup == NULL) 523 newpcbgroup = in_pcbgroup_byinpcb(inp); 524#ifdef INET6 525 } 526#endif 527 } else 528 newpcbgroup = NULL; 529 in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); 530} 531 532/* 533 * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb. 534 */ 535void 536in_pcbgroup_remove(struct inpcb *inp) 537{ 538 struct inpcbgroup *pcbgroup; 539 540 INP_WLOCK_ASSERT(inp); 541 542 if (!in_pcbgroup_enabled(inp->inp_pcbinfo)) 543 return; 544 545 if (inp->inp_flags2 & INP_PCBGROUPWILD) 546 in_pcbwild_remove(inp); 547 548 pcbgroup = inp->inp_pcbgroup; 549 if (pcbgroup != NULL) { 550 INP_GROUP_LOCK(pcbgroup); 551 CK_LIST_REMOVE(inp, inp_pcbgrouphash); 552 inp->inp_pcbgroup = NULL; 553 INP_GROUP_UNLOCK(pcbgroup); 554 } 555} 556 557/* 558 * Query whether or not it is appropriate to use pcbgroups to look up inpcbs 559 * for a protocol. 560 */ 561int 562in_pcbgroup_enabled(struct inpcbinfo *pcbinfo) 563{ 564 565 return (pcbinfo->ipi_npcbgroups > 0); 566} 567