1222748Srwatson/*- 2222748Srwatson * Copyright (c) 2010-2011 Juniper Networks, Inc. 3222748Srwatson * All rights reserved. 4222748Srwatson * 5222748Srwatson * This software was developed by Robert N. M. Watson under contract 6222748Srwatson * to Juniper Networks, Inc. 7222748Srwatson * 8222748Srwatson * Redistribution and use in source and binary forms, with or without 9222748Srwatson * modification, are permitted provided that the following conditions 10222748Srwatson * are met: 11222748Srwatson * 1. Redistributions of source code must retain the above copyright 12222748Srwatson * notice, this list of conditions and the following disclaimer. 13222748Srwatson * 2. Redistributions in binary form must reproduce the above copyright 14222748Srwatson * notice, this list of conditions and the following disclaimer in the 15222748Srwatson * documentation and/or other materials provided with the distribution. 16222748Srwatson * 17222748Srwatson * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18222748Srwatson * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19222748Srwatson * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20222748Srwatson * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21222748Srwatson * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22222748Srwatson * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23222748Srwatson * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24222748Srwatson * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25222748Srwatson * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26222748Srwatson * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27222748Srwatson * SUCH DAMAGE. 28222748Srwatson */ 29222748Srwatson 30222748Srwatson#include <sys/cdefs.h> 31222748Srwatson 32222748Srwatson__FBSDID("$FreeBSD$"); 33222748Srwatson 34222748Srwatson#include "opt_inet6.h" 35263198Srwatson#include "opt_rss.h" 36222748Srwatson 37222748Srwatson#include <sys/param.h> 38222748Srwatson#include <sys/lock.h> 39222748Srwatson#include <sys/malloc.h> 40222748Srwatson#include <sys/mbuf.h> 41222748Srwatson#include <sys/mutex.h> 42222748Srwatson#include <sys/smp.h> 43297439Sgnn#include <sys/socket.h> 44222748Srwatson#include <sys/socketvar.h> 45222748Srwatson 46277331Sadrian#include <net/rss_config.h> 47277331Sadrian 48222748Srwatson#include <netinet/in.h> 49277331Sadrian 50222748Srwatson#include <netinet/in_pcb.h> 51263198Srwatson#include <netinet/in_rss.h> 52222748Srwatson#ifdef INET6 53222748Srwatson#include <netinet6/in6_pcb.h> 54222748Srwatson#endif /* INET6 */ 55222748Srwatson 56222748Srwatson/* 57222748Srwatson * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's 58222748Srwatson * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization 59222748Srwatson * Strategies in Modern Operating Systems". This implementation differs 60222748Srwatson * significantly from that described in the paper, in that it attempts to 61222748Srwatson * introduce not just notions of affinity for connections and distribute work 62222748Srwatson * so as to reduce lock contention, but also align those notions with 63222748Srwatson * hardware work distribution strategies such as RSS. In this construction, 64222748Srwatson * connection groups supplement, rather than replace, existing reservation 65222748Srwatson * tables for protocol 4-tuples, offering CPU-affine lookup tables with 66222748Srwatson * minimal cache line migration and lock contention during steady state 67222748Srwatson * operation. 68222748Srwatson * 69263198Srwatson * Hardware-offloaded checksums are often inefficient in software -- for 70263198Srwatson * example, Toeplitz, specified by RSS, introduced a significant overhead if 71263198Srwatson * performed during per-packge processing. It is therefore desirable to fall 72263198Srwatson * back on traditional reservation table lookups without affinity where 73263198Srwatson * hardware-offloaded checksums aren't available, such as for traffic over 74263198Srwatson * non-RSS interfaces. 75263198Srwatson * 76222748Srwatson * Internet protocols, such as UDP and TCP, register to use connection groups 77222748Srwatson * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this 78222748Srwatson * indicates to the connection group code whether a 2-tuple or 4-tuple is 79222748Srwatson * used as an argument to hashes that assign a connection to a particular 80222748Srwatson * group. This must be aligned with any hardware offloaded distribution 81222748Srwatson * model, such as RSS or similar approaches taken in embedded network boards. 82222748Srwatson * Wildcard sockets require special handling, as in Willman 2006, and are 83222748Srwatson * shared between connection groups -- while being protected by group-local 84222748Srwatson * locks. This means that connection establishment and teardown can be 85222748Srwatson * signficantly more expensive than without connection groups, but that 86222748Srwatson * steady-state processing can be significantly faster. 87222748Srwatson * 88263198Srwatson * When RSS is used, certain connection group parameters, such as the number 89263198Srwatson * of groups, are provided by the RSS implementation, found in in_rss.c. 90263198Srwatson * Otherwise, in_pcbgroup.c selects possible sensible parameters 91263198Srwatson * corresponding to the degree of parallelism exposed by netisr. 92263198Srwatson * 93222748Srwatson * Most of the implementation of connection groups is in this file; however, 94222748Srwatson * connection group lookup is implemented in in_pcb.c alongside reservation 95222748Srwatson * table lookups -- see in_pcblookup_group(). 96222748Srwatson * 97222748Srwatson * TODO: 98222748Srwatson * 99222748Srwatson * Implement dynamic rebalancing of buckets with connection groups; when 100222748Srwatson * load is unevenly distributed, search for more optimal balancing on 101222748Srwatson * demand. This might require scaling up the number of connection groups 102222748Srwatson * by <<1. 103222748Srwatson * 104222748Srwatson * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection 105222748Srwatson * groups for ip_input and ip6_input, allowing non-offloaded work 106222748Srwatson * distribution. 107222748Srwatson * 108222748Srwatson * Expose effective CPU affinity of connections to userspace using socket 109222748Srwatson * options. 110222748Srwatson * 111222748Srwatson * Investigate per-connection affinity overrides based on socket options; an 112222748Srwatson * option could be set, certainly resulting in work being distributed 113222748Srwatson * differently in software, and possibly propagated to supporting hardware 114222748Srwatson * with TCAMs or hardware hash tables. This might require connections to 115222748Srwatson * exist in more than one connection group at a time. 116222748Srwatson * 117222748Srwatson * Hook netisr thread reconfiguration events, and propagate those to RSS so 118222748Srwatson * that rebalancing can occur when the thread pool grows or shrinks. 119222748Srwatson * 120222748Srwatson * Expose per-pcbgroup statistics to userspace monitoring tools such as 121222748Srwatson * netstat, in order to allow better debugging and profiling. 122222748Srwatson */ 123222748Srwatson 124222748Srwatsonvoid 125222748Srwatsonin_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields, 126222748Srwatson int hash_nelements) 127222748Srwatson{ 128222748Srwatson struct inpcbgroup *pcbgroup; 129222748Srwatson u_int numpcbgroups, pgn; 130222748Srwatson 131222748Srwatson /* 132222748Srwatson * Only enable connection groups for a protocol if it has been 133222748Srwatson * specifically requested. 134222748Srwatson */ 135222748Srwatson if (hashfields == IPI_HASHFIELDS_NONE) 136222748Srwatson return; 137222748Srwatson 138222748Srwatson /* 139222748Srwatson * Connection groups are about multi-processor load distribution, 140222748Srwatson * lock contention, and connection CPU affinity. As such, no point 141222748Srwatson * in turning them on for a uniprocessor machine, it only wastes 142222748Srwatson * memory. 143222748Srwatson */ 144222748Srwatson if (mp_ncpus == 1) 145222748Srwatson return; 146222748Srwatson 147263198Srwatson#ifdef RSS 148222748Srwatson /* 149263198Srwatson * If we're using RSS, then RSS determines the number of connection 150263198Srwatson * groups to use: one connection group per RSS bucket. If for some 151263198Srwatson * reason RSS isn't able to provide a number of buckets, disable 152263198Srwatson * connection groups entirely. 153263198Srwatson * 154263198Srwatson * XXXRW: Can this ever happen? 155222748Srwatson */ 156263198Srwatson numpcbgroups = rss_getnumbuckets(); 157263198Srwatson if (numpcbgroups == 0) 158263198Srwatson return; 159263198Srwatson#else 160263198Srwatson /* 161263198Srwatson * Otherwise, we'll just use one per CPU for now. If we decide to 162263198Srwatson * do dynamic rebalancing a la RSS, we'll need similar logic here. 163263198Srwatson */ 164222748Srwatson numpcbgroups = mp_ncpus; 165263198Srwatson#endif 166222748Srwatson 167222748Srwatson pcbinfo->ipi_hashfields = hashfields; 168222748Srwatson pcbinfo->ipi_pcbgroups = malloc(numpcbgroups * 169222748Srwatson sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO); 170222748Srwatson pcbinfo->ipi_npcbgroups = numpcbgroups; 171222748Srwatson pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB, 172222748Srwatson &pcbinfo->ipi_wildmask); 173222748Srwatson for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { 174222748Srwatson pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; 175222748Srwatson pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB, 176222748Srwatson &pcbgroup->ipg_hashmask); 177222748Srwatson INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup"); 178222748Srwatson 179222748Srwatson /* 180222748Srwatson * Initialise notional affinity of the pcbgroup -- for RSS, 181263198Srwatson * we want the same notion of affinity as NICs to be used. In 182263198Srwatson * the non-RSS case, just round robin for the time being. 183263198Srwatson * 184263198Srwatson * XXXRW: The notion of a bucket to CPU mapping is common at 185263198Srwatson * both pcbgroup and RSS layers -- does that mean that we 186263198Srwatson * should migrate it all from RSS to here, and just leave RSS 187263198Srwatson * responsible only for providing hashing and mapping funtions? 188222748Srwatson */ 189263198Srwatson#ifdef RSS 190263198Srwatson pcbgroup->ipg_cpu = rss_getcpu(pgn); 191263198Srwatson#else 192222748Srwatson pcbgroup->ipg_cpu = (pgn % mp_ncpus); 193263198Srwatson#endif 194222748Srwatson } 195222748Srwatson} 196222748Srwatson 197222748Srwatsonvoid 198222748Srwatsonin_pcbgroup_destroy(struct inpcbinfo *pcbinfo) 199222748Srwatson{ 200222748Srwatson struct inpcbgroup *pcbgroup; 201222748Srwatson u_int pgn; 202222748Srwatson 203222748Srwatson if (pcbinfo->ipi_npcbgroups == 0) 204222748Srwatson return; 205222748Srwatson 206222748Srwatson for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { 207222748Srwatson pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; 208222748Srwatson KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead), 209222748Srwatson ("in_pcbinfo_destroy: listhead not empty")); 210222748Srwatson INP_GROUP_LOCK_DESTROY(pcbgroup); 211222748Srwatson hashdestroy(pcbgroup->ipg_hashbase, M_PCB, 212222748Srwatson pcbgroup->ipg_hashmask); 213222748Srwatson } 214222748Srwatson hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask); 215222748Srwatson free(pcbinfo->ipi_pcbgroups, M_PCB); 216222748Srwatson pcbinfo->ipi_pcbgroups = NULL; 217222748Srwatson pcbinfo->ipi_npcbgroups = 0; 218222748Srwatson pcbinfo->ipi_hashfields = 0; 219222748Srwatson} 220222748Srwatson 221222748Srwatson/* 222222748Srwatson * Given a hash of whatever the covered tuple might be, return a pcbgroup 223263198Srwatson * index. Where RSS is supported, try to align bucket selection with RSS CPU 224263198Srwatson * affinity strategy. 225222748Srwatson */ 226222748Srwatsonstatic __inline u_int 227222748Srwatsonin_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) 228222748Srwatson{ 229222748Srwatson 230263198Srwatson#ifdef RSS 231263198Srwatson return (rss_getbucket(hash)); 232263198Srwatson#else 233222748Srwatson return (hash % pcbinfo->ipi_npcbgroups); 234263198Srwatson#endif 235222748Srwatson} 236222748Srwatson 237222748Srwatson/* 238222748Srwatson * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash 239263198Srwatson * information is insufficient to identify the pcbgroup. This might occur if 240263198Srwatson * a TCP packet turns up with a 2-tuple hash, or if an RSS hash is present but 241263198Srwatson * RSS is not compiled into the kernel. 242222748Srwatson */ 243222748Srwatsonstruct inpcbgroup * 244222748Srwatsonin_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) 245222748Srwatson{ 246222748Srwatson 247263198Srwatson#ifdef RSS 248263198Srwatson if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && 249263198Srwatson hashtype == M_HASHTYPE_RSS_TCP_IPV4) || 250268913Sadrian (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && 251268913Sadrian hashtype == M_HASHTYPE_RSS_UDP_IPV4) || 252263198Srwatson (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE && 253263198Srwatson hashtype == M_HASHTYPE_RSS_IPV4)) 254263198Srwatson return (&pcbinfo->ipi_pcbgroups[ 255263198Srwatson in_pcbgroup_getbucket(pcbinfo, hash)]); 256263198Srwatson#endif 257222748Srwatson return (NULL); 258222748Srwatson} 259222748Srwatson 260222748Srwatsonstatic struct inpcbgroup * 261222748Srwatsonin_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m) 262222748Srwatson{ 263222748Srwatson 264222748Srwatson return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), 265222748Srwatson m->m_pkthdr.flowid)); 266222748Srwatson} 267222748Srwatson 268222748Srwatsonstruct inpcbgroup * 269222748Srwatsonin_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr, 270222748Srwatson u_short lport, struct in_addr faddr, u_short fport) 271222748Srwatson{ 272222748Srwatson uint32_t hash; 273222748Srwatson 274263198Srwatson /* 275263198Srwatson * RSS note: we pass foreign addr/port as source, and local addr/port 276263198Srwatson * as destination, as we want to align with what the hardware is 277263198Srwatson * doing. 278263198Srwatson */ 279222748Srwatson switch (pcbinfo->ipi_hashfields) { 280222748Srwatson case IPI_HASHFIELDS_4TUPLE: 281263198Srwatson#ifdef RSS 282263198Srwatson hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport); 283263198Srwatson#else 284222748Srwatson hash = faddr.s_addr ^ fport; 285263198Srwatson#endif 286222748Srwatson break; 287222748Srwatson 288222748Srwatson case IPI_HASHFIELDS_2TUPLE: 289263198Srwatson#ifdef RSS 290263198Srwatson hash = rss_hash_ip4_2tuple(faddr, laddr); 291263198Srwatson#else 292222748Srwatson hash = faddr.s_addr ^ laddr.s_addr; 293263198Srwatson#endif 294222748Srwatson break; 295222748Srwatson 296222748Srwatson default: 297222748Srwatson hash = 0; 298222748Srwatson } 299222748Srwatson return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo, 300222748Srwatson hash)]); 301222748Srwatson} 302222748Srwatson 303222748Srwatsonstruct inpcbgroup * 304222748Srwatsonin_pcbgroup_byinpcb(struct inpcb *inp) 305222748Srwatson{ 306268479Sadrian#ifdef RSS 307268479Sadrian /* 308268479Sadrian * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined 309268479Sadrian * RSS bucket and thus we should use this pcbgroup, rather than 310268479Sadrian * using a tuple or hash. 311268479Sadrian * 312268479Sadrian * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket 313268479Sadrian * fits in that! 314268479Sadrian */ 315268479Sadrian if (inp->inp_flags2 & INP_RSS_BUCKET_SET) 316268479Sadrian return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]); 317268479Sadrian#endif 318222748Srwatson 319222748Srwatson return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr, 320222748Srwatson inp->inp_lport, inp->inp_faddr, inp->inp_fport)); 321222748Srwatson} 322222748Srwatson 323222748Srwatsonstatic void 324222748Srwatsonin_pcbwild_add(struct inpcb *inp) 325222748Srwatson{ 326222748Srwatson struct inpcbinfo *pcbinfo; 327222748Srwatson struct inpcbhead *head; 328222748Srwatson u_int pgn; 329222748Srwatson 330222748Srwatson INP_WLOCK_ASSERT(inp); 331222748Srwatson KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD), 332222748Srwatson ("%s: is wild",__func__)); 333222748Srwatson 334222748Srwatson pcbinfo = inp->inp_pcbinfo; 335222748Srwatson for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 336222748Srwatson INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); 337222748Srwatson head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport, 338222748Srwatson 0, pcbinfo->ipi_wildmask)]; 339222748Srwatson LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild); 340222748Srwatson inp->inp_flags2 |= INP_PCBGROUPWILD; 341222748Srwatson for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 342222748Srwatson INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); 343222748Srwatson} 344222748Srwatson 345222748Srwatsonstatic void 346222748Srwatsonin_pcbwild_remove(struct inpcb *inp) 347222748Srwatson{ 348222748Srwatson struct inpcbinfo *pcbinfo; 349222748Srwatson u_int pgn; 350222748Srwatson 351222748Srwatson INP_WLOCK_ASSERT(inp); 352222748Srwatson KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD), 353222748Srwatson ("%s: not wild", __func__)); 354222748Srwatson 355222748Srwatson pcbinfo = inp->inp_pcbinfo; 356222748Srwatson for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 357222748Srwatson INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); 358222748Srwatson LIST_REMOVE(inp, inp_pcbgroup_wild); 359222748Srwatson for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 360222748Srwatson INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); 361222748Srwatson inp->inp_flags2 &= ~INP_PCBGROUPWILD; 362222748Srwatson} 363222748Srwatson 364222748Srwatsonstatic __inline int 365222748Srwatsonin_pcbwild_needed(struct inpcb *inp) 366222748Srwatson{ 367268479Sadrian#ifdef RSS 368268479Sadrian /* 369268479Sadrian * If it's a listen socket and INP_RSS_BUCKET_SET is set, 370268479Sadrian * it's a wildcard socket _but_ it's in a specific pcbgroup. 371268479Sadrian * Thus we don't treat it as a pcbwild inp. 372268479Sadrian */ 373268479Sadrian if (inp->inp_flags2 & INP_RSS_BUCKET_SET) 374268479Sadrian return (0); 375268479Sadrian#endif 376222748Srwatson 377222748Srwatson#ifdef INET6 378222748Srwatson if (inp->inp_vflag & INP_IPV6) 379222748Srwatson return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)); 380222748Srwatson else 381222748Srwatson#endif 382222748Srwatson return (inp->inp_faddr.s_addr == htonl(INADDR_ANY)); 383222748Srwatson} 384222748Srwatson 385222748Srwatsonstatic void 386222748Srwatsonin_pcbwild_update_internal(struct inpcb *inp) 387222748Srwatson{ 388222748Srwatson int wildcard_needed; 389222748Srwatson 390222748Srwatson wildcard_needed = in_pcbwild_needed(inp); 391222748Srwatson if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD)) 392222748Srwatson in_pcbwild_add(inp); 393222748Srwatson else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD)) 394222748Srwatson in_pcbwild_remove(inp); 395222748Srwatson} 396222748Srwatson 397222748Srwatson/* 398222748Srwatson * Update the pcbgroup of an inpcb, which might include removing an old 399222748Srwatson * pcbgroup reference and/or adding a new one. Wildcard processing is not 400222748Srwatson * performed here, although ideally we'll never install a pcbgroup for a 401222748Srwatson * wildcard inpcb (asserted below). 402222748Srwatson */ 403222748Srwatsonstatic void 404222748Srwatsonin_pcbgroup_update_internal(struct inpcbinfo *pcbinfo, 405222748Srwatson struct inpcbgroup *newpcbgroup, struct inpcb *inp) 406222748Srwatson{ 407222748Srwatson struct inpcbgroup *oldpcbgroup; 408222748Srwatson struct inpcbhead *pcbhash; 409222748Srwatson uint32_t hashkey_faddr; 410222748Srwatson 411222748Srwatson INP_WLOCK_ASSERT(inp); 412222748Srwatson 413222748Srwatson oldpcbgroup = inp->inp_pcbgroup; 414222748Srwatson if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) { 415222748Srwatson INP_GROUP_LOCK(oldpcbgroup); 416222748Srwatson LIST_REMOVE(inp, inp_pcbgrouphash); 417222748Srwatson inp->inp_pcbgroup = NULL; 418222748Srwatson INP_GROUP_UNLOCK(oldpcbgroup); 419222748Srwatson } 420222748Srwatson if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) { 421222748Srwatson#ifdef INET6 422222748Srwatson if (inp->inp_vflag & INP_IPV6) 423271386Sae hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); 424222748Srwatson else 425222748Srwatson#endif 426222748Srwatson hashkey_faddr = inp->inp_faddr.s_addr; 427222748Srwatson INP_GROUP_LOCK(newpcbgroup); 428268479Sadrian /* 429268479Sadrian * If the inp is an RSS bucket wildcard entry, ensure 430268479Sadrian * that the PCB hash is calculated correctly. 431268479Sadrian * 432268479Sadrian * The wildcard hash calculation differs from the 433268479Sadrian * non-wildcard definition. The source address is 434268479Sadrian * INADDR_ANY and the far port is 0. 435268479Sadrian */ 436268479Sadrian if (inp->inp_flags2 & INP_RSS_BUCKET_SET) { 437268479Sadrian pcbhash = &newpcbgroup->ipg_hashbase[ 438268479Sadrian INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0, 439268479Sadrian newpcbgroup->ipg_hashmask)]; 440268479Sadrian } else { 441268479Sadrian pcbhash = &newpcbgroup->ipg_hashbase[ 442268479Sadrian INP_PCBHASH(hashkey_faddr, inp->inp_lport, 443268479Sadrian inp->inp_fport, 444268479Sadrian newpcbgroup->ipg_hashmask)]; 445268479Sadrian } 446222748Srwatson LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash); 447222748Srwatson inp->inp_pcbgroup = newpcbgroup; 448222748Srwatson INP_GROUP_UNLOCK(newpcbgroup); 449222748Srwatson } 450222748Srwatson 451222748Srwatson KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)), 452222748Srwatson ("%s: pcbgroup and wildcard!", __func__)); 453222748Srwatson} 454222748Srwatson 455222748Srwatson/* 456222748Srwatson * Two update paths: one in which the 4-tuple on an inpcb has been updated 457222748Srwatson * and therefore connection groups may need to change (or a wildcard entry 458222748Srwatson * may needed to be installed), and another in which the 4-tuple has been 459222748Srwatson * set as a result of a packet received, in which case we may be able to use 460222748Srwatson * the hash on the mbuf to avoid doing a software hash calculation for RSS. 461222748Srwatson * 462222748Srwatson * In each case: first, let the wildcard code have a go at placing it as a 463222748Srwatson * wildcard socket. If it was a wildcard, or if the connection has been 464222748Srwatson * dropped, then no pcbgroup is required (so potentially clear it); 465222748Srwatson * otherwise, calculate and update the pcbgroup for the inpcb. 466222748Srwatson */ 467222748Srwatsonvoid 468222748Srwatsonin_pcbgroup_update(struct inpcb *inp) 469222748Srwatson{ 470222748Srwatson struct inpcbinfo *pcbinfo; 471222748Srwatson struct inpcbgroup *newpcbgroup; 472222748Srwatson 473222748Srwatson INP_WLOCK_ASSERT(inp); 474222748Srwatson 475222748Srwatson pcbinfo = inp->inp_pcbinfo; 476222748Srwatson if (!in_pcbgroup_enabled(pcbinfo)) 477222748Srwatson return; 478222748Srwatson 479222748Srwatson in_pcbwild_update_internal(inp); 480222748Srwatson if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && 481222748Srwatson !(inp->inp_flags & INP_DROPPED)) { 482222748Srwatson#ifdef INET6 483222748Srwatson if (inp->inp_vflag & INP_IPV6) 484222748Srwatson newpcbgroup = in6_pcbgroup_byinpcb(inp); 485222748Srwatson else 486222748Srwatson#endif 487222748Srwatson newpcbgroup = in_pcbgroup_byinpcb(inp); 488222748Srwatson } else 489222748Srwatson newpcbgroup = NULL; 490222748Srwatson in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); 491222748Srwatson} 492222748Srwatson 493222748Srwatsonvoid 494222748Srwatsonin_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m) 495222748Srwatson{ 496222748Srwatson struct inpcbinfo *pcbinfo; 497222748Srwatson struct inpcbgroup *newpcbgroup; 498222748Srwatson 499222748Srwatson INP_WLOCK_ASSERT(inp); 500222748Srwatson 501222748Srwatson pcbinfo = inp->inp_pcbinfo; 502222748Srwatson if (!in_pcbgroup_enabled(pcbinfo)) 503222748Srwatson return; 504222748Srwatson 505222748Srwatson /* 506222748Srwatson * Possibly should assert !INP_PCBGROUPWILD rather than testing for 507222748Srwatson * it; presumably this function should never be called for anything 508222748Srwatson * other than non-wildcard socket? 509222748Srwatson */ 510222748Srwatson in_pcbwild_update_internal(inp); 511222748Srwatson if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && 512222748Srwatson !(inp->inp_flags & INP_DROPPED)) { 513222748Srwatson newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m); 514222748Srwatson#ifdef INET6 515222748Srwatson if (inp->inp_vflag & INP_IPV6) { 516222748Srwatson if (newpcbgroup == NULL) 517222748Srwatson newpcbgroup = in6_pcbgroup_byinpcb(inp); 518222748Srwatson } else { 519222748Srwatson#endif 520222748Srwatson if (newpcbgroup == NULL) 521222748Srwatson newpcbgroup = in_pcbgroup_byinpcb(inp); 522222748Srwatson#ifdef INET6 523222748Srwatson } 524222748Srwatson#endif 525222748Srwatson } else 526222748Srwatson newpcbgroup = NULL; 527222748Srwatson in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); 528222748Srwatson} 529222748Srwatson 530222748Srwatson/* 531222748Srwatson * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb. 532222748Srwatson */ 533222748Srwatsonvoid 534222748Srwatsonin_pcbgroup_remove(struct inpcb *inp) 535222748Srwatson{ 536222748Srwatson struct inpcbgroup *pcbgroup; 537222748Srwatson 538222748Srwatson INP_WLOCK_ASSERT(inp); 539222748Srwatson 540222748Srwatson if (!in_pcbgroup_enabled(inp->inp_pcbinfo)) 541222748Srwatson return; 542222748Srwatson 543222748Srwatson if (inp->inp_flags2 & INP_PCBGROUPWILD) 544222748Srwatson in_pcbwild_remove(inp); 545222748Srwatson 546222748Srwatson pcbgroup = inp->inp_pcbgroup; 547222748Srwatson if (pcbgroup != NULL) { 548222748Srwatson INP_GROUP_LOCK(pcbgroup); 549222748Srwatson LIST_REMOVE(inp, inp_pcbgrouphash); 550222748Srwatson inp->inp_pcbgroup = NULL; 551222748Srwatson INP_GROUP_UNLOCK(pcbgroup); 552222748Srwatson } 553222748Srwatson} 554222748Srwatson 555222748Srwatson/* 556222748Srwatson * Query whether or not it is appropriate to use pcbgroups to look up inpcbs 557222748Srwatson * for a protocol. 558222748Srwatson */ 559222748Srwatsonint 560222748Srwatsonin_pcbgroup_enabled(struct inpcbinfo *pcbinfo) 561222748Srwatson{ 562222748Srwatson 563222748Srwatson return (pcbinfo->ipi_npcbgroups > 0); 564222748Srwatson} 565