1/*- 2 * Copyright (c) 2010-2011 Juniper Networks, Inc. 3 * All rights reserved. 4 * 5 * This software was developed by Robert N. M. Watson under contract 6 * to Juniper Networks, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include <sys/cdefs.h> 31 32__FBSDID("$FreeBSD$"); 33 34#include "opt_inet6.h" 35 36#include <sys/param.h> 37#include <sys/lock.h> 38#include <sys/malloc.h> 39#include <sys/mbuf.h> 40#include <sys/mutex.h> 41#include <sys/smp.h> 42#include <sys/socketvar.h> 43 44#include <netinet/in.h> 45#include <netinet/in_pcb.h> 46#ifdef INET6 47#include <netinet6/in6_pcb.h> 48#endif /* INET6 */ 49 50/* 51 * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's 52 * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization 53 * Strategies in Modern Operating Systems". This implementation differs 54 * significantly from that described in the paper, in that it attempts to 55 * introduce not just notions of affinity for connections and distribute work 56 * so as to reduce lock contention, but also align those notions with 57 * hardware work distribution strategies such as RSS. In this construction, 58 * connection groups supplement, rather than replace, existing reservation 59 * tables for protocol 4-tuples, offering CPU-affine lookup tables with 60 * minimal cache line migration and lock contention during steady state 61 * operation. 62 * 63 * Internet protocols, such as UDP and TCP, register to use connection groups 64 * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this 65 * indicates to the connection group code whether a 2-tuple or 4-tuple is 66 * used as an argument to hashes that assign a connection to a particular 67 * group. This must be aligned with any hardware offloaded distribution 68 * model, such as RSS or similar approaches taken in embedded network boards. 69 * Wildcard sockets require special handling, as in Willman 2006, and are 70 * shared between connection groups -- while being protected by group-local 71 * locks. This means that connection establishment and teardown can be 72 * signficantly more expensive than without connection groups, but that 73 * steady-state processing can be significantly faster. 74 * 75 * Most of the implementation of connection groups is in this file; however, 76 * connection group lookup is implemented in in_pcb.c alongside reservation 77 * table lookups -- see in_pcblookup_group(). 78 * 79 * TODO: 80 * 81 * Implement dynamic rebalancing of buckets with connection groups; when 82 * load is unevenly distributed, search for more optimal balancing on 83 * demand. This might require scaling up the number of connection groups 84 * by <<1. 85 * 86 * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection 87 * groups for ip_input and ip6_input, allowing non-offloaded work 88 * distribution. 89 * 90 * Expose effective CPU affinity of connections to userspace using socket 91 * options. 92 * 93 * Investigate per-connection affinity overrides based on socket options; an 94 * option could be set, certainly resulting in work being distributed 95 * differently in software, and possibly propagated to supporting hardware 96 * with TCAMs or hardware hash tables. This might require connections to 97 * exist in more than one connection group at a time. 98 * 99 * Hook netisr thread reconfiguration events, and propagate those to RSS so 100 * that rebalancing can occur when the thread pool grows or shrinks. 101 * 102 * Expose per-pcbgroup statistics to userspace monitoring tools such as 103 * netstat, in order to allow better debugging and profiling. 104 */ 105 106void 107in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields, 108 int hash_nelements) 109{ 110 struct inpcbgroup *pcbgroup; 111 u_int numpcbgroups, pgn; 112 113 /* 114 * Only enable connection groups for a protocol if it has been 115 * specifically requested. 116 */ 117 if (hashfields == IPI_HASHFIELDS_NONE) 118 return; 119 120 /* 121 * Connection groups are about multi-processor load distribution, 122 * lock contention, and connection CPU affinity. As such, no point 123 * in turning them on for a uniprocessor machine, it only wastes 124 * memory. 125 */ 126 if (mp_ncpus == 1) 127 return; 128 129 /* 130 * Use one group per CPU for now. If we decide to do dynamic 131 * rebalancing a la RSS, we'll need to shift left by at least 1. 132 */ 133 numpcbgroups = mp_ncpus; 134 135 pcbinfo->ipi_hashfields = hashfields; 136 pcbinfo->ipi_pcbgroups = malloc(numpcbgroups * 137 sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO); 138 pcbinfo->ipi_npcbgroups = numpcbgroups; 139 pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB, 140 &pcbinfo->ipi_wildmask); 141 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { 142 pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; 143 pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB, 144 &pcbgroup->ipg_hashmask); 145 INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup"); 146 147 /* 148 * Initialise notional affinity of the pcbgroup -- for RSS, 149 * we want the same notion of affinity as NICs to be used. 150 * Just round robin for the time being. 151 */ 152 pcbgroup->ipg_cpu = (pgn % mp_ncpus); 153 } 154} 155 156void 157in_pcbgroup_destroy(struct inpcbinfo *pcbinfo) 158{ 159 struct inpcbgroup *pcbgroup; 160 u_int pgn; 161 162 if (pcbinfo->ipi_npcbgroups == 0) 163 return; 164 165 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { 166 pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; 167 KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead), 168 ("in_pcbinfo_destroy: listhead not empty")); 169 INP_GROUP_LOCK_DESTROY(pcbgroup); 170 hashdestroy(pcbgroup->ipg_hashbase, M_PCB, 171 pcbgroup->ipg_hashmask); 172 } 173 hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask); 174 free(pcbinfo->ipi_pcbgroups, M_PCB); 175 pcbinfo->ipi_pcbgroups = NULL; 176 pcbinfo->ipi_npcbgroups = 0; 177 pcbinfo->ipi_hashfields = 0; 178} 179 180/* 181 * Given a hash of whatever the covered tuple might be, return a pcbgroup 182 * index. 183 */ 184static __inline u_int 185in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) 186{ 187 188 return (hash % pcbinfo->ipi_npcbgroups); 189} 190 191/* 192 * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash 193 * information is insufficient to identify the pcbgroup. 194 */ 195struct inpcbgroup * 196in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) 197{ 198 199 return (NULL); 200} 201 202static struct inpcbgroup * 203in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m) 204{ 205 206 return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), 207 m->m_pkthdr.flowid)); 208} 209 210struct inpcbgroup * 211in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr, 212 u_short lport, struct in_addr faddr, u_short fport) 213{ 214 uint32_t hash; 215 216 switch (pcbinfo->ipi_hashfields) { 217 case IPI_HASHFIELDS_4TUPLE: 218 hash = faddr.s_addr ^ fport; 219 break; 220 221 case IPI_HASHFIELDS_2TUPLE: 222 hash = faddr.s_addr ^ laddr.s_addr; 223 break; 224 225 default: 226 hash = 0; 227 } 228 return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo, 229 hash)]); 230} 231 232struct inpcbgroup * 233in_pcbgroup_byinpcb(struct inpcb *inp) 234{ 235 236 return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr, 237 inp->inp_lport, inp->inp_faddr, inp->inp_fport)); 238} 239 240static void 241in_pcbwild_add(struct inpcb *inp) 242{ 243 struct inpcbinfo *pcbinfo; 244 struct inpcbhead *head; 245 u_int pgn; 246 247 INP_WLOCK_ASSERT(inp); 248 KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD), 249 ("%s: is wild",__func__)); 250 251 pcbinfo = inp->inp_pcbinfo; 252 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 253 INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); 254 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport, 255 0, pcbinfo->ipi_wildmask)]; 256 LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild); 257 inp->inp_flags2 |= INP_PCBGROUPWILD; 258 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 259 INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); 260} 261 262static void 263in_pcbwild_remove(struct inpcb *inp) 264{ 265 struct inpcbinfo *pcbinfo; 266 u_int pgn; 267 268 INP_WLOCK_ASSERT(inp); 269 KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD), 270 ("%s: not wild", __func__)); 271 272 pcbinfo = inp->inp_pcbinfo; 273 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 274 INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); 275 LIST_REMOVE(inp, inp_pcbgroup_wild); 276 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) 277 INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); 278 inp->inp_flags2 &= ~INP_PCBGROUPWILD; 279} 280 281static __inline int 282in_pcbwild_needed(struct inpcb *inp) 283{ 284 285#ifdef INET6 286 if (inp->inp_vflag & INP_IPV6) 287 return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)); 288 else 289#endif 290 return (inp->inp_faddr.s_addr == htonl(INADDR_ANY)); 291} 292 293static void 294in_pcbwild_update_internal(struct inpcb *inp) 295{ 296 int wildcard_needed; 297 298 wildcard_needed = in_pcbwild_needed(inp); 299 if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD)) 300 in_pcbwild_add(inp); 301 else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD)) 302 in_pcbwild_remove(inp); 303} 304 305/* 306 * Update the pcbgroup of an inpcb, which might include removing an old 307 * pcbgroup reference and/or adding a new one. Wildcard processing is not 308 * performed here, although ideally we'll never install a pcbgroup for a 309 * wildcard inpcb (asserted below). 310 */ 311static void 312in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo, 313 struct inpcbgroup *newpcbgroup, struct inpcb *inp) 314{ 315 struct inpcbgroup *oldpcbgroup; 316 struct inpcbhead *pcbhash; 317 uint32_t hashkey_faddr; 318 319 INP_WLOCK_ASSERT(inp); 320 321 oldpcbgroup = inp->inp_pcbgroup; 322 if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) { 323 INP_GROUP_LOCK(oldpcbgroup); 324 LIST_REMOVE(inp, inp_pcbgrouphash); 325 inp->inp_pcbgroup = NULL; 326 INP_GROUP_UNLOCK(oldpcbgroup); 327 } 328 if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) { 329#ifdef INET6 330 if (inp->inp_vflag & INP_IPV6) 331 hashkey_faddr = inp->in6p_faddr.s6_addr32[3]; /* XXX */ 332 else 333#endif 334 hashkey_faddr = inp->inp_faddr.s_addr; 335 INP_GROUP_LOCK(newpcbgroup); 336 pcbhash = &newpcbgroup->ipg_hashbase[ 337 INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, 338 newpcbgroup->ipg_hashmask)]; 339 LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash); 340 inp->inp_pcbgroup = newpcbgroup; 341 INP_GROUP_UNLOCK(newpcbgroup); 342 } 343 344 KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)), 345 ("%s: pcbgroup and wildcard!", __func__)); 346} 347 348/* 349 * Two update paths: one in which the 4-tuple on an inpcb has been updated 350 * and therefore connection groups may need to change (or a wildcard entry 351 * may needed to be installed), and another in which the 4-tuple has been 352 * set as a result of a packet received, in which case we may be able to use 353 * the hash on the mbuf to avoid doing a software hash calculation for RSS. 354 * 355 * In each case: first, let the wildcard code have a go at placing it as a 356 * wildcard socket. If it was a wildcard, or if the connection has been 357 * dropped, then no pcbgroup is required (so potentially clear it); 358 * otherwise, calculate and update the pcbgroup for the inpcb. 359 */ 360void 361in_pcbgroup_update(struct inpcb *inp) 362{ 363 struct inpcbinfo *pcbinfo; 364 struct inpcbgroup *newpcbgroup; 365 366 INP_WLOCK_ASSERT(inp); 367 368 pcbinfo = inp->inp_pcbinfo; 369 if (!in_pcbgroup_enabled(pcbinfo)) 370 return; 371 372 in_pcbwild_update_internal(inp); 373 if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && 374 !(inp->inp_flags & INP_DROPPED)) { 375#ifdef INET6 376 if (inp->inp_vflag & INP_IPV6) 377 newpcbgroup = in6_pcbgroup_byinpcb(inp); 378 else 379#endif 380 newpcbgroup = in_pcbgroup_byinpcb(inp); 381 } else 382 newpcbgroup = NULL; 383 in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); 384} 385 386void 387in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m) 388{ 389 struct inpcbinfo *pcbinfo; 390 struct inpcbgroup *newpcbgroup; 391 392 INP_WLOCK_ASSERT(inp); 393 394 pcbinfo = inp->inp_pcbinfo; 395 if (!in_pcbgroup_enabled(pcbinfo)) 396 return; 397 398 /* 399 * Possibly should assert !INP_PCBGROUPWILD rather than testing for 400 * it; presumably this function should never be called for anything 401 * other than non-wildcard socket? 402 */ 403 in_pcbwild_update_internal(inp); 404 if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && 405 !(inp->inp_flags & INP_DROPPED)) { 406 newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m); 407#ifdef INET6 408 if (inp->inp_vflag & INP_IPV6) { 409 if (newpcbgroup == NULL) 410 newpcbgroup = in6_pcbgroup_byinpcb(inp); 411 } else { 412#endif 413 if (newpcbgroup == NULL) 414 newpcbgroup = in_pcbgroup_byinpcb(inp); 415#ifdef INET6 416 } 417#endif 418 } else 419 newpcbgroup = NULL; 420 in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); 421} 422 423/* 424 * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb. 425 */ 426void 427in_pcbgroup_remove(struct inpcb *inp) 428{ 429 struct inpcbgroup *pcbgroup; 430 431 INP_WLOCK_ASSERT(inp); 432 433 if (!in_pcbgroup_enabled(inp->inp_pcbinfo)) 434 return; 435 436 if (inp->inp_flags2 & INP_PCBGROUPWILD) 437 in_pcbwild_remove(inp); 438 439 pcbgroup = inp->inp_pcbgroup; 440 if (pcbgroup != NULL) { 441 INP_GROUP_LOCK(pcbgroup); 442 LIST_REMOVE(inp, inp_pcbgrouphash); 443 inp->inp_pcbgroup = NULL; 444 INP_GROUP_UNLOCK(pcbgroup); 445 } 446} 447 448/* 449 * Query whether or not it is appropriate to use pcbgroups to look up inpcbs 450 * for a protocol. 451 */ 452int 453in_pcbgroup_enabled(struct inpcbinfo *pcbinfo) 454{ 455 456 return (pcbinfo->ipi_npcbgroups > 0); 457} 458