in_pcbgroup.c revision 268479
1/*-
2 * Copyright (c) 2010-2011 Juniper Networks, Inc.
3 * All rights reserved.
4 *
5 * This software was developed by Robert N. M. Watson under contract
6 * to Juniper Networks, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31
32__FBSDID("$FreeBSD: head/sys/netinet/in_pcbgroup.c 268479 2014-07-10 03:10:56Z adrian $");
33
34#include "opt_inet6.h"
35#include "opt_rss.h"
36
37#include <sys/param.h>
38#include <sys/lock.h>
39#include <sys/malloc.h>
40#include <sys/mbuf.h>
41#include <sys/mutex.h>
42#include <sys/smp.h>
43#include <sys/socketvar.h>
44
45#include <netinet/in.h>
46#include <netinet/in_pcb.h>
47#include <netinet/in_rss.h>
48#ifdef INET6
49#include <netinet6/in6_pcb.h>
50#endif /* INET6 */
51
52/*
53 * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's
54 * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization
55 * Strategies in Modern Operating Systems".  This implementation differs
56 * significantly from that described in the paper, in that it attempts to
57 * introduce not just notions of affinity for connections and distribute work
58 * so as to reduce lock contention, but also align those notions with
59 * hardware work distribution strategies such as RSS.  In this construction,
60 * connection groups supplement, rather than replace, existing reservation
61 * tables for protocol 4-tuples, offering CPU-affine lookup tables with
62 * minimal cache line migration and lock contention during steady state
63 * operation.
64 *
65 * Hardware-offloaded checksums are often inefficient in software -- for
66 * example, Toeplitz, specified by RSS, introduced a significant overhead if
67 * performed during per-packge processing.  It is therefore desirable to fall
68 * back on traditional reservation table lookups without affinity where
69 * hardware-offloaded checksums aren't available, such as for traffic over
70 * non-RSS interfaces.
71 *
72 * Internet protocols, such as UDP and TCP, register to use connection groups
73 * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this
74 * indicates to the connection group code whether a 2-tuple or 4-tuple is
75 * used as an argument to hashes that assign a connection to a particular
76 * group.  This must be aligned with any hardware offloaded distribution
77 * model, such as RSS or similar approaches taken in embedded network boards.
78 * Wildcard sockets require special handling, as in Willman 2006, and are
79 * shared between connection groups -- while being protected by group-local
80 * locks.  This means that connection establishment and teardown can be
81 * signficantly more expensive than without connection groups, but that
82 * steady-state processing can be significantly faster.
83 *
84 * When RSS is used, certain connection group parameters, such as the number
85 * of groups, are provided by the RSS implementation, found in in_rss.c.
86 * Otherwise, in_pcbgroup.c selects possible sensible parameters
87 * corresponding to the degree of parallelism exposed by netisr.
88 *
89 * Most of the implementation of connection groups is in this file; however,
90 * connection group lookup is implemented in in_pcb.c alongside reservation
91 * table lookups -- see in_pcblookup_group().
92 *
93 * TODO:
94 *
95 * Implement dynamic rebalancing of buckets with connection groups; when
96 * load is unevenly distributed, search for more optimal balancing on
97 * demand.  This might require scaling up the number of connection groups
98 * by <<1.
99 *
100 * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection
101 * groups for ip_input and ip6_input, allowing non-offloaded work
102 * distribution.
103 *
104 * Expose effective CPU affinity of connections to userspace using socket
105 * options.
106 *
107 * Investigate per-connection affinity overrides based on socket options; an
108 * option could be set, certainly resulting in work being distributed
109 * differently in software, and possibly propagated to supporting hardware
110 * with TCAMs or hardware hash tables.  This might require connections to
111 * exist in more than one connection group at a time.
112 *
113 * Hook netisr thread reconfiguration events, and propagate those to RSS so
114 * that rebalancing can occur when the thread pool grows or shrinks.
115 *
116 * Expose per-pcbgroup statistics to userspace monitoring tools such as
117 * netstat, in order to allow better debugging and profiling.
118 */
119
120void
121in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields,
122    int hash_nelements)
123{
124	struct inpcbgroup *pcbgroup;
125	u_int numpcbgroups, pgn;
126
127	/*
128	 * Only enable connection groups for a protocol if it has been
129	 * specifically requested.
130	 */
131	if (hashfields == IPI_HASHFIELDS_NONE)
132		return;
133
134	/*
135	 * Connection groups are about multi-processor load distribution,
136	 * lock contention, and connection CPU affinity.  As such, no point
137	 * in turning them on for a uniprocessor machine, it only wastes
138	 * memory.
139	 */
140	if (mp_ncpus == 1)
141		return;
142
143#ifdef RSS
144	/*
145	 * If we're using RSS, then RSS determines the number of connection
146	 * groups to use: one connection group per RSS bucket.  If for some
147	 * reason RSS isn't able to provide a number of buckets, disable
148	 * connection groups entirely.
149	 *
150	 * XXXRW: Can this ever happen?
151	 */
152	numpcbgroups = rss_getnumbuckets();
153	if (numpcbgroups == 0)
154		return;
155#else
156	/*
157	 * Otherwise, we'll just use one per CPU for now.  If we decide to
158	 * do dynamic rebalancing a la RSS, we'll need similar logic here.
159	 */
160	numpcbgroups = mp_ncpus;
161#endif
162
163	pcbinfo->ipi_hashfields = hashfields;
164	pcbinfo->ipi_pcbgroups = malloc(numpcbgroups *
165	    sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO);
166	pcbinfo->ipi_npcbgroups = numpcbgroups;
167	pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB,
168	    &pcbinfo->ipi_wildmask);
169	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
170		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
171		pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB,
172		    &pcbgroup->ipg_hashmask);
173		INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup");
174
175		/*
176		 * Initialise notional affinity of the pcbgroup -- for RSS,
177		 * we want the same notion of affinity as NICs to be used.  In
178		 * the non-RSS case, just round robin for the time being.
179		 *
180		 * XXXRW: The notion of a bucket to CPU mapping is common at
181		 * both pcbgroup and RSS layers -- does that mean that we
182		 * should migrate it all from RSS to here, and just leave RSS
183		 * responsible only for providing hashing and mapping funtions?
184		 */
185#ifdef RSS
186		pcbgroup->ipg_cpu = rss_getcpu(pgn);
187#else
188		pcbgroup->ipg_cpu = (pgn % mp_ncpus);
189#endif
190	}
191}
192
193void
194in_pcbgroup_destroy(struct inpcbinfo *pcbinfo)
195{
196	struct inpcbgroup *pcbgroup;
197	u_int pgn;
198
199	if (pcbinfo->ipi_npcbgroups == 0)
200		return;
201
202	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
203		pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
204		KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead),
205		    ("in_pcbinfo_destroy: listhead not empty"));
206		INP_GROUP_LOCK_DESTROY(pcbgroup);
207		hashdestroy(pcbgroup->ipg_hashbase, M_PCB,
208		    pcbgroup->ipg_hashmask);
209	}
210	hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask);
211	free(pcbinfo->ipi_pcbgroups, M_PCB);
212	pcbinfo->ipi_pcbgroups = NULL;
213	pcbinfo->ipi_npcbgroups = 0;
214	pcbinfo->ipi_hashfields = 0;
215}
216
217/*
218 * Given a hash of whatever the covered tuple might be, return a pcbgroup
219 * index.  Where RSS is supported, try to align bucket selection with RSS CPU
220 * affinity strategy.
221 */
222static __inline u_int
223in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash)
224{
225
226#ifdef RSS
227	return (rss_getbucket(hash));
228#else
229	return (hash % pcbinfo->ipi_npcbgroups);
230#endif
231}
232
233/*
234 * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash
235 * information is insufficient to identify the pcbgroup.  This might occur if
236 * a TCP packet turns up with a 2-tuple hash, or if an RSS hash is present but
237 * RSS is not compiled into the kernel.
238 */
239struct inpcbgroup *
240in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash)
241{
242
243#ifdef RSS
244	if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE &&
245	    hashtype == M_HASHTYPE_RSS_TCP_IPV4) ||
246	    (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE &&
247	    hashtype == M_HASHTYPE_RSS_IPV4))
248		return (&pcbinfo->ipi_pcbgroups[
249		    in_pcbgroup_getbucket(pcbinfo, hash)]);
250#endif
251	return (NULL);
252}
253
254static struct inpcbgroup *
255in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m)
256{
257
258	return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
259	    m->m_pkthdr.flowid));
260}
261
262struct inpcbgroup *
263in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr,
264    u_short lport, struct in_addr faddr, u_short fport)
265{
266	uint32_t hash;
267
268	/*
269	 * RSS note: we pass foreign addr/port as source, and local addr/port
270	 * as destination, as we want to align with what the hardware is
271	 * doing.
272	 */
273	switch (pcbinfo->ipi_hashfields) {
274	case IPI_HASHFIELDS_4TUPLE:
275#ifdef RSS
276		hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport);
277#else
278		hash = faddr.s_addr ^ fport;
279#endif
280		break;
281
282	case IPI_HASHFIELDS_2TUPLE:
283#ifdef RSS
284		hash = rss_hash_ip4_2tuple(faddr, laddr);
285#else
286		hash = faddr.s_addr ^ laddr.s_addr;
287#endif
288		break;
289
290	default:
291		hash = 0;
292	}
293	return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo,
294	    hash)]);
295}
296
297struct inpcbgroup *
298in_pcbgroup_byinpcb(struct inpcb *inp)
299{
300#ifdef	RSS
301	/*
302	 * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined
303	 * RSS bucket and thus we should use this pcbgroup, rather than
304	 * using a tuple or hash.
305	 *
306	 * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket
307	 * fits in that!
308	 */
309	if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
310		return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]);
311#endif
312
313	return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
314	    inp->inp_lport, inp->inp_faddr, inp->inp_fport));
315}
316
317static void
318in_pcbwild_add(struct inpcb *inp)
319{
320	struct inpcbinfo *pcbinfo;
321	struct inpcbhead *head;
322	u_int pgn;
323
324	INP_WLOCK_ASSERT(inp);
325	KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD),
326	    ("%s: is wild",__func__));
327
328	pcbinfo = inp->inp_pcbinfo;
329	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
330		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
331	head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport,
332	    0, pcbinfo->ipi_wildmask)];
333	LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild);
334	inp->inp_flags2 |= INP_PCBGROUPWILD;
335	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
336		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
337}
338
339static void
340in_pcbwild_remove(struct inpcb *inp)
341{
342	struct inpcbinfo *pcbinfo;
343	u_int pgn;
344
345	INP_WLOCK_ASSERT(inp);
346	KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD),
347	    ("%s: not wild", __func__));
348
349	pcbinfo = inp->inp_pcbinfo;
350	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
351		INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
352	LIST_REMOVE(inp, inp_pcbgroup_wild);
353	for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
354		INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
355	inp->inp_flags2 &= ~INP_PCBGROUPWILD;
356}
357
358static __inline int
359in_pcbwild_needed(struct inpcb *inp)
360{
361#ifdef	RSS
362	/*
363	 * If it's a listen socket and INP_RSS_BUCKET_SET is set,
364	 * it's a wildcard socket _but_ it's in a specific pcbgroup.
365	 * Thus we don't treat it as a pcbwild inp.
366	 */
367	if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
368		return (0);
369#endif
370
371#ifdef INET6
372	if (inp->inp_vflag & INP_IPV6)
373		return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr));
374	else
375#endif
376		return (inp->inp_faddr.s_addr == htonl(INADDR_ANY));
377}
378
379static void
380in_pcbwild_update_internal(struct inpcb *inp)
381{
382	int wildcard_needed;
383
384	wildcard_needed = in_pcbwild_needed(inp);
385	if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD))
386		in_pcbwild_add(inp);
387	else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD))
388		in_pcbwild_remove(inp);
389}
390
391/*
392 * Update the pcbgroup of an inpcb, which might include removing an old
393 * pcbgroup reference and/or adding a new one.  Wildcard processing is not
394 * performed here, although ideally we'll never install a pcbgroup for a
395 * wildcard inpcb (asserted below).
396 */
397static void
398in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo,
399    struct inpcbgroup *newpcbgroup, struct inpcb *inp)
400{
401	struct inpcbgroup *oldpcbgroup;
402	struct inpcbhead *pcbhash;
403	uint32_t hashkey_faddr;
404
405	INP_WLOCK_ASSERT(inp);
406
407	oldpcbgroup = inp->inp_pcbgroup;
408	if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
409		INP_GROUP_LOCK(oldpcbgroup);
410		LIST_REMOVE(inp, inp_pcbgrouphash);
411		inp->inp_pcbgroup = NULL;
412		INP_GROUP_UNLOCK(oldpcbgroup);
413	}
414	if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
415#ifdef INET6
416		if (inp->inp_vflag & INP_IPV6)
417			hashkey_faddr = inp->in6p_faddr.s6_addr32[3]; /* XXX */
418		else
419#endif
420			hashkey_faddr = inp->inp_faddr.s_addr;
421		INP_GROUP_LOCK(newpcbgroup);
422		/*
423		 * If the inp is an RSS bucket wildcard entry, ensure
424		 * that the PCB hash is calculated correctly.
425		 *
426		 * The wildcard hash calculation differs from the
427		 * non-wildcard definition.  The source address is
428		 * INADDR_ANY and the far port is 0.
429		 */
430		if (inp->inp_flags2 & INP_RSS_BUCKET_SET) {
431			pcbhash = &newpcbgroup->ipg_hashbase[
432			    INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0,
433			    newpcbgroup->ipg_hashmask)];
434		} else {
435			pcbhash = &newpcbgroup->ipg_hashbase[
436			    INP_PCBHASH(hashkey_faddr, inp->inp_lport,
437			    inp->inp_fport,
438			    newpcbgroup->ipg_hashmask)];
439		}
440		LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
441		inp->inp_pcbgroup = newpcbgroup;
442		INP_GROUP_UNLOCK(newpcbgroup);
443	}
444
445	KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)),
446	    ("%s: pcbgroup and wildcard!", __func__));
447}
448
449/*
450 * Two update paths: one in which the 4-tuple on an inpcb has been updated
451 * and therefore connection groups may need to change (or a wildcard entry
452 * may needed to be installed), and another in which the 4-tuple has been
453 * set as a result of a packet received, in which case we may be able to use
454 * the hash on the mbuf to avoid doing a software hash calculation for RSS.
455 *
456 * In each case: first, let the wildcard code have a go at placing it as a
457 * wildcard socket.  If it was a wildcard, or if the connection has been
458 * dropped, then no pcbgroup is required (so potentially clear it);
459 * otherwise, calculate and update the pcbgroup for the inpcb.
460 */
461void
462in_pcbgroup_update(struct inpcb *inp)
463{
464	struct inpcbinfo *pcbinfo;
465	struct inpcbgroup *newpcbgroup;
466
467	INP_WLOCK_ASSERT(inp);
468
469	pcbinfo = inp->inp_pcbinfo;
470	if (!in_pcbgroup_enabled(pcbinfo))
471		return;
472
473	in_pcbwild_update_internal(inp);
474	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
475	    !(inp->inp_flags & INP_DROPPED)) {
476#ifdef INET6
477		if (inp->inp_vflag & INP_IPV6)
478			newpcbgroup = in6_pcbgroup_byinpcb(inp);
479		else
480#endif
481			newpcbgroup = in_pcbgroup_byinpcb(inp);
482	} else
483		newpcbgroup = NULL;
484	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
485}
486
487void
488in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m)
489{
490	struct inpcbinfo *pcbinfo;
491	struct inpcbgroup *newpcbgroup;
492
493	INP_WLOCK_ASSERT(inp);
494
495	pcbinfo = inp->inp_pcbinfo;
496	if (!in_pcbgroup_enabled(pcbinfo))
497		return;
498
499	/*
500	 * Possibly should assert !INP_PCBGROUPWILD rather than testing for
501	 * it; presumably this function should never be called for anything
502	 * other than non-wildcard socket?
503	 */
504	in_pcbwild_update_internal(inp);
505	if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
506	    !(inp->inp_flags & INP_DROPPED)) {
507		newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m);
508#ifdef INET6
509		if (inp->inp_vflag & INP_IPV6) {
510			if (newpcbgroup == NULL)
511				newpcbgroup = in6_pcbgroup_byinpcb(inp);
512		} else {
513#endif
514			if (newpcbgroup == NULL)
515				newpcbgroup = in_pcbgroup_byinpcb(inp);
516#ifdef INET6
517		}
518#endif
519	} else
520		newpcbgroup = NULL;
521	in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
522}
523
524/*
525 * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb.
526 */
527void
528in_pcbgroup_remove(struct inpcb *inp)
529{
530	struct inpcbgroup *pcbgroup;
531
532	INP_WLOCK_ASSERT(inp);
533
534	if (!in_pcbgroup_enabled(inp->inp_pcbinfo))
535		return;
536
537	if (inp->inp_flags2 & INP_PCBGROUPWILD)
538		in_pcbwild_remove(inp);
539
540	pcbgroup = inp->inp_pcbgroup;
541	if (pcbgroup != NULL) {
542		INP_GROUP_LOCK(pcbgroup);
543		LIST_REMOVE(inp, inp_pcbgrouphash);
544		inp->inp_pcbgroup = NULL;
545		INP_GROUP_UNLOCK(pcbgroup);
546	}
547}
548
549/*
550 * Query whether or not it is appropriate to use pcbgroups to look up inpcbs
551 * for a protocol.
552 */
553int
554in_pcbgroup_enabled(struct inpcbinfo *pcbinfo)
555{
556
557	return (pcbinfo->ipi_npcbgroups > 0);
558}
559